]> Zhao Yanbai Git Server - minix.git/commitdiff
imported code harmonisation
authorBen Gras <ben@minix3.org>
Tue, 13 Mar 2012 01:37:22 +0000 (02:37 +0100)
committerBen Gras <ben@minix3.org>
Wed, 14 Mar 2012 15:02:59 +0000 (16:02 +0100)
. common/include/arch/i386 is not actually an imported
  sys/arch/i386/include but leftover Minix files;
  remove and move to include/
. move include/ufs to sys/ufs, where it came from, now that
  we have a sys/ hierarchy
. move mdocml/ to external/bsd/, now we have that
. single sys/arch/i386/stand/ import for boot stuff

233 files changed:
Makefile
common/include/arch/i386/disklabel.h [deleted file]
external/Makefile [new file with mode: 0644]
external/bsd/Makefile [new file with mode: 0644]
external/bsd/mdocml/Makefile [moved from usr.bin/mdocml/Makefile with 100% similarity]
external/bsd/mdocml/Makefile.inc [moved from usr.bin/mdocml/Makefile.inc with 95% similarity]
external/bsd/mdocml/bin/Makefile [moved from usr.bin/mdocml/bin/Makefile with 100% similarity]
external/bsd/mdocml/bin/Makefile.inc [moved from usr.bin/mdocml/bin/Makefile.inc with 100% similarity]
external/bsd/mdocml/bin/mandoc/Makefile [moved from usr.bin/mdocml/bin/mandoc/Makefile with 100% similarity]
external/bsd/mdocml/dist/Makefile [moved from usr.bin/mdocml/dist/Makefile with 100% similarity]
external/bsd/mdocml/dist/arch.c [moved from usr.bin/mdocml/dist/arch.c with 100% similarity]
external/bsd/mdocml/dist/arch.in [moved from usr.bin/mdocml/dist/arch.in with 100% similarity]
external/bsd/mdocml/dist/att.c [moved from usr.bin/mdocml/dist/att.c with 100% similarity]
external/bsd/mdocml/dist/att.in [moved from usr.bin/mdocml/dist/att.in with 100% similarity]
external/bsd/mdocml/dist/chars.c [moved from usr.bin/mdocml/dist/chars.c with 100% similarity]
external/bsd/mdocml/dist/chars.h [moved from usr.bin/mdocml/dist/chars.h with 100% similarity]
external/bsd/mdocml/dist/chars.in [moved from usr.bin/mdocml/dist/chars.in with 100% similarity]
external/bsd/mdocml/dist/compat.c [moved from usr.bin/mdocml/dist/compat.c with 100% similarity]
external/bsd/mdocml/dist/config.h.post [moved from usr.bin/mdocml/dist/config.h.post with 100% similarity]
external/bsd/mdocml/dist/config.h.pre [moved from usr.bin/mdocml/dist/config.h.pre with 100% similarity]
external/bsd/mdocml/dist/example.style.css [moved from usr.bin/mdocml/dist/example.style.css with 100% similarity]
external/bsd/mdocml/dist/external.png.uu [moved from usr.bin/mdocml/dist/external.png.uu with 100% similarity]
external/bsd/mdocml/dist/html.c [moved from usr.bin/mdocml/dist/html.c with 100% similarity]
external/bsd/mdocml/dist/html.h [moved from usr.bin/mdocml/dist/html.h with 100% similarity]
external/bsd/mdocml/dist/lib.c [moved from usr.bin/mdocml/dist/lib.c with 100% similarity]
external/bsd/mdocml/dist/lib.in [moved from usr.bin/mdocml/dist/lib.in with 100% similarity]
external/bsd/mdocml/dist/libman.h [moved from usr.bin/mdocml/dist/libman.h with 100% similarity]
external/bsd/mdocml/dist/libmandoc.h [moved from usr.bin/mdocml/dist/libmandoc.h with 100% similarity]
external/bsd/mdocml/dist/libmdoc.h [moved from usr.bin/mdocml/dist/libmdoc.h with 100% similarity]
external/bsd/mdocml/dist/libroff.h [moved from usr.bin/mdocml/dist/libroff.h with 100% similarity]
external/bsd/mdocml/dist/main.c [moved from usr.bin/mdocml/dist/main.c with 100% similarity]
external/bsd/mdocml/dist/main.h [moved from usr.bin/mdocml/dist/main.h with 100% similarity]
external/bsd/mdocml/dist/man.3 [moved from usr.bin/mdocml/dist/man.3 with 100% similarity]
external/bsd/mdocml/dist/man.7 [moved from usr.bin/mdocml/dist/man.7 with 100% similarity]
external/bsd/mdocml/dist/man.c [moved from usr.bin/mdocml/dist/man.c with 100% similarity]
external/bsd/mdocml/dist/man.h [moved from usr.bin/mdocml/dist/man.h with 100% similarity]
external/bsd/mdocml/dist/man_argv.c [moved from usr.bin/mdocml/dist/man_argv.c with 100% similarity]
external/bsd/mdocml/dist/man_hash.c [moved from usr.bin/mdocml/dist/man_hash.c with 100% similarity]
external/bsd/mdocml/dist/man_html.c [moved from usr.bin/mdocml/dist/man_html.c with 100% similarity]
external/bsd/mdocml/dist/man_macro.c [moved from usr.bin/mdocml/dist/man_macro.c with 100% similarity]
external/bsd/mdocml/dist/man_term.c [moved from usr.bin/mdocml/dist/man_term.c with 100% similarity]
external/bsd/mdocml/dist/man_validate.c [moved from usr.bin/mdocml/dist/man_validate.c with 100% similarity]
external/bsd/mdocml/dist/mandoc [moved from usr.bin/mdocml/dist/mandoc with 100% similarity]
external/bsd/mdocml/dist/mandoc.1 [moved from usr.bin/mdocml/dist/mandoc.1 with 100% similarity]
external/bsd/mdocml/dist/mandoc.c [moved from usr.bin/mdocml/dist/mandoc.c with 100% similarity]
external/bsd/mdocml/dist/mandoc.h [moved from usr.bin/mdocml/dist/mandoc.h with 100% similarity]
external/bsd/mdocml/dist/mandoc_char.7 [moved from usr.bin/mdocml/dist/mandoc_char.7 with 100% similarity]
external/bsd/mdocml/dist/mdoc.3 [moved from usr.bin/mdocml/dist/mdoc.3 with 100% similarity]
external/bsd/mdocml/dist/mdoc.7 [moved from usr.bin/mdocml/dist/mdoc.7 with 100% similarity]
external/bsd/mdocml/dist/mdoc.c [moved from usr.bin/mdocml/dist/mdoc.c with 100% similarity]
external/bsd/mdocml/dist/mdoc.h [moved from usr.bin/mdocml/dist/mdoc.h with 100% similarity]
external/bsd/mdocml/dist/mdoc_argv.c [moved from usr.bin/mdocml/dist/mdoc_argv.c with 100% similarity]
external/bsd/mdocml/dist/mdoc_hash.c [moved from usr.bin/mdocml/dist/mdoc_hash.c with 100% similarity]
external/bsd/mdocml/dist/mdoc_html.c [moved from usr.bin/mdocml/dist/mdoc_html.c with 100% similarity]
external/bsd/mdocml/dist/mdoc_macro.c [moved from usr.bin/mdocml/dist/mdoc_macro.c with 100% similarity]
external/bsd/mdocml/dist/mdoc_strings.c [moved from usr.bin/mdocml/dist/mdoc_strings.c with 100% similarity]
external/bsd/mdocml/dist/mdoc_term.c [moved from usr.bin/mdocml/dist/mdoc_term.c with 100% similarity]
external/bsd/mdocml/dist/mdoc_validate.c [moved from usr.bin/mdocml/dist/mdoc_validate.c with 100% similarity]
external/bsd/mdocml/dist/msec.c [moved from usr.bin/mdocml/dist/msec.c with 100% similarity]
external/bsd/mdocml/dist/msec.in [moved from usr.bin/mdocml/dist/msec.in with 100% similarity]
external/bsd/mdocml/dist/out.c [moved from usr.bin/mdocml/dist/out.c with 100% similarity]
external/bsd/mdocml/dist/out.h [moved from usr.bin/mdocml/dist/out.h with 100% similarity]
external/bsd/mdocml/dist/roff.3 [moved from usr.bin/mdocml/dist/roff.3 with 100% similarity]
external/bsd/mdocml/dist/roff.7 [moved from usr.bin/mdocml/dist/roff.7 with 100% similarity]
external/bsd/mdocml/dist/roff.c [moved from usr.bin/mdocml/dist/roff.c with 100% similarity]
external/bsd/mdocml/dist/roff.h [moved from usr.bin/mdocml/dist/roff.h with 100% similarity]
external/bsd/mdocml/dist/st.c [moved from usr.bin/mdocml/dist/st.c with 100% similarity]
external/bsd/mdocml/dist/st.in [moved from usr.bin/mdocml/dist/st.in with 100% similarity]
external/bsd/mdocml/dist/tbl.7 [moved from usr.bin/mdocml/dist/tbl.7 with 100% similarity]
external/bsd/mdocml/dist/tbl.c [moved from usr.bin/mdocml/dist/tbl.c with 100% similarity]
external/bsd/mdocml/dist/tbl_data.c [moved from usr.bin/mdocml/dist/tbl_data.c with 100% similarity]
external/bsd/mdocml/dist/tbl_html.c [moved from usr.bin/mdocml/dist/tbl_html.c with 100% similarity]
external/bsd/mdocml/dist/tbl_layout.c [moved from usr.bin/mdocml/dist/tbl_layout.c with 100% similarity]
external/bsd/mdocml/dist/tbl_opts.c [moved from usr.bin/mdocml/dist/tbl_opts.c with 100% similarity]
external/bsd/mdocml/dist/tbl_term.c [moved from usr.bin/mdocml/dist/tbl_term.c with 100% similarity]
external/bsd/mdocml/dist/term.c [moved from usr.bin/mdocml/dist/term.c with 100% similarity]
external/bsd/mdocml/dist/term.h [moved from usr.bin/mdocml/dist/term.h with 100% similarity]
external/bsd/mdocml/dist/term_ascii.c [moved from usr.bin/mdocml/dist/term_ascii.c with 100% similarity]
external/bsd/mdocml/dist/term_ps.c [moved from usr.bin/mdocml/dist/term_ps.c with 100% similarity]
external/bsd/mdocml/dist/test-strlcat.c [moved from usr.bin/mdocml/dist/test-strlcat.c with 100% similarity]
external/bsd/mdocml/dist/test-strlcpy.c [moved from usr.bin/mdocml/dist/test-strlcpy.c with 100% similarity]
external/bsd/mdocml/dist/tree.c [moved from usr.bin/mdocml/dist/tree.c with 100% similarity]
external/bsd/mdocml/dist/vol.c [moved from usr.bin/mdocml/dist/vol.c with 100% similarity]
external/bsd/mdocml/dist/vol.in [moved from usr.bin/mdocml/dist/vol.in with 100% similarity]
external/bsd/mdocml/lib/Makefile [moved from usr.bin/mdocml/lib/Makefile with 100% similarity]
external/bsd/mdocml/lib/Makefile.inc [moved from usr.bin/mdocml/lib/Makefile.inc with 100% similarity]
external/bsd/mdocml/lib/libman/Makefile [moved from usr.bin/mdocml/lib/libman/Makefile with 100% similarity]
external/bsd/mdocml/lib/libmdoc/Makefile [moved from usr.bin/mdocml/lib/libmdoc/Makefile with 100% similarity]
external/bsd/mdocml/lib/libroff/Makefile [moved from usr.bin/mdocml/lib/libroff/Makefile with 100% similarity]
external/bsd/mdocml/man/Makefile [moved from usr.bin/mdocml/man/Makefile with 100% similarity]
external/bsd/mdocml/prepare-import.sh [moved from usr.bin/mdocml/prepare-import.sh with 100% similarity]
include/Makefile
include/arch/i386/include/Makefile
include/arch/i386/include/Makefile.inc [moved from common/include/arch/i386/Makefile.inc with 100% similarity]
include/arch/i386/include/archtypes.h [moved from common/include/arch/i386/archtypes.h with 100% similarity]
include/arch/i386/include/bios.h [moved from common/include/arch/i386/bios.h with 100% similarity]
include/arch/i386/include/cmos.h [moved from common/include/arch/i386/cmos.h with 100% similarity]
include/arch/i386/include/cpu.h [moved from common/include/arch/i386/cpu.h with 100% similarity]
include/arch/i386/include/disklabel.h
include/arch/i386/include/diskparm.h [moved from common/include/arch/i386/diskparm.h with 100% similarity]
include/arch/i386/include/elf.h [moved from common/include/arch/i386/elf.h with 100% similarity]
include/arch/i386/include/elf_machdep.h [moved from common/include/arch/i386/elf_machdep.h with 100% similarity]
include/arch/i386/include/fpu.h [moved from common/include/arch/i386/fpu.h with 100% similarity]
include/arch/i386/include/int86.h [moved from common/include/arch/i386/int86.h with 100% similarity]
include/arch/i386/include/interrupt.h [moved from common/include/arch/i386/interrupt.h with 100% similarity]
include/arch/i386/include/memory.h [moved from common/include/arch/i386/memory.h with 100% similarity]
include/arch/i386/include/multiboot.h [moved from common/include/arch/i386/multiboot.h with 100% similarity]
include/arch/i386/include/mutex.h [moved from common/include/arch/i386/mutex.h with 100% similarity]
include/arch/i386/include/partition.h [moved from common/include/arch/i386/partition.h with 100% similarity]
include/arch/i386/include/pci.h [moved from common/include/arch/i386/pci.h with 100% similarity]
include/arch/i386/include/pci_amd.h [moved from common/include/arch/i386/pci_amd.h with 100% similarity]
include/arch/i386/include/pci_intel.h [moved from common/include/arch/i386/pci_intel.h with 100% similarity]
include/arch/i386/include/pci_sis.h [moved from common/include/arch/i386/pci_sis.h with 100% similarity]
include/arch/i386/include/pci_via.h [moved from common/include/arch/i386/pci_via.h with 100% similarity]
include/arch/i386/include/ports.h [moved from common/include/arch/i386/ports.h with 100% similarity]
include/arch/i386/include/stackframe.h [moved from common/include/arch/i386/stackframe.h with 100% similarity]
include/arch/i386/include/vm.h [moved from common/include/arch/i386/vm.h with 100% similarity]
sys/Makefile
sys/ufs/Makefile [new file with mode: 0644]
sys/ufs/chfs/chfs.h [moved from include/ufs/chfs/chfs.h with 100% similarity]
sys/ufs/chfs/chfs_args.h [moved from include/ufs/chfs/chfs_args.h with 100% similarity]
sys/ufs/chfs/chfs_build.c [new file with mode: 0644]
sys/ufs/chfs/chfs_erase.c [new file with mode: 0644]
sys/ufs/chfs/chfs_gc.c [new file with mode: 0644]
sys/ufs/chfs/chfs_ihash.c [new file with mode: 0644]
sys/ufs/chfs/chfs_inode.h [moved from include/ufs/chfs/chfs_inode.h with 100% similarity]
sys/ufs/chfs/chfs_malloc.c [new file with mode: 0644]
sys/ufs/chfs/chfs_nodeops.c [new file with mode: 0644]
sys/ufs/chfs/chfs_pool.c [new file with mode: 0644]
sys/ufs/chfs/chfs_pool.h [moved from include/ufs/chfs/chfs_pool.h with 100% similarity]
sys/ufs/chfs/chfs_readinode.c [new file with mode: 0644]
sys/ufs/chfs/chfs_scan.c [new file with mode: 0644]
sys/ufs/chfs/chfs_subr.c [new file with mode: 0644]
sys/ufs/chfs/chfs_vfsops.c [new file with mode: 0644]
sys/ufs/chfs/chfs_vnode.c [new file with mode: 0644]
sys/ufs/chfs/chfs_vnode_cache.c [new file with mode: 0644]
sys/ufs/chfs/chfs_vnops.c [new file with mode: 0644]
sys/ufs/chfs/chfs_wbuf.c [new file with mode: 0644]
sys/ufs/chfs/chfs_write.c [new file with mode: 0644]
sys/ufs/chfs/debug.c [new file with mode: 0644]
sys/ufs/chfs/debug.h [moved from include/ufs/chfs/debug.h with 100% similarity]
sys/ufs/chfs/ebh.c [new file with mode: 0644]
sys/ufs/chfs/ebh.h [moved from include/ufs/chfs/ebh.h with 100% similarity]
sys/ufs/chfs/ebh_media.h [moved from include/ufs/chfs/ebh_media.h with 100% similarity]
sys/ufs/chfs/ebh_misc.h [moved from include/ufs/chfs/ebh_misc.h with 100% similarity]
sys/ufs/chfs/media.h [moved from include/ufs/chfs/media.h with 100% similarity]
sys/ufs/ext2fs/Makefile [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs.h [moved from include/ufs/ext2fs/ext2fs.h with 100% similarity]
sys/ufs/ext2fs/ext2fs_alloc.c [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs_balloc.c [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs_bmap.c [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs_bswap.c [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs_dinode.h [moved from include/ufs/ext2fs/ext2fs_dinode.h with 100% similarity]
sys/ufs/ext2fs/ext2fs_dir.h [moved from include/ufs/ext2fs/ext2fs_dir.h with 100% similarity]
sys/ufs/ext2fs/ext2fs_extern.h [moved from include/ufs/ext2fs/ext2fs_extern.h with 100% similarity]
sys/ufs/ext2fs/ext2fs_inode.c [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs_lookup.c [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs_readwrite.c [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs_subr.c [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs_vfsops.c [new file with mode: 0644]
sys/ufs/ext2fs/ext2fs_vnops.c [new file with mode: 0644]
sys/ufs/ffs/Makefile [new file with mode: 0644]
sys/ufs/ffs/ffs_alloc.c [new file with mode: 0644]
sys/ufs/ffs/ffs_appleufs.c [new file with mode: 0644]
sys/ufs/ffs/ffs_balloc.c [new file with mode: 0644]
sys/ufs/ffs/ffs_bswap.c [new file with mode: 0644]
sys/ufs/ffs/ffs_extern.h [moved from include/ufs/ffs/ffs_extern.h with 100% similarity]
sys/ufs/ffs/ffs_inode.c [new file with mode: 0644]
sys/ufs/ffs/ffs_quota2.c [new file with mode: 0644]
sys/ufs/ffs/ffs_snapshot.c [new file with mode: 0644]
sys/ufs/ffs/ffs_subr.c [new file with mode: 0644]
sys/ufs/ffs/ffs_tables.c [new file with mode: 0644]
sys/ufs/ffs/ffs_vfsops.c [new file with mode: 0644]
sys/ufs/ffs/ffs_vnops.c [new file with mode: 0644]
sys/ufs/ffs/ffs_wapbl.c [new file with mode: 0644]
sys/ufs/ffs/fs.h [moved from include/ufs/ffs/fs.h with 100% similarity]
sys/ufs/files.ufs [new file with mode: 0644]
sys/ufs/lfs/CHANGES [new file with mode: 0644]
sys/ufs/lfs/Makefile [new file with mode: 0644]
sys/ufs/lfs/README [new file with mode: 0644]
sys/ufs/lfs/TODO [new file with mode: 0644]
sys/ufs/lfs/lfs.h [moved from include/ufs/lfs/lfs.h with 100% similarity]
sys/ufs/lfs/lfs_alloc.c [new file with mode: 0644]
sys/ufs/lfs/lfs_balloc.c [new file with mode: 0644]
sys/ufs/lfs/lfs_bio.c [new file with mode: 0644]
sys/ufs/lfs/lfs_cksum.c [new file with mode: 0644]
sys/ufs/lfs/lfs_debug.c [new file with mode: 0644]
sys/ufs/lfs/lfs_extern.h [moved from include/ufs/lfs/lfs_extern.h with 100% similarity]
sys/ufs/lfs/lfs_inode.c [new file with mode: 0644]
sys/ufs/lfs/lfs_itimes.c [new file with mode: 0644]
sys/ufs/lfs/lfs_rfw.c [new file with mode: 0644]
sys/ufs/lfs/lfs_segment.c [new file with mode: 0644]
sys/ufs/lfs/lfs_subr.c [new file with mode: 0644]
sys/ufs/lfs/lfs_syscalls.c [new file with mode: 0644]
sys/ufs/lfs/lfs_vfsops.c [new file with mode: 0644]
sys/ufs/lfs/lfs_vnops.c [new file with mode: 0644]
sys/ufs/mfs/Makefile [new file with mode: 0644]
sys/ufs/mfs/mfs_extern.h [moved from include/ufs/mfs/mfs_extern.h with 100% similarity]
sys/ufs/mfs/mfs_miniroot.c [new file with mode: 0644]
sys/ufs/mfs/mfs_vfsops.c [new file with mode: 0644]
sys/ufs/mfs/mfs_vnops.c [new file with mode: 0644]
sys/ufs/mfs/mfsnode.h [moved from include/ufs/mfs/mfsnode.h with 100% similarity]
sys/ufs/ufs/Makefile [new file with mode: 0644]
sys/ufs/ufs/dinode.h [moved from include/ufs/ufs/dinode.h with 100% similarity]
sys/ufs/ufs/dir.h [moved from include/ufs/ufs/dir.h with 100% similarity]
sys/ufs/ufs/dirhash.h [moved from include/ufs/ufs/dirhash.h with 100% similarity]
sys/ufs/ufs/extattr.h [moved from include/ufs/ufs/extattr.h with 100% similarity]
sys/ufs/ufs/inode.h [moved from include/ufs/ufs/inode.h with 100% similarity]
sys/ufs/ufs/quota.h [moved from include/ufs/ufs/quota.h with 100% similarity]
sys/ufs/ufs/quota1.h [moved from include/ufs/ufs/quota1.h with 100% similarity]
sys/ufs/ufs/quota1_subr.c [new file with mode: 0644]
sys/ufs/ufs/quota2.h [moved from include/ufs/ufs/quota2.h with 100% similarity]
sys/ufs/ufs/quota2_subr.c [new file with mode: 0644]
sys/ufs/ufs/ufs_bmap.c [new file with mode: 0644]
sys/ufs/ufs/ufs_bswap.h [moved from include/ufs/ufs/ufs_bswap.h with 100% similarity]
sys/ufs/ufs/ufs_dirhash.c [new file with mode: 0644]
sys/ufs/ufs/ufs_extattr.c [new file with mode: 0644]
sys/ufs/ufs/ufs_extern.h [moved from include/ufs/ufs/ufs_extern.h with 100% similarity]
sys/ufs/ufs/ufs_ihash.c [new file with mode: 0644]
sys/ufs/ufs/ufs_inode.c [new file with mode: 0644]
sys/ufs/ufs/ufs_lookup.c [new file with mode: 0644]
sys/ufs/ufs/ufs_quota.c [new file with mode: 0644]
sys/ufs/ufs/ufs_quota.h [moved from include/ufs/ufs/ufs_quota.h with 100% similarity]
sys/ufs/ufs/ufs_quota1.c [new file with mode: 0644]
sys/ufs/ufs/ufs_quota2.c [new file with mode: 0644]
sys/ufs/ufs/ufs_readwrite.c [new file with mode: 0644]
sys/ufs/ufs/ufs_vfsops.c [new file with mode: 0644]
sys/ufs/ufs/ufs_vnops.c [new file with mode: 0644]
sys/ufs/ufs/ufs_wapbl.c [new file with mode: 0644]
sys/ufs/ufs/ufs_wapbl.h [moved from include/ufs/ufs/ufs_wapbl.h with 100% similarity]
sys/ufs/ufs/ufsmount.h [moved from include/ufs/ufs/ufsmount.h with 100% similarity]
tools/nbsd_ports
usr.bin/Makefile

index e9e111758a74bc3389fe8a80e107c4e6c269da14..fa49ec775761e90d5f39a0f4cafd0a546de2d96d 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,7 @@ mkfiles:
 includes:
        $(MAKE) -C include includes
        $(MAKE) -C lib includes NBSD_LIBC=yes
+       $(MAKE) -C sys includes
 
 MKHEADERSS=/usr/pkg/gcc*/libexec/gcc/*/*/install-tools/mkheaders
 gnu-includes: includes
@@ -50,6 +51,7 @@ commands: includes libraries
        $(MAKE) -C bin all
        $(MAKE) -C sbin all
        $(MAKE) -C usr.bin all
+       $(MAKE) -C external all
        $(MAKE) -C libexec all
        $(MAKE) -C usr.sbin all
 
@@ -59,6 +61,7 @@ dep-all:
        $(MAKE) -C bin dependall
        $(MAKE) -C sbin dependall
        $(MAKE) -C usr.bin dependall
+       $(MAKE) -C external dependall
        $(MAKE) -C libexec dependall
        $(MAKE) -C usr.sbin dependall
        $(MAKE) -C kernel dependall
@@ -77,6 +80,7 @@ all:
        $(MAKE) -C bin all
        $(MAKE) -C sbin all
        $(MAKE) -C usr.bin all
+       $(MAKE) -C external all
        $(MAKE) -C libexec all
        $(MAKE) -C usr.sbin all
        $(MAKE) -C tools all
@@ -89,6 +93,7 @@ install:
        $(MAKE) -C bin install
        $(MAKE) -C sbin install
        $(MAKE) -C usr.bin install
+       $(MAKE) -C external install
        $(MAKE) -C usr.sbin install
        $(MAKE) -C servers install
        $(MAKE) -C share install
@@ -100,6 +105,7 @@ clean: mkfiles
        $(MAKE) -C bin clean
        $(MAKE) -C sbin clean
        $(MAKE) -C usr.bin clean
+       $(MAKE) -C external clean
        $(MAKE) -C libexec clean
        $(MAKE) -C usr.sbin clean
        $(MAKE) -C share clean
@@ -114,6 +120,7 @@ cleandepend: mkfiles
        $(MAKE) -C bin cleandepend
        $(MAKE) -C sbin cleandepend
        $(MAKE) -C usr.bin cleandepend
+       $(MAKE) -C external cleandepend
        $(MAKE) -C libexec cleandepend
        $(MAKE) -C usr.sbin cleandepend
        $(MAKE) -C tools cleandepend
diff --git a/common/include/arch/i386/disklabel.h b/common/include/arch/i386/disklabel.h
deleted file mode 100644 (file)
index e7d5246..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-/*     $NetBSD: disklabel.h,v 1.16 2011/08/30 12:39:55 bouyer Exp $    */
-
-/*
- * Copyright (c) 1994 Christopher G. Demetriou
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *      This product includes software developed by Christopher G. Demetriou.
- * 4. The name of the author may not be used to endorse or promote products
- *    derived from this software without specific prior written permission
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _I386_DISKLABEL_H_
-#define _I386_DISKLABEL_H_
-
-#define LABELUSESMBR           1       /* use MBR partitionning */
-#define        LABELSECTOR             1       /* sector containing label */
-#define        LABELOFFSET             0       /* offset of label in sector */
-#define        MAXPARTITIONS           16      /* number of partitions */
-#define        OLDMAXPARTITIONS        8       /* number of partitions before 1.6 */
-#define        RAW_PART                3       /* raw partition: XX?d (XXX) */
-
-/*
- * We use the highest bit of the minor number for the partition number.
- * This maintains backward compatibility with device nodes created before
- * MAXPARTITIONS was increased.
- */
-#define __I386_MAXDISKS        ((1 << 20) / MAXPARTITIONS)
-#define DISKUNIT(dev)  ((minor(dev) / OLDMAXPARTITIONS) % __I386_MAXDISKS)
-#define DISKPART(dev)  ((minor(dev) % OLDMAXPARTITIONS) + \
-    ((minor(dev) / (__I386_MAXDISKS * OLDMAXPARTITIONS)) * OLDMAXPARTITIONS))
-#define        DISKMINOR(unit, part) \
-    (((unit) * OLDMAXPARTITIONS) + ((part) % OLDMAXPARTITIONS) + \
-     ((part) / OLDMAXPARTITIONS) * (__I386_MAXDISKS * OLDMAXPARTITIONS))
-
-/* Pull in MBR partition definitions. */
-#if HAVE_NBTOOL_CONFIG_H
-#include <nbinclude/sys/bootblock.h>
-#else
-#include <sys/bootblock.h>
-#endif /* HAVE_NBTOOL_CONFIG_H */
-
-#ifndef __ASSEMBLER__
-#if HAVE_NBTOOL_CONFIG_H
-#include <nbinclude/sys/dkbad.h>
-#else
-#include <sys/dkbad.h>
-#endif /* HAVE_NBTOOL_CONFIG_H */
-struct cpu_disklabel {
-#define __HAVE_DISKLABEL_DKBAD
-       struct dkbad bad;
-};
-#endif
-
-#endif /* _I386_DISKLABEL_H_ */
diff --git a/external/Makefile b/external/Makefile
new file mode 100644 (file)
index 0000000..4072adc
--- /dev/null
@@ -0,0 +1,3 @@
+SUBDIR=bsd
+
+.include <bsd.subdir.mk>
diff --git a/external/bsd/Makefile b/external/bsd/Makefile
new file mode 100644 (file)
index 0000000..5874067
--- /dev/null
@@ -0,0 +1,3 @@
+.include <bsd.own.mk>
+SUBDIR=mdocml
+.include <bsd.subdir.mk>
similarity index 95%
rename from usr.bin/mdocml/Makefile.inc
rename to external/bsd/mdocml/Makefile.inc
index 7368b890c5cfcf0a95e9682b4e454ef837c1285d..6dd59dd61db9b4b21c4b7e50be0c1a90b60eba7a 100644 (file)
@@ -1,7 +1,6 @@
 # $NetBSD: Makefile.inc,v 1.12 2010/07/25 19:16:18 joerg Exp $
 
 .include <bsd.own.mk>
-.include "../Makefile.inc"
 
 VERSION!=      cd ${.PARSEDIR}/dist && ${MAKE} -V VERSION
 
index e95fdcda45a1d68cff2d9908e52443e3c3886d24..5a120d103ab8f7a6b5dfa43d558f5e8aff6403eb 100644 (file)
@@ -25,19 +25,6 @@ INCS=        a.out.h aio.h ar.h assert.h atomic.h \
        ttyent.h tzfile.h ucontext.h ulimit.h unistd.h util.h utime.h utmp.h \
        utmpx.h uuid.h varargs.h vis.h wchar.h wctype.h wordexp.h
 
-INCS +=        ufs/chfs/chfs.h ufs/chfs/chfs_args.h ufs/chfs/chfs_inode.h \
-       ufs/chfs/chfs_pool.h ufs/chfs/debug.h ufs/chfs/ebh.h \
-       ufs/chfs/ebh_media.h ufs/chfs/ebh_misc.h ufs/chfs/media.h \
-       ufs/ext2fs/ext2fs.h ufs/ext2fs/ext2fs_dinode.h \
-       ufs/ext2fs/ext2fs_dir.h ufs/ext2fs/ext2fs_extern.h \
-       ufs/ffs/ffs_extern.h ufs/ffs/fs.h ufs/lfs/lfs.h \
-       ufs/lfs/lfs_extern.h ufs/mfs/mfs_extern.h ufs/mfs/mfsnode.h \
-       ufs/ufs/dinode.h ufs/ufs/dir.h ufs/ufs/dirhash.h \
-       ufs/ufs/extattr.h ufs/ufs/inode.h ufs/ufs/quota.h \
-       ufs/ufs/quota1.h ufs/ufs/quota2.h ufs/ufs/ufs_bswap.h \
-       ufs/ufs/ufs_extern.h ufs/ufs/ufs_quota.h ufs/ufs/ufs_wapbl.h \
-       ufs/ufs/ufsmount.h \
-
 .else
 INCS=  a.out.h aio.h ar.h assert.h atomic.h \
        bitstring.h bm.h cdbr.h cdbw.h complex.h cpio.h ctype.h \
index 93281aee8a48139ed9a10ede7b3ac8025ea309c2..8a1150c6552b3f0297625f94692c774466862681 100644 (file)
@@ -9,8 +9,12 @@ INCS=  ansi.h asm.h bswap.h byte_swap.h cdefs.h \
        int_mwgwtypes.h int_types.h limits.h \
        math.h mcontext.h npx.h param.h profile.h \
        setjmp.h signal.h stdarg.h types.h \
-       vmparam.h wchar_limits.h
+       vmparam.h wchar_limits.h \
+       archtypes.h bios.h cmos.h cpu.h diskparm.h fpu.h int86.h \
+       interrupt.h memory.h multiboot.h partition.h \
+       pci.h pci_amd.h pci_intel.h pci_sis.h pci_via.h \
+       ports.h stackframe.h vm.h elf.h elf_machdep.h mutex.h \
+       disklabel.h
 
 
-.include "${MINIXSRCDIR}/common/include/arch/i386/Makefile.inc"
 .include <bsd.kinc.mk>
index bf567de6d5da9394f5da1a06f9356edc5c39e6cd..e7d5246bc31230cc8848a502edf3596452717748 100644 (file)
@@ -1,4 +1,4 @@
-/*     $NetBSD: disklabel.h,v 1.15 2009/11/23 13:40:10 pooka Exp $     */
+/*     $NetBSD: disklabel.h,v 1.16 2011/08/30 12:39:55 bouyer Exp $    */
 
 /*
  * Copyright (c) 1994 Christopher G. Demetriou
@@ -33,6 +33,7 @@
 #ifndef _I386_DISKLABEL_H_
 #define _I386_DISKLABEL_H_
 
+#define LABELUSESMBR           1       /* use MBR partitionning */
 #define        LABELSECTOR             1       /* sector containing label */
 #define        LABELOFFSET             0       /* offset of label in sector */
 #define        MAXPARTITIONS           16      /* number of partitions */
index 3bee04ec74c301897c3d3086fa9c92d0d0ed1f83..1861b4903bba37265bd1774a540fa729e4c23d38 100644 (file)
@@ -6,5 +6,6 @@ SUBDIR= arch/i386/stand/mbr
 SUBDIR+= arch/i386/stand/bootxx
 SUBDIR+= arch/i386/stand/boot
 SUBDIR+= arch/i386/stand/cdboot
+SUBDIR+= ufs
 
 .include <bsd.subdir.mk>
diff --git a/sys/ufs/Makefile b/sys/ufs/Makefile
new file mode 100644 (file)
index 0000000..c06bbf8
--- /dev/null
@@ -0,0 +1,7 @@
+#      $NetBSD: Makefile,v 1.2 2002/11/26 23:30:35 lukem Exp $
+
+SUBDIR=        ffs lfs mfs ufs ext2fs
+
+INCSDIR= /usr/include/ufs
+
+.include <bsd.kinc.mk>
similarity index 100%
rename from include/ufs/chfs/chfs.h
rename to sys/ufs/chfs/chfs.h
diff --git a/sys/ufs/chfs/chfs_build.c b/sys/ufs/chfs/chfs_build.c
new file mode 100644 (file)
index 0000000..3904b02
--- /dev/null
@@ -0,0 +1,405 @@
+/*     $NetBSD: chfs_build.c,v 1.2 2011/11/24 21:22:39 agc Exp $       */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+
+void
+chfs_calc_trigger_levels(struct chfs_mount *chmp)
+{
+       uint32_t size;
+
+       chmp->chm_resv_blocks_deletion = 2;
+
+       size = chmp->chm_ebh->flash_size / 50;  //2% of flash size
+       size += chmp->chm_ebh->peb_nr * 100;
+       size += chmp->chm_ebh->eb_size - 1;
+
+       chmp->chm_resv_blocks_write =
+           chmp->chm_resv_blocks_deletion + (size / chmp->chm_ebh->eb_size);
+       chmp->chm_resv_blocks_gctrigger = chmp->chm_resv_blocks_write + 1;
+       chmp->chm_resv_blocks_gcmerge = chmp->chm_resv_blocks_deletion + 1;
+       chmp->chm_vdirty_blocks_gctrigger = chmp->chm_resv_blocks_gctrigger * 10;
+
+       chmp->chm_nospc_dirty =
+           chmp->chm_ebh->eb_size + (chmp->chm_ebh->flash_size / 100);
+}
+
+
+/**
+ * chfs_build_set_vnodecache_nlink - set pvno and nlink in vnodecaches
+ * @chmp: CHFS main descriptor structure
+ * @vc: vnode cache
+ * This function travels @vc's directory entries and sets the pvno and nlink
+ * attribute of the vnode where the dirent's vno points.
+ */
+void
+chfs_build_set_vnodecache_nlink(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc)
+{
+       struct chfs_dirent *fd;
+       //dbg("set nlink\n");
+
+//     for (fd = vc->scan_dirents; fd; fd = fd->next) {
+       TAILQ_FOREACH(fd, &vc->scan_dirents, fds) {
+               struct chfs_vnode_cache *child_vc;
+
+               if (!fd->vno)
+                       continue;
+
+               mutex_enter(&chmp->chm_lock_vnocache);
+               child_vc = chfs_vnode_cache_get(chmp, fd->vno);
+               mutex_exit(&chmp->chm_lock_vnocache);
+               if (!child_vc) {
+                       chfs_mark_node_obsolete(chmp, fd->nref);
+                       continue;
+               }
+               if (fd->type == VDIR) {
+                       if (child_vc->nlink < 1)
+                               child_vc->nlink = 1;
+
+                       if (child_vc->pvno) {
+                               chfs_err("found a hard link: child dir: %s"
+                                   ", (vno: %llu) of dir vno: %llu\n",
+                                   fd->name, (unsigned long long)fd->vno,
+                                   (unsigned long long)vc->vno);
+                       } else {
+                               //dbg("child_vc->pvno =
+                               //      vc->vno; pvno = %d\n", child_vc->pvno);
+                               child_vc->pvno = vc->vno;
+                       }
+               }
+               child_vc->nlink++;
+               //dbg("child_vc->nlink++;\n");
+               //child_vc->nlink++;
+               vc->nlink++;
+       }
+}
+
+/**
+ * chfs_build_remove_unlinked vnode
+ */
+/* static */
+void
+chfs_build_remove_unlinked_vnode(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc,
+//    struct chfs_dirent **unlinked)
+    struct chfs_dirent_list *unlinked)
+{
+       struct chfs_node_ref *nref;
+       struct chfs_dirent *fd, *tmpfd;
+
+       dbg("START\n");
+       dbg("vno: %llu\n", (unsigned long long)vc->vno);
+
+       nref = vc->dnode;
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       // The vnode cache is at the end of the data node's chain
+       while (nref != (struct chfs_node_ref *)vc) {
+               struct chfs_node_ref *next = nref->nref_next;
+               dbg("mark dnode\n");
+               chfs_mark_node_obsolete(chmp, nref);
+               nref = next;
+       }
+       nref = vc->dirents;
+       // The vnode cache is at the end of the dirent node's chain
+       while (nref != (struct chfs_node_ref *)vc) {
+               struct chfs_node_ref *next = nref->nref_next;
+               dbg("mark dirent\n");
+               chfs_mark_node_obsolete(chmp, nref);
+               nref = next;
+       }
+       if (!TAILQ_EMPTY(&vc->scan_dirents)) {
+               TAILQ_FOREACH_SAFE(fd, &vc->scan_dirents, fds, tmpfd) {
+//             while (vc->scan_dirents) {
+                       struct chfs_vnode_cache *child_vc;
+//                     fd = vc->scan_dirents;
+                       dbg("dirent dump:\n");
+                       dbg(" ->vno:     %llu\n", (unsigned long long)fd->vno);
+                       dbg(" ->version: %llu\n", (unsigned long long)fd->version);
+                       dbg(" ->nhash:   0x%x\n", fd->nhash);
+                       dbg(" ->nsize:   %d\n", fd->nsize);
+                       dbg(" ->name:    %s\n", fd->name);
+                       dbg(" ->type:    %d\n", fd->type);
+//                     vc->scan_dirents = fd->next;
+                       TAILQ_REMOVE(&vc->scan_dirents, fd, fds);
+
+                       if (!fd->vno) {
+                               chfs_free_dirent(fd);
+                               continue;
+                       }
+                       mutex_enter(&chmp->chm_lock_vnocache);
+                       child_vc = chfs_vnode_cache_get(chmp, fd->vno);
+                       mutex_exit(&chmp->chm_lock_vnocache);
+                       if (!child_vc) {
+                               chfs_free_dirent(fd);
+                               continue;
+                       }
+                       /**
+                        * Decrease nlink in child. If it is 0, add to unlinked
+                        * dirents or just free it otherwise.
+                        */
+                       child_vc->nlink--;
+
+                       if (!child_vc->nlink) {
+                               //dbg("nlink is 0\n");
+//                             fd->next = *unlinked;
+//                             *unlinked = fd;
+                               // XXX HEAD or TAIL?
+                               // original code did HEAD, but we could add
+                               // it to the TAIL easily with TAILQ.
+                               TAILQ_INSERT_TAIL(unlinked, fd, fds);
+                       } else {
+                               chfs_free_dirent(fd);
+                       }
+               }
+       } else {
+               dbg("there are no scan dirents\n");
+       }
+
+       nref = vc->v;
+       while ((struct chfs_vnode_cache *)nref != vc) {
+               if (!CHFS_REF_OBSOLETE(nref))
+                       chfs_mark_node_obsolete(chmp, nref);
+               nref = nref->nref_next;
+       }
+
+       mutex_enter(&chmp->chm_lock_vnocache);
+       if (vc->vno != CHFS_ROOTINO)
+               chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_UNCHECKED);
+       mutex_exit(&chmp->chm_lock_vnocache);
+       dbg("END\n");
+}
+
+/**
+ * chfs_build_filesystem - build in-memory representation of filesystem
+ * @chmp: super block information
+ *
+ * Step 1:
+ * This function scans through the eraseblocks mapped in EBH.
+ * During scan builds up the map of vnodes and directory entries and puts them
+ * into the vnode_cache.
+ * Step 2:
+ * Scans the directory tree and set the nlink in the vnode caches.
+ * Step 3:
+ * Scans vnode caches with nlink = 0
+ */
+int
+chfs_build_filesystem(struct chfs_mount *chmp)
+{
+       int i,err = 0;
+       struct chfs_vnode_cache *vc;
+       struct chfs_dirent *fd, *tmpfd;
+//     struct chfs_dirent *unlinked = NULL;
+       struct chfs_node_ref **nref;
+       struct chfs_dirent_list unlinked;
+       struct chfs_vnode_cache *notregvc;
+
+       TAILQ_INIT(&unlinked);
+
+       mutex_enter(&chmp->chm_lock_mountfields);
+
+       /**
+        * Step 1
+        */
+       chmp->chm_flags |= CHFS_MP_FLAG_SCANNING;
+       for (i = 0; i < chmp->chm_ebh->peb_nr; i++) {
+               //dbg("processing block: %d\n", i);
+               chmp->chm_blocks[i].lnr = i;
+               chmp->chm_blocks[i].free_size = chmp->chm_ebh->eb_size;
+               //If the LEB is add to free list skip it.
+               if (chmp->chm_ebh->lmap[i] < 0) {
+                       //dbg("block %d is unmapped\n", i);
+                       TAILQ_INSERT_TAIL(&chmp->chm_free_queue,
+                           &chmp->chm_blocks[i], queue);
+                       chmp->chm_nr_free_blocks++;
+                       continue;
+               }
+
+               err = chfs_scan_eraseblock(chmp, &chmp->chm_blocks[i]);
+               switch (err) {
+               case CHFS_BLK_STATE_FREE:
+                       chmp->chm_nr_free_blocks++;
+                       TAILQ_INSERT_TAIL(&chmp->chm_free_queue,
+                           &chmp->chm_blocks[i], queue);
+                       break;
+               case CHFS_BLK_STATE_CLEAN:
+                       TAILQ_INSERT_TAIL(&chmp->chm_clean_queue,
+                           &chmp->chm_blocks[i], queue);
+                       break;
+               case CHFS_BLK_STATE_PARTDIRTY:
+                       //dbg("free size: %d\n", chmp->chm_blocks[i].free_size);
+                       if (chmp->chm_blocks[i].free_size > chmp->chm_wbuf_pagesize &&
+                           (!chmp->chm_nextblock ||
+                               chmp->chm_blocks[i].free_size >
+                               chmp->chm_nextblock->free_size)) {
+                               /* convert the old nextblock's free size to
+                                * dirty and put it on a list */
+                               if (chmp->chm_nextblock) {
+                                       err = chfs_close_eraseblock(chmp,
+                                           chmp->chm_nextblock);
+                                       if (err)
+                                               return err;
+                               }
+                               chmp->chm_nextblock = &chmp->chm_blocks[i];
+                       } else {
+                               /* convert the scanned block's free size to
+                                * dirty and put it on a list */
+                               err = chfs_close_eraseblock(chmp,
+                                   &chmp->chm_blocks[i]);
+                               if (err)
+                                       return err;
+                       }
+                       break;
+               case CHFS_BLK_STATE_ALLDIRTY:
+                       /*
+                        * The block has a valid EBH header, but it doesn't
+                        * contain any valid data.
+                        */
+                       TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue,
+                           &chmp->chm_blocks[i], queue);
+                       chmp->chm_nr_erasable_blocks++;
+                       break;
+               default:
+                       /* It was an error, unknown  state */
+                       break;
+               }
+
+       }
+       chmp->chm_flags &= ~CHFS_MP_FLAG_SCANNING;
+
+
+       //TODO need bad block check (and bad block handling in EBH too!!)
+       /* Now EBH only checks block is bad  during its scan operation.
+        * Need check at erase + write + read...
+        */
+
+       /**
+        * Step 2
+        */
+       chmp->chm_flags |= CHFS_MP_FLAG_BUILDING;
+       for (i = 0; i < VNODECACHE_SIZE; i++) {
+               vc = chmp->chm_vnocache_hash[i];
+               while (vc) {
+                       dbg("vc->vno: %llu\n", (unsigned long long)vc->vno);
+                       if (!TAILQ_EMPTY(&vc->scan_dirents))
+                               chfs_build_set_vnodecache_nlink(chmp, vc);
+                       vc = vc->next;
+               }
+       }
+
+       /**
+        * Step 3
+        * Scan for vnodes with 0 nlink.
+        */
+       for (i =  0; i < VNODECACHE_SIZE; i++) {
+               vc = chmp->chm_vnocache_hash[i];
+               while (vc) {
+                       if (vc->nlink) {
+                               vc = vc->next;
+                               continue;
+                       }
+
+                       //dbg("remove unlinked start i: %d\n", i);
+                       chfs_build_remove_unlinked_vnode(chmp,
+                           vc, &unlinked);
+                       //dbg("remove unlinked end\n");
+                       vc = vc->next;
+               }
+       }
+       /* Remove the newly unlinked vnodes. They are on the unlinked list */
+       TAILQ_FOREACH_SAFE(fd, &unlinked, fds, tmpfd) {
+//     while (unlinked) {
+//             fd = unlinked;
+//             unlinked = fd->next;
+               TAILQ_REMOVE(&unlinked, fd, fds);
+               mutex_enter(&chmp->chm_lock_vnocache);
+               vc = chfs_vnode_cache_get(chmp, fd->vno);
+               mutex_exit(&chmp->chm_lock_vnocache);
+               if (vc) {
+                       chfs_build_remove_unlinked_vnode(chmp,
+                           vc, &unlinked);
+               }
+               chfs_free_dirent(fd);
+       }
+
+       chmp->chm_flags &= ~CHFS_MP_FLAG_BUILDING;
+
+       /* Free all dirents */
+       for (i =  0; i < VNODECACHE_SIZE; i++) {
+               vc = chmp->chm_vnocache_hash[i];
+               while (vc) {
+                       TAILQ_FOREACH_SAFE(fd, &vc->scan_dirents, fds, tmpfd) {
+//                     while (vc->scan_dirents) {
+//                             fd = vc->scan_dirents;
+//                             vc->scan_dirents = fd->next;
+                               TAILQ_REMOVE(&vc->scan_dirents, fd, fds);
+                               if (fd->vno == 0) {
+                                       //for (nref = &vc->dirents;
+                                       //     *nref != fd->nref;
+                                       //     nref = &((*nref)->next));
+
+                                       nref = &fd->nref;
+                                       *nref = fd->nref->nref_next;
+                                       //fd->nref->nref_next = NULL;
+                               } else if (fd->type == VDIR) {
+                                       //set state every non-VREG file's vc
+                                       mutex_enter(&chmp->chm_lock_vnocache);
+                                       notregvc =
+                                           chfs_vnode_cache_get(chmp,
+                                               fd->vno);
+                                       chfs_vnode_cache_set_state(chmp,
+                                           notregvc, VNO_STATE_PRESENT);
+                                       mutex_exit(&chmp->chm_lock_vnocache);
+                               }
+                               chfs_free_dirent(fd);
+                       }
+//                     vc->scan_dirents = NULL;
+                       KASSERT(TAILQ_EMPTY(&vc->scan_dirents));
+                       vc = vc->next;
+               }
+       }
+
+       //Set up chmp->chm_wbuf_ofs for the first write
+       if (chmp->chm_nextblock) {
+               dbg("free_size: %d\n", chmp->chm_nextblock->free_size);
+               chmp->chm_wbuf_ofs = chmp->chm_ebh->eb_size -
+                   chmp->chm_nextblock->free_size;
+       } else {
+               chmp->chm_wbuf_ofs = 0xffffffff;
+       }
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       return 0;
+}
+
diff --git a/sys/ufs/chfs/chfs_erase.c b/sys/ufs/chfs/chfs_erase.c
new file mode 100644 (file)
index 0000000..9ae49c3
--- /dev/null
@@ -0,0 +1,137 @@
+/*     $NetBSD: chfs_erase.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $     */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (c) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_erase.c
+ *
+ * Copyright (C) 2010  David Tengeri <dtengeri@inf.u-szeged.hu>,
+ *                     ...
+ *                     University of Szeged, Hungary
+ */
+
+#include "chfs.h"
+
+
+/**
+ * chfs_remap_leb - unmap and then map a leb
+ * @chmp: chfs mount structure
+ *
+ * This function gets an eraseblock from the erasable queue, unmaps it through
+ * EBH and maps another eraseblock to the same LNR.
+ * EBH will find a free eraseblock if any or will erase one if there isn't any
+ * free, just dirty block.
+ *
+ * Returns zero on case of success, errorcode otherwise.
+ *
+ * Needs more brainstorming here.
+ */
+int
+chfs_remap_leb(struct chfs_mount *chmp)
+{
+       int err;
+       struct chfs_eraseblock *cheb;
+       dbg("chfs_remap_leb\n");
+       uint32_t dirty, unchecked, used, free, wasted;
+
+       //dbg("chmp->chm_nr_erasable_blocks: %d\n", chmp->chm_nr_erasable_blocks);
+       //dbg("ltree: %p ecl: %p\n", &chmp->chm_ebh->ltree_lock, &chmp->chm_lock_sizes);
+       KASSERT(!rw_write_held(&chmp->chm_lock_wbuf));
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+
+       if (!chmp->chm_nr_erasable_blocks) {
+               //TODO
+               /* We don't have any erasable blocks, need to check if there are
+                * blocks on erasable_pending_wbuf_queue, flush the data and then
+                * we can remap it.
+                * If there aren't any blocks on that list too, we need to GC?
+                */
+               if (!TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue)) {
+                       cheb = TAILQ_FIRST(&chmp->chm_erasable_pending_wbuf_queue);
+                       TAILQ_REMOVE(&chmp->chm_erasable_pending_wbuf_queue, cheb, queue);
+                       if (chmp->chm_wbuf_len) {
+                               mutex_exit(&chmp->chm_lock_sizes);
+                               chfs_flush_pending_wbuf(chmp);
+                               mutex_enter(&chmp->chm_lock_sizes);
+                       }
+                       TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue, cheb, queue);
+                       chmp->chm_nr_erasable_blocks++;
+               } else {
+                       /* We can't delete any block. */
+                       //FIXME should we return ENOSPC?
+                       return ENOSPC;
+               }
+       }
+       cheb = TAILQ_FIRST(&chmp->chm_erase_pending_queue);
+       TAILQ_REMOVE(&chmp->chm_erase_pending_queue, cheb, queue);
+       chmp->chm_nr_erasable_blocks--;
+       
+       dirty = cheb->dirty_size;
+       unchecked = cheb->unchecked_size;
+       used = cheb->used_size;
+       free = cheb->free_size;
+       wasted = cheb->wasted_size;
+
+       // Free allocated node references for this eraseblock
+       chfs_free_node_refs(cheb);
+
+       err = chfs_unmap_leb(chmp, cheb->lnr);
+       if (err)
+               return err;
+
+       err = chfs_map_leb(chmp, cheb->lnr);
+       if (err)
+               return err;
+       // Reset state to default and change chmp sizes too 
+       chfs_change_size_dirty(chmp, cheb, -dirty);
+       chfs_change_size_unchecked(chmp, cheb, -unchecked);
+       chfs_change_size_used(chmp, cheb, -used);
+       chfs_change_size_free(chmp, cheb, chmp->chm_ebh->eb_size - free);
+       chfs_change_size_wasted(chmp, cheb, -wasted);
+
+       KASSERT(cheb->dirty_size == 0);
+       KASSERT(cheb->unchecked_size == 0);
+       KASSERT(cheb->used_size == 0);
+       KASSERT(cheb->free_size == chmp->chm_ebh->eb_size);
+       KASSERT(cheb->wasted_size == 0);
+
+       cheb->first_node = NULL;
+       cheb->last_node  = NULL;
+       //put it to free_queue
+       TAILQ_INSERT_TAIL(&chmp->chm_free_queue, cheb, queue);
+       chmp->chm_nr_free_blocks++;
+       dbg("remaped (free: %d, erasable: %d)\n", chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks);
+       KASSERT(!TAILQ_EMPTY(&chmp->chm_free_queue));
+
+       return 0;
+}
diff --git a/sys/ufs/chfs/chfs_gc.c b/sys/ufs/chfs/chfs_gc.c
new file mode 100644 (file)
index 0000000..aa32d64
--- /dev/null
@@ -0,0 +1,1238 @@
+/*     $NetBSD: chfs_gc.c,v 1.2 2011/11/24 21:09:37 agc Exp $  */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (c) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (c) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+
+void chfs_gc_release_inode(struct chfs_mount *,
+    struct chfs_inode *);
+struct chfs_inode *chfs_gc_fetch_inode(struct chfs_mount *,
+    ino_t, uint32_t);
+int chfs_check(struct chfs_mount *, struct chfs_vnode_cache *);
+void chfs_clear_inode(struct chfs_mount *, struct chfs_inode *);
+
+
+struct chfs_eraseblock *find_gc_block(struct chfs_mount *);
+int chfs_gcollect_pristine(struct chfs_mount *,
+    struct chfs_eraseblock *,
+    struct chfs_vnode_cache *, struct chfs_node_ref *);
+int chfs_gcollect_live(struct chfs_mount *,
+    struct chfs_eraseblock *, struct chfs_node_ref *,
+    struct chfs_inode *);
+int chfs_gcollect_vnode(struct chfs_mount *, struct chfs_inode *);
+int chfs_gcollect_dirent(struct chfs_mount *,
+    struct chfs_eraseblock *, struct chfs_inode *,
+    struct chfs_dirent *);
+int chfs_gcollect_deletion_dirent(struct chfs_mount *,
+    struct chfs_eraseblock *, struct chfs_inode *,
+    struct chfs_dirent *);
+int chfs_gcollect_dnode(struct chfs_mount *,
+    struct chfs_eraseblock *, struct chfs_inode *,
+    struct chfs_full_dnode *, uint32_t, uint32_t);
+
+/* must be called with chm_lock_mountfields held */
+void
+chfs_gc_trigger(struct chfs_mount *chmp)
+{
+       struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+       //mutex_enter(&chmp->chm_lock_sizes);
+       if (gc->gcth_running &&
+           chfs_gc_thread_should_wake(chmp)) {
+               cv_signal(&gc->gcth_wakeup);
+       }
+       //mutex_exit(&chmp->chm_lock_sizes);
+}
+
+
+void
+chfs_gc_thread(void *data)
+{
+       struct chfs_mount *chmp = data;
+       struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+       dbg_gc("[GC THREAD] thread started\n");
+
+       mutex_enter(&chmp->chm_lock_mountfields);
+       while (gc->gcth_running) {
+               /* we must call chfs_gc_thread_should_wake with chm_lock_mountfields
+                * held, which is a bit awkwardly done here, but we cant relly
+                * do it otherway with the current design...
+                */
+               if (chfs_gc_thread_should_wake(chmp)) {
+//                     mutex_exit(&chmp->chm_lock_mountfields);
+                       if (chfs_gcollect_pass(chmp) == ENOSPC) {
+                               dbg_gc("No space for garbage collection\n");
+                               panic("No space for garbage collection\n");
+                               /* XXX why break here? i have added a panic
+                                * here to see if it gets triggered -ahoka
+                                */
+                               break;
+                       }
+                       /* XXX gcollect_pass drops the mutex */
+                       mutex_enter(&chmp->chm_lock_mountfields);
+               }
+
+               cv_timedwait_sig(&gc->gcth_wakeup,
+                   &chmp->chm_lock_mountfields, mstohz(100));
+       }
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       dbg_gc("[GC THREAD] thread stopped\n");
+       kthread_exit(0);
+}
+
+void
+chfs_gc_thread_start(struct chfs_mount *chmp)
+{
+       struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+       cv_init(&gc->gcth_wakeup, "chfsgccv");
+
+       gc->gcth_running = true;
+       kthread_create(PRI_NONE, /*KTHREAD_MPSAFE |*/ KTHREAD_MUSTJOIN,
+           NULL, chfs_gc_thread, chmp, &gc->gcth_thread,
+           "chfsgcth");
+}
+
+void
+chfs_gc_thread_stop(struct chfs_mount *chmp)
+{
+       struct garbage_collector_thread *gc = &chmp->chm_gc_thread;
+
+       /* check if it is actually running. if not, do nothing */
+       if (gc->gcth_running) {
+               gc->gcth_running = false;
+       } else {
+               return;
+       }
+       cv_signal(&gc->gcth_wakeup);
+       dbg_gc("[GC THREAD] stop signal sent\n");
+
+       kthread_join(gc->gcth_thread);
+#ifdef BROKEN_KTH_JOIN
+       kpause("chfsthjoin", false, mstohz(1000), NULL);
+#endif
+
+       cv_destroy(&gc->gcth_wakeup);
+}
+
+/* must be called with chm_lock_mountfields held */
+int
+chfs_gc_thread_should_wake(struct chfs_mount *chmp)
+{
+       int nr_very_dirty = 0;
+       struct chfs_eraseblock *cheb;
+       uint32_t dirty;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+               dbg_gc("erase_pending\n");
+               return 1;
+       }
+
+       if (chmp->chm_unchecked_size) {
+               dbg_gc("unchecked\n");
+               return 1;
+       }
+
+       dirty = chmp->chm_dirty_size - chmp->chm_nr_erasable_blocks *
+           chmp->chm_ebh->eb_size;
+
+       if (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks <
+           chmp->chm_resv_blocks_gctrigger && (dirty > chmp->chm_nospc_dirty)) {
+               dbg_gc("free: %d + erasable: %d < resv: %d\n",
+                   chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks,
+                   chmp->chm_resv_blocks_gctrigger);
+               dbg_gc("dirty: %d > nospc_dirty: %d\n",
+                   dirty, chmp->chm_nospc_dirty);
+
+               return 1;
+       }
+
+       TAILQ_FOREACH(cheb, &chmp->chm_very_dirty_queue, queue) {
+               nr_very_dirty++;
+               if (nr_very_dirty == chmp->chm_vdirty_blocks_gctrigger) {
+                       dbg_gc("nr_very_dirty\n");
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+void
+chfs_gc_release_inode(struct chfs_mount *chmp,
+    struct chfs_inode *ip)
+{
+       dbg_gc("release inode\n");
+       //mutex_exit(&ip->inode_lock);
+       //vput(ITOV(ip));
+}
+
+struct chfs_inode *
+chfs_gc_fetch_inode(struct chfs_mount *chmp, ino_t vno,
+    uint32_t unlinked)
+{
+       struct vnode *vp = NULL;
+       struct chfs_vnode_cache *vc;
+       struct chfs_inode *ip;
+       dbg_gc("fetch inode %llu\n", (unsigned long long)vno);
+
+       if (unlinked) {
+               dbg_gc("unlinked\n");
+               vp = chfs_vnode_lookup(chmp, vno);
+               if (!vp) {
+                       mutex_enter(&chmp->chm_lock_vnocache);
+                       vc = chfs_vnode_cache_get(chmp, vno);
+                       if (!vc) {
+                               mutex_exit(&chmp->chm_lock_vnocache);
+                               return NULL;
+                       }
+                       if (vc->state != VNO_STATE_CHECKEDABSENT) {
+                               //sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+                               mutex_exit(&chmp->chm_lock_vnocache);
+                               /* XXX why do we need the delay here?! */
+//                             kpause("chvncabs", true, mstohz(50), NULL);
+                               KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+                               cv_timedwait_sig(
+                                       &chmp->chm_gc_thread.gcth_wakeup,
+                                       &chmp->chm_lock_mountfields, mstohz(50));
+
+//                             KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+                       } else {
+                               mutex_exit(&chmp->chm_lock_vnocache);
+                       }
+                       return NULL;
+               }
+       } else {
+               dbg_gc("vnode lookup\n");
+               vp = chfs_vnode_lookup(chmp, vno);
+               //VFS_VGET(chmp->chm_fsmp, vno, &vp);
+       }
+       dbg_gc("vp to ip\n");
+       ip = VTOI(vp);
+       KASSERT(ip);
+       //mutex_enter(&ip->inode_lock);
+
+       return ip;
+}
+
+extern rb_tree_ops_t frag_rbtree_ops;
+
+int
+chfs_check(struct chfs_mount *chmp, struct  chfs_vnode_cache *chvc)
+{
+       struct chfs_inode *ip;
+       struct vnode *vp;
+       int ret;
+
+       ip = pool_get(&chfs_inode_pool, PR_WAITOK);
+       if (!ip) {
+               return ENOMEM;
+       }
+
+       vp = kmem_zalloc(sizeof(struct vnode), KM_SLEEP);
+
+       ip->chvc = chvc;
+       ip->vp = vp;
+
+       vp->v_data = ip;
+
+       rb_tree_init(&ip->fragtree, &frag_rbtree_ops);
+       TAILQ_INIT(&ip->dents);
+
+       ret = chfs_read_inode_internal(chmp, ip);
+       if (!ret) {
+               chfs_clear_inode(chmp, ip);
+       }
+
+       pool_put(&chfs_inode_pool, ip);
+
+       return ret;
+}
+
+void
+chfs_clear_inode(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+       struct chfs_dirent *fd, *tmpfd;
+       struct chfs_vnode_cache *chvc;
+
+
+       /* XXX not sure if this is the correct locking */
+//     mutex_enter(&chmp->chm_lock_vnocache);
+       chvc = ip->chvc;
+       /* shouldnt this be: */
+       //bool deleted = (chvc && !(chvc->pvno || chvc->nlink));
+       int deleted = (chvc && !(chvc->pvno | chvc->nlink));
+
+       if (chvc && chvc->state != VNO_STATE_CHECKING) {
+//             chfs_vnode_cache_state_set(chmp, chvc, VNO_STATE_CLEARING);
+               chvc->state = VNO_STATE_CLEARING;
+       }
+
+       if (chvc->v && ((struct  chfs_vnode_cache *)chvc->v != chvc)) {
+               if (deleted)
+                       chfs_mark_node_obsolete(chmp, chvc->v);
+               //chfs_free_refblock(chvc->v);
+       }
+//     mutex_enter(&chmp->chm_lock_vnocache);
+
+       chfs_kill_fragtree(&ip->fragtree);
+/*
+       fd = TAILQ_FIRST(&ip->dents);
+       while (fd) {
+               TAILQ_REMOVE(&ip->dents, fd, fds);
+               chfs_free_dirent(fd);
+               fd = TAILQ_FIRST(&ip->dents);
+       }
+*/
+
+       TAILQ_FOREACH_SAFE(fd, &ip->dents, fds, tmpfd) {
+               chfs_free_dirent(fd);
+       }
+
+       if (chvc && chvc->state == VNO_STATE_CHECKING) {
+               chfs_vnode_cache_set_state(chmp,
+                   chvc, VNO_STATE_CHECKEDABSENT);
+               if ((struct chfs_vnode_cache *)chvc->v == chvc &&
+                   (struct chfs_vnode_cache *)chvc->dirents == chvc &&
+                   (struct chfs_vnode_cache *)chvc->dnode == chvc)
+                       chfs_vnode_cache_remove(chmp, chvc);
+       }
+
+}
+
+struct chfs_eraseblock *
+find_gc_block(struct chfs_mount *chmp)
+{
+       struct chfs_eraseblock *ret;
+       struct chfs_eraseblock_queue *nextqueue;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       struct timespec now;
+       vfs_timestamp(&now);
+
+       int n = now.tv_nsec % 128;
+
+       //dbg_gc("n = %d\n", n);
+again:
+/*    if (!TAILQ_EMPTY(&chmp->chm_bad_used_queue) && chmp->chm_nr_free_blocks > chmp->chm_nr_resv_blocks_gcbad) {
+      dbg_gc("Picking block from bad_used_queue to GC next\n");
+      nextqueue = &chmp->chm_bad_used_queue;
+      } else */if (n<50 && !TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+               dbg_gc("Picking block from erase_pending_queue to GC next\n");
+               nextqueue = &chmp->chm_erase_pending_queue;
+       } else if (n<110 && !TAILQ_EMPTY(&chmp->chm_very_dirty_queue) ) {
+               dbg_gc("Picking block from very_dirty_queue to GC next\n");
+               nextqueue = &chmp->chm_very_dirty_queue;
+       } else if (n<126 && !TAILQ_EMPTY(&chmp->chm_dirty_queue) ) {
+               dbg_gc("Picking block from dirty_queue to GC next\n");
+               nextqueue = &chmp->chm_dirty_queue;
+       } else if (!TAILQ_EMPTY(&chmp->chm_clean_queue)) {
+               dbg_gc("Picking block from clean_queue to GC next\n");
+               nextqueue = &chmp->chm_clean_queue;
+       } else if (!TAILQ_EMPTY(&chmp->chm_dirty_queue)) {
+               dbg_gc("Picking block from dirty_queue to GC next"
+                   " (clean_queue was empty)\n");
+               nextqueue = &chmp->chm_dirty_queue;
+       } else if (!TAILQ_EMPTY(&chmp->chm_very_dirty_queue)) {
+               dbg_gc("Picking block from very_dirty_queue to GC next"
+                   " (clean_queue and dirty_queue were empty)\n");
+               nextqueue = &chmp->chm_very_dirty_queue;
+       } else if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+               dbg_gc("Picking block from erase_pending_queue to GC next"
+                   " (clean_queue and {very_,}dirty_queue were empty)\n");
+               nextqueue = &chmp->chm_erase_pending_queue;
+       } else if (!TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue)) {
+               dbg_gc("Synching wbuf in order to reuse "
+                   "erasable_pendig_wbuf_queue blocks\n");
+               rw_enter(&chmp->chm_lock_wbuf, RW_WRITER);
+               chfs_flush_pending_wbuf(chmp);
+               rw_exit(&chmp->chm_lock_wbuf);
+               goto again;
+       } else {
+               dbg_gc("CHFS: no clean, dirty _or_ erasable"
+                   " blocks to GC from! Where are they all?\n");
+               return NULL;
+       }
+
+       ret = TAILQ_FIRST(nextqueue);
+       if (chmp->chm_nextblock) {
+               dbg_gc("nextblock num: %u - gcblock num: %u\n",
+                   chmp->chm_nextblock->lnr, ret->lnr);
+               if (ret == chmp->chm_nextblock)
+                       goto again;
+               //KASSERT(ret != chmp->chm_nextblock);
+               //dbg_gc("first node lnr: %u ofs: %u\n", ret->first_node->lnr, ret->first_node->offset);
+               //dbg_gc("last node lnr: %u ofs: %u\n", ret->last_node->lnr, ret->last_node->offset);
+       }
+       TAILQ_REMOVE(nextqueue, ret, queue);
+       chmp->chm_gcblock = ret;
+       ret->gc_node = ret->first_node;
+
+       if (!ret->gc_node) {
+               dbg_gc("Oops! ret->gc_node at LEB: %u is NULL\n", ret->lnr);
+               panic("CHFS BUG - one LEB's gc_node is NULL\n");
+       }
+
+       /* TODO wasted size? */
+       return ret;
+}
+
+
+int
+chfs_gcollect_pass(struct chfs_mount *chmp)
+{
+       struct chfs_vnode_cache *vc;
+       struct chfs_eraseblock *eb;
+       struct chfs_node_ref *nref;
+       uint32_t gcblock_dirty;
+       struct chfs_inode *ip;
+       ino_t vno, pvno;
+       uint32_t nlink;
+       int ret = 0;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+//     mutex_enter(&chmp->chm_lock_mountfields);
+       for (;;) {
+               mutex_enter(&chmp->chm_lock_sizes);
+
+               dbg_gc("unchecked size == %u\n", chmp->chm_unchecked_size);
+               if (!chmp->chm_unchecked_size)
+                       break;
+
+               if (chmp->chm_checked_vno > chmp->chm_max_vno) {
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       dbg_gc("checked_vno (#%llu) > max_vno (#%llu)\n",
+                           (unsigned long long)chmp->chm_checked_vno,
+                           (unsigned long long)chmp->chm_max_vno);
+                       return ENOSPC;
+               }
+
+               mutex_exit(&chmp->chm_lock_sizes);
+
+               mutex_enter(&chmp->chm_lock_vnocache);
+               dbg_gc("checking vno #%llu\n",
+                       (unsigned long long)chmp->chm_checked_vno);
+               dbg_gc("get vnode cache\n");
+               vc = chfs_vnode_cache_get(chmp, chmp->chm_checked_vno++);
+
+               if (!vc) {
+                       dbg_gc("!vc\n");
+                       mutex_exit(&chmp->chm_lock_vnocache);
+                       continue;
+               }
+
+               if ((vc->pvno | vc->nlink) == 0) {
+                       dbg_gc("(pvno | nlink) == 0\n");
+                       mutex_exit(&chmp->chm_lock_vnocache);
+                       continue;
+               }
+
+               dbg_gc("switch\n");
+               switch (vc->state) {
+               case VNO_STATE_CHECKEDABSENT:
+               case VNO_STATE_PRESENT:
+                       mutex_exit(&chmp->chm_lock_vnocache);
+                       continue;
+
+               case VNO_STATE_GC:
+               case VNO_STATE_CHECKING:
+                       mutex_exit(&chmp->chm_lock_vnocache);
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       dbg_gc("VNO_STATE GC or CHECKING\n");
+                       panic("CHFS BUG - vc state gc or checking\n");
+
+               case VNO_STATE_READING:
+                       chmp->chm_checked_vno--;
+                       mutex_exit(&chmp->chm_lock_vnocache);
+                       /* XXX why do we need the delay here?! */
+                       kpause("chvncrea", true, mstohz(50), NULL);
+
+//                     sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+//                     KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       return 0;
+
+               default:
+                       mutex_exit(&chmp->chm_lock_vnocache);
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       dbg_gc("default\n");
+                       panic("CHFS BUG - vc state is other what we"
+                           " checked\n");
+
+               case VNO_STATE_UNCHECKED:
+                       ;
+               }
+
+               chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_CHECKING);
+
+               /* XXX check if this is too heavy to call under
+                * chm_lock_vnocache
+                */
+               ret = chfs_check(chmp, vc);
+               dbg_gc("set state\n");
+               chfs_vnode_cache_set_state(chmp,
+                   vc, VNO_STATE_CHECKEDABSENT);
+
+               mutex_exit(&chmp->chm_lock_vnocache);
+               mutex_exit(&chmp->chm_lock_mountfields);
+
+               return ret;
+       }
+
+
+       eb = chmp->chm_gcblock;
+
+       if (!eb) {
+               eb = find_gc_block(chmp);
+       }
+
+       if (!eb) {
+               dbg_gc("!eb\n");
+               if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       return EAGAIN;
+               }
+               mutex_exit(&chmp->chm_lock_sizes);
+               mutex_exit(&chmp->chm_lock_mountfields);
+               return EIO;
+       }
+
+       if (!eb->used_size) {
+               dbg_gc("!eb->used_size\n");
+               goto eraseit;
+       }
+
+       nref = eb->gc_node;
+       //dbg_gc("gc use: %u\n", chmp->chm_nextblock->lnr);
+       //dbg_gc("nref: %u %u\n", nref->nref_lnr, nref->nref_offset);
+       gcblock_dirty = eb->dirty_size;
+
+       while(CHFS_REF_OBSOLETE(nref)) {
+               //dbg_gc("obsoleted nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+#ifdef DBG_MSG_GC
+               if (nref == chmp->chm_blocks[nref->nref_lnr].last_node) {
+                       dbg_gc("THIS NODE IS THE LAST NODE OF ITS EB\n");
+               }
+#endif
+               nref = node_next(nref);
+               if (!nref) {
+                       //dbg_gc("!nref\n");
+                       eb->gc_node = nref;
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       panic("CHFS BUG - nref is NULL)\n");
+               }
+       }
+       eb->gc_node = nref;
+       //dbg_gc("nref the chosen one lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+       KASSERT(nref->nref_lnr == chmp->chm_gcblock->lnr);
+
+       if (!nref->nref_next) {
+               //dbg_gc("!nref->nref_next\n");
+               mutex_exit(&chmp->chm_lock_sizes);
+               if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) {
+                       chfs_gcollect_pristine(chmp, eb, NULL, nref);
+               } else {
+                       chfs_mark_node_obsolete(chmp, nref);
+               }
+               goto lock_size;
+       }
+
+       dbg_gc("nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+       vc = chfs_nref_to_vc(nref);
+
+       mutex_exit(&chmp->chm_lock_sizes);
+
+       //dbg_gc("enter vnocache lock on #%llu\n", vc->vno);
+       mutex_enter(&chmp->chm_lock_vnocache);
+
+       dbg_gc("switch\n");
+       switch(vc->state) {
+        case VNO_STATE_CHECKEDABSENT:
+               if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) {
+                       chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_GC);
+               }
+               break;
+
+        case VNO_STATE_PRESENT:
+               break;
+
+        case VNO_STATE_UNCHECKED:
+        case VNO_STATE_CHECKING:
+        case VNO_STATE_GC:
+               mutex_exit(&chmp->chm_lock_vnocache);
+               mutex_exit(&chmp->chm_lock_mountfields);
+               panic("CHFS BUG - vc state unchecked,"
+                   " checking or gc (vno #%llu, num #%d)\n",
+                   (unsigned long long)vc->vno, vc->state);
+
+        case VNO_STATE_READING:
+               mutex_exit(&chmp->chm_lock_vnocache);
+               /* XXX why do we need the delay here?! */
+               kpause("chvncrea", true, mstohz(50), NULL);
+
+//             sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+//             KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+               mutex_exit(&chmp->chm_lock_mountfields);
+               return 0;
+       }
+
+       if (vc->state == VNO_STATE_GC) {
+               dbg_gc("vc->state == VNO_STATE_GC\n");
+               mutex_exit(&chmp->chm_lock_vnocache);
+               ret = chfs_gcollect_pristine(chmp, eb, NULL, nref);
+
+//             chfs_vnode_cache_state_set(chmp,
+//                 vc, VNO_STATE_CHECKEDABSENT);
+               /* XXX locking? */
+               vc->state = VNO_STATE_CHECKEDABSENT;
+               //TODO wake_up(&chmp->chm_vnocache_wq);
+               if (ret != EBADF)
+                       goto test_gcnode;
+               mutex_enter(&chmp->chm_lock_vnocache);
+       }
+
+       vno = vc->vno;
+       pvno = vc->pvno;
+       nlink = vc->nlink;
+       mutex_exit(&chmp->chm_lock_vnocache);
+
+       ip = chfs_gc_fetch_inode(chmp, vno, !(pvno | nlink));
+
+       if (!ip) {
+               dbg_gc("!ip\n");
+               ret = 0;
+               goto lock_size;
+       }
+
+       chfs_gcollect_live(chmp, eb, nref, ip);
+
+       chfs_gc_release_inode(chmp, ip);
+
+test_gcnode:
+       if (eb->dirty_size == gcblock_dirty &&
+           !CHFS_REF_OBSOLETE(eb->gc_node)) {
+               dbg_gc("ERROR collecting node at %u failed.\n",
+                   CHFS_GET_OFS(eb->gc_node->nref_offset));
+
+               ret = ENOSPC;
+       }
+
+lock_size:
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       mutex_enter(&chmp->chm_lock_sizes);
+eraseit:
+       dbg_gc("eraseit\n");
+
+       if (chmp->chm_gcblock) {
+               dbg_gc("eb used size = %u\n", chmp->chm_gcblock->used_size);
+               dbg_gc("eb free size = %u\n", chmp->chm_gcblock->free_size);
+               dbg_gc("eb dirty size = %u\n", chmp->chm_gcblock->dirty_size);
+               dbg_gc("eb unchecked size = %u\n",
+                   chmp->chm_gcblock->unchecked_size);
+               dbg_gc("eb wasted size = %u\n", chmp->chm_gcblock->wasted_size);
+
+               KASSERT(chmp->chm_gcblock->used_size + chmp->chm_gcblock->free_size +
+                   chmp->chm_gcblock->dirty_size +
+                   chmp->chm_gcblock->unchecked_size +
+                   chmp->chm_gcblock->wasted_size == chmp->chm_ebh->eb_size);
+
+       }
+
+       if (chmp->chm_gcblock && chmp->chm_gcblock->dirty_size +
+           chmp->chm_gcblock->wasted_size == chmp->chm_ebh->eb_size) {
+               dbg_gc("Block at leb #%u completely obsoleted by GC, "
+                   "Moving to erase_pending_queue\n", chmp->chm_gcblock->lnr);
+               TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue,
+                   chmp->chm_gcblock, queue);
+               chmp->chm_gcblock = NULL;
+               chmp->chm_nr_erasable_blocks++;
+               if (!TAILQ_EMPTY(&chmp->chm_erase_pending_queue)) {
+                       ret = chfs_remap_leb(chmp);
+               }
+       }
+
+       mutex_exit(&chmp->chm_lock_sizes);
+       mutex_exit(&chmp->chm_lock_mountfields);
+       dbg_gc("return\n");
+       return ret;
+}
+
+
+int
+chfs_gcollect_pristine(struct chfs_mount *chmp, struct chfs_eraseblock *cheb,
+    struct chfs_vnode_cache *chvc, struct chfs_node_ref *nref)
+{
+       struct chfs_node_ref *newnref;
+       struct chfs_flash_node_hdr *nhdr;
+       struct chfs_flash_vnode *fvnode;
+       struct chfs_flash_dirent_node *fdirent;
+       struct chfs_flash_data_node *fdata;
+       int ret, retries = 0;
+       uint32_t ofs, crc;
+       size_t totlen = chfs_nref_len(chmp, cheb, nref);
+       char *data;
+       struct iovec vec;
+       size_t retlen;
+
+       dbg_gc("gcollect_pristine\n");
+
+       data = kmem_alloc(totlen, KM_SLEEP);
+       if (!data)
+               return ENOMEM;
+
+       ofs = CHFS_GET_OFS(nref->nref_offset);
+
+       ret = chfs_read_leb(chmp, nref->nref_lnr, data, ofs, totlen, &retlen);
+       if (ret) {
+               dbg_gc("reading error\n");
+               return ret;
+       }
+       if (retlen != totlen) {
+               dbg_gc("read size error\n");
+               return EIO;
+       }
+       nhdr = (struct chfs_flash_node_hdr *)data;
+       /* check the header */
+       if (le16toh(nhdr->magic) != CHFS_FS_MAGIC_BITMASK) {
+               dbg_gc("node header magic number error\n");
+               return EBADF;
+       }
+       crc = crc32(0, (uint8_t *)nhdr, CHFS_NODE_HDR_SIZE - 4);
+       if (crc != le32toh(nhdr->hdr_crc)) {
+               dbg_gc("node header crc error\n");
+               return EBADF;
+       }
+
+       switch(le16toh(nhdr->type)) {
+        case CHFS_NODETYPE_VNODE:
+               fvnode = (struct chfs_flash_vnode *)data;
+               crc = crc32(0, (uint8_t *)fvnode, sizeof(struct chfs_flash_vnode) - 4);
+               if (crc != le32toh(fvnode->node_crc)) {
+                       dbg_gc("vnode crc error\n");
+                       return EBADF;
+               }
+               break;
+        case CHFS_NODETYPE_DIRENT:
+               fdirent = (struct chfs_flash_dirent_node *)data;
+               crc = crc32(0, (uint8_t *)fdirent, sizeof(struct chfs_flash_dirent_node) - 4);
+               if (crc != le32toh(fdirent->node_crc)) {
+                       dbg_gc("dirent crc error\n");
+                       return EBADF;
+               }
+               crc = crc32(0, fdirent->name, fdirent->nsize);
+               if (crc != le32toh(fdirent->name_crc)) {
+                       dbg_gc("dirent name crc error\n");
+                       return EBADF;
+               }
+               break;
+        case CHFS_NODETYPE_DATA:
+               fdata = (struct chfs_flash_data_node *)data;
+               crc = crc32(0, (uint8_t *)fdata, sizeof(struct chfs_flash_data_node) - 4);
+               if (crc != le32toh(fdata->node_crc)) {
+                       dbg_gc("data node crc error\n");
+                       return EBADF;
+               }
+               break;
+        default:
+               if (chvc) {
+                       dbg_gc("unknown node have vnode cache\n");
+                       return EBADF;
+               }
+       }
+       /* CRC's OK, write node to its new place */
+retry:
+       ret = chfs_reserve_space_gc(chmp, totlen);
+       if (ret)
+               return ret;
+
+       newnref = chfs_alloc_node_ref(chmp->chm_nextblock);
+       if (!newnref)
+               return ENOMEM;
+
+       ofs = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+       newnref->nref_offset = ofs;
+
+       vec.iov_base = (void *)data;
+       vec.iov_len = totlen;
+       mutex_enter(&chmp->chm_lock_sizes);
+       ret = chfs_write_wbuf(chmp, &vec, 1, ofs, &retlen);
+
+       if (ret || retlen != totlen) {
+               chfs_err("error while writing out to the media\n");
+               chfs_err("err: %d | size: %zu | retlen : %zu\n",
+                   ret, totlen, retlen);
+
+               chfs_change_size_dirty(chmp, chmp->chm_nextblock, totlen);
+               if (retries) {
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       return EIO;
+               }
+
+               retries++;
+               mutex_exit(&chmp->chm_lock_sizes);
+               goto retry;
+       }
+
+       mutex_exit(&chmp->chm_lock_sizes);
+       //TODO should we set free_size?
+       chfs_mark_node_obsolete(chmp, nref);
+       chfs_add_vnode_ref_to_vc(chmp, chvc, newnref);
+       return 0;
+}
+
+
+int
+chfs_gcollect_live(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, struct chfs_node_ref *nref,
+    struct chfs_inode *ip)
+{
+       struct chfs_node_frag *frag;
+       struct chfs_full_dnode *fn = NULL;
+       int start = 0, end = 0, nrfrags = 0;
+       struct chfs_dirent *fd = NULL;
+       int ret = 0;
+       bool is_dirent;
+
+       dbg_gc("gcollect_live\n");
+
+       if (chmp->chm_gcblock != cheb) {
+               dbg_gc("GC block is no longer gcblock. Restart.\n");
+               goto upnout;
+       }
+
+       if (CHFS_REF_OBSOLETE(nref)) {
+               dbg_gc("node to be GC'd was obsoleted in the meantime.\n");
+               goto upnout;
+       }
+
+       /* It's a vnode? */
+       if (ip->chvc->v == nref) {
+               chfs_gcollect_vnode(chmp, ip);
+               goto upnout;
+       }
+
+       /* find fn */
+       dbg_gc("find full dnode\n");
+       for(frag = frag_first(&ip->fragtree);
+           frag; frag = frag_next(&ip->fragtree, frag)) {
+               if (frag->node && frag->node->nref == nref) {
+                       fn = frag->node;
+                       end = frag->ofs + frag->size;
+                       if (!nrfrags++)
+                               start = frag->ofs;
+                       if (nrfrags == frag->node->frags)
+                               break;
+               }
+       }
+
+       /* It's a pristine node, or dnode (or hole? XXX have we hole nodes?) */
+       if (fn) {
+               if (CHFS_REF_FLAGS(nref) == CHFS_PRISTINE_NODE_MASK) {
+                       ret = chfs_gcollect_pristine(chmp,
+                           cheb, ip->chvc, nref);
+                       if (!ret) {
+                               frag->node->nref = ip->chvc->v;
+                       }
+                       if (ret != EBADF)
+                               goto upnout;
+               }
+               //ret = chfs_gcollect_hole(chmp, cheb, ip, fn, start, end);
+               ret = chfs_gcollect_dnode(chmp, cheb, ip, fn, start, end);
+               goto upnout;
+       }
+
+
+       /* It's a dirent? */
+       dbg_gc("find full dirent\n");
+       is_dirent = false;
+       TAILQ_FOREACH(fd, &ip->dents, fds) {
+               if (fd->nref == nref) {
+                       is_dirent = true;
+                       break;
+               }
+       }
+
+       if (is_dirent && fd->vno) {
+               ret = chfs_gcollect_dirent(chmp, cheb, ip, fd);
+       } else if (is_dirent) {
+               ret = chfs_gcollect_deletion_dirent(chmp, cheb, ip, fd);
+       } else {
+               dbg_gc("Nref at leb #%u offset 0x%08x wasn't in node list"
+                   " for ino #%llu\n",
+                   nref->nref_lnr, CHFS_GET_OFS(nref->nref_offset),
+                   (unsigned long long)ip->ino);
+               if (CHFS_REF_OBSOLETE(nref)) {
+                       dbg_gc("But it's obsolete so we don't mind"
+                           " too much.\n");
+               }
+       }
+
+upnout:
+       return ret;
+}
+
+int
+chfs_gcollect_vnode(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+       int ret;
+       dbg_gc("gcollect_vnode\n");
+
+       ret = chfs_write_flash_vnode(chmp, ip, ALLOC_GC);
+
+       return ret;
+}
+
+int
+chfs_gcollect_dirent(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, struct chfs_inode *parent,
+    struct chfs_dirent *fd)
+{
+       struct vnode *vnode = NULL;
+       struct chfs_inode *ip;
+       struct chfs_node_ref *prev;
+       dbg_gc("gcollect_dirent\n");
+
+       vnode = chfs_vnode_lookup(chmp, fd->vno);
+
+       /* XXX maybe KASSERT or panic on this? */
+       if (vnode == NULL) {
+               return ENOENT;
+       }
+
+       ip = VTOI(vnode);
+
+       prev = parent->chvc->dirents;
+       if (prev == fd->nref) {
+               parent->chvc->dirents = prev->nref_next;
+               dbg_gc("fd nref removed from dirents list\n");
+               prev = NULL;
+       }
+       while (prev) {
+               if (prev->nref_next == fd->nref) {
+                       prev->nref_next = fd->nref->nref_next;
+                       dbg_gc("fd nref removed from dirents list\n");
+                       break;
+               }
+               prev = prev->nref_next;
+       }
+
+       prev = fd->nref;
+       chfs_mark_node_obsolete(chmp, fd->nref);
+       return chfs_write_flash_dirent(chmp,
+           parent, ip, fd, fd->vno, ALLOC_GC);
+}
+
+/* Check dirents what are marked as deleted. */
+int
+chfs_gcollect_deletion_dirent(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, struct chfs_inode *parent,
+    struct chfs_dirent *fd)
+{
+       struct chfs_flash_dirent_node chfdn;
+       struct chfs_node_ref *nref;
+       size_t retlen, name_len, nref_len;
+       uint32_t name_crc;
+
+       int ret;
+
+       struct vnode *vnode = NULL;
+
+       dbg_gc("gcollect_deletion_dirent\n");
+
+       name_len = strlen(fd->name);
+       name_crc = crc32(0, fd->name, name_len);
+
+       nref_len = chfs_nref_len(chmp, cheb, fd->nref);
+
+       vnode = chfs_vnode_lookup(chmp, fd->vno);
+
+       //dbg_gc("ip from vnode\n");
+       //VFS_VGET(chmp->chm_fsmp, fd->vno, &vnode);
+       //ip = VTOI(vnode);
+       //vput(vnode);
+
+       //dbg_gc("mutex enter erase_completion_lock\n");
+
+//     dbg_gc("alloc chfdn\n");
+//     chfdn = kmem_alloc(nref_len, KM_SLEEP);
+//     if (!chfdn)
+//             return ENOMEM;
+
+       for (nref = parent->chvc->dirents;
+            nref != (void*)parent->chvc;
+            nref = nref->nref_next) {
+
+               if (!CHFS_REF_OBSOLETE(nref))
+                       continue;
+
+               /* if node refs have different length, skip */
+               if (chfs_nref_len(chmp, NULL, nref) != nref_len)
+                       continue;
+
+               if (CHFS_GET_OFS(nref->nref_offset) ==
+                   CHFS_GET_OFS(fd->nref->nref_offset)) {
+                       continue;
+               }
+
+               ret = chfs_read_leb(chmp,
+                   nref->nref_lnr, (void*)&chfdn, CHFS_GET_OFS(nref->nref_offset),
+                   nref_len, &retlen);
+
+               if (ret) {
+                       dbg_gc("Read error: %d\n", ret);
+                       continue;
+               }
+
+               if (retlen != nref_len) {
+                       dbg_gc("Error reading node:"
+                           " read: %zu insted of: %zu\n", retlen, nref_len);
+                       continue;
+               }
+
+               /* if node type doesn't match, skip */
+               if (le16toh(chfdn.type) != CHFS_NODETYPE_DIRENT)
+                       continue;
+
+               /* if crc doesn't match, skip */
+               if (le32toh(chfdn.name_crc) != name_crc)
+                       continue;
+
+               /* if length of name different, or this is an another deletion
+                * dirent, skip
+                */
+               if (chfdn.nsize != name_len || !le64toh(chfdn.vno))
+                       continue;
+
+               /* check actual name */
+               if (memcmp(chfdn.name, fd->name, name_len))
+                       continue;
+
+//             kmem_free(chfdn, nref_len);
+
+               chfs_mark_node_obsolete(chmp, fd->nref);
+               return chfs_write_flash_dirent(chmp,
+                   parent, NULL, fd, fd->vno, ALLOC_GC);
+       }
+
+//     kmem_free(chfdn, nref_len);
+
+       TAILQ_REMOVE(&parent->dents, fd, fds);
+       chfs_free_dirent(fd);
+       return 0;
+}
+
+int
+chfs_gcollect_dnode(struct chfs_mount *chmp,
+    struct chfs_eraseblock *orig_cheb, struct chfs_inode *ip,
+    struct chfs_full_dnode *fn, uint32_t orig_start, uint32_t orig_end)
+{
+       struct chfs_node_ref *nref, *prev;
+       struct chfs_full_dnode *newfn;
+       struct chfs_flash_data_node *fdnode;
+       int ret = 0, retries = 0;
+       uint32_t totlen;
+       char *data = NULL;
+       struct iovec vec;
+       size_t retlen;
+       dbg_gc("gcollect_dnode\n");
+
+       //uint32_t used_size;
+
+/* TODO GC merging frags, should we use it?
+
+   uint32_t start, end;
+
+   start = orig_start;
+   end = orig_end;
+
+   if (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks > chmp->chm_resv_blocks_gcmerge) {
+   struct chfs_node_frag *frag;
+   uint32_t min, max;
+
+   min = start & (PAGE_CACHE_SIZE-1);
+   max = min + PAGE_CACHE_SIZE;
+
+   frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->i_chfs_ext.fragtree, &start);
+   KASSERT(frag->ofs == start);
+
+   while ((frag = frag_prev(&ip->i_chfs_ext.fragtree, frag)) && frag->ofs >= min) {
+   if (frag->ofs > min) {
+   start = frag->ofs;
+   continue;
+   }
+
+   if (!frag->node || !frag->node->nref) {
+   break;
+   } else {
+   struct chfs_node_ref *nref = frag->node->nref;
+   struct chfs_eraseblock *cheb;
+
+   cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+   if (cheb == chmp->chm_gcblock)
+   start = frag->ofs;
+
+   //TODO is this a clean block?
+
+   start = frag->ofs;
+   break;
+   }
+   }
+
+   end--;
+   frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->i_chfs_ext.fragtree, &(end));
+
+   while ((frag = frag_next(&ip->i_chfs_ext.fragtree, frag)) && (frag->ofs + frag->size <= max)) {
+   if (frag->ofs + frag->size < max) {
+   end = frag->ofs + frag->size;
+   continue;
+   }
+
+   if (!frag->node || !frag->node->nref) {
+   break;
+   } else {
+   struct chfs_node_ref *nref = frag->node->nref;
+   struct chfs_eraseblock *cheb;
+
+   cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+   if (cheb == chmp->chm_gcblock)
+   end = frag->ofs + frag->size;
+
+   //TODO is this a clean block?
+
+   end = frag->ofs + frag->size;
+   break;
+   }
+   }
+
+   KASSERT(end <=
+   frag_last(&ip->i_chfs_ext.fragtree)->ofs +
+   frag_last(&ip->i_chfs_ext.fragtree)->size);
+   KASSERT(end >= orig_end);
+   KASSERT(start <= orig_start);
+   }
+*/
+       KASSERT(orig_cheb->lnr == fn->nref->nref_lnr);
+       totlen = chfs_nref_len(chmp, orig_cheb, fn->nref);
+       data = kmem_alloc(totlen, KM_SLEEP);
+
+       ret = chfs_read_leb(chmp, fn->nref->nref_lnr, data, fn->nref->nref_offset,
+           totlen, &retlen);
+
+       fdnode = (struct chfs_flash_data_node *)data;
+       fdnode->version = htole64(++ip->chvc->highest_version);
+       fdnode->node_crc = htole32(crc32(0, (uint8_t *)fdnode,
+               sizeof(*fdnode) - 4));
+
+       vec.iov_base = (void *)data;
+       vec.iov_len = totlen;
+
+retry:
+       ret = chfs_reserve_space_gc(chmp, totlen);
+       if (ret)
+               goto out;
+
+       nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+       if (!nref) {
+               ret = ENOMEM;
+               goto out;
+       }
+
+       mutex_enter(&chmp->chm_lock_sizes);
+
+       nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+       KASSERT(nref->nref_offset % 4 == 0);
+       chfs_change_size_free(chmp, chmp->chm_nextblock, -totlen);
+
+       ret = chfs_write_wbuf(chmp, &vec, 1, nref->nref_offset, &retlen);
+       if (ret || retlen != totlen) {
+               chfs_err("error while writing out to the media\n");
+               chfs_err("err: %d | size: %d | retlen : %zu\n",
+                   ret, totlen, retlen);
+               chfs_change_size_dirty(chmp, chmp->chm_nextblock, totlen);
+               if (retries) {
+                       ret = EIO;
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       goto out;
+               }
+
+               retries++;
+               mutex_exit(&chmp->chm_lock_sizes);
+               goto retry;
+       }
+
+       dbg_gc("new nref lnr: %u - offset: %u\n", nref->nref_lnr, nref->nref_offset);
+
+       chfs_change_size_used(chmp, &chmp->chm_blocks[nref->nref_lnr], totlen);
+       mutex_exit(&chmp->chm_lock_sizes);
+       KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+
+       newfn = chfs_alloc_full_dnode();
+       newfn->nref = nref;
+       newfn->ofs = fn->ofs;
+       newfn->size = fn->size;
+       newfn->frags = fn->frags;
+
+       //TODO should we remove fd from dnode list?
+
+       prev = ip->chvc->dnode;
+       if (prev == fn->nref) {
+               ip->chvc->dnode = prev->nref_next;
+               prev = NULL;
+       }
+       while (prev) {
+               if (prev->nref_next == fn->nref) {
+                       prev->nref_next = fn->nref->nref_next;
+                       break;
+               }
+               prev = prev->nref_next;
+       }
+
+       chfs_add_full_dnode_to_inode(chmp, ip, newfn);
+       chfs_add_node_to_list(chmp,
+           ip->chvc, newfn->nref, &ip->chvc->dnode);
+
+out:
+       kmem_free(data, totlen);
+       return ret;
+}
diff --git a/sys/ufs/chfs/chfs_ihash.c b/sys/ufs/chfs/chfs_ihash.c
new file mode 100644 (file)
index 0000000..b16b00c
--- /dev/null
@@ -0,0 +1,220 @@
+/*     $NetBSD: chfs_ihash.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $     */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+/*
+ * Structures associated with inode cacheing.
+ */
+static LIST_HEAD(ihashhead, chfs_inode) *chfs_ihashtbl;
+static u_long  chfs_ihash;             /* size of hash table - 1 */
+#define INOHASH(device, inum)  (((device) + (inum)) & chfs_ihash)
+
+kmutex_t       chfs_ihash_lock;
+kmutex_t       chfs_hashlock;
+
+/*
+ * Initialize inode hash table.
+ */
+void
+chfs_ihashinit(void)
+{
+       dbg("initing\n");
+
+       mutex_init(&chfs_hashlock, MUTEX_DEFAULT, IPL_NONE);
+       mutex_init(&chfs_ihash_lock, MUTEX_DEFAULT, IPL_NONE);
+       chfs_ihashtbl = hashinit(desiredvnodes,
+           HASH_LIST, true, &chfs_ihash);
+}
+
+/*
+ * Reinitialize inode hash table.
+ */
+
+void
+chfs_ihashreinit(void)
+{
+       struct chfs_inode *ip;
+       struct ihashhead *oldhash, *hash;
+       u_long oldmask, mask, val;
+       int i;
+
+       dbg("reiniting\n");
+
+       hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
+       mutex_enter(&chfs_ihash_lock);
+       oldhash = chfs_ihashtbl;
+       oldmask = chfs_ihash;
+       chfs_ihashtbl = hash;
+       chfs_ihash = mask;
+       for (i = 0; i <= oldmask; i++) {
+               while ((ip = LIST_FIRST(&oldhash[i])) != NULL) {
+                       LIST_REMOVE(ip, hash_entry);
+                       val = INOHASH(ip->dev, ip->ino);
+                       LIST_INSERT_HEAD(&hash[val], ip, hash_entry);
+               }
+       }
+       mutex_exit(&chfs_ihash_lock);
+       hashdone(oldhash, HASH_LIST, oldmask);
+}
+
+/*
+ * Free inode hash table.
+ */
+void
+chfs_ihashdone(void)
+{
+       dbg("destroying\n");
+
+       hashdone(chfs_ihashtbl, HASH_LIST, chfs_ihash);
+       mutex_destroy(&chfs_hashlock);
+       mutex_destroy(&chfs_ihash_lock);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, return it, even if it is locked.
+ */
+struct vnode *
+chfs_ihashlookup(dev_t dev, ino_t inum)
+{
+       struct chfs_inode *ip;
+       struct ihashhead *ipp;
+
+       dbg("dev: %ju, inum: %ju\n", (uintmax_t )dev, (uintmax_t )inum);
+
+       KASSERT(mutex_owned(&chfs_ihash_lock));
+
+       ipp = &chfs_ihashtbl[INOHASH(dev, inum)];
+       LIST_FOREACH(ip, ipp, hash_entry) {
+               if (inum == ip->ino && dev == ip->dev) {
+                       break;
+               }
+       }
+
+       if (ip) {
+               return (ITOV(ip));
+       }
+
+       return (NULLVP);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, but locked, wait for it.
+ */
+struct vnode *
+chfs_ihashget(dev_t dev, ino_t inum, int flags)
+{
+       struct ihashhead *ipp;
+       struct chfs_inode *ip;
+       struct vnode *vp;
+
+       dbg("search for ino\n");
+
+loop:
+       mutex_enter(&chfs_ihash_lock);
+       ipp = &chfs_ihashtbl[INOHASH(dev, inum)];
+       dbg("ipp: %p, chfs_ihashtbl: %p, ihash: %lu\n",
+           ipp, chfs_ihashtbl, chfs_ihash);
+       LIST_FOREACH(ip, ipp, hash_entry) {
+               dbg("ip: %p\n", ip);
+               if (inum == ip->ino && dev == ip->dev) {
+//                     printf("chfs_ihashget: found inode: %p\n", ip);
+                       vp = ITOV(ip);
+                       KASSERT(vp != NULL);
+                       //dbg("found\n");
+                       if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) {
+                               //dbg("wait for #%llu\n", ip->ino);
+                               mutex_exit(&chfs_ihash_lock);
+                               goto loop;
+                       }
+                       /*
+                       if (VOP_ISLOCKED(vp))
+                               dbg("locked\n");
+                       else
+                               dbg("isn't locked\n");
+                       */
+                       if (flags == 0) {
+                               //dbg("no flags\n");
+                               mutex_exit(&chfs_ihash_lock);
+                       } else {
+                               //dbg("vget\n");
+                               mutex_enter(vp->v_interlock);
+                               mutex_exit(&chfs_ihash_lock);
+                               if (vget(vp, flags)) {
+                                       goto loop;
+                               }
+                               //dbg("got it\n");
+                       }
+                       //dbg("return\n");
+                       return (vp);
+               }
+       }
+       //dbg("not found\n");
+       mutex_exit(&chfs_ihash_lock);
+       return (NULL);
+}
+
+/*
+ * Insert the inode into the hash table, and return it locked.
+ */
+void
+chfs_ihashins(struct chfs_inode *ip)
+{
+       struct ihashhead *ipp;
+
+       dbg("ip: %p\n", ip);
+
+       KASSERT(mutex_owned(&chfs_hashlock));
+
+       /* lock the inode, then put it on the appropriate hash list */
+       VOP_LOCK(ITOV(ip), LK_EXCLUSIVE);
+
+       mutex_enter(&chfs_ihash_lock);
+       ipp = &chfs_ihashtbl[INOHASH(ip->dev, ip->ino)];
+       LIST_INSERT_HEAD(ipp, ip, hash_entry);
+       mutex_exit(&chfs_ihash_lock);
+}
+
+/*
+ * Remove the inode from the hash table.
+ */
+void
+chfs_ihashrem(struct chfs_inode *ip)
+{
+       dbg("ip: %p\n", ip);
+
+       mutex_enter(&chfs_ihash_lock);
+       LIST_REMOVE(ip, hash_entry);
+       mutex_exit(&chfs_ihash_lock);
+}
+
diff --git a/sys/ufs/chfs/chfs_malloc.c b/sys/ufs/chfs/chfs_malloc.c
new file mode 100644 (file)
index 0000000..3138acc
--- /dev/null
@@ -0,0 +1,396 @@
+/*     $NetBSD: chfs_malloc.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $    */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+#include <sys/pool.h>
+
+pool_cache_t chfs_vnode_cache;
+pool_cache_t chfs_nrefs_cache;
+pool_cache_t chfs_flash_vnode_cache;
+pool_cache_t chfs_flash_dirent_cache;
+pool_cache_t chfs_flash_dnode_cache;
+pool_cache_t chfs_node_frag_cache;
+pool_cache_t chfs_tmp_dnode_cache;
+pool_cache_t chfs_tmp_dnode_info_cache;
+
+int
+chfs_alloc_pool_caches()
+{
+       chfs_vnode_cache = pool_cache_init(
+               sizeof(struct chfs_vnode_cache),
+               0, 0, 0, "chfs_vnode_cache", NULL, IPL_NONE, NULL, NULL,
+               NULL);
+       if (!chfs_vnode_cache)
+               goto err_vnode;
+
+       chfs_nrefs_cache = pool_cache_init(
+               (REFS_BLOCK_LEN + 1) * sizeof(struct chfs_node_ref), 0, 0,
+               0, "chfs_nrefs_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+       if (!chfs_nrefs_cache)
+               goto err_nrefs;
+
+       chfs_flash_vnode_cache = pool_cache_init(
+               sizeof(struct chfs_flash_vnode), 0, 0, 0,
+               "chfs_flash_vnode_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+       if (!chfs_flash_vnode_cache)
+               goto err_flash_vnode;
+
+       chfs_flash_dirent_cache = pool_cache_init(
+               sizeof(struct chfs_flash_dirent_node), 0, 0, 0,
+               "chfs_flash_dirent_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+       if (!chfs_flash_dirent_cache)
+               goto err_flash_dirent;
+
+       chfs_flash_dnode_cache = pool_cache_init(
+               sizeof(struct chfs_flash_data_node), 0, 0, 0,
+               "chfs_flash_dnode_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+       if (!chfs_flash_dnode_cache)
+               goto err_flash_dnode;
+
+       chfs_node_frag_cache = pool_cache_init(
+               sizeof(struct chfs_node_frag), 0, 0, 0,
+               "chfs_node_frag_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+       if (!chfs_node_frag_cache)
+               goto err_node_frag;
+
+       chfs_tmp_dnode_cache = pool_cache_init(
+               sizeof(struct chfs_tmp_dnode), 0, 0, 0,
+               "chfs_tmp_dnode_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+       if (!chfs_tmp_dnode_cache)
+               goto err_tmp_dnode;
+
+       chfs_tmp_dnode_info_cache = pool_cache_init(
+               sizeof(struct chfs_tmp_dnode_info), 0, 0, 0,
+               "chfs_tmp_dnode_info_pool", NULL, IPL_NONE, NULL, NULL, NULL);
+       if (!chfs_tmp_dnode_info_cache)
+               goto err_tmp_dnode_info;
+
+       return 0;
+
+err_tmp_dnode_info:
+       pool_cache_destroy(chfs_tmp_dnode_cache);
+err_tmp_dnode:
+       pool_cache_destroy(chfs_node_frag_cache);
+err_node_frag:
+       pool_cache_destroy(chfs_flash_dnode_cache);
+err_flash_dnode:
+       pool_cache_destroy(chfs_flash_dirent_cache);
+err_flash_dirent:
+       pool_cache_destroy(chfs_flash_vnode_cache);
+err_flash_vnode:
+       pool_cache_destroy(chfs_nrefs_cache);
+err_nrefs:
+       pool_cache_destroy(chfs_vnode_cache);
+err_vnode:
+
+       return ENOMEM;
+}
+
+void
+chfs_destroy_pool_caches()
+{
+       if (chfs_vnode_cache)
+               pool_cache_destroy(chfs_vnode_cache);
+
+       if (chfs_nrefs_cache)
+               pool_cache_destroy(chfs_nrefs_cache);
+
+       if (chfs_flash_vnode_cache)
+               pool_cache_destroy(chfs_flash_vnode_cache);
+
+       if (chfs_flash_dirent_cache)
+               pool_cache_destroy(chfs_flash_dirent_cache);
+
+       if (chfs_flash_dnode_cache)
+               pool_cache_destroy(chfs_flash_dnode_cache);
+
+       if (chfs_node_frag_cache)
+               pool_cache_destroy(chfs_node_frag_cache);
+
+       if (chfs_tmp_dnode_cache)
+               pool_cache_destroy(chfs_tmp_dnode_cache);
+
+       if (chfs_tmp_dnode_info_cache)
+               pool_cache_destroy(chfs_tmp_dnode_info_cache);
+}
+
+struct chfs_vnode_cache *
+chfs_vnode_cache_alloc(ino_t vno)
+{
+       struct chfs_vnode_cache* vc;
+       vc = pool_cache_get(chfs_vnode_cache, PR_WAITOK);
+
+       memset(vc, 0, sizeof(*vc));
+       vc->vno = vno;
+       vc->v = (void *)vc;
+       vc->dirents = (void *)vc;
+       vc->dnode = (void *)vc;
+       TAILQ_INIT(&vc->scan_dirents);
+       vc->highest_version = 0;
+
+       return vc;
+}
+
+void
+chfs_vnode_cache_free(struct chfs_vnode_cache *vc)
+{
+       //kmem_free(vc->vno_version, sizeof(uint64_t));
+       pool_cache_put(chfs_vnode_cache, vc);
+}
+
+/**
+ * chfs_alloc_refblock - allocating a refblock
+ *
+ * Returns a pointer of the first element in the block.
+ *
+ * We are not allocating just one node ref, instead we allocating REFS_BLOCK_LEN
+ * number of node refs, the last element will be a pointer to the next block.
+ * We do this, because we need a chain of nodes which have been ordered by the
+ * physical address of them.
+ *
+ */
+struct chfs_node_ref*
+chfs_alloc_refblock(void)
+{
+       int i;
+       struct chfs_node_ref *nref;
+       nref = pool_cache_get(chfs_nrefs_cache, PR_WAITOK);
+
+       for (i = 0; i < REFS_BLOCK_LEN; i++) {
+               nref[i].nref_lnr = REF_EMPTY_NODE;
+               nref[i].nref_next = NULL;
+       }
+       i = REFS_BLOCK_LEN;
+       nref[i].nref_lnr = REF_LINK_TO_NEXT;
+       nref[i].nref_next = NULL;
+
+       return nref;
+}
+
+/**
+ * chfs_free_refblock - freeing a refblock
+ */
+void
+chfs_free_refblock(struct chfs_node_ref *nref)
+{
+       pool_cache_put(chfs_nrefs_cache, nref);
+}
+
+/**
+ * chfs_alloc_node_ref - allocating a node ref from a refblock
+ * @cheb: eraseblock information structure
+ *
+ * Allocating a node ref from a refblock, it there isn't any free element in the
+ * block, a new block will be allocated and be linked to the current block.
+ */
+struct chfs_node_ref*
+chfs_alloc_node_ref(struct chfs_eraseblock *cheb)
+{
+       struct chfs_node_ref *nref, *new, *old;
+       old = cheb->last_node;
+       nref = cheb->last_node;
+
+       if (!nref) {
+               //There haven't been any nref allocated for this block yet
+               nref = chfs_alloc_refblock();
+
+               cheb->first_node = nref;
+               cheb->last_node = nref;
+               nref->nref_lnr = cheb->lnr;
+               KASSERT(cheb->lnr == nref->nref_lnr);
+
+               return nref;
+       }
+
+       nref++;
+       if (nref->nref_lnr == REF_LINK_TO_NEXT) {
+               new = chfs_alloc_refblock();
+               nref->nref_next = new;
+               nref = new;
+       }
+
+       cheb->last_node = nref;
+       nref->nref_lnr = cheb->lnr;
+
+       KASSERT(old->nref_lnr == nref->nref_lnr &&
+           nref->nref_lnr == cheb->lnr);
+
+       return nref;
+}
+
+/**
+ * chfs_free_node_refs - freeing an eraseblock's node refs
+ * @cheb: eraseblock information structure
+ */
+void
+chfs_free_node_refs(struct chfs_eraseblock *cheb)
+{
+       struct chfs_node_ref *nref, *block;
+
+       block = nref = cheb->first_node;
+
+       while (nref) {
+               if (nref->nref_lnr == REF_LINK_TO_NEXT) {
+                       nref = nref->nref_next;
+                       chfs_free_refblock(block);
+                       block = nref;
+                       continue;
+               }
+               nref++;
+       }
+}
+
+struct chfs_dirent*
+chfs_alloc_dirent(int namesize)
+{
+       struct chfs_dirent *ret;
+       size_t size = sizeof(struct chfs_dirent) + namesize;
+
+       ret = kmem_alloc(size, KM_SLEEP);
+       //ret->alloc_size = size;
+
+       return ret;
+}
+
+void
+chfs_free_dirent(struct chfs_dirent *dirent)
+{
+       //size_t size = dirent->alloc_size;
+       size_t size = sizeof(struct chfs_dirent) + dirent->nsize + 1;
+
+       kmem_free(dirent, size);
+}
+
+struct chfs_full_dnode*
+chfs_alloc_full_dnode()
+{
+       struct chfs_full_dnode *ret;
+       ret = kmem_alloc(sizeof(struct chfs_full_dnode), KM_SLEEP);
+       return ret;
+}
+
+void
+chfs_free_full_dnode(struct chfs_full_dnode *fd)
+{
+       kmem_free(fd,(sizeof(struct chfs_full_dnode)));
+}
+
+struct chfs_flash_vnode*
+chfs_alloc_flash_vnode()
+{
+       struct chfs_flash_vnode *ret;
+       ret = pool_cache_get(chfs_flash_vnode_cache, 0);
+       return ret;
+}
+
+void
+chfs_free_flash_vnode(struct chfs_flash_vnode *fvnode)
+{
+       pool_cache_put(chfs_flash_vnode_cache, fvnode);
+}
+
+struct chfs_flash_dirent_node*
+chfs_alloc_flash_dirent()
+{
+       struct chfs_flash_dirent_node *ret;
+       ret = pool_cache_get(chfs_flash_dirent_cache, 0);
+       return ret;
+}
+
+void
+chfs_free_flash_dirent(struct chfs_flash_dirent_node *fdnode)
+{
+       pool_cache_put(chfs_flash_dirent_cache, fdnode);
+}
+
+struct chfs_flash_data_node*
+chfs_alloc_flash_dnode()
+{
+       struct chfs_flash_data_node *ret;
+       ret = pool_cache_get(chfs_flash_dnode_cache, 0);
+       return ret;
+}
+
+void
+chfs_free_flash_dnode(struct chfs_flash_data_node *fdnode)
+{
+       pool_cache_put(chfs_flash_dnode_cache, fdnode);
+}
+
+
+struct chfs_node_frag*
+chfs_alloc_node_frag()
+{
+       struct chfs_node_frag *ret;
+       ret = pool_cache_get(chfs_node_frag_cache, 0);
+       return ret;
+
+}
+
+void
+chfs_free_node_frag(struct chfs_node_frag *frag)
+{
+       pool_cache_put(chfs_node_frag_cache, frag);
+}
+
+struct chfs_tmp_dnode *
+chfs_alloc_tmp_dnode()
+{
+       struct chfs_tmp_dnode *ret;
+       ret = pool_cache_get(chfs_tmp_dnode_cache, 0);
+       ret->next = NULL;
+       return ret;
+}
+
+void
+chfs_free_tmp_dnode(struct chfs_tmp_dnode *td)
+{
+       pool_cache_put(chfs_tmp_dnode_cache, td);
+}
+
+struct chfs_tmp_dnode_info *
+chfs_alloc_tmp_dnode_info()
+{
+       struct chfs_tmp_dnode_info *ret;
+       ret = pool_cache_get(chfs_tmp_dnode_info_cache, 0);
+       ret->tmpnode = NULL;
+       return ret;
+}
+
+void
+chfs_free_tmp_dnode_info(struct chfs_tmp_dnode_info *di)
+{
+       pool_cache_put(chfs_tmp_dnode_info_cache, di);
+}
+
diff --git a/sys/ufs/chfs/chfs_nodeops.c b/sys/ufs/chfs/chfs_nodeops.c
new file mode 100644 (file)
index 0000000..bf761dd
--- /dev/null
@@ -0,0 +1,570 @@
+/*     $NetBSD: chfs_nodeops.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $   */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+
+/**
+ * chfs_update_eb_dirty - updates dirty and free space, first and
+ *                           last node references
+ * @sbi: CHFS main descriptor structure
+ * @cheb: eraseblock to update
+ * @size: increase dirty space size with this
+ * Returns zero in case of success, %1 in case of fail.
+ */
+int
+chfs_update_eb_dirty(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, uint32_t size)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       KASSERT(!mutex_owned(&chmp->chm_lock_sizes));
+
+       if (!size)
+               return 0;
+
+       if (size > cheb->free_size) {
+               chfs_err("free_size (%d) is less then dirty space (%d) "
+                   "on block (%d)\n", cheb->free_size, size, cheb->lnr);
+               return 1;
+       }
+       mutex_enter(&chmp->chm_lock_sizes);
+       //dbg("BEFORE: free_size: %d\n", cheb->free_size);
+       chfs_change_size_free(chmp, cheb, -size);
+       chfs_change_size_dirty(chmp, cheb, size);
+       //dbg(" AFTER: free_size: %d\n", cheb->free_size);
+       mutex_exit(&chmp->chm_lock_sizes);
+       return 0;
+}
+
+/**
+ * chfs_add_node_to_list - adds a data node ref to vnode cache's dnode list
+ * @sbi: super block informations
+ * @new: node ref to insert
+ * @list: head of the list
+ * This function inserts a data node ref to the list of vnode cache.
+ * The list is sorted by data node's lnr and offset.
+ */
+void
+chfs_add_node_to_list(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc,
+    struct chfs_node_ref *new, struct chfs_node_ref **list)
+{
+       struct chfs_node_ref *nextref = *list;
+       struct chfs_node_ref *prevref = NULL;
+
+       while (nextref && nextref != (struct chfs_node_ref *)vc &&
+           (nextref->nref_lnr <= new->nref_lnr)) {
+               if (nextref->nref_lnr == new->nref_lnr) {
+                       while (nextref && nextref !=
+                           (struct chfs_node_ref *)vc &&
+                           (CHFS_GET_OFS(nextref->nref_offset) <
+                               CHFS_GET_OFS(new->nref_offset))) {
+                               prevref = nextref;
+                               nextref = nextref->nref_next;
+                       }
+                       break;
+               }
+               prevref = nextref;
+               nextref = nextref->nref_next;
+       }
+
+       if (nextref && nextref != (struct chfs_node_ref *)vc &&
+           nextref->nref_lnr == new->nref_lnr &&
+           CHFS_GET_OFS(nextref->nref_offset) ==
+           CHFS_GET_OFS(new->nref_offset)) {
+               new->nref_next = nextref->nref_next;
+       } else {
+               new->nref_next = nextref;
+       }
+
+       if (prevref) {
+               prevref->nref_next = new;
+       } else {
+               *list = new;
+       }
+}
+
+void
+chfs_add_fd_to_inode(struct chfs_mount *chmp,
+    struct chfs_inode *parent, struct chfs_dirent *new)
+{
+//     struct chfs_dirent **prev = &parent->dents;
+       struct chfs_dirent *fd, *tmpfd;
+
+       if (new->version > parent->chvc->highest_version) {
+               parent->chvc->highest_version = new->version;
+       }
+
+       //mutex_enter(&parent->inode_lock);
+       TAILQ_FOREACH_SAFE(fd, &parent->dents, fds, tmpfd) {
+               if (fd->nhash > new->nhash) {
+                       /* insert new before fd */
+                       TAILQ_INSERT_BEFORE(fd, new, fds);
+                       return;
+               } else if (fd->nhash == new->nhash &&
+                   !strcmp(fd->name, new->name)) {
+                       if (new->version > fd->version) {
+//                             new->next = fd->next;
+                               /* replace fd with new */
+                               TAILQ_INSERT_BEFORE(fd, new, fds);
+                               TAILQ_REMOVE(&parent->dents, fd, fds);
+                               if (fd->nref) {
+                                       chfs_mark_node_obsolete(chmp,
+                                           fd->nref);
+                               }
+                               chfs_free_dirent(fd);
+//                             *prev = new;//XXX
+                       } else {
+                               chfs_mark_node_obsolete(chmp, new->nref);
+                               chfs_free_dirent(new);
+                       }
+                       return;
+               }
+       }
+       /* if we couldnt fit it elsewhere, lets add to the end */
+       /* FIXME insert tail or insert head? */
+       TAILQ_INSERT_HEAD(&parent->dents, new, fds);
+       //mutex_exit(&parent->inode_lock);
+#if 0
+       while ((*prev) && (*prev)->nhash <= new->nhash) {
+               if ((*prev)->nhash == new->nhash &&
+                   !strcmp((*prev)->name, new->name)) {
+                       if (new->version > (*prev)->version) {
+                               new->next = (*prev)->next;
+                               if ((*prev)->nref) {
+                                       chfs_mark_node_obsolete(chmp,
+                                           (*prev)->nref);
+                               }
+                               chfs_free_dirent(*prev);
+                               *prev = new;
+                       } else {
+                               chfs_mark_node_obsolete(chmp, new->nref);
+                               chfs_free_dirent(new);
+                       }
+                       return;
+               }
+               prev = &((*prev)->next);
+       }
+
+       new->next = *prev;
+       *prev = new;
+#endif
+}
+
+void
+chfs_add_vnode_ref_to_vc(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc, struct chfs_node_ref *new)
+{
+       if ((struct chfs_vnode_cache*)(vc->v) != vc) {
+               chfs_mark_node_obsolete(chmp, vc->v);
+               new->nref_next = vc->v->nref_next;
+       } else {
+               new->nref_next = vc->v;
+       }
+       vc->v = new;
+}
+
+struct chfs_node_ref *
+chfs_nref_next(struct chfs_node_ref *nref)
+{
+//     dbg("check nref: %u - %u\n", nref->nref_lnr, nref->nref_offset);
+       nref++;
+//     dbg("next nref: %u - %u\n", nref->nref_lnr, nref->nref_offset);
+       if (nref->nref_lnr == REF_LINK_TO_NEXT) {
+               //End of chain
+               if (!nref->nref_next)
+                       return NULL;
+
+               nref = nref->nref_next;
+       }
+       //end of chain
+       if (nref->nref_lnr == REF_EMPTY_NODE)
+               return NULL;
+
+       return nref;
+}
+
+int
+chfs_nref_len(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, struct chfs_node_ref *nref)
+{
+       struct chfs_node_ref *next;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       if (!cheb)
+               cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+       next = chfs_nref_next(nref);
+
+       if (!next) {
+               //dbg("next null\n");
+               return chmp->chm_ebh->eb_size - cheb->free_size -
+                   CHFS_GET_OFS(nref->nref_offset);
+       }
+       //dbg("size: %d\n", CHFS_GET_OFS(next->nref_offset) - CHFS_GET_OFS(nref->nref_offset));
+       return CHFS_GET_OFS(next->nref_offset) -
+           CHFS_GET_OFS(nref->nref_offset);
+}
+
+/**
+ * chfs_mark_node_obsolete - marks a node obsolete
+ */
+void
+chfs_mark_node_obsolete(struct chfs_mount *chmp,
+    struct chfs_node_ref *nref)
+{
+       int len;
+       struct chfs_eraseblock *cheb;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       KASSERT(!CHFS_REF_OBSOLETE(nref));
+
+       KASSERT(nref->nref_lnr <= chmp->chm_ebh->peb_nr);
+       cheb = &chmp->chm_blocks[nref->nref_lnr];
+
+#ifdef DIAGNOSTIC
+       if (cheb->used_size + cheb->free_size + cheb->dirty_size +
+           cheb->unchecked_size + cheb->wasted_size != chmp->chm_ebh->eb_size) {
+               dbg("eraseblock leak detected!\nused: %u\nfree: %u\n"
+                   "dirty: %u\nunchecked: %u\nwasted: %u\ntotal: %u\nshould be: %zu\n",
+                   cheb->used_size, cheb->free_size, cheb->dirty_size,
+                   cheb->unchecked_size, cheb->wasted_size, cheb->used_size + cheb->free_size +
+                   cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size,
+                   chmp->chm_ebh->eb_size);
+       }
+#endif
+
+       len = chfs_nref_len(chmp, cheb, nref);
+       //dbg("len: %u\n", len);
+       //dbg("1. used: %u\n", cheb->used_size);
+
+       mutex_enter(&chmp->chm_lock_sizes);
+       
+       if (CHFS_REF_FLAGS(nref) == CHFS_UNCHECKED_NODE_MASK) {
+               //dbg("UNCHECKED mark an unchecked node\n");
+               chfs_change_size_unchecked(chmp, cheb, -len);
+               //dbg("unchecked: %u\n", chmp->chm_unchecked_size);
+       } else {
+               chfs_change_size_used(chmp, cheb, -len);
+
+               //dbg("2. used: %u\n", cheb->used_size);
+               KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size);
+       }
+       chfs_change_size_dirty(chmp, cheb, len);
+
+#ifdef DIAGNOSTIC
+       if (cheb->used_size + cheb->free_size + cheb->dirty_size +
+           cheb->unchecked_size + cheb->wasted_size != chmp->chm_ebh->eb_size) {
+               panic("eraseblock leak detected!\nused: %u\nfree: %u\n"
+                   "dirty: %u\nunchecked: %u\nwasted: %u\ntotal: %u\nshould be: %zu\n",
+                   cheb->used_size, cheb->free_size, cheb->dirty_size,
+                   cheb->unchecked_size, cheb->wasted_size, cheb->used_size + cheb->free_size +
+                   cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size,
+                   chmp->chm_ebh->eb_size);
+       }
+#endif
+       nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) |
+           CHFS_OBSOLETE_NODE_MASK;
+
+       if (chmp->chm_flags & CHFS_MP_FLAG_SCANNING) {
+               /*Scan is in progress, do nothing now*/
+               mutex_exit(&chmp->chm_lock_sizes);
+               return;
+       }
+
+       if (cheb == chmp->chm_nextblock) {
+               dbg("Not moving nextblock to dirty/erase_pending list\n");
+       } else if (!cheb->used_size && !cheb->unchecked_size) {
+               if (cheb == chmp->chm_gcblock) {
+                       dbg("gcblock is completely dirtied\n");
+                       chmp->chm_gcblock = NULL;
+               } else {
+                       //remove from a tailq, but we don't know which tailq contains this cheb
+                       //so we remove it from the dirty list now
+                       //TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue);
+                       int removed = 0;
+                       struct chfs_eraseblock *eb, *tmpeb;
+                       //XXX ugly code
+                       TAILQ_FOREACH_SAFE(eb, &chmp->chm_free_queue, queue, tmpeb) {
+                               if (eb == cheb) {
+                                       TAILQ_REMOVE(&chmp->chm_free_queue, cheb, queue);
+                                       removed = 1;
+                                       break;
+                               }
+                       }
+                       if (removed == 0) {
+                               TAILQ_FOREACH_SAFE(eb, &chmp->chm_dirty_queue, queue, tmpeb) {
+                                       if (eb == cheb) {
+                                               TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue);
+                                               removed = 1;
+                                               break;
+                                       }
+                               }
+                       }
+                       if (removed == 0) {
+                               TAILQ_FOREACH_SAFE(eb, &chmp->chm_very_dirty_queue, queue, tmpeb) {
+                                       if (eb == cheb) {
+                                               TAILQ_REMOVE(&chmp->chm_very_dirty_queue, cheb, queue);
+                                               removed = 1;
+                                               break;
+                                       }
+                               }
+                       }
+                       if (removed == 0) {
+                               TAILQ_FOREACH_SAFE(eb, &chmp->chm_clean_queue, queue, tmpeb) {
+                                       if (eb == cheb) {
+                                               TAILQ_REMOVE(&chmp->chm_clean_queue, cheb, queue);
+                                               removed = 1;
+                                               break;
+                                       }
+                               }
+                       }
+               }
+               if (chmp->chm_wbuf_len) {
+                       dbg("Adding block to erasable pending wbuf queue\n");
+                       TAILQ_INSERT_TAIL(&chmp->chm_erasable_pending_wbuf_queue,
+                           cheb, queue);
+               } else {
+                       TAILQ_INSERT_TAIL(&chmp->chm_erase_pending_queue,
+                           cheb, queue);
+                       chmp->chm_nr_erasable_blocks++;
+               }
+               chfs_remap_leb(chmp);
+       } else if (cheb == chmp->chm_gcblock) {
+               dbg("Not moving gcblock to dirty list\n");
+       } else if (cheb->dirty_size > MAX_DIRTY_TO_CLEAN &&
+           cheb->dirty_size - len <= MAX_DIRTY_TO_CLEAN) {
+               dbg("Freshly dirtied, remove it from clean queue and "
+                   "add it to dirty\n");
+               TAILQ_REMOVE(&chmp->chm_clean_queue, cheb, queue);
+               TAILQ_INSERT_TAIL(&chmp->chm_dirty_queue, cheb, queue);
+       } else if (VERY_DIRTY(chmp, cheb->dirty_size) &&
+           !VERY_DIRTY(chmp, cheb->dirty_size - len)) {
+               dbg("Becomes now very dirty, remove it from dirty "
+                   "queue and add it to very dirty\n");
+               TAILQ_REMOVE(&chmp->chm_dirty_queue, cheb, queue);
+               TAILQ_INSERT_TAIL(&chmp->chm_very_dirty_queue, cheb, queue);
+       } else {
+               dbg("Leave cheb where it is\n");
+       }
+       mutex_exit(&chmp->chm_lock_sizes);
+       return;
+}
+
+/**
+ * chfs_close_eraseblock - close an eraseblock
+ * @chmp: chfs mount structure
+ * @cheb: eraseblock informations
+ *
+ * This function close the physical chain of the nodes on the eraseblock,
+ * convert its free size to dirty and add it to clean, dirty or very dirty list.
+ */
+int
+chfs_close_eraseblock(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb)
+{
+       uint32_t offset;
+       struct chfs_node_ref *nref;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       offset = chmp->chm_ebh->eb_size - cheb->free_size;
+
+       // Close the chain
+       nref = chfs_alloc_node_ref(cheb);
+       if (!nref)
+               return ENOMEM;
+
+       nref->nref_next = NULL;
+       nref->nref_offset = offset;
+
+       // Mark space as dirty
+       chfs_update_eb_dirty(chmp, cheb, cheb->free_size);
+
+       if (cheb->dirty_size < MAX_DIRTY_TO_CLEAN) {
+               TAILQ_INSERT_TAIL(&chmp->chm_clean_queue, cheb, queue);
+       } else if (VERY_DIRTY(chmp, cheb->dirty_size)) {
+               TAILQ_INSERT_TAIL(&chmp->chm_very_dirty_queue, cheb, queue);
+       } else {
+               TAILQ_INSERT_TAIL(&chmp->chm_dirty_queue, cheb, queue);
+       }
+       return 0;
+}
+
+int
+chfs_reserve_space_normal(struct chfs_mount *chmp, uint32_t size, int prio)
+{
+       int ret;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       mutex_enter(&chmp->chm_lock_sizes);
+       while (chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks < chmp->chm_resv_blocks_write) {
+               dbg("free: %d, erasable: %d, resv: %d\n", chmp->chm_nr_free_blocks, chmp->chm_nr_erasable_blocks, chmp->chm_resv_blocks_write);
+               uint32_t avail, dirty;
+               if (prio == ALLOC_DELETION && chmp->chm_nr_free_blocks + chmp->chm_nr_erasable_blocks >= chmp->chm_resv_blocks_deletion)
+                       break;
+
+               dirty = chmp->chm_dirty_size - chmp->chm_nr_erasable_blocks * chmp->chm_ebh->eb_size + chmp->chm_unchecked_size;
+               if (dirty < chmp->chm_nospc_dirty) {
+                       dbg("dirty: %u < nospc_dirty: %u\n", dirty, chmp->chm_nospc_dirty);
+                       ret = ENOSPC;
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       goto out;
+               }
+
+               avail = chmp->chm_free_size - (chmp->chm_resv_blocks_write * chmp->chm_ebh->eb_size);
+               if (size > avail) {
+                       dbg("size: %u > avail: %u\n", size, avail);
+                       ret = ENOSPC;
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       goto out;
+               }
+
+               mutex_exit(&chmp->chm_lock_sizes);
+               ret = chfs_gcollect_pass(chmp);
+               /* gcollect_pass exits chm_lock_mountfields */
+               mutex_enter(&chmp->chm_lock_mountfields);
+               mutex_enter(&chmp->chm_lock_sizes);
+
+               if (chmp->chm_nr_erasable_blocks ||
+                   !TAILQ_EMPTY(&chmp->chm_erasable_pending_wbuf_queue) ||
+                   ret == EAGAIN) {
+                       ret = chfs_remap_leb(chmp);
+               }
+
+               if (ret) {
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       goto out;
+               }
+       }
+
+       mutex_exit(&chmp->chm_lock_sizes);
+       ret = chfs_reserve_space(chmp, size);
+out:
+       return ret;
+}
+
+
+int
+chfs_reserve_space_gc(struct chfs_mount *chmp, uint32_t size)
+{
+       int ret;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       mutex_enter(&chmp->chm_lock_sizes);
+       chfs_remap_leb(chmp);
+
+       if (size > chmp->chm_free_size) {
+               dbg("size: %u\n", size);
+               mutex_exit(&chmp->chm_lock_sizes);
+               return ENOSPC;
+       }
+
+       mutex_exit(&chmp->chm_lock_sizes);
+       ret = chfs_reserve_space(chmp, size);
+       return ret;
+}
+
+/**
+ * chfs_reserve_space - finds a block which free size is >= requested size
+ * @chmp: chfs mount point
+ * @size: requested size
+ * @len: reserved spaced will be returned in this variable;
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+chfs_reserve_space(struct chfs_mount *chmp, uint32_t size)
+{
+       //TODO define minimum reserved blocks, which is needed for writing
+       //TODO check we have enough free blocks to write
+       //TODO if no: need erase and GC
+
+       int err;
+       struct chfs_eraseblock *cheb;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       KASSERT(!mutex_owned(&chmp->chm_lock_sizes));
+
+       cheb = chmp->chm_nextblock;
+       //if (cheb)
+           //dbg("cheb->free_size %u\n", cheb->free_size);
+       if (cheb && size > cheb->free_size) {
+               dbg("size: %u > free_size: %u\n", size, cheb->free_size);
+               /*
+                * There isn't enough space on this eraseblock, we mark this as
+                * dirty and close the physical chain of the node refs.
+                */
+               //Write out pending data if any
+               if (chmp->chm_wbuf_len) {
+                       chfs_flush_pending_wbuf(chmp);
+                       //FIXME need goto restart here?
+               }
+
+               while (chmp->chm_wbuf_ofs < chmp->chm_ebh->eb_size) {
+                       dbg("wbuf ofs: %zu - eb_size: %zu\n",
+                           chmp->chm_wbuf_ofs, chmp->chm_ebh->eb_size);
+                       chfs_flush_pending_wbuf(chmp);
+               }
+
+               if (!(chmp->chm_wbuf_ofs % chmp->chm_ebh->eb_size) && !chmp->chm_wbuf_len)
+                       chmp->chm_wbuf_ofs = 0xffffffff;
+
+               err = chfs_close_eraseblock(chmp, cheb);
+               if (err)
+                       return err;
+
+               cheb = NULL;
+       }
+       if (!cheb) {
+               //get a block for nextblock
+               if (TAILQ_EMPTY(&chmp->chm_free_queue)) {
+                       // If this succeeds there will be a block on free_queue
+                       dbg("cheb remap (free: %d)\n", chmp->chm_nr_free_blocks);
+                       err = chfs_remap_leb(chmp);
+                       if (err)
+                               return err;
+               }
+               cheb = TAILQ_FIRST(&chmp->chm_free_queue);
+               TAILQ_REMOVE(&chmp->chm_free_queue, cheb, queue);
+               chmp->chm_nextblock = cheb;
+               chmp->chm_nr_free_blocks--;
+       }
+
+       return 0;
+}
+
diff --git a/sys/ufs/chfs/chfs_pool.c b/sys/ufs/chfs/chfs_pool.c
new file mode 100644 (file)
index 0000000..6e25d17
--- /dev/null
@@ -0,0 +1,211 @@
+/*     $NetBSD: chfs_pool.c,v 1.1 2011/11/24 15:51:31 ahoka Exp $      */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Pool allocator and convenience routines for chfs.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/pool.h>
+#include <sys/atomic.h>
+
+#include <uvm/uvm.h>
+
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+/* --------------------------------------------------------------------- */
+
+void * chfs_pool_page_alloc(struct pool *, int);
+void   chfs_pool_page_free(struct pool *, void *);
+
+extern void*   pool_page_alloc_nointr(struct pool *, int);
+extern void    pool_page_free_nointr(struct pool *, void *);
+
+/* --------------------------------------------------------------------- */
+
+struct pool_allocator chfs_pool_allocator = {
+       .pa_alloc = chfs_pool_page_alloc,
+       .pa_free = chfs_pool_page_free,
+};
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_pool_init(struct chfs_pool *chpp, size_t size, const char *what,
+    struct chfs_mount *chmp)
+{
+       int cnt;
+
+       cnt = snprintf(chpp->chp_name, sizeof(chpp->chp_name),
+           "%s_chfs_%p", what, chmp);
+       KASSERT(cnt < sizeof(chpp->chp_name));
+
+       pool_init(&chpp->chp_pool, size, 0, 0, 0, chpp->chp_name,
+           &chfs_pool_allocator, IPL_NONE);
+       chpp->chp_mount = chmp;
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_pool_destroy(struct chfs_pool *chpp)
+{
+       pool_destroy((struct pool *)chpp);
+}
+
+/* --------------------------------------------------------------------- */
+
+void *
+chfs_pool_page_alloc(struct pool *pp, int flags)
+{
+       struct chfs_pool *chpp;
+       struct chfs_mount *chmp;
+       unsigned int pages;
+       void *page;
+       dbg("CHFS: pool_page_alloc()\n");
+
+       chpp = (struct chfs_pool *)pp;
+       chmp = chpp->chp_mount;
+
+       pages = atomic_inc_uint_nv(&chmp->chm_pages_used);
+       if (pages >= CHFS_PAGES_MAX(chmp)) {
+               atomic_dec_uint(&chmp->chm_pages_used);
+               return NULL;
+       }
+       page = pool_page_alloc_nointr(pp, flags | PR_WAITOK);
+       if (page == NULL) {
+               atomic_dec_uint(&chmp->chm_pages_used);
+       }
+
+       return page;
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_pool_page_free(struct pool *pp, void *v)
+{
+       struct chfs_pool *chpp;
+       struct chfs_mount *chmp;
+       dbg("CHFS: pool_page_free()\n");
+
+       chpp = (struct chfs_pool *)pp;
+       chmp = chpp->chp_mount;
+
+       atomic_dec_uint(&chmp->chm_pages_used);
+       pool_page_free_nointr(pp, v);
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_str_pool_init(struct chfs_str_pool *chsp, struct chfs_mount *chmp)
+{
+       dbg("CHFS: str_pool_init()\n");
+
+       chfs_pool_init(&chsp->chsp_pool_16,   16,   "str", chmp);
+       chfs_pool_init(&chsp->chsp_pool_32,   32,   "str", chmp);
+       chfs_pool_init(&chsp->chsp_pool_64,   64,   "str", chmp);
+       chfs_pool_init(&chsp->chsp_pool_128,  128,  "str", chmp);
+       chfs_pool_init(&chsp->chsp_pool_256,  256,  "str", chmp);
+       chfs_pool_init(&chsp->chsp_pool_512,  512,  "str", chmp);
+       chfs_pool_init(&chsp->chsp_pool_1024, 1024, "str", chmp);
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_str_pool_destroy(struct chfs_str_pool *chsp)
+{
+       dbg("CHFS: str_pool_destroy()\n");
+
+       chfs_pool_destroy(&chsp->chsp_pool_16);
+       chfs_pool_destroy(&chsp->chsp_pool_32);
+       chfs_pool_destroy(&chsp->chsp_pool_64);
+       chfs_pool_destroy(&chsp->chsp_pool_128);
+       chfs_pool_destroy(&chsp->chsp_pool_256);
+       chfs_pool_destroy(&chsp->chsp_pool_512);
+       chfs_pool_destroy(&chsp->chsp_pool_1024);
+}
+
+/* --------------------------------------------------------------------- */
+
+char *
+chfs_str_pool_get(struct chfs_str_pool *chsp, size_t len, int flags)
+{
+       struct chfs_pool *p;
+       dbg("CHFS: str_pool_get()\n");
+
+       KASSERT(len <= 1024);
+
+       if      (len <= 16)   p = &chsp->chsp_pool_16;
+       else if (len <= 32)   p = &chsp->chsp_pool_32;
+       else if (len <= 64)   p = &chsp->chsp_pool_64;
+       else if (len <= 128)  p = &chsp->chsp_pool_128;
+       else if (len <= 256)  p = &chsp->chsp_pool_256;
+       else if (len <= 512)  p = &chsp->chsp_pool_512;
+       else if (len <= 1024) p = &chsp->chsp_pool_1024;
+       else {
+               KASSERT(0);
+               p = NULL; /* Silence compiler warnings */
+       }
+
+       return (char *)CHFS_POOL_GET(p, flags);
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_str_pool_put(struct chfs_str_pool *chsp, char *str, size_t len)
+{
+       struct chfs_pool *p;
+       dbg("CHFS: str_pool_put()\n");
+
+       KASSERT(len <= 1024);
+
+       if      (len <= 16)   p = &chsp->chsp_pool_16;
+       else if (len <= 32)   p = &chsp->chsp_pool_32;
+       else if (len <= 64)   p = &chsp->chsp_pool_64;
+       else if (len <= 128)  p = &chsp->chsp_pool_128;
+       else if (len <= 256)  p = &chsp->chsp_pool_256;
+       else if (len <= 512)  p = &chsp->chsp_pool_512;
+       else if (len <= 1024) p = &chsp->chsp_pool_1024;
+       else {
+               KASSERT(0);
+               p = NULL; /* Silence compiler warnings */
+       }
+
+       CHFS_POOL_PUT(p, str);
+}
diff --git a/sys/ufs/chfs/chfs_readinode.c b/sys/ufs/chfs/chfs_readinode.c
new file mode 100644 (file)
index 0000000..3ae626f
--- /dev/null
@@ -0,0 +1,1136 @@
+/*     $NetBSD: chfs_readinode.c,v 1.2 2011/11/24 21:09:37 agc Exp $   */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_readinode.c
+ *
+ *  Created on: 2010.05.31.
+ *      Author: dtengeri
+ */
+
+#include <sys/buf.h>
+
+#include "chfs.h"
+
+/* tmp node operations */
+int chfs_check_td_data(struct chfs_mount *,
+    struct chfs_tmp_dnode *);
+int chfs_check_td_node(struct chfs_mount *,
+    struct chfs_tmp_dnode *);
+struct chfs_node_ref *chfs_first_valid_data_ref(struct chfs_node_ref *);
+int chfs_add_tmp_dnode_to_tree(struct chfs_mount *,
+    struct chfs_readinode_info *,
+    struct chfs_tmp_dnode *);
+void chfs_add_tmp_dnode_to_tdi(struct chfs_tmp_dnode_info *,
+       struct chfs_tmp_dnode *);
+void chfs_remove_tmp_dnode_from_tdi(struct chfs_tmp_dnode_info *,
+       struct chfs_tmp_dnode *);
+static void chfs_kill_td(struct chfs_mount *,
+    struct chfs_tmp_dnode *);
+static void chfs_kill_tdi(struct chfs_mount *,
+    struct chfs_tmp_dnode_info *);
+/* frag node operations */
+struct chfs_node_frag *new_fragment(struct chfs_full_dnode *,
+    uint32_t,
+    uint32_t);
+int no_overlapping_node(struct rb_tree *, struct chfs_node_frag *,
+    struct chfs_node_frag *, uint32_t);
+int chfs_add_frag_to_fragtree(struct chfs_mount *,
+    struct rb_tree *,
+    struct chfs_node_frag *);
+void chfs_obsolete_node_frag(struct chfs_mount *,
+    struct chfs_node_frag *);
+/* general node operations */
+int chfs_get_data_nodes(struct chfs_mount *,
+    struct chfs_inode *,
+    struct chfs_readinode_info *);
+int chfs_build_fragtree(struct chfs_mount *,
+    struct chfs_inode *,
+    struct chfs_readinode_info *);
+
+
+
+/*
+ * --------------------------
+ * tmp node rbtree operations
+ * --------------------------
+ */
+static signed int
+tmp_node_compare_nodes(void *ctx, const void *n1, const void *n2)
+{
+       const struct chfs_tmp_dnode_info *tdi1 = n1;
+       const struct chfs_tmp_dnode_info *tdi2 = n2;
+
+       return (tdi1->tmpnode->node->ofs - tdi2->tmpnode->node->ofs);
+}
+
+static signed int
+tmp_node_compare_key(void *ctx, const void *n, const void *key)
+{
+       const struct chfs_tmp_dnode_info *tdi = n;
+       uint64_t ofs =  *(const uint64_t *)key;
+
+       return (tdi->tmpnode->node->ofs - ofs);
+}
+
+const rb_tree_ops_t tmp_node_rbtree_ops = {
+       .rbto_compare_nodes = tmp_node_compare_nodes,
+       .rbto_compare_key = tmp_node_compare_key,
+       .rbto_node_offset = offsetof(struct chfs_tmp_dnode_info, rb_node),
+       .rbto_context = NULL
+};
+
+
+/*
+ * ---------------------------
+ * frag node rbtree operations
+ * ---------------------------
+ */
+static signed int
+frag_compare_nodes(void *ctx, const void *n1, const void *n2)
+{
+       const struct chfs_node_frag *frag1 = n1;
+       const struct chfs_node_frag *frag2 = n2;
+
+       return (frag1->ofs - frag2->ofs);
+}
+
+static signed int
+frag_compare_key(void *ctx, const void *n, const void *key)
+{
+       const struct chfs_node_frag *frag = n;
+       uint64_t ofs = *(const uint64_t *)key;
+
+       return (frag->ofs - ofs);
+}
+
+const rb_tree_ops_t frag_rbtree_ops = {
+       .rbto_compare_nodes = frag_compare_nodes,
+       .rbto_compare_key   = frag_compare_key,
+       .rbto_node_offset = offsetof(struct chfs_node_frag, rb_node),
+       .rbto_context = NULL
+};
+
+
+/*
+ * -------------------
+ * tmp node operations
+ * -------------------
+ */
+/*
+ * Check the data CRC of the node.
+ *
+ * Returns: 0 - if everything OK;
+ *             1 - if CRC is incorrect;
+ *             2 - else;
+ *             error code if an error occured.
+ */
+int
+chfs_check_td_data(struct chfs_mount *chmp,
+    struct chfs_tmp_dnode *td)
+{
+       int err;
+       size_t retlen, len, totlen;
+       uint32_t crc;
+       uint64_t ofs;
+       char *buf;
+       struct chfs_node_ref *nref = td->node->nref;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       KASSERT(!mutex_owned(&chmp->chm_lock_sizes));
+
+       ofs = CHFS_GET_OFS(nref->nref_offset) + sizeof(struct chfs_flash_data_node);
+       len = td->node->size;
+       if (!len)
+               return 0;
+
+       buf = kmem_alloc(len, KM_SLEEP);
+       if (!buf) {
+               dbg("allocating error\n");
+               return 2;
+       }
+       err = chfs_read_leb(chmp, nref->nref_lnr, buf, ofs, len, &retlen);
+       if (err) {
+               dbg("error wile reading: %d\n", err);
+               err = 2;
+               goto out;
+       }
+
+       if (len != retlen) {
+               dbg("len:%zu, retlen:%zu\n", len, retlen);
+               err = 2;
+               goto out;
+       }
+       crc = crc32(0, (uint8_t *)buf, len);
+
+       if (crc != td->data_crc) {
+               dbg("crc failed, calculated: 0x%x, orig: 0x%x\n", crc, td->data_crc);
+               kmem_free(buf, len);
+               return 1;
+       }
+
+       nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) | CHFS_NORMAL_NODE_MASK;
+       totlen = CHFS_PAD(sizeof(struct chfs_flash_data_node) + len);
+
+       mutex_enter(&chmp->chm_lock_sizes);
+       chfs_change_size_unchecked(chmp, &chmp->chm_blocks[nref->nref_lnr], -totlen);
+       chfs_change_size_used(chmp, &chmp->chm_blocks[nref->nref_lnr], totlen);
+       mutex_exit(&chmp->chm_lock_sizes);
+       KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+
+       err = 0;
+out:
+       kmem_free(buf, len);
+       return err;
+}
+
+int
+chfs_check_td_node(struct chfs_mount *chmp, struct chfs_tmp_dnode *td)
+{
+       int ret;
+
+       if (CHFS_REF_FLAGS(td->node->nref) != CHFS_UNCHECKED_NODE_MASK)
+               return 0;
+
+       ret = chfs_check_td_data(chmp, td);
+       if (ret == 1) {
+               chfs_mark_node_obsolete(chmp, td->node->nref);
+       }
+       return ret;
+}
+
+
+struct chfs_node_ref *
+chfs_first_valid_data_ref(struct chfs_node_ref *nref)
+{
+       while (nref) {
+               if (!CHFS_REF_OBSOLETE(nref)) {
+#ifdef DGB_MSG_GC
+                       if (nref->nref_lnr == REF_EMPTY_NODE) {
+                               dbg("FIRST VALID IS EMPTY!\n");
+                       }
+#endif
+                       return nref;
+               }
+
+               if (nref->nref_next) {
+                       nref = nref->nref_next;
+               } else
+                       break;
+       }
+       return NULL;
+}
+
+void
+chfs_add_tmp_dnode_to_tdi(struct chfs_tmp_dnode_info *tdi,
+       struct chfs_tmp_dnode *td)
+{
+       if (!tdi->tmpnode) {
+               tdi->tmpnode = td;
+       } else {
+               struct chfs_tmp_dnode *tmp = tdi->tmpnode;
+               while (tmp->next) {
+                       tmp = tmp->next;
+               }
+               tmp->next = td;
+       }
+}
+
+void
+chfs_remove_tmp_dnode_from_tdi(struct chfs_tmp_dnode_info *tdi,
+       struct chfs_tmp_dnode *td)
+{
+       if (tdi->tmpnode == td) {
+               tdi->tmpnode = tdi->tmpnode->next;
+       } else {
+               struct chfs_tmp_dnode *tmp = tdi->tmpnode->next;
+               while (tmp->next && tmp->next != td) {
+                       tmp = tmp->next;
+               }
+               if (tmp->next) {
+                       tmp->next = td->next;
+               }
+       }
+}
+
+static void
+chfs_kill_td(struct chfs_mount *chmp,
+    struct chfs_tmp_dnode *td)
+{
+       /* check if we need to mark as obsolete, to avoid double mark */
+       if (!CHFS_REF_OBSOLETE(td->node->nref)) {
+               chfs_mark_node_obsolete(chmp, td->node->nref);
+       }
+
+       chfs_free_tmp_dnode(td);
+}
+
+static void
+chfs_kill_tdi(struct chfs_mount *chmp,
+    struct chfs_tmp_dnode_info *tdi)
+{
+       struct chfs_tmp_dnode *next, *tmp = tdi->tmpnode;
+
+       while (tmp) {
+               next = tmp->next;
+               chfs_kill_td(chmp, tmp);
+               tmp = next;
+       }
+
+       chfs_free_tmp_dnode_info(tdi);
+}
+
+int
+chfs_add_tmp_dnode_to_tree(struct chfs_mount *chmp,
+    struct chfs_readinode_info *rii,
+    struct chfs_tmp_dnode *newtd)
+{
+       uint64_t end_ofs = newtd->node->ofs + newtd->node->size;
+       struct chfs_tmp_dnode_info *this;
+       struct rb_node *node, *prev_node;
+       struct chfs_tmp_dnode_info *newtdi;
+
+       node = rb_tree_find_node(&rii->tdi_root, &newtd->node->ofs);
+       if (node) {
+               this = (struct chfs_tmp_dnode_info *)node;
+               while (this->tmpnode->overlapped) {
+                       prev_node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT);
+                       if (!prev_node) {
+                               this->tmpnode->overlapped = 0;
+                               break;
+                       }
+                       node = prev_node;
+                       this = (struct chfs_tmp_dnode_info *)node;
+               }
+       }
+       while (node) {
+               this = (struct chfs_tmp_dnode_info *)node;
+               if (this->tmpnode->node->ofs > end_ofs)
+                       break;
+               
+               struct chfs_tmp_dnode *tmp_td = this->tmpnode;
+               while (tmp_td) {
+                       if (tmp_td->version == newtd->version) {
+                               if (!chfs_check_td_node(chmp, tmp_td)) {
+                                       dbg("calling kill td 0\n");
+                                       chfs_kill_td(chmp, newtd);
+                                       return 0;
+                               } else {
+                                       chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+                                       chfs_kill_td(chmp, tmp_td);
+                                       chfs_add_tmp_dnode_to_tdi(this, newtd);
+                                       return 0;
+                               }
+                       }
+                       if (tmp_td->version < newtd->version &&
+                               tmp_td->node->ofs >= newtd->node->ofs &&
+                               tmp_td->node->ofs + tmp_td->node->size <= end_ofs) {
+                               /* New node entirely overlaps 'this' */
+                               if (chfs_check_td_node(chmp, newtd)) {
+                                       dbg("calling kill td 2\n");
+                                       chfs_kill_td(chmp, newtd);
+                                       return 0;
+                               }
+                               /* ... and is good. Kill 'this' and any subsequent nodes which are also overlapped */
+                               while (tmp_td && tmp_td->node->ofs + tmp_td->node->size <= end_ofs) {
+                                       struct rb_node *next = rb_tree_iterate(&rii->tdi_root, this, RB_DIR_RIGHT);
+                                       struct chfs_tmp_dnode_info *next_tdi = (struct chfs_tmp_dnode_info *)next;
+                                       struct chfs_tmp_dnode *next_td = NULL;
+                                       if (tmp_td->next) {
+                                               next_td = tmp_td->next;
+                                       } else if (next_tdi) {
+                                               next_td = next_tdi->tmpnode;
+                                       }
+                                       if (tmp_td->version < newtd->version) {
+                                               chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+                                               chfs_kill_td(chmp, tmp_td);
+                                               if (!this->tmpnode) {
+                                                       rb_tree_remove_node(&rii->tdi_root, this);
+                                                       chfs_kill_tdi(chmp, this);
+                                                       this = next_tdi;
+                                               }
+                                       }
+                                       tmp_td = next_td;
+                               }
+                               continue;
+                       }
+                       if (tmp_td->version > newtd->version &&
+                               tmp_td->node->ofs <= newtd->node->ofs &&
+                               tmp_td->node->ofs + tmp_td->node->size >= end_ofs) {
+                               /* New node entirely overlapped by 'this' */
+                               if (!chfs_check_td_node(chmp, tmp_td)) {
+                                       dbg("this version: %llu\n",
+                                               (unsigned long long)tmp_td->version);
+                                       dbg("this ofs: %llu, size: %u\n",
+                                               (unsigned long long)tmp_td->node->ofs,
+                                               tmp_td->node->size);
+                                       dbg("calling kill td 4\n");
+                                       chfs_kill_td(chmp, newtd);
+                                       return 0;
+                               }
+                               /* ... but 'this' was bad. Replace it... */
+                               chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+                               chfs_kill_td(chmp, tmp_td);
+                               if (!this->tmpnode) {
+                                       rb_tree_remove_node(&rii->tdi_root, this);
+                                       chfs_kill_tdi(chmp, this);
+                               }
+                               dbg("calling kill td 5\n");
+                               chfs_kill_td(chmp, newtd);
+                               break;
+                       }
+                       tmp_td = tmp_td->next;
+               }
+               node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT);
+       }
+
+       newtdi = chfs_alloc_tmp_dnode_info();
+       chfs_add_tmp_dnode_to_tdi(newtdi, newtd);
+       /* We neither completely obsoleted nor were completely
+          obsoleted by an earlier node. Insert into the tree */
+       struct chfs_tmp_dnode_info *tmp_tdi = rb_tree_insert_node(&rii->tdi_root, newtdi);
+       if (tmp_tdi != newtdi) {
+               chfs_add_tmp_dnode_to_tdi(tmp_tdi, newtd);
+               newtdi->tmpnode = NULL;
+               chfs_kill_tdi(chmp, newtdi);
+       }
+
+       /* If there's anything behind that overlaps us, note it */
+       node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT);
+       if (node) {
+               while (1) {
+                       this = (struct chfs_tmp_dnode_info *)node;
+                       if (this->tmpnode->node->ofs + this->tmpnode->node->size > newtd->node->ofs) {
+                               newtd->overlapped = 1;
+                       }
+                       if (!this->tmpnode->overlapped)
+                               break;
+
+                       prev_node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_LEFT);
+                       if (!prev_node) {
+                               this->tmpnode->overlapped = 0;
+                               break;
+                       }
+                       node = prev_node;
+               }
+       }
+
+       /* If the new node overlaps anything ahead, note it */
+       node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT);
+       this = (struct chfs_tmp_dnode_info *)node;
+       while (this && this->tmpnode->node->ofs < end_ofs) {
+               this->tmpnode->overlapped = 1;
+               node = rb_tree_iterate(&rii->tdi_root, node, RB_DIR_RIGHT);
+               this = (struct chfs_tmp_dnode_info *)node;
+       }
+       return 0;
+}
+
+
+/*
+ * --------------------
+ * frag node operations
+ * --------------------
+ */
+struct chfs_node_frag *
+new_fragment(struct chfs_full_dnode *fdn, uint32_t ofs, uint32_t size)
+{
+       struct chfs_node_frag *newfrag;
+       newfrag = chfs_alloc_node_frag();
+       if (newfrag) {
+               newfrag->ofs = ofs;
+               newfrag->size = size;
+               newfrag->node = fdn;
+       } else {
+               chfs_err("cannot allocate a chfs_node_frag object\n");
+       }
+       return newfrag;
+}
+
+int
+no_overlapping_node(struct rb_tree *fragtree,
+    struct chfs_node_frag *newfrag,
+    struct chfs_node_frag *this, uint32_t lastend)
+{
+       if (lastend < newfrag->node->ofs) {
+               struct chfs_node_frag *holefrag;
+
+               holefrag = new_fragment(NULL, lastend, newfrag->node->ofs - lastend);
+               if (!holefrag) {
+                       chfs_free_node_frag(newfrag);
+                       return ENOMEM;
+               }
+
+               rb_tree_insert_node(fragtree, holefrag);
+               this = holefrag;
+       }
+
+       rb_tree_insert_node(fragtree, newfrag);
+
+       return 0;
+}
+
+int
+chfs_add_frag_to_fragtree(struct chfs_mount *chmp,
+    struct rb_tree *fragtree,
+    struct chfs_node_frag *newfrag)
+{
+       struct chfs_node_frag *this;
+       uint32_t lastend;
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       this = (struct chfs_node_frag *)rb_tree_find_node_leq(fragtree, &newfrag->ofs);
+
+       if (this) {
+               lastend = this->ofs + this->size;
+       } else {
+               lastend = 0;
+       }
+
+       if (lastend <= newfrag->ofs) {
+               //dbg("no overlapping node\n");
+               if (lastend && (lastend - 1) >> PAGE_SHIFT == newfrag->ofs >> PAGE_SHIFT) {
+                       if (this->node)
+                               CHFS_MARK_REF_NORMAL(this->node->nref);
+                       CHFS_MARK_REF_NORMAL(newfrag->node->nref);
+               }
+               return no_overlapping_node(fragtree, newfrag, this, lastend);
+       }
+
+       if (newfrag->ofs > this->ofs) {
+
+               CHFS_MARK_REF_NORMAL(newfrag->node->nref);
+               if (this->node)
+                       CHFS_MARK_REF_NORMAL(this->node->nref);
+
+               if (this->ofs + this->size > newfrag->ofs + newfrag->size) {
+                       /* newfrag is inside of this */
+                       //dbg("newfrag is inside of this\n");
+                       struct chfs_node_frag *newfrag2;
+
+                       newfrag2 = new_fragment(this->node, newfrag->ofs + newfrag->size,
+                           this->ofs + this->size - newfrag->ofs - newfrag->size);
+                       if (!newfrag2)
+                               return ENOMEM;
+                       if (this->node)
+                               this->node->frags++;
+
+                       this->size = newfrag->ofs - this->ofs;
+
+                       rb_tree_insert_node(fragtree, newfrag);
+                       rb_tree_insert_node(fragtree, newfrag2);
+
+                       return 0;
+               }
+               /* newfrag is bottom of this */
+               //dbg("newfrag is bottom of this\n");
+               this->size = newfrag->ofs - this->ofs;
+               rb_tree_insert_node(fragtree, newfrag);
+       } else {
+               /* newfrag start at same point */
+               //dbg("newfrag start at same point\n");
+               //TODO replace instead of remove and insert
+               rb_tree_remove_node(fragtree, this);
+               rb_tree_insert_node(fragtree, newfrag);
+
+               if (newfrag->ofs + newfrag->size >= this->ofs+this->size) {
+                       chfs_obsolete_node_frag(chmp, this);
+               } else {
+                       this->ofs += newfrag->size;
+                       this->size -= newfrag->size;
+
+                       rb_tree_insert_node(fragtree, this);
+                       return 0;
+               }
+       }
+       /* OK, now we have newfrag added in the correct place in the tree, but
+          frag_next(newfrag) may be a fragment which is overlapped by it
+       */
+       while ((this = frag_next(fragtree, newfrag)) && newfrag->ofs + newfrag->size >= this->ofs + this->size) {
+               rb_tree_remove_node(fragtree, this);
+               chfs_obsolete_node_frag(chmp, this);
+       }
+
+       if (!this || newfrag->ofs + newfrag->size == this->ofs)
+               return 0;
+
+       this->size = (this->ofs + this->size) - (newfrag->ofs + newfrag->size);
+       this->ofs = newfrag->ofs + newfrag->size;
+
+       if (this->node)
+               CHFS_MARK_REF_NORMAL(this->node->nref);
+       CHFS_MARK_REF_NORMAL(newfrag->node->nref);
+
+       return 0;
+}
+
+void
+chfs_kill_fragtree(struct rb_tree *fragtree)
+{
+       struct chfs_node_frag *this, *next;
+       //dbg("start\n");
+
+       this = (struct chfs_node_frag *)RB_TREE_MIN(fragtree);
+       while (this) {
+               //for (this = (struct chfs_node_frag *)RB_TREE_MIN(&fragtree); this != NULL; this = (struct chfs_node_frag *)rb_tree_iterate(&fragtree, &this->rb_node, RB_DIR_RIGHT)) {
+               next = frag_next(fragtree, this);
+               rb_tree_remove_node(fragtree, this);
+               chfs_free_node_frag(this);
+               //dbg("one frag killed\n");
+               this = next;
+       }
+       //dbg("end\n");
+}
+
+uint32_t
+chfs_truncate_fragtree(struct chfs_mount *chmp,
+       struct rb_tree *fragtree, uint32_t size)
+{
+       struct chfs_node_frag *frag;
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       dbg("truncate to size: %u\n", size);
+
+       frag = (struct chfs_node_frag *)rb_tree_find_node_leq(fragtree, &size);
+
+       /* Find the last frag before size and set its new size. */
+       if (frag && frag->ofs != size) {
+               if (frag->ofs + frag->size > size) {
+                       frag->size = size - frag->ofs;
+               }
+               frag = frag_next(fragtree, frag);
+       }
+
+       /* Delete frags after new size. */
+       while (frag && frag->ofs >= size) {
+               struct chfs_node_frag *next = frag_next(fragtree, frag);
+
+               rb_tree_remove_node(fragtree, frag);
+               chfs_obsolete_node_frag(chmp, frag);
+               frag = next;
+       }
+
+       if (size == 0) {
+               return 0;
+       }
+
+       frag = frag_last(fragtree);
+
+       if (!frag) {
+               return 0;
+       }
+       
+       if (frag->ofs + frag->size < size) {
+               return frag->ofs + frag->size;
+       }
+
+       /* FIXME Should we check the postion of the last node? (PAGE_CACHE size, etc.) */
+       if (frag->node && (frag->ofs & (PAGE_SIZE - 1)) == 0) {
+               frag->node->nref->nref_offset = CHFS_GET_OFS(frag->node->nref->nref_offset) | CHFS_PRISTINE_NODE_MASK;
+       }
+
+       return size;
+}
+
+void
+chfs_obsolete_node_frag(struct chfs_mount *chmp,
+    struct chfs_node_frag *this)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       if (this->node) {
+               this->node->frags--;
+               if (!this->node->frags) {
+                       struct chfs_vnode_cache *vc = chfs_nref_to_vc(this->node->nref);
+                       chfs_mark_node_obsolete(chmp, this->node->nref);
+                       
+                       if (vc->dnode == this->node->nref) {
+                               vc->dnode = this->node->nref->nref_next;
+                       } else {
+                               struct chfs_node_ref *tmp = vc->dnode;
+                               while (tmp->nref_next != (struct chfs_node_ref*) vc 
+                                               && tmp->nref_next != this->node->nref) {
+                                       tmp = tmp->nref_next;
+                               }
+                               if (tmp->nref_next == this->node->nref) {
+                                       tmp->nref_next = this->node->nref->nref_next;
+                               }
+                               // FIXME should we free here the this->node->nref?
+                       }
+                       
+                       chfs_free_full_dnode(this->node);
+               } else {
+                       CHFS_MARK_REF_NORMAL(this->node->nref);
+               }
+       }
+       chfs_free_node_frag(this);
+}
+
+int
+chfs_add_full_dnode_to_inode(struct chfs_mount *chmp,
+    struct chfs_inode *ip,
+    struct chfs_full_dnode *fd)
+{
+       int ret;
+       struct chfs_node_frag *newfrag;
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       if (unlikely(!fd->size))
+               return 0;
+
+       newfrag = new_fragment(fd, fd->ofs, fd->size);
+       if (unlikely(!newfrag))
+               return ENOMEM;
+
+       newfrag->node->frags = 1;
+
+       ret = chfs_add_frag_to_fragtree(chmp, &ip->fragtree, newfrag);
+       if (ret)
+               return ret;
+
+       if (newfrag->ofs & (PAGE_SIZE - 1)) {
+               struct chfs_node_frag *prev = frag_prev(&ip->fragtree, newfrag);
+
+               CHFS_MARK_REF_NORMAL(fd->nref);
+               if (prev->node)
+                       CHFS_MARK_REF_NORMAL(prev->node->nref);
+       }
+
+       if ((newfrag->ofs+newfrag->size) & (PAGE_SIZE - 1)) {
+               struct chfs_node_frag *next = frag_next(&ip->fragtree, newfrag);
+
+               if (next) {
+                       CHFS_MARK_REF_NORMAL(fd->nref);
+                       if (next->node)
+                               CHFS_MARK_REF_NORMAL(next->node->nref);
+               }
+       }
+
+       return 0;
+}
+
+
+/*
+ * -----------------------
+ * general node operations
+ * -----------------------
+ */
+/* get tmp nodes of an inode */
+int
+chfs_get_data_nodes(struct chfs_mount *chmp,
+    struct chfs_inode *ip,
+    struct chfs_readinode_info *rii)
+{
+       uint32_t crc;
+       int err;
+       size_t len, retlen;
+       struct chfs_node_ref *nref;
+       struct chfs_flash_data_node *dnode;
+       struct chfs_tmp_dnode *td;
+       char* buf;
+
+       len = sizeof(struct chfs_flash_data_node);
+       buf = kmem_alloc(len, KM_SLEEP);
+
+       dnode = kmem_alloc(len, KM_SLEEP);
+       if (!dnode)
+               return ENOMEM;
+
+       nref = chfs_first_valid_data_ref(ip->chvc->dnode);
+
+       rii->highest_version = ip->chvc->highest_version;
+
+       while(nref && (struct chfs_vnode_cache *)nref != ip->chvc) {
+               err = chfs_read_leb(chmp, nref->nref_lnr, buf, CHFS_GET_OFS(nref->nref_offset), len, &retlen);
+               if (err || len != retlen)
+                       goto out;
+               dnode = (struct chfs_flash_data_node*)buf;
+
+               //check header crc
+               crc = crc32(0, (uint8_t *)dnode, CHFS_NODE_HDR_SIZE - 4);
+               if (crc != le32toh(dnode->hdr_crc)) {
+                       chfs_err("CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->hdr_crc));
+                       goto cont;
+               }
+               //check header magic bitmask
+               if (le16toh(dnode->magic) != CHFS_FS_MAGIC_BITMASK) {
+                       chfs_err("Wrong magic bitmask.\n");
+                       goto cont;
+               }
+               //check node crc
+               crc = crc32(0, (uint8_t *)dnode, sizeof(*dnode) - 4);
+               if (crc != le32toh(dnode->node_crc)) {
+                       chfs_err("Node CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->node_crc));
+                       goto cont;
+               }
+               td = chfs_alloc_tmp_dnode();
+               if (!td) {
+                       chfs_err("Can't allocate tmp dnode info.\n");
+                       err = ENOMEM;
+                       goto out;
+               }
+               /* We don't check data crc here, just add nodes to tmp frag tree, because
+                * we don't want to check nodes which have been overlapped by a new node
+                * with a higher version number.
+                */
+               td->node = chfs_alloc_full_dnode();
+               if (!td->node) {
+                       chfs_err("Can't allocate full dnode info.\n");
+                       err = ENOMEM;
+                       goto out_tmp_dnode;
+               }
+               td->version = le64toh(dnode->version);
+               td->node->ofs = le64toh(dnode->offset);
+               td->data_crc = le32toh(dnode->data_crc);
+               td->node->nref = nref;
+               td->node->size = le32toh(dnode->data_length);
+               td->overlapped = 0;
+
+               if (td->version > rii->highest_version) {
+                       rii->highest_version = td->version;
+               }
+
+               err = chfs_add_tmp_dnode_to_tree(chmp, rii, td);
+               if (err)
+                       goto out_full_dnode;
+
+cont:
+               nref = chfs_first_valid_data_ref(nref->nref_next);
+       }
+
+       ip->chvc->highest_version = rii->highest_version;
+       return 0;
+
+/* Exit points */
+out_full_dnode:
+       chfs_free_full_dnode(td->node);
+out_tmp_dnode:
+       chfs_free_tmp_dnode(td);
+out:
+       kmem_free(buf, len);
+       kmem_free(dnode, len);
+       return err;
+}
+
+
+/* Build final normal fragtree from tdi tree. */
+int
+chfs_build_fragtree(struct chfs_mount *chmp, struct chfs_inode *ip,
+    struct chfs_readinode_info *rii)
+{
+       struct chfs_tmp_dnode_info *pen, *last, *this;
+       struct rb_tree ver_tree;    /* version tree */
+       uint64_t high_ver = 0;
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       rb_tree_init(&ver_tree, &tmp_node_rbtree_ops);
+
+       if (rii->mdata_tn) {
+               high_ver = rii->mdata_tn->tmpnode->version;
+               rii->latest_ref = rii->mdata_tn->tmpnode->node->nref;
+       }
+
+       pen = (struct chfs_tmp_dnode_info *)RB_TREE_MAX(&rii->tdi_root);
+
+       while((last = pen)) {
+               pen = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&rii->tdi_root, last, RB_DIR_LEFT);
+
+               rb_tree_remove_node(&rii->tdi_root, last);
+               rb_tree_insert_node(&ver_tree, last);
+
+               if (last->tmpnode->overlapped) {
+                       if (pen)
+                               continue;
+
+                       last->tmpnode->overlapped = 0;
+               }
+               
+               this = (struct chfs_tmp_dnode_info *)RB_TREE_MAX(&ver_tree);
+
+               while (this) {
+                       struct chfs_tmp_dnode_info *vers_next;
+                       int ret;
+
+                       vers_next = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&ver_tree, this, RB_DIR_LEFT);
+                       rb_tree_remove_node(&ver_tree, this);
+
+                       struct chfs_tmp_dnode *tmp_td = this->tmpnode;
+                       while (tmp_td) {
+                               struct chfs_tmp_dnode *next_td = tmp_td->next;
+                               
+                               if (chfs_check_td_node(chmp, tmp_td)) {
+                                       if (next_td) {
+                                               chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+                                       } else {
+                                               break;
+                                       }
+                               } else {
+                                       if (tmp_td->version > high_ver) {
+                                               high_ver = tmp_td->version;
+                                               dbg("highver: %llu\n", (unsigned long long)high_ver);
+                                               rii->latest_ref = tmp_td->node->nref;
+                                       }
+
+                                       ret = chfs_add_full_dnode_to_inode(chmp, ip, tmp_td->node);
+                                       if (ret) {
+                                               while (1) {
+                                                       vers_next = (struct chfs_tmp_dnode_info *)rb_tree_iterate(&ver_tree, this, RB_DIR_LEFT);
+                                                       while (tmp_td) {
+                                                               next_td = tmp_td->next;
+                                                               if (chfs_check_td_node(chmp, tmp_td) > 1) {
+                                                                       chfs_mark_node_obsolete(chmp,
+                                                                               tmp_td->node->nref);
+                                                               }
+                                                               chfs_free_full_dnode(tmp_td->node);
+                                                               chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+                                                               chfs_free_tmp_dnode(tmp_td);
+                                                               tmp_td = next_td;
+                                                       }
+                                                       chfs_free_tmp_dnode_info(this);
+                                                       this = vers_next;
+                                                       if (!this)
+                                                               break;
+                                                       rb_tree_remove_node(&ver_tree, vers_next);
+                                               }
+                                               return ret;
+                                       }
+
+                                       chfs_remove_tmp_dnode_from_tdi(this, tmp_td);
+                                       chfs_free_tmp_dnode(tmp_td);
+                               }
+                               tmp_td = next_td;
+                       }
+                       chfs_kill_tdi(chmp, this);
+                       this = vers_next;
+               }
+       }
+
+       return 0;
+}
+
+int chfs_read_inode(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+       struct chfs_vnode_cache *vc = ip->chvc;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+retry:
+       /* XXX locking */
+       //mutex_enter(&chmp->chm_lock_vnocache);
+       switch (vc->state) {
+       case VNO_STATE_UNCHECKED:
+       case VNO_STATE_CHECKEDABSENT:
+//             chfs_vnode_cache_set_state(chmp, vc, VNO_STATE_READING);
+               vc->state = VNO_STATE_READING;
+               break;
+       case VNO_STATE_CHECKING:
+       case VNO_STATE_GC:
+               //sleep_on_spinunlock(&chmp->chm_lock_vnocache);
+               //KASSERT(!mutex_owned(&chmp->chm_lock_vnocache));
+               goto retry;
+               break;
+       case VNO_STATE_PRESENT:
+       case VNO_STATE_READING:
+               chfs_err("Reading inode #%llu in state %d!\n",
+                       (unsigned long long)vc->vno, vc->state);
+               chfs_err("wants to read a nonexistent ino %llu\n",
+                       (unsigned long long)vc->vno);
+               return ENOENT;
+       default:
+               panic("BUG() Bad vno cache state.");
+       }
+       //mutex_exit(&chmp->chm_lock_vnocache);
+
+       return chfs_read_inode_internal(chmp, ip);
+}
+
+/*
+ * Read inode frags.
+ * Firstly get tmp nodes,
+ * secondly build fragtree from those.
+ */
+int
+chfs_read_inode_internal(struct chfs_mount *chmp, struct chfs_inode *ip)
+{
+       int err;
+       size_t len, retlen;
+       char* buf;
+       struct chfs_readinode_info rii;
+       struct chfs_flash_vnode *fvnode;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       len = sizeof(*fvnode);
+
+       memset(&rii, 0, sizeof(rii));
+
+       rb_tree_init(&rii.tdi_root, &tmp_node_rbtree_ops);
+
+       /* build up a temp node frag tree */
+       err = chfs_get_data_nodes(chmp, ip, &rii);
+       if (err) {
+               if (ip->chvc->state == VNO_STATE_READING)
+                       ip->chvc->state = VNO_STATE_CHECKEDABSENT;
+               /* FIXME Should we kill fragtree or something here? */
+               return err;
+       }
+
+       rb_tree_init(&ip->fragtree, &frag_rbtree_ops);
+       /*
+        * build fragtree from temp nodes
+        */
+       err = chfs_build_fragtree(chmp, ip, &rii);
+       if (err) {
+               if (ip->chvc->state == VNO_STATE_READING)
+                       ip->chvc->state = VNO_STATE_CHECKEDABSENT;
+               /* FIXME Should we kill fragtree or something here? */
+               return err;
+       }
+
+       if (!rii.latest_ref) {
+               return 0;
+       }
+
+       buf = kmem_alloc(len, KM_SLEEP);
+       if (!buf)
+               return ENOMEM;
+
+       /*
+        * set inode size from chvc->v
+        */
+       err = chfs_read_leb(chmp, ip->chvc->v->nref_lnr, buf, CHFS_GET_OFS(ip->chvc->v->nref_offset), len, &retlen);
+       if (err || retlen != len) {
+               kmem_free(buf, len);
+               return err?err:EIO;
+       }
+
+       fvnode = (struct chfs_flash_vnode*)buf;
+
+       dbg("set size from v: %u\n", fvnode->dn_size);
+       chfs_set_vnode_size(ITOV(ip), fvnode->dn_size);
+       uint32_t retsize = chfs_truncate_fragtree(chmp, &ip->fragtree, fvnode->dn_size);
+       if (retsize != fvnode->dn_size) {
+               dbg("Truncating failed. It is %u instead of %u\n", retsize, fvnode->dn_size);
+       }
+
+       kmem_free(buf, len);
+
+       if (ip->chvc->state == VNO_STATE_READING) {
+               ip->chvc->state = VNO_STATE_PRESENT;
+       }
+
+       return 0;
+}
+
+int
+chfs_read_data(struct chfs_mount* chmp, struct vnode *vp,
+    struct buf *bp)
+{
+       off_t ofs;
+       struct chfs_node_frag *frag;
+       char * buf;
+       int err = 0;
+       size_t size, retlen;
+       uint32_t crc;
+       struct chfs_inode *ip = VTOI(vp);
+       struct chfs_flash_data_node *dnode;
+       struct chfs_node_ref *nref;
+
+       memset(bp->b_data, 0, bp->b_bcount);
+
+       ofs = bp->b_blkno * PAGE_SIZE;
+       frag = (struct chfs_node_frag *)rb_tree_find_node_leq(&ip->fragtree, &ofs);
+
+       if (!frag || frag->ofs > ofs || frag->ofs + frag->size <= ofs) {
+               dbg("not found in frag tree\n");
+               return 0;
+       }
+
+       if (!frag->node) {
+               dbg("no node in frag\n");
+               return 0;
+       }
+
+       nref = frag->node->nref;
+
+       size = sizeof(*dnode) + frag->size;
+
+       buf = kmem_alloc(size, KM_SLEEP);
+
+       dbg("reading from lnr: %u, offset: %u, size: %zu\n", nref->nref_lnr, CHFS_GET_OFS(nref->nref_offset), size);
+       err = chfs_read_leb(chmp, nref->nref_lnr, buf, CHFS_GET_OFS(nref->nref_offset), size, &retlen);
+       if (err) {
+               chfs_err("error after reading: %d\n", err);
+               goto out;
+       }
+       if (retlen != size) {
+               chfs_err("retlen: %zu != size: %zu\n", retlen, size);
+               err = EIO;
+               goto out;
+       }
+
+       dnode = (struct chfs_flash_data_node *)buf;
+       crc = crc32(0, (uint8_t *)dnode, CHFS_NODE_HDR_SIZE - 4);
+       if (crc != le32toh(dnode->hdr_crc)) {
+               chfs_err("CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->hdr_crc));
+               err = EIO;
+               goto out;
+       }
+       //check header magic bitmask
+       if (le16toh(dnode->magic) != CHFS_FS_MAGIC_BITMASK) {
+               chfs_err("Wrong magic bitmask.\n");
+               err = EIO;
+               goto out;
+       }
+       //check node crc
+       crc = crc32(0, (uint8_t *)dnode, sizeof(*dnode) - 4);
+       if (crc != le32toh(dnode->node_crc)) {
+               chfs_err("Node CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->node_crc));
+               err = EIO;
+               goto out;
+       }
+       crc = crc32(0, (uint8_t *)dnode->data, dnode->data_length);
+       if (crc != le32toh(dnode->data_crc)) {
+               chfs_err("Data CRC check failed. calc: 0x%x orig: 0x%x\n", crc, le32toh(dnode->data_crc));
+               err = EIO;
+               goto out;
+       }
+
+       memcpy(bp->b_data, dnode->data, dnode->data_length);
+       bp->b_resid = 0;
+
+out:
+       kmem_free(buf, size);
+       return err;
+}
diff --git a/sys/ufs/chfs/chfs_scan.c b/sys/ufs/chfs/chfs_scan.c
new file mode 100644 (file)
index 0000000..a35ce72
--- /dev/null
@@ -0,0 +1,740 @@
+/*     $NetBSD: chfs_scan.c,v 1.2 2011/11/24 21:09:37 agc Exp $        */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (c) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_scan.c
+ *
+ *  Created on: 2009.11.05.
+ *      Author: dtengeri
+ */
+
+#include "chfs.h"
+
+/**
+ * chfs_scan_make_vnode_cache - makes a new vnode cache during scan
+ * @chmp: CHFS main descriptor structure
+ * @vno: vnode identifier
+ * This function returns a vnode cache belonging to @vno.
+ */
+struct chfs_vnode_cache *
+chfs_scan_make_vnode_cache(struct chfs_mount *chmp, ino_t vno)
+{
+       struct chfs_vnode_cache *vc;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+       vc = chfs_vnode_cache_get(chmp, vno);
+       if (vc) {
+               return vc;
+       }
+
+       if (vno > chmp->chm_max_vno) {
+               chmp->chm_max_vno = vno;
+       }
+
+       vc = chfs_vnode_cache_alloc(vno);
+
+       //mutex_enter(&chmp->chm_lock_vnocache);
+
+       chfs_vnode_cache_add(chmp, vc);
+
+       //mutex_exit(&chmp->chm_lock_vnocache);
+
+       if (vno == CHFS_ROOTINO) {
+               vc->nlink = 2;
+               vc->pvno = CHFS_ROOTINO;
+               chfs_vnode_cache_set_state(chmp,
+                   vc, VNO_STATE_CHECKEDABSENT);
+       }
+
+       return vc;
+}
+
+/**
+ * chfs_scan_check_node_hdr - checks node magic and crc
+ * @nhdr: node header to check
+ * Returns 0 if everything is OK, error code otherwise.
+ */
+int
+chfs_scan_check_node_hdr(struct chfs_flash_node_hdr *nhdr)
+{
+       uint16_t magic;
+       uint32_t crc, hdr_crc;
+
+       magic = le16toh(nhdr->magic);
+
+       if (magic != CHFS_FS_MAGIC_BITMASK) {
+               dbg("bad magic\n");
+               return CHFS_NODE_BADMAGIC;
+       }
+
+       hdr_crc = le32toh(nhdr->hdr_crc);
+       crc = crc32(0, (uint8_t *)nhdr, CHFS_NODE_HDR_SIZE - 4);
+
+       if (crc != hdr_crc) {
+               dbg("bad crc\n");
+               return CHFS_NODE_BADCRC;
+       }
+
+       return CHFS_NODE_OK;
+}
+
+/**
+ * chfs_scan_check_vnode - check vnode crc and add to vnode cache
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock informations
+ * @buf: vnode to check
+ * @ofs: offset in eraseblock where vnode starts
+ */
+int
+chfs_scan_check_vnode(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, void *buf, off_t ofs)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       struct chfs_vnode_cache *vc;
+       struct chfs_flash_vnode *vnode = buf;
+       struct chfs_node_ref *nref;
+       int err;
+       uint32_t crc;
+       ino_t vno;
+
+       crc = crc32(0, (uint8_t *)vnode,
+           sizeof(struct chfs_flash_vnode) - 4);
+
+       if (crc != le32toh(vnode->node_crc)) {
+               err = chfs_update_eb_dirty(chmp,
+                   cheb, le32toh(vnode->length));
+               if (err) {
+                       return err;
+               }
+
+               return CHFS_NODE_BADCRC;
+       }
+
+       vno = le64toh(vnode->vno);
+
+       mutex_enter(&chmp->chm_lock_vnocache);
+       vc = chfs_vnode_cache_get(chmp, vno);
+       if (!vc) {
+               vc = chfs_scan_make_vnode_cache(chmp, vno);
+               if (!vc) {
+                       mutex_exit(&chmp->chm_lock_vnocache);
+                       return ENOMEM;
+               }
+       }
+       mutex_exit(&chmp->chm_lock_vnocache);
+
+       nref = chfs_alloc_node_ref(cheb);
+
+       nref->nref_offset = ofs;
+
+       KASSERT(nref->nref_lnr == cheb->lnr);
+
+       /* Check version of vnode. */
+       if ((struct chfs_vnode_cache *)vc->v != vc) {
+               if (le64toh(vnode->version) > *vc->vno_version) {
+                       //err = chfs_update_eb_dirty(chmp, &chmp->chm_blocks[vc->v->lnr],
+                       //              sizeof(struct chfs_flash_vnode));
+                       *vc->vno_version = le64toh(vnode->version);
+                       chfs_add_vnode_ref_to_vc(chmp, vc, nref);
+               } else {
+                       err = chfs_update_eb_dirty(chmp, cheb,
+                           sizeof(struct chfs_flash_vnode));
+                       return CHFS_NODE_OK;
+               }
+       } else {
+               vc->vno_version = kmem_alloc(sizeof(uint64_t), KM_SLEEP);
+               if (!vc->vno_version)
+                       return ENOMEM;
+               *vc->vno_version = le64toh(vnode->version);
+               chfs_add_vnode_ref_to_vc(chmp, vc, nref);
+       }
+
+       mutex_enter(&chmp->chm_lock_sizes);
+       //dbg("B:lnr: %d |free_size: %d node's size: %d\n", cheb->lnr, cheb->free_size, le32toh(vnode->length));
+       chfs_change_size_free(chmp, cheb, -le32toh(vnode->length));
+       chfs_change_size_used(chmp, cheb, le32toh(vnode->length));
+       mutex_exit(&chmp->chm_lock_sizes);
+
+       KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size);
+
+       KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+       //dbg(" A: free_size: %d\n", cheb->free_size);
+
+       /*dbg("vnode dump:\n");
+         dbg(" ->magic:    0x%x\n", le16toh(vnode->magic));
+         dbg(" ->type:     %d\n", le16toh(vnode->type));
+         dbg(" ->length:   %d\n", le32toh(vnode->length));
+         dbg(" ->hdr_crc:  0x%x\n", le32toh(vnode->hdr_crc));
+         dbg(" ->vno:      %d\n", le64toh(vnode->vno));
+         dbg(" ->version:  %ld\n", le64toh(vnode->version));
+         dbg(" ->uid:      %d\n", le16toh(vnode->uid));
+         dbg(" ->gid:      %d\n", le16toh(vnode->gid));
+         dbg(" ->mode:     %d\n", le32toh(vnode->mode));
+         dbg(" ->dn_size:  %d\n", le32toh(vnode->dn_size));
+         dbg(" ->atime:    %d\n", le32toh(vnode->atime));
+         dbg(" ->mtime:    %d\n", le32toh(vnode->mtime));
+         dbg(" ->ctime:    %d\n", le32toh(vnode->ctime));
+         dbg(" ->dsize:    %d\n", le32toh(vnode->dsize));
+         dbg(" ->node_crc: 0x%x\n", le32toh(vnode->node_crc));*/
+
+       return CHFS_NODE_OK;
+}
+
+int
+chfs_scan_mark_dirent_obsolete(struct chfs_mount *chmp,
+    struct chfs_vnode_cache *vc, struct chfs_dirent *fd)
+{
+       //int size;
+       struct chfs_eraseblock *cheb;
+       struct chfs_node_ref *prev, *nref;
+
+       nref = fd->nref;
+       cheb = &chmp->chm_blocks[fd->nref->nref_lnr];
+
+       /* Remove dirent's node ref from vnode cache */
+       prev = vc->dirents;
+       if (prev && prev == nref) {
+               vc->dirents = prev->nref_next;
+       } else if (prev && prev != (void *)vc) {
+               while (prev->nref_next && prev->nref_next !=
+                   (void *)vc && prev->nref_next != nref) {
+                       prev = prev->nref_next;
+               }
+
+               if (prev->nref_next == nref) {
+                       prev->nref_next = nref->nref_next;
+               }
+       }
+       /*dbg("XXX - start\n");
+       //nref = vc->dirents;
+       struct chfs_dirent *tmp;
+       tmp = vc->scan_dirents;
+       while (tmp) {
+       dbg(" ->tmp->name:    %s\n", tmp->name);
+       dbg(" ->tmp->version: %ld\n", tmp->version);
+       dbg(" ->tmp->vno: %d\n", tmp->vno);
+       tmp = tmp->next;
+       }
+       dbg("XXX - end\n");*/
+       //size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) + fd->nsize);
+
+       KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size +
+           cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+       return 0;
+}
+
+void
+chfs_add_fd_to_list(struct chfs_mount *chmp,
+    struct chfs_dirent *new, struct chfs_vnode_cache *pvc)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       int size;
+       struct chfs_eraseblock *cheb, *oldcheb;
+//     struct chfs_dirent **prev;
+       struct chfs_dirent *fd, *tmpfd;
+
+       dbg("adding fd to list: %s\n", new->name);
+
+       if ((new->version > pvc->highest_version))
+               pvc->highest_version = new->version;
+
+       size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) +
+           new->nsize);
+       cheb = &chmp->chm_blocks[new->nref->nref_lnr];
+
+       mutex_enter(&chmp->chm_lock_sizes);     
+       TAILQ_FOREACH_SAFE(fd, &pvc->scan_dirents, fds, tmpfd) {
+               if (fd->nhash > new->nhash) {
+                       /* insert new before fd */
+                       TAILQ_INSERT_BEFORE(fd, new, fds);
+                       goto out;
+               } else if (fd->nhash == new->nhash &&
+                   !strcmp(fd->name, new->name)) {
+                       if (new->version > fd->version) {
+//                             new->next = fd->next;
+                               /* replace fd with new */
+                               TAILQ_INSERT_BEFORE(fd, new, fds);
+                               chfs_change_size_free(chmp, cheb, -size);
+                               chfs_change_size_used(chmp, cheb, size);
+
+                               TAILQ_REMOVE(&pvc->scan_dirents, fd, fds);
+                               if (fd->nref) {
+                                       size = CHFS_PAD(sizeof(struct chfs_flash_dirent_node) + fd->nsize);
+                                       chfs_scan_mark_dirent_obsolete(chmp, pvc, fd);
+                                       oldcheb = &chmp->chm_blocks[fd->nref->nref_lnr];
+                                       chfs_change_size_used(chmp, oldcheb, -size);
+                                       chfs_change_size_dirty(chmp, oldcheb, size);
+                               }
+                               chfs_free_dirent(fd);
+//                             *prev = new;//XXX
+                       } else {
+                               chfs_scan_mark_dirent_obsolete(chmp, pvc, new);
+                               chfs_change_size_free(chmp, cheb, -size);
+                               chfs_change_size_dirty(chmp, cheb, size);
+                               chfs_free_dirent(new);
+                       }
+                       /*dbg("START\n");
+                         fd = pvc->scan_dirents;
+                         while (fd) {
+                         dbg("dirent dump:\n");
+                         dbg(" ->vno:     %d\n", fd->vno);
+                         dbg(" ->version: %ld\n", fd->version);
+                         dbg(" ->nhash:   0x%x\n", fd->nhash);
+                         dbg(" ->nsize:   %d\n", fd->nsize);
+                         dbg(" ->name:    %s\n", fd->name);
+                         dbg(" ->type:    %d\n", fd->type);
+                         fd = fd->next;
+                         }
+                         dbg("END\n");*/
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       return;
+               }
+       }
+       /* if we couldnt fit it elsewhere, lets add to the end */
+       TAILQ_INSERT_TAIL(&pvc->scan_dirents, new, fds);
+
+out:
+       //dbg("B:lnr: %d |free_size: %d size: %d\n", cheb->lnr, cheb->free_size, size);
+       chfs_change_size_free(chmp, cheb, -size);
+       chfs_change_size_used(chmp, cheb, size);
+       mutex_exit(&chmp->chm_lock_sizes);
+
+       KASSERT(cheb->used_size <= chmp->chm_ebh->eb_size);
+       //dbg(" A: free_size: %d\n", cheb->free_size);
+
+       KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size + cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+
+//     fd = pvc->scan_dirents;
+       /*dbg("START\n");
+         while (fd) {
+         dbg("dirent dump:\n");
+         dbg(" ->vno:     %d\n", fd->vno);
+         dbg(" ->version: %ld\n", fd->version);
+         dbg(" ->nhash:   0x%x\n", fd->nhash);
+         dbg(" ->nsize:   %d\n", fd->nsize);
+         dbg(" ->name:    %s\n", fd->name);
+         dbg(" ->type:    %d\n", fd->type);
+         fd = fd->next;
+         }
+         dbg("END\n");*/
+}
+/**
+ * chfs_scan_check_dirent_node - check vnode crc and add to vnode cache
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock informations
+ * @buf: directory entry to check
+ * @ofs: offset in eraseblock where dirent starts
+ */
+int
+chfs_scan_check_dirent_node(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, void *buf, off_t ofs)
+{
+       int err, namelen;
+       uint32_t crc;
+       struct chfs_dirent *fd;
+       struct chfs_vnode_cache *vc;
+       struct chfs_flash_dirent_node *dirent = buf;
+
+       //struct chfs_node_ref *tmp;
+
+       crc = crc32(0, (uint8_t *)dirent, sizeof(*dirent) - 4);
+       if (crc != le32toh(dirent->node_crc)) {
+               err = chfs_update_eb_dirty(chmp, cheb, le32toh(dirent->length));
+               if (err)
+                       return err;
+               return CHFS_NODE_BADCRC;
+       }
+       namelen = dirent->nsize;
+
+       fd = chfs_alloc_dirent(namelen + 1);
+       if (!fd)
+               return ENOMEM;
+
+       fd->nref = chfs_alloc_node_ref(cheb);
+       if (!fd->nref)
+               return ENOMEM;
+
+       KASSERT(fd->nref->nref_lnr == cheb->lnr);
+
+       memcpy(&fd->name, dirent->name, namelen);
+       fd->nsize = namelen;
+       fd->name[namelen] = 0;
+       crc = crc32(0, fd->name, dirent->nsize);
+       if (crc != le32toh(dirent->name_crc)) {
+               chfs_err("Directory entry's name has bad crc: read: 0x%x, "
+                   "calculated: 0x%x\n", le32toh(dirent->name_crc), crc);
+               chfs_free_dirent(fd);
+               err = chfs_update_eb_dirty(chmp, cheb, le32toh(dirent->length));
+               if (err)
+                       return err;
+               return CHFS_NODE_BADNAMECRC;
+       }
+
+       /* Check vnode_cache of parent node */
+       mutex_enter(&chmp->chm_lock_vnocache);
+       vc = chfs_scan_make_vnode_cache(chmp, le64toh(dirent->pvno));
+       mutex_exit(&chmp->chm_lock_vnocache);
+       if (!vc) {
+               chfs_free_dirent(fd);
+               return ENOMEM;
+       }
+
+       fd->nref->nref_offset = ofs;
+
+       dbg("add dirent to #%llu\n", (unsigned long long)vc->vno);
+       chfs_add_node_to_list(chmp, vc, fd->nref, &vc->dirents);
+       /*tmp = vc->dirents;
+         dbg("START|vno: %d dirents dump\n", vc->vno);
+         while (tmp) {
+         dbg(" ->nref->nref_lnr:    %d\n", tmp->lnr);
+         dbg(" ->nref->nref_offset: %d\n", tmp->offset);
+         tmp = tmp->next;
+         }
+         dbg("  END|vno: %d dirents dump\n", vc->vno);*/
+
+//     fd->next = NULL;
+       fd->vno = le64toh(dirent->vno);
+       fd->version = le64toh(dirent->version);
+       fd->nhash = hash32_buf(fd->name, namelen, HASH32_BUF_INIT);
+       fd->type = dirent->dtype;
+
+       /*dbg("dirent dump:\n");
+         dbg(" ->vno:     %d\n", fd->vno);
+         dbg(" ->version: %ld\n", fd->version);
+         dbg(" ->nhash:   0x%x\n", fd->nhash);
+         dbg(" ->nsize:   %d\n", fd->nsize);
+         dbg(" ->name:    %s\n", fd->name);
+         dbg(" ->type:    %d\n", fd->type);*/
+
+       chfs_add_fd_to_list(chmp, fd, vc);
+
+       /*struct chfs_node_ref *tmp;
+         tmp = vc->dirents;
+         dbg("START|vno: %d dirents dump\n", vc->vno);
+         while (tmp) {
+         dbg(" ->nref->nref_lnr:    %d\n", tmp->lnr);
+         dbg(" ->nref->nref_offset: %d\n", tmp->offset);
+         tmp = tmp->next;
+         }
+         dbg("  END|vno: %d dirents dump\n", vc->vno);*/
+
+       /*dbg("dirent dump:\n");
+         dbg(" ->magic:    0x%x\n", le16toh(dirent->magic));
+         dbg(" ->type:     %d\n", le16toh(dirent->type));
+         dbg(" ->length:   %d\n", le32toh(dirent->length));
+         dbg(" ->hdr_crc:  0x%x\n", le32toh(dirent->hdr_crc));
+         dbg(" ->vno:      %d\n", le64toh(dirent->vno));
+         dbg(" ->pvno:     %d\n", le64toh(dirent->pvno));
+         dbg(" ->version:  %ld\n", le64toh(dirent->version));
+         dbg(" ->mctime:   %d\n", le32toh(dirent->mctime));
+         dbg(" ->nsize:    %d\n", dirent->nsize);
+         dbg(" ->dtype:    %d\n", dirent->dtype);
+         dbg(" ->name_crc: 0x%x\n", le32toh(dirent->name_crc));
+         dbg(" ->node_crc: 0x%x\n", le32toh(dirent->node_crc));
+         dbg(" ->name:     %s\n", dirent->name);*/
+
+       return CHFS_NODE_OK;
+}
+
+/**
+ * chfs_scan_check_data_node - check vnode crc and add to vnode cache
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock informations
+ * @buf: data node to check
+ * @ofs: offset in eraseblock where data node starts
+ */
+int
+chfs_scan_check_data_node(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb, void *buf, off_t ofs)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       int err;
+       uint32_t crc, vno;
+       struct chfs_node_ref *nref;
+       struct chfs_vnode_cache *vc;
+       struct chfs_flash_data_node *dnode = buf;
+
+       crc = crc32(0, (uint8_t *)dnode, sizeof(struct chfs_flash_data_node) - 4);
+       if (crc != le32toh(dnode->node_crc)) {
+               err = chfs_update_eb_dirty(chmp, cheb, le32toh(dnode->length));
+               if (err)
+                       return err;
+               return CHFS_NODE_BADCRC;
+       }
+       /**
+        * Don't check data nodes crc and version here, it will be done in
+        * the background GC thread.
+        */
+       nref = chfs_alloc_node_ref(cheb);
+       if (!nref)
+               return ENOMEM;
+
+       nref->nref_offset = ofs | CHFS_UNCHECKED_NODE_MASK;
+
+       KASSERT(nref->nref_lnr == cheb->lnr);
+
+       vno = le64toh(dnode->vno);
+       mutex_enter(&chmp->chm_lock_vnocache);
+       vc = chfs_vnode_cache_get(chmp, vno);
+       if (!vc) {
+               vc = chfs_scan_make_vnode_cache(chmp, vno);
+               if (!vc)
+                       return ENOMEM;
+       }
+       mutex_exit(&chmp->chm_lock_vnocache);
+       chfs_add_node_to_list(chmp, vc, nref, &vc->dnode);
+
+       dbg("chmpfree: %u, chebfree: %u, dnode: %u\n", chmp->chm_free_size, cheb->free_size, dnode->length);
+
+       mutex_enter(&chmp->chm_lock_sizes);
+       chfs_change_size_free(chmp, cheb, -dnode->length);
+       chfs_change_size_unchecked(chmp, cheb, dnode->length);
+       mutex_exit(&chmp->chm_lock_sizes);
+       return CHFS_NODE_OK;
+}
+
+/**
+ * chfs_scan_classify_cheb - determine eraseblock's state
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock to classify
+ */
+int
+chfs_scan_classify_cheb(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb)
+{
+       if (cheb->free_size == chmp->chm_ebh->eb_size)
+               return CHFS_BLK_STATE_FREE;
+       else if (cheb->dirty_size < MAX_DIRTY_TO_CLEAN)
+               return CHFS_BLK_STATE_CLEAN;
+       else if (cheb->used_size || cheb->unchecked_size)
+               return CHFS_BLK_STATE_PARTDIRTY;
+       else
+               return CHFS_BLK_STATE_ALLDIRTY;
+}
+
+
+/**
+ * chfs_scan_eraseblock - scans an eraseblock and looking for nodes
+ * @chmp: CHFS main descriptor structure
+ * @cheb: eraseblock to scan
+ *
+ * This function scans a whole eraseblock, checks the nodes on it and add them
+ * to the vnode cache.
+ * Returns eraseblock state on success, error code if fails.
+ */
+int
+chfs_scan_eraseblock(struct chfs_mount *chmp,
+    struct chfs_eraseblock *cheb) {
+
+       int err;
+       size_t len, retlen;
+       off_t ofs = 0;
+       int lnr = cheb->lnr;
+       u_char *buf;
+       struct chfs_flash_node_hdr *nhdr;
+       int read_free = 0;
+       struct chfs_node_ref *nref;
+
+
+       dbg("scanning eraseblock content: %d free_size: %d\n", cheb->lnr, cheb->free_size);
+       dbg("scanned physical block: %d\n", chmp->chm_ebh->lmap[lnr]);
+       buf = kmem_alloc(CHFS_MAX_NODE_SIZE, KM_SLEEP);
+
+       while((ofs + CHFS_NODE_HDR_SIZE) < chmp->chm_ebh->eb_size) {
+               memset(buf, 0 , CHFS_MAX_NODE_SIZE);
+               err = chfs_read_leb(chmp,
+                   lnr, buf, ofs, CHFS_NODE_HDR_SIZE, &retlen);
+               if (err) {
+                       return err;
+               }
+
+               if (retlen != CHFS_NODE_HDR_SIZE) {
+                       chfs_err("Error reading node header: "
+                           "read: %zu instead of: %zu\n",
+                           CHFS_NODE_HDR_SIZE, retlen);
+                       return EIO;
+               }
+
+               /* first we check if the buffer we read is full with 0xff, if yes maybe
+                * the blocks remaining area is free. We increase read_free and if it
+                * reaches MAX_READ_FREE we stop reading the block*/
+               if (check_pattern(buf, 0xff, 0, CHFS_NODE_HDR_SIZE)) {
+                       read_free += CHFS_NODE_HDR_SIZE;
+                       if (read_free >= MAX_READ_FREE(chmp)) {
+                               dbg("rest of the block is free. Size: %d\n", cheb->free_size);
+                               return chfs_scan_classify_cheb(chmp, cheb);
+                       }
+                       ofs += CHFS_NODE_HDR_SIZE;
+                       continue;
+               } else {
+                       chfs_update_eb_dirty(chmp, cheb, read_free);
+                       read_free = 0;
+               }
+
+               nhdr = (struct chfs_flash_node_hdr *)buf;
+
+               err = chfs_scan_check_node_hdr(nhdr);
+               if (err) {
+                       dbg("node hdr error\n");
+                       err = chfs_update_eb_dirty(chmp, cheb, 4);
+                       if (err) {
+                               return err;
+                       }
+
+                       ofs += 4;
+                       continue;
+               }
+               ofs += CHFS_NODE_HDR_SIZE;
+               if (ofs > chmp->chm_ebh->eb_size) {
+                       chfs_err("Second part of node is on the next eraseblock.\n");
+                       return EIO;
+               }
+               switch (le16toh(nhdr->type)) {
+               case CHFS_NODETYPE_VNODE:
+                       /* Read up the node */
+                       //dbg("nodetype vnode\n");
+                       len = le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE;
+                       err = chfs_read_leb(chmp,
+                           lnr, buf + CHFS_NODE_HDR_SIZE,
+                           ofs, len,  &retlen);
+                       if (err) {
+                               return err;
+                       }
+
+                       if (retlen != len) {
+                               chfs_err("Error reading vnode: read: %zu instead of: %zu\n",
+                                   len, retlen);
+                               return EIO;
+                       }
+                       KASSERT(lnr == cheb->lnr);
+                       err = chfs_scan_check_vnode(chmp,
+                           cheb, buf, ofs - CHFS_NODE_HDR_SIZE);
+                       if (err) {
+                               return err;
+                       }
+
+                       //dbg("XXX5end\n");
+                       break;
+               case CHFS_NODETYPE_DIRENT:
+                       /* Read up the node */
+                       //dbg("nodetype dirent\n");
+                       len = le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE;
+
+                       err = chfs_read_leb(chmp,
+                           lnr, buf + CHFS_NODE_HDR_SIZE,
+                           ofs, len, &retlen);
+                       if (err) {
+                               return err;
+                       }
+
+                       if (retlen != len) {
+                               chfs_err("Error reading dirent node: read: %zu "
+                                   "instead of: %zu\n", len, retlen);
+                               return EIO;
+                       }
+
+                       KASSERT(lnr == cheb->lnr);
+
+                       err = chfs_scan_check_dirent_node(chmp,
+                           cheb, buf, ofs - CHFS_NODE_HDR_SIZE);
+                       if (err) {
+                               return err;
+                       }
+
+                       //dbg("XXX6end\n");
+                       break;
+               case CHFS_NODETYPE_DATA:
+                       //dbg("nodetype data\n");
+                       len = sizeof(struct chfs_flash_data_node) -
+                           CHFS_NODE_HDR_SIZE;
+                       err = chfs_read_leb(chmp,
+                           lnr, buf + CHFS_NODE_HDR_SIZE,
+                           ofs, len, &retlen);
+                       if (err) {
+                               return err;
+                       }
+
+                       if (retlen != len) {
+                               chfs_err("Error reading data node: read: %zu "
+                                   "instead of: %zu\n", len, retlen);
+                               return EIO;
+                       }
+                       KASSERT(lnr == cheb->lnr);
+                       err = chfs_scan_check_data_node(chmp,
+                           cheb, buf, ofs - CHFS_NODE_HDR_SIZE);
+                       if (err)
+                               return err;
+
+                       //dbg("XXX7end\n");
+                       break;
+               case CHFS_NODETYPE_PADDING:
+                       //dbg("nodetype padding\n");
+                       //dbg("padding len: %d\n", le32toh(nhdr->length));
+                       //dbg("BEF: cheb->free_size: %d\n", cheb->free_size);
+                       nref = chfs_alloc_node_ref(cheb);
+                       nref->nref_offset = ofs - CHFS_NODE_HDR_SIZE;
+                       nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) |
+                           CHFS_OBSOLETE_NODE_MASK;
+
+                       err = chfs_update_eb_dirty(chmp, cheb,
+                           le32toh(nhdr->length));
+                       //dbg("AFT: cheb->free_size: %d\n", cheb->free_size);
+                       if (err)
+                               return err;
+
+                       //dbg("XXX8end\n");
+                       break;
+               default:
+                       //dbg("nodetype ? (default)\n");
+                       /* Unknown node type, update dirty and skip */
+                       err = chfs_update_eb_dirty(chmp, cheb,
+                           le32toh(nhdr->length));
+                       if (err)
+                               return err;
+
+                       //dbg("XXX9end\n");
+                       break;
+               }
+               ofs += le32toh(nhdr->length) - CHFS_NODE_HDR_SIZE;
+       }
+
+       KASSERT(cheb->used_size + cheb->free_size + cheb->dirty_size +
+           cheb->unchecked_size + cheb->wasted_size == chmp->chm_ebh->eb_size);
+
+       //dbg("XXX10\n");
+       return chfs_scan_classify_cheb(chmp, cheb);
+}
diff --git a/sys/ufs/chfs/chfs_subr.c b/sys/ufs/chfs/chfs_subr.c
new file mode 100644 (file)
index 0000000..00cd82f
--- /dev/null
@@ -0,0 +1,540 @@
+/*     $NetBSD: chfs_subr.c,v 1.2 2011/11/24 21:09:37 agc Exp $        */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Efficient memory file system supporting functions.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/kmem.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/systm.h>
+#include <sys/swap.h>
+#include <sys/vnode.h>
+#include <sys/kauth.h>
+#include <sys/proc.h>
+#include <sys/atomic.h>
+
+#include <uvm/uvm.h>
+
+#include <miscfs/specfs/specdev.h>
+#include "chfs.h"
+//#include <fs/chfs/chfs_vnops.h>
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Returns information about the number of available memory pages,
+ * including physical and virtual ones.
+ *
+ * If 'total' is true, the value returned is the total amount of memory
+ * pages configured for the system (either in use or free).
+ * If it is FALSE, the value returned is the amount of free memory pages.
+ *
+ * Remember to remove DUMMYFS_PAGES_RESERVED from the returned value to avoid
+ * excessive memory usage.
+ *
+ */
+size_t
+chfs_mem_info(bool total)
+{
+       size_t size;
+
+       size = 0;
+       size += uvmexp.swpgavail;
+       if (!total) {
+               size -= uvmexp.swpgonly;
+       }
+       size += uvmexp.free;
+       size += uvmexp.filepages;
+       if (size > uvmexp.wired) {
+               size -= uvmexp.wired;
+       } else {
+               size = 0;
+       }
+
+       return size;
+}
+
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Looks for a directory entry in the directory represented by node.
+ * 'cnp' describes the name of the entry to look for.  Note that the .
+ * and .. components are not allowed as they do not physically exist
+ * within directories.
+ *
+ * Returns a pointer to the entry when found, otherwise NULL.
+ */
+struct chfs_dirent *
+chfs_dir_lookup(struct chfs_inode *ip, struct componentname *cnp)
+{
+       bool found;
+       struct chfs_dirent *fd;
+       dbg("dir_lookup()\n");
+
+       KASSERT(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.'));
+       KASSERT(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' &&
+                   cnp->cn_nameptr[1] == '.')));
+       //CHFS_VALIDATE_DIR(node);
+
+       //node->chn_status |= CHFS_NODE_ACCESSED;
+
+       found = false;
+//     fd = ip->dents;
+//     while(fd) {
+       TAILQ_FOREACH(fd, &ip->dents, fds) {
+               KASSERT(cnp->cn_namelen < 0xffff);
+               if (fd->vno == 0)
+                       continue;
+               /*dbg("dirent dump:\n");
+                 dbg(" ->vno:     %d\n", fd->vno);
+                 dbg(" ->version: %ld\n", fd->version);
+                 dbg(" ->nhash:   0x%x\n", fd->nhash);
+                 dbg(" ->nsize:   %d\n", fd->nsize);
+                 dbg(" ->name:    %s\n", fd->name);
+                 dbg(" ->type:    %d\n", fd->type);*/
+               if (fd->nsize == (uint16_t)cnp->cn_namelen &&
+                   memcmp(fd->name, cnp->cn_nameptr, fd->nsize) == 0) {
+                       found = true;
+                       break;
+               }
+//             fd = fd->next;
+       }
+
+       return found ? fd : NULL;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_filldir(struct uio* uio, ino_t ino, const char *name,
+    int namelen, enum vtype type)
+{
+       struct dirent dent;
+       int error;
+
+       memset(&dent, 0, sizeof(dent));
+
+       dent.d_fileno = ino;
+       switch (type) {
+       case VBLK:
+               dent.d_type = DT_BLK;
+               break;
+
+       case VCHR:
+               dent.d_type = DT_CHR;
+               break;
+
+       case VDIR:
+               dent.d_type = DT_DIR;
+               break;
+
+       case VFIFO:
+               dent.d_type = DT_FIFO;
+               break;
+
+       case VLNK:
+               dent.d_type = DT_LNK;
+               break;
+
+       case VREG:
+               dent.d_type = DT_REG;
+               break;
+
+       case VSOCK:
+               dent.d_type = DT_SOCK;
+               break;
+
+       default:
+               KASSERT(0);
+       }
+       dent.d_namlen = namelen;
+       (void)memcpy(dent.d_name, name, dent.d_namlen);
+       dent.d_reclen = _DIRENT_SIZE(&dent);
+
+       if (dent.d_reclen > uio->uio_resid) {
+               error = -1;
+       } else {
+               error = uiomove(&dent, dent.d_reclen, uio);
+       }
+
+       return error;
+}
+
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Change size of the given vnode.
+ * Caller should execute chfs_update on vp after a successful execution.
+ * The vnode must be locked on entry and remain locked on exit.
+ */
+int
+chfs_chsize(struct vnode *vp, u_quad_t size, kauth_cred_t cred)
+{
+       struct chfs_mount *chmp;
+       struct chfs_inode *ip;
+       struct buf *bp;
+       int blknum, append;
+       int error = 0;
+       char *buf = NULL;
+       struct chfs_full_dnode *fd;
+
+       ip = VTOI(vp);
+       chmp = ip->chmp;
+
+       dbg("chfs_chsize\n");
+
+       switch (vp->v_type) {
+       case VDIR:
+               return EISDIR;
+       case VLNK:
+       case VREG:
+               if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                       return EROFS;
+               break;
+       case VBLK:
+       case VCHR:
+       case VFIFO:
+               return 0;
+       default:
+               return EOPNOTSUPP; /* XXX why not ENODEV? */
+       }
+
+       vflushbuf(vp, 0);
+
+       mutex_enter(&chmp->chm_lock_mountfields);
+       chfs_flush_pending_wbuf(chmp);
+
+       /* handle truncate to zero as a special case */
+       if (size == 0) {
+               dbg("truncate to zero");
+               chfs_truncate_fragtree(ip->chmp,
+                   &ip->fragtree, size);
+               chfs_set_vnode_size(vp, size);
+
+               mutex_exit(&chmp->chm_lock_mountfields);
+
+               return 0;
+       }
+
+
+       /* allocate zeros for the new data */
+       buf = kmem_zalloc(size, KM_SLEEP);
+       bp = getiobuf(vp, true);
+
+       if (ip->size != 0) {
+               /* read the whole data */
+               bp->b_blkno = 0;
+               bp->b_bufsize = bp->b_resid = bp->b_bcount = ip->size;
+               bp->b_data = kmem_alloc(ip->size, KM_SLEEP);
+
+               error = chfs_read_data(chmp, vp, bp);
+               if (error) {
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       putiobuf(bp);
+
+                       return error;
+               }
+
+               /* create the new data */
+               dbg("create new data vap%llu ip%llu\n",
+                       (unsigned long long)size, (unsigned long long)ip->size);
+               append = size - ip->size;
+               if (append > 0) {
+                       memcpy(buf, bp->b_data, ip->size);
+               } else {
+                       memcpy(buf, bp->b_data, size);
+                       chfs_truncate_fragtree(ip->chmp,
+                               &ip->fragtree, size);
+               }
+
+               kmem_free(bp->b_data, ip->size);
+
+               struct chfs_node_frag *lastfrag = frag_last(&ip->fragtree);
+               fd = lastfrag->node;
+               chfs_mark_node_obsolete(chmp, fd->nref);
+
+               blknum = lastfrag->ofs / PAGE_SIZE;
+               lastfrag->size = append > PAGE_SIZE ? PAGE_SIZE : size % PAGE_SIZE;
+       } else {
+               fd = chfs_alloc_full_dnode();
+               blknum = 0;
+       }
+
+       chfs_set_vnode_size(vp, size);
+
+       // write the new data
+       for (bp->b_blkno = blknum; bp->b_blkno * PAGE_SIZE < size; bp->b_blkno++) {
+               uint64_t writesize = MIN(size - bp->b_blkno * PAGE_SIZE, PAGE_SIZE);
+
+               bp->b_bufsize = bp->b_resid = bp->b_bcount = writesize;
+               bp->b_data = kmem_alloc(writesize, KM_SLEEP);
+
+               memcpy(bp->b_data, buf + (bp->b_blkno * PAGE_SIZE), writesize);
+
+               if (bp->b_blkno != blknum) {
+                       fd = chfs_alloc_full_dnode();
+               }
+
+               error = chfs_write_flash_dnode(chmp, vp, bp, fd);
+               if (error) {
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       kmem_free(bp->b_data, writesize);
+                       putiobuf(bp);
+
+                       return error;
+               }
+               if (bp->b_blkno != blknum) {
+                       chfs_add_full_dnode_to_inode(chmp, ip, fd);
+               }
+               kmem_free(bp->b_data, writesize);
+       }
+
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       kmem_free(buf, size);
+       putiobuf(bp);
+
+       return 0;
+}
+#if 0
+       int error;
+       struct chfs_node *node;
+
+       KASSERT(VOP_ISLOCKED(vp));
+
+       node = VP_TO_CHFS_NODE(vp);
+
+       // Decide whether this is a valid operation based on the file type.
+       error = 0;
+       switch (vp->v_type) {
+       case VDIR:
+               return EISDIR;
+
+       case VREG:
+               if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                       return EROFS;
+               break;
+
+       case VBLK:
+       case VCHR:
+       case VFIFO:
+               // Allow modifications of special files even if in the file
+               // system is mounted read-only (we are not modifying the
+               // files themselves, but the objects they represent).
+               return 0;
+
+       default:
+               return ENODEV;
+       }
+
+       // Immutable or append-only files cannot be modified, either.
+       if (node->chn_flags & (IMMUTABLE | APPEND))
+               return EPERM;
+
+       error = chfs_truncate(vp, size);
+       // chfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents
+       // for us, as will update dn_status; no need to do that here.
+
+       KASSERT(VOP_ISLOCKED(vp));
+
+       return error;
+#endif
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Change flags of the given vnode.
+ * Caller should execute chfs_update on vp after a successful execution.
+ * The vnode must be locked on entry and remain locked on exit.
+ */
+int
+chfs_chflags(struct vnode *vp, int flags, kauth_cred_t cred)
+{
+       struct chfs_mount *chmp;
+       struct chfs_inode *ip;
+       int error = 0;
+
+       ip = VTOI(vp);
+       chmp = ip->chmp;
+
+       if (vp->v_mount->mnt_flag & MNT_RDONLY)
+               return EROFS;
+
+       if (kauth_cred_geteuid(cred) != ip->uid &&
+           (error = kauth_authorize_generic(cred,
+               KAUTH_GENERIC_ISSUSER, NULL)))
+               return error;
+
+       if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+               NULL) == 0) {
+               if ((ip->flags & (SF_IMMUTABLE | SF_APPEND)) &&
+                   kauth_authorize_system(curlwp->l_cred,
+                       KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL))
+                       return EPERM;
+
+               if ((flags & SF_SNAPSHOT) !=
+                   (ip->flags & SF_SNAPSHOT))
+                       return EPERM;
+
+               ip->flags = flags;
+       } else {
+               if ((ip->flags & (SF_IMMUTABLE | SF_APPEND)) ||
+                   (flags & UF_SETTABLE) != flags)
+                       return EPERM;
+
+               if ((ip->flags & SF_SETTABLE) !=
+                   (flags & SF_SETTABLE))
+                       return EPERM;
+
+               ip->flags &= SF_SETTABLE;
+               ip->flags |= (flags & UF_SETTABLE);
+       }
+       ip->iflag |= IN_CHANGE;
+       error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+       if (error)
+               return error;
+
+       if (flags & (IMMUTABLE | APPEND))
+               return 0;
+
+       return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+void
+chfs_itimes(struct chfs_inode *ip, const struct timespec *acc,
+    const struct timespec *mod, const struct timespec *cre)
+{
+       //dbg("itimes\n");
+       struct timespec now;
+
+       if (!(ip->iflag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
+               return;
+       }
+
+       vfs_timestamp(&now);
+       if (ip->iflag & IN_ACCESS) {
+               if (acc == NULL)
+                       acc = &now;
+               ip->atime = acc->tv_sec;
+       }
+       if (ip->iflag & (IN_UPDATE | IN_MODIFY)) {
+               if (mod == NULL)
+                       mod = &now;
+               ip->mtime = mod->tv_sec;
+               //ip->i_modrev++;
+       }
+       if (ip->iflag & (IN_CHANGE | IN_MODIFY)) {
+               if (cre == NULL)
+                       cre = &now;
+               ip->ctime = cre->tv_sec;
+       }
+       if (ip->iflag & (IN_ACCESS | IN_MODIFY))
+               ip->iflag |= IN_ACCESSED;
+       if (ip->iflag & (IN_UPDATE | IN_CHANGE))
+               ip->iflag |= IN_MODIFIED;
+       ip->iflag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_update(struct vnode *vp, const struct timespec *acc,
+    const struct timespec *mod, int flags)
+{
+
+       struct chfs_inode *ip;
+
+       /* XXX ufs_reclaim calls this function unlocked! */
+//     KASSERT(VOP_ISLOCKED(vp));
+
+#if 0
+       if (flags & UPDATE_CLOSE)
+               ; /* XXX Need to do anything special? */
+#endif
+
+       ip = VTOI(vp);
+       chfs_itimes(ip, acc, mod, NULL);
+
+//     KASSERT(VOP_ISLOCKED(vp));
+       return (0);
+}
+
+/* --------------------------------------------------------------------- */
+/*
+  int
+  chfs_truncate(struct vnode *vp, off_t length)
+  {
+  bool extended;
+  int error;
+  struct chfs_node *node;
+  printf("CHFS: truncate()\n");
+
+  node = VP_TO_CHFS_NODE(vp);
+  extended = length > node->chn_size;
+
+  if (length < 0) {
+  error = EINVAL;
+  goto out;
+  }
+
+  if (node->chn_size == length) {
+  error = 0;
+  goto out;
+  }
+
+  error = chfs_reg_resize(vp, length);
+  if (error == 0)
+  node->chn_status |= CHFS_NODE_CHANGED | CHFS_NODE_MODIFIED;
+
+  out:
+  chfs_update(vp, NULL, NULL, 0);
+
+  return error;
+  }*/
+
+
diff --git a/sys/ufs/chfs/chfs_vfsops.c b/sys/ufs/chfs/chfs_vfsops.c
new file mode 100644 (file)
index 0000000..a08d15f
--- /dev/null
@@ -0,0 +1,847 @@
+/*     $NetBSD: chfs_vfsops.c,v 1.2 2011/11/24 21:09:37 agc Exp $      */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/module.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/conf.h>
+#include <sys/buf.h>
+//XXX needed just for debugging
+#include <sys/fstrans.h>
+#include <sys/sleepq.h>
+#include <sys/lockdebug.h>
+#include <sys/ktrace.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_pager.h>
+#include <ufs/ufs/dir.h>
+//#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+#include <miscfs/specfs/specdev.h>
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+//#include </root/xipffs/netbsd.chfs/chfs_args.h>
+#include "chfs.h"
+#include "chfs_args.h"
+
+MODULE(MODULE_CLASS_VFS, chfs, "flash");
+
+/* --------------------------------------------------------------------- */
+/* functions */
+
+static int chfs_mount(struct mount *, const char *, void *, size_t *);
+static int chfs_unmount(struct mount *, int);
+static int chfs_root(struct mount *, struct vnode **);
+static int chfs_vget(struct mount *, ino_t, struct vnode **);
+static int chfs_fhtovp(struct mount *, struct fid *, struct vnode **);
+static int chfs_vptofh(struct vnode *, struct fid *, size_t *);
+static int chfs_start(struct mount *, int);
+static int chfs_statvfs(struct mount *, struct statvfs *);
+static int chfs_sync(struct mount *, int, kauth_cred_t);
+static void chfs_init(void);
+static void chfs_reinit(void);
+static void chfs_done(void);
+static int chfs_snapshot(struct mount *, struct vnode *,
+    struct timespec *);
+
+/* --------------------------------------------------------------------- */
+/* structures */
+
+int
+chfs_gop_alloc(struct vnode *vp, off_t off, off_t len,  int flags,
+    kauth_cred_t cred)
+{
+       return (0);
+}
+
+const struct genfs_ops chfs_genfsops = {
+       .gop_size = genfs_size,
+       .gop_alloc = chfs_gop_alloc,
+       .gop_write = genfs_gop_write,
+       .gop_markupdate = ufs_gop_markupdate,
+};
+
+/*
+static const struct ufs_ops chfs_ufsops = {
+       .uo_itimes = chfs_itimes,
+       .uo_update = chfs_update,
+};
+*/
+
+struct pool chfs_inode_pool;
+
+/* for looking up the major for flash */
+extern const struct cdevsw flash_cdevsw;
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_mount(struct mount *mp,
+    const char *path, void *data, size_t *data_len)
+{
+       struct lwp *l = curlwp;
+       struct nameidata nd;
+       struct pathbuf *pb;
+       struct vnode *devvp = NULL;
+       struct ufs_args *args = data;
+       struct ufsmount *ump = NULL;
+       struct chfs_mount *chmp;
+       int err = 0;
+       int xflags;
+
+       dbg("mount()\n");
+
+       if (*data_len < sizeof *args)
+               return EINVAL;
+
+       if (mp->mnt_flag & MNT_GETARGS) {
+               ump = VFSTOUFS(mp);
+               if (ump == NULL)
+                       return EIO;
+               memset(args, 0, sizeof *args);
+               args->fspec = NULL;
+               *data_len = sizeof *args;
+               return 0;
+       }
+
+       if (mp->mnt_flag & MNT_UPDATE) {
+               /* XXX: There is no support yet to update file system
+                * settings.  Should be added. */
+
+               return ENODEV;
+       }
+
+       if (args->fspec != NULL) {
+               err = pathbuf_copyin(args->fspec, &pb);
+               if (err) {
+                       return err;
+               }
+               /*
+                * Look up the name and verify that it's sane.
+                */
+               NDINIT(&nd, LOOKUP, FOLLOW, pb);
+               if ((err = namei(&nd)) != 0 )
+                       return (err);
+               devvp = nd.ni_vp;
+
+               /*
+                * Be sure this is a valid block device
+                */
+               if (devvp->v_type != VBLK)
+                       err = ENOTBLK;
+               else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+                       err = ENXIO;
+       }
+
+       if (err) {
+               vrele(devvp);
+               return (err);
+       }
+
+       if (mp->mnt_flag & MNT_RDONLY)
+               xflags = FREAD;
+       else
+               xflags = FREAD|FWRITE;
+
+       err = VOP_OPEN(devvp, xflags, FSCRED);
+       if (err)
+               goto fail;
+
+
+       err = chfs_mountfs(devvp, mp);
+       if (err) {
+               vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+               (void)VOP_CLOSE(devvp, xflags, NOCRED);
+               VOP_UNLOCK(devvp);
+               goto fail;
+       }
+       ump = VFSTOUFS(mp);
+       chmp = ump->um_chfs;
+
+       vfs_getnewfsid(mp);
+       chmp->chm_fsmp = mp;
+
+       return set_statvfs_info(path,
+           UIO_USERSPACE, args->fspec,
+           UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+
+fail:
+       vrele(devvp);
+       return (err);
+}
+
+
+int
+chfs_mountfs(struct vnode *devvp, struct mount *mp)
+{
+       struct lwp *l = curlwp;
+       struct proc *p;
+       kauth_cred_t cred;
+       devmajor_t flash_major;
+       dev_t dev;
+       struct ufsmount* ump = NULL;
+       struct chfs_mount* chmp;
+       struct vnode *vp;
+       int err = 0;
+
+       dbg("mountfs()\n");
+
+       dev = devvp->v_rdev;
+       p = l ? l->l_proc : NULL;
+       cred = l ? l->l_cred : NOCRED;
+
+       /* Flush out any old buffers remaining from a previous use. */
+       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+       err = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+       VOP_UNLOCK(devvp);
+       if (err)
+               return (err);
+
+       flash_major = cdevsw_lookup_major(&flash_cdevsw);
+
+       if (devvp->v_type != VBLK)
+               err = ENOTBLK;
+       else if (bdevsw_lookup(dev) == NULL)
+               err = ENXIO;
+       else if (major(dev) != flash_major) {
+               dbg("major(dev): %d, flash_major: %d\n",
+                   major(dev), flash_major);
+               err = ENODEV;
+       }
+       if (err) {
+               vrele(devvp);
+               return (err);
+       }
+
+       ump = malloc(sizeof(*ump), M_UFSMNT, M_WAITOK);
+       memset(ump, 0, sizeof(*ump));
+       ump->um_fstype = UFS1;
+       //ump->um_ops = &chfs_ufsops;
+       ump->um_chfs = malloc(sizeof(struct chfs_mount),
+           M_UFSMNT, M_WAITOK);
+       memset(ump->um_chfs, 0, sizeof(struct chfs_mount));
+
+       mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE);
+
+       /* Get superblock and set flash device number */
+       chmp = ump->um_chfs;
+       if (!chmp)
+               return ENOMEM;
+
+       chmp->chm_ebh = kmem_alloc(sizeof(struct chfs_ebh), KM_SLEEP);
+
+       dbg("[]opening flash: %u\n", (unsigned int)devvp->v_rdev);
+       err = ebh_open(chmp->chm_ebh, devvp->v_rdev);
+       if (err) {
+               dbg("error while opening flash\n");
+               kmem_free(chmp->chm_ebh, sizeof(struct chfs_ebh));
+               free(chmp, M_UFSMNT);
+               return err;
+       }
+
+       //TODO check flash sizes
+
+       chmp->chm_gbl_version = 0;
+       chmp->chm_vnocache_hash = chfs_vnocache_hash_init();
+
+       chmp->chm_blocks = kmem_zalloc(chmp->chm_ebh->peb_nr *
+           sizeof(struct chfs_eraseblock), KM_SLEEP);
+
+       if (!chmp->chm_blocks) {
+               kmem_free(chmp->chm_ebh, chmp->chm_ebh->peb_nr *
+                   sizeof(struct chfs_eraseblock));
+               ebh_close(chmp->chm_ebh);
+               free(chmp, M_UFSMNT);
+               return ENOMEM;
+       }
+
+       mutex_init(&chmp->chm_lock_mountfields, MUTEX_DEFAULT, IPL_NONE);
+       mutex_init(&chmp->chm_lock_sizes, MUTEX_DEFAULT, IPL_NONE);
+       mutex_init(&chmp->chm_lock_vnocache, MUTEX_DEFAULT, IPL_NONE);
+
+       //XXX
+       chmp->chm_fs_bmask = -4096;
+       chmp->chm_fs_bsize = 4096;
+       chmp->chm_fs_qbmask = 4095;
+       chmp->chm_fs_bshift = 12;
+       chmp->chm_fs_fmask = -2048;
+       chmp->chm_fs_qfmask = 2047;
+
+       chmp->chm_wbuf_pagesize = chmp->chm_ebh->flash_if->page_size;
+       dbg("wbuf size: %zu\n", chmp->chm_wbuf_pagesize);
+       chmp->chm_wbuf = kmem_alloc(chmp->chm_wbuf_pagesize, KM_SLEEP);
+       rw_init(&chmp->chm_lock_wbuf);
+
+       //init queues
+       TAILQ_INIT(&chmp->chm_free_queue);
+       TAILQ_INIT(&chmp->chm_clean_queue);
+       TAILQ_INIT(&chmp->chm_dirty_queue);
+       TAILQ_INIT(&chmp->chm_very_dirty_queue);
+       TAILQ_INIT(&chmp->chm_erasable_pending_wbuf_queue);
+       TAILQ_INIT(&chmp->chm_erase_pending_queue);
+
+       chfs_calc_trigger_levels(chmp);
+
+       chmp->chm_nr_free_blocks = 0;
+       chmp->chm_nr_erasable_blocks = 0;
+       chmp->chm_max_vno = 2;
+       chmp->chm_checked_vno = 2;
+       chmp->chm_unchecked_size = 0;
+       chmp->chm_used_size = 0;
+       chmp->chm_dirty_size = 0;
+       chmp->chm_wasted_size = 0;
+       chmp->chm_free_size = chmp->chm_ebh->eb_size * chmp->chm_ebh->peb_nr;
+       err = chfs_build_filesystem(chmp);
+
+       if (err) {
+               chfs_vnocache_hash_destroy(chmp->chm_vnocache_hash);
+               kmem_free(chmp->chm_ebh, chmp->chm_ebh->peb_nr *
+                   sizeof(struct chfs_eraseblock));
+               ebh_close(chmp->chm_ebh);
+               free(chmp, M_UFSMNT);
+               return EIO;
+       }
+
+       mp->mnt_data = ump;
+       mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+       mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_CHFS);
+       mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+       mp->mnt_stat.f_namemax = MAXNAMLEN;
+       mp->mnt_flag |= MNT_LOCAL;
+       mp->mnt_fs_bshift = PAGE_SHIFT;
+       mp->mnt_dev_bshift = DEV_BSHIFT;
+       mp->mnt_iflag |= IMNT_MPSAFE;
+       ump->um_flags = 0;
+       ump->um_mountp = mp;
+       ump->um_dev = dev;
+       ump->um_devvp = devvp;
+       ump->um_maxfilesize = 1048512 * 1024;
+       /*TODO fill these fields
+         ump->um_nindir =
+         ump->um_lognindir =
+         ump->um_bptrtodb =
+         ump->um_seqinc =
+         ump->um_maxsymlinklen =
+         ump->um_dirblksiz =
+         ump->um_maxfilesize =
+       */
+
+       /*
+        * Allocate the root vnode.
+        */
+       err = VFS_VGET(mp, CHFS_ROOTINO, &vp);
+       if (err) {
+               dbg("error: %d while allocating root node\n", err);
+               return err;
+       }
+       vput(vp);
+
+       chfs_gc_thread_start(chmp);
+       mutex_enter(&chmp->chm_lock_mountfields);
+       chfs_gc_trigger(chmp);
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       devvp->v_specmountpoint = mp;
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* ARGSUSED2 */
+static int
+chfs_unmount(struct mount *mp, int mntflags)
+{
+       int flags = 0, i = 0;
+       struct ufsmount *ump;
+       struct chfs_mount *chmp;
+//     struct chfs_vnode_cache *vc, *next;
+
+       if (mntflags & MNT_FORCE)
+               flags |= FORCECLOSE;
+
+       dbg("[START]\n");
+
+       ump = VFSTOUFS(mp);
+       chmp = ump->um_chfs;
+
+       chfs_gc_thread_stop(chmp);
+
+       (void)vflush(mp, NULLVP, flags);
+
+       if (chmp->chm_wbuf_len) {
+               mutex_enter(&chmp->chm_lock_mountfields);
+               chfs_flush_pending_wbuf(chmp);
+               mutex_exit(&chmp->chm_lock_mountfields);
+       }
+
+       for (i = 0; i < chmp->chm_ebh->peb_nr; i++) {
+               chfs_free_node_refs(&chmp->chm_blocks[i]);
+       }
+
+       chfs_vnocache_hash_destroy(chmp->chm_vnocache_hash);
+
+       ebh_close(chmp->chm_ebh);
+
+       rw_destroy(&chmp->chm_lock_wbuf);
+       mutex_destroy(&chmp->chm_lock_vnocache);
+       mutex_destroy(&chmp->chm_lock_sizes);
+       mutex_destroy(&chmp->chm_lock_mountfields);
+
+       if (ump->um_devvp->v_type != VBAD) {
+               ump->um_devvp->v_specmountpoint = NULL;
+       }
+       vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+       (void)VOP_CLOSE(ump->um_devvp, FREAD|FWRITE, NOCRED);
+       vput(ump->um_devvp);
+
+       mutex_destroy(&ump->um_lock);
+
+       //free(ump->um_chfs, M_UFSMNT);
+       free(ump, M_UFSMNT);
+       mp->mnt_data = NULL;
+       mp->mnt_flag &= ~MNT_LOCAL;
+       dbg("[END]\n");
+       return (0);
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_root(struct mount *mp, struct vnode **vpp)
+{
+       struct vnode *vp;
+       int error;
+
+       if ((error = VFS_VGET(mp, (ino_t)ROOTINO, &vp)) != 0)
+               return error;
+       *vpp = vp;
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+extern rb_tree_ops_t frag_rbtree_ops;
+
+static int
+chfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+       struct chfs_mount *chmp;
+       struct chfs_inode *ip;
+       struct ufsmount *ump;
+       struct vnode *vp;
+       dev_t dev;
+       int error;
+       struct chfs_vnode_cache* chvc = NULL;
+       struct chfs_node_ref* nref = NULL;
+       struct buf *bp;
+
+       dbg("vget() | ino: %llu\n", (unsigned long long)ino);
+
+       ump = VFSTOUFS(mp);
+       dev = ump->um_dev;
+retry:
+       if (!vpp) {
+               vpp = kmem_alloc(sizeof(struct vnode*), KM_SLEEP);
+       }
+
+       if ((*vpp = chfs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) {
+               return 0;
+       }
+
+       /* Allocate a new vnode/inode. */
+       if ((error = getnewvnode(VT_CHFS,
+                   mp, chfs_vnodeop_p, NULL, &vp)) != 0) {
+               *vpp = NULL;
+               return (error);
+       }
+       ip = pool_get(&chfs_inode_pool, PR_WAITOK);
+
+       mutex_enter(&chfs_hashlock);
+       if ((*vpp = chfs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL) {
+               mutex_exit(&chfs_hashlock);
+               ungetnewvnode(vp);
+               pool_put(&chfs_inode_pool, ip);
+               goto retry;
+       }
+
+       vp->v_vflag |= VV_LOCKSWORK;
+
+       memset(ip, 0, sizeof(*ip));
+       vp->v_data = ip;
+       ip->vp = vp;
+       ip->ump = ump;
+       ip->chmp = chmp = ump->um_chfs;
+       ip->dev = dev;
+       ip->ino = ino;
+       vp->v_mount = mp;
+       genfs_node_init(vp, &chfs_genfsops);
+
+       rb_tree_init(&ip->fragtree, &frag_rbtree_ops);
+       //mutex_init(&ip->inode_lock, MUTEX_DEFAULT, IPL_NONE);
+
+       chfs_ihashins(ip);
+       mutex_exit(&chfs_hashlock);
+
+       // set root inode
+       if (ino == CHFS_ROOTINO) {
+               dbg("SETROOT\n");
+               vp->v_vflag |= VV_ROOT;
+               vp->v_type = VDIR;
+               ip->mode = IFMT | IEXEC | IWRITE | IREAD;
+               ip->iflag |= (IN_ACCESS | IN_CHANGE | IN_UPDATE);
+               chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+//             ip->dents = NULL; XXXTAILQ
+               TAILQ_INIT(&ip->dents);
+               chfs_set_vnode_size(vp, 512);
+       }
+
+       // set vnode cache
+       mutex_enter(&chmp->chm_lock_vnocache);
+       chvc = chfs_vnode_cache_get(chmp, ino);
+       mutex_exit(&chmp->chm_lock_vnocache);
+       if (!chvc) {
+               dbg("!chvc\n");
+               /* XXX, we cant alloc under a lock, refactor this! */
+               chvc = chfs_vnode_cache_alloc(ino);
+               mutex_enter(&chmp->chm_lock_vnocache);
+               if (ino == CHFS_ROOTINO) {
+                       chvc->nlink = 2;
+                       chvc->pvno = CHFS_ROOTINO;
+                       chfs_vnode_cache_set_state(chmp,
+                           chvc, VNO_STATE_CHECKEDABSENT);
+               }
+               chfs_vnode_cache_add(chmp, chvc);
+               mutex_exit(&chmp->chm_lock_vnocache);
+
+               ip->chvc = chvc;
+               TAILQ_INIT(&ip->dents);
+       } else {
+               dbg("chvc\n");
+               ip->chvc = chvc;
+               // if we have a vnode cache, the node is already on flash, so read it
+               if (ino == CHFS_ROOTINO) {
+                       chvc->pvno = CHFS_ROOTINO;
+                       TAILQ_INIT(&chvc->scan_dirents);
+               } else {
+                       chfs_readvnode(mp, ino, &vp);
+               }
+
+               mutex_enter(&chmp->chm_lock_mountfields);
+               // init type specific things
+               switch (vp->v_type) {
+               case VDIR:
+                       nref = chvc->dirents;
+                       while (nref &&
+                           (struct chfs_vnode_cache *)nref != chvc) {
+                               chfs_readdirent(mp, nref, ip);
+                               nref = nref->nref_next;
+                       }
+                       chfs_set_vnode_size(vp, 512);
+                       break;
+               case VREG:
+               case VSOCK:
+                       //build the fragtree of the vnode
+                       dbg("read_inode_internal | ino: %llu\n",
+                               (unsigned long long)ip->ino);
+                       error = chfs_read_inode(chmp, ip);
+                       if (error) {
+                               vput(vp);
+                               *vpp = NULL;
+                               mutex_exit(&chmp->chm_lock_mountfields);
+                               return (error);
+                       }
+                       break;
+               case VLNK:
+                       //build the fragtree of the vnode
+                       dbg("read_inode_internal | ino: %llu\n",
+                               (unsigned long long)ip->ino);
+                       error = chfs_read_inode_internal(chmp, ip);
+                       if (error) {
+                               vput(vp);
+                               *vpp = NULL;
+                               mutex_exit(&chmp->chm_lock_mountfields);
+                               return (error);
+                       }
+
+                       dbg("size: %llu\n", (unsigned long long)ip->size);
+                       bp = getiobuf(vp, true);
+                       bp->b_blkno = 0;
+                       bp->b_bufsize = bp->b_resid =
+                           bp->b_bcount = ip->size;
+                       bp->b_data = kmem_alloc(ip->size, KM_SLEEP);
+                       chfs_read_data(chmp, vp, bp);
+                       if (!ip->target)
+                               ip->target = kmem_alloc(ip->size,
+                                   KM_SLEEP);
+                       memcpy(ip->target, bp->b_data, ip->size);
+                       kmem_free(bp->b_data, ip->size);
+                       putiobuf(bp);
+
+                       break;
+               case VCHR:
+               case VBLK:
+               case VFIFO:
+                       //build the fragtree of the vnode
+                       dbg("read_inode_internal | ino: %llu\n",
+                               (unsigned long long)ip->ino);
+                       error = chfs_read_inode_internal(chmp, ip);
+                       if (error) {
+                               vput(vp);
+                               *vpp = NULL;
+                               mutex_exit(&chmp->chm_lock_mountfields);
+                               return (error);
+                       }
+
+                       bp = getiobuf(vp, true);
+                       bp->b_blkno = 0;
+                       bp->b_bufsize = bp->b_resid =
+                           bp->b_bcount = sizeof(dev_t);
+                       bp->b_data = kmem_alloc(sizeof(dev_t), KM_SLEEP);
+                       chfs_read_data(chmp, vp, bp);
+                       memcpy(&ip->rdev,
+                           bp->b_data, sizeof(dev_t));
+                       kmem_free(bp->b_data, sizeof(dev_t));
+                       putiobuf(bp);
+                       if (vp->v_type == VFIFO)
+                               vp->v_op = chfs_fifoop_p;
+                       else {
+                               vp->v_op = chfs_specop_p;
+                               spec_node_init(vp, ip->rdev);
+                       }
+
+                   break;
+               case VNON:
+               case VBAD:
+                       break;
+               }
+               mutex_exit(&chmp->chm_lock_mountfields);
+
+       }
+
+       /* finish inode initalization */
+       ip->devvp = ump->um_devvp;
+       vref(ip->devvp);
+
+       uvm_vnp_setsize(vp, ip->size);
+       *vpp = vp;
+
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+       return ENODEV;
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+       return ENODEV;
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_start(struct mount *mp, int flags)
+{
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* ARGSUSED2 */
+static int
+chfs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+       struct chfs_mount *chmp;
+       struct ufsmount *ump;
+       dbg("statvfs\n");
+
+       ump = VFSTOUFS(mp);
+       chmp = ump->um_chfs;
+
+       sbp->f_flag   = mp->mnt_flag;
+       sbp->f_bsize  = chmp->chm_ebh->eb_size;
+       sbp->f_frsize = chmp->chm_ebh->eb_size;
+       sbp->f_iosize = chmp->chm_ebh->eb_size;
+
+       sbp->f_blocks = chmp->chm_ebh->peb_nr;
+       sbp->f_files  = 0;
+       sbp->f_bavail = chmp->chm_nr_free_blocks - chmp->chm_resv_blocks_write;
+#if 0
+       printf("chmp->chm_nr_free_blocks: %jd\n",
+           (intmax_t )chmp->chm_nr_free_blocks);
+       printf("chmp->chm_resv_blocks_write: %jd\n",
+           (intmax_t) chmp->chm_resv_blocks_write);
+       printf("chmp->chm_ebh->peb_nr: %jd\n",
+           (intmax_t) chmp->chm_ebh->peb_nr);
+#endif
+
+       sbp->f_bfree = chmp->chm_nr_free_blocks;
+       sbp->f_bresvd = chmp->chm_resv_blocks_write;
+
+       /* FFS specific */
+       sbp->f_ffree  = 0;
+       sbp->f_favail = 0;
+       sbp->f_fresvd = 0;
+
+       copy_statvfs_info(sbp, mp);
+
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* ARGSUSED0 */
+static int
+chfs_sync(struct mount *mp, int waitfor,
+    kauth_cred_t uc)
+{
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+static void
+chfs_init(void)
+{
+       chfs_alloc_pool_caches();
+       chfs_ihashinit();
+       pool_init(&chfs_inode_pool, sizeof(struct chfs_inode), 0, 0, 0,
+           "chfsinopl", &pool_allocator_nointr, IPL_NONE);
+       ufs_init();
+}
+
+/* --------------------------------------------------------------------- */
+
+static void
+chfs_reinit(void)
+{
+       chfs_ihashreinit();
+       ufs_reinit();
+}
+
+/* --------------------------------------------------------------------- */
+
+static void
+chfs_done(void)
+{
+       ufs_done();
+       chfs_ihashdone();
+       pool_destroy(&chfs_inode_pool);
+       chfs_destroy_pool_caches();
+}
+
+/* --------------------------------------------------------------------- */
+
+static int
+chfs_snapshot(struct mount *mp, struct vnode *vp,
+    struct timespec *ctime)
+{
+       return ENODEV;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * chfs vfs operations.
+ */
+
+extern const struct vnodeopv_desc chfs_fifoop_opv_desc;
+extern const struct vnodeopv_desc chfs_specop_opv_desc;
+extern const struct vnodeopv_desc chfs_vnodeop_opv_desc;
+
+const struct vnodeopv_desc * const chfs_vnodeopv_descs[] = {
+       &chfs_fifoop_opv_desc,
+       &chfs_specop_opv_desc,
+       &chfs_vnodeop_opv_desc,
+       NULL,
+};
+
+struct vfsops chfs_vfsops = {
+       MOUNT_CHFS,                     /* vfs_name */
+       sizeof (struct chfs_args),
+       chfs_mount,                     /* vfs_mount */
+       chfs_start,                     /* vfs_start */
+       chfs_unmount,           /* vfs_unmount */
+       chfs_root,                      /* vfs_root */
+       ufs_quotactl,                   /* vfs_quotactl */
+       chfs_statvfs,           /* vfs_statvfs */
+       chfs_sync,                      /* vfs_sync */
+       chfs_vget,                      /* vfs_vget */
+       chfs_fhtovp,            /* vfs_fhtovp */
+       chfs_vptofh,            /* vfs_vptofh */
+       chfs_init,                      /* vfs_init */
+       chfs_reinit,            /* vfs_reinit */
+       chfs_done,                      /* vfs_done */
+       NULL,                           /* vfs_mountroot */
+       chfs_snapshot,          /* vfs_snapshot */
+       vfs_stdextattrctl,              /* vfs_extattrctl */
+       (void *)eopnotsupp,             /* vfs_suspendctl */
+       genfs_renamelock_enter,
+       genfs_renamelock_exit,
+       (void *)eopnotsupp,
+       chfs_vnodeopv_descs,
+       0,                              /* vfs_refcount */
+       { NULL, NULL },
+};
+
+static int
+chfs_modcmd(modcmd_t cmd, void *arg)
+{
+       switch (cmd) {
+       case MODULE_CMD_INIT:
+               return vfs_attach(&chfs_vfsops);
+       case MODULE_CMD_FINI:
+               return vfs_detach(&chfs_vfsops);
+       default:
+               return ENOTTY;
+       }
+}
diff --git a/sys/ufs/chfs/chfs_vnode.c b/sys/ufs/chfs/chfs_vnode.c
new file mode 100644 (file)
index 0000000..2e1b386
--- /dev/null
@@ -0,0 +1,393 @@
+/*     $NetBSD: chfs_vnode.c,v 1.2 2011/11/24 21:09:37 agc Exp $       */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+#include "chfs_inode.h"
+#include <sys/malloc.h>
+#include <sys/kauth.h>
+#include <sys/namei.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+
+struct vnode *
+chfs_vnode_lookup(struct chfs_mount *chmp, ino_t vno)
+{
+       struct vnode *vp;
+       struct chfs_inode *ip;
+
+       TAILQ_FOREACH(vp, &chmp->chm_fsmp->mnt_vnodelist, v_mntvnodes) {
+               ip = VTOI(vp);
+               if (ip && ip->ino == vno)
+                       return vp;
+       }
+       return NULL;
+}
+
+int
+chfs_readvnode(struct mount* mp, ino_t ino, struct vnode** vpp)
+{
+       struct ufsmount* ump = VFSTOUFS(mp);
+       struct chfs_mount *chmp = ump->um_chfs;
+       struct chfs_vnode_cache *chvc;
+       struct chfs_flash_vnode *chfvn;
+       struct chfs_inode *ip;
+       int err;
+       char* buf;
+       size_t retlen, len;
+       struct vnode* vp = NULL;
+       dbg("readvnode | ino: %llu\n", (unsigned long long)ino);
+
+       len = sizeof(struct chfs_flash_vnode);
+
+       KASSERT(vpp != NULL);
+
+       if (vpp != NULL) {
+               vp = *vpp;
+       }
+
+       ip = VTOI(vp);
+       chvc = ip->chvc;
+
+       if (chvc && ino != CHFS_ROOTINO) {
+               /* debug... */
+               printf("readvnode; offset: %" PRIu32 ", lnr: %d\n",
+                   CHFS_GET_OFS(chvc->v->nref_offset), chvc->v->nref_lnr);
+
+               KASSERT((void *)chvc != (void *)chvc->v);
+
+               buf = kmem_alloc(len, KM_SLEEP);
+               err = chfs_read_leb(chmp, chvc->v->nref_lnr, buf,
+                   CHFS_GET_OFS(chvc->v->nref_offset), len, &retlen);
+               if (err)
+                       return err;
+               if (retlen != len) {
+                       chfs_err("Error reading vnode: read: %zu insted of: %zu\n",
+                           len, retlen);
+                       return EIO;
+               }
+               chfvn = (struct chfs_flash_vnode*)buf;
+               chfs_set_vnode_size(vp, chfvn->dn_size);
+               ip->mode = chfvn->mode;
+               vp->v_type = IFTOVT(ip->mode);
+               ip->version = chfvn->version;
+               //ip->chvc->highest_version = ip->version;
+               ip->uid = chfvn->uid;
+               ip->gid = chfvn->gid;
+               ip->atime = chfvn->atime;
+               ip->mtime = chfvn->mtime;
+               ip->ctime = chfvn->ctime;
+               kmem_free(buf, len);
+       }
+
+
+       *vpp = vp;
+       return 0;
+}
+
+int
+chfs_readdirent(struct mount *mp, struct chfs_node_ref *chnr, struct chfs_inode *pdir)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct chfs_mount *chmp = ump->um_chfs;
+       struct chfs_flash_dirent_node chfdn;
+       struct chfs_dirent *fd;//, *pdents;
+       size_t len = sizeof(struct chfs_flash_dirent_node);
+//     struct chfs_vnode_cache* parent;
+       size_t retlen;
+       int err = 0;
+
+//     parent = chfs_get_vnode_cache(chmp, pdir->ino);
+
+       //read flash_dirent_node
+       err = chfs_read_leb(chmp, chnr->nref_lnr, (char *)&chfdn,
+           CHFS_GET_OFS(chnr->nref_offset), len, &retlen);
+       if (err) {
+               return err;
+       }
+       if (retlen != len) {
+               chfs_err("Error reading vnode: read: %zu insted of: %zu\n",
+                   retlen, len);
+               return EIO;
+       }
+
+       //set fields of dirent
+       fd = chfs_alloc_dirent(chfdn.nsize + 1);
+       fd->version = chfdn.version;
+       fd->vno = chfdn.vno;
+       fd->type = chfdn.dtype;
+       fd->nsize = chfdn.nsize;
+//     fd->next = NULL;
+
+       err = chfs_read_leb(chmp, chnr->nref_lnr, fd->name,
+           CHFS_GET_OFS(chnr->nref_offset) + len, chfdn.nsize, &retlen);
+       if (err) {
+               return err;
+       }
+
+       if (retlen != chfdn.nsize) {
+               chfs_err("Error reading vnode: read: %zu insted of: %zu\n",
+                   len, retlen);
+               return EIO;
+       }
+
+       fd->name[fd->nsize] = 0;
+       fd->nref = chnr;
+
+       chfs_add_fd_to_inode(chmp, pdir, fd);
+/*
+  pdents = pdir->i_chfs_ext.dents;
+  if (!pdents)
+  pdir->i_chfs_ext.dents = fd;
+  else {
+  while (pdents->next != NULL) {
+  pdents = pdents->next;
+  }
+  pdents->next = fd;
+  }
+*/
+       return 0;
+}
+
+/*
+ * Allocate a new inode.
+ */
+int
+chfs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
+    struct componentname *cnp, int type)
+{
+       struct chfs_inode *ip, *pdir;
+       struct vnode *vp;
+       struct ufsmount* ump = VFSTOUFS(dvp->v_mount);
+       struct chfs_mount* chmp = ump->um_chfs;
+       struct chfs_vnode_cache* chvc;
+       int error, ismember = 0;
+       ino_t vno;
+       struct chfs_dirent *nfd;//, *fd;
+
+       dbg("makeinode\n");
+       pdir = VTOI(dvp);
+
+       *vpp = NULL;
+
+       vno = ++(chmp->chm_max_vno);
+
+       error = VFS_VGET(dvp->v_mount, vno, &vp);
+       if (error)
+               return (error);
+
+       mutex_enter(&chmp->chm_lock_vnocache);
+       chvc = chfs_vnode_cache_get(chmp, vno);
+       mutex_exit(&chmp->chm_lock_vnocache);
+
+       chvc->pvno = pdir->ino;
+       chvc->vno_version = kmem_alloc(sizeof(uint64_t), KM_SLEEP);
+       *(chvc->vno_version) = 1;
+       if (type != VDIR)
+               chvc->nlink = 1;
+       else
+               chvc->nlink = 2;
+//     chfs_vnode_cache_set_state(chmp, chvc, VNO_STATE_CHECKEDABSENT);
+       chvc->state = VNO_STATE_CHECKEDABSENT;
+
+       ip = VTOI(vp);
+       ip->ino = vno;
+
+       if (type == VDIR)
+               chfs_set_vnode_size(vp, 512);
+       else
+               chfs_set_vnode_size(vp, 0);
+
+       ip->uid = kauth_cred_geteuid(cnp->cn_cred);
+       ip->gid = kauth_cred_getegid(cnp->cn_cred);
+       ip->version = 1;
+       ip->iflag |= (IN_ACCESS | IN_CHANGE | IN_UPDATE);
+
+       ip->chvc = chvc;
+       //ip->chvc->highest_version = 1;
+       ip->target = NULL;
+
+       ip->mode = mode;
+       vp->v_type = type;      /* Rest init'd in getnewvnode(). */
+       if ((ip->mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred,
+               ip->gid, &ismember) != 0 || !ismember) &&
+           kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL))
+               ip->mode &= ~ISGID;
+
+       chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+       mutex_enter(&chmp->chm_lock_mountfields);
+
+       //write inode to flash
+       error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+       if (error) {
+               mutex_exit(&chmp->chm_lock_mountfields);
+               vput(vp);
+               vput(dvp);
+               return error;
+       }
+       //update parent directory and write it to the flash
+       pdir->iflag |= (IN_ACCESS | IN_CHANGE | IN_MODIFY | IN_UPDATE);
+       chfs_update(dvp, NULL, NULL, UPDATE_WAIT);
+
+       error = chfs_write_flash_vnode(chmp, pdir, ALLOC_NORMAL);
+       if (error) {
+               mutex_exit(&chmp->chm_lock_mountfields);
+               vput(vp);
+               vput(dvp);
+               return error;
+       }
+       vput(dvp);
+
+       //set up node's full dirent
+       nfd = chfs_alloc_dirent(cnp->cn_namelen + 1);
+       nfd->vno = ip->ino;
+       nfd->version = (++pdir->chvc->highest_version);
+       nfd->type = type;
+//     nfd->next = NULL;
+       nfd->nsize = cnp->cn_namelen;
+       memcpy(&(nfd->name), cnp->cn_nameptr, cnp->cn_namelen);
+       nfd->name[nfd->nsize] = 0;
+       nfd->nhash = hash32_buf(nfd->name, cnp->cn_namelen, HASH32_BUF_INIT);
+
+       // write out direntry
+       error = chfs_write_flash_dirent(chmp, pdir, ip, nfd, ip->ino, ALLOC_NORMAL);
+       if (error) {
+        mutex_exit(&chmp->chm_lock_mountfields);
+               vput(vp);
+               return error;
+       }
+
+       //TODO set parent's dir times
+
+       chfs_add_fd_to_inode(chmp, pdir, nfd);
+/*
+  fd = pdir->i_chfs_ext.dents;
+  if (!fd)
+  pdir->i_chfs_ext.dents = nfd;
+  else {
+  while (fd->next != NULL) {
+  fd = fd->next;
+  }
+  fd->next = nfd;
+  }
+*/
+       //pdir->i_nlink++;
+       pdir->chvc->nlink++;
+
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       *vpp = vp;
+       return (0);
+}
+
+void
+chfs_set_vnode_size(struct vnode *vp, size_t size)
+{
+       struct chfs_inode *ip;
+
+       KASSERT(vp != NULL);
+
+       ip = VTOI(vp);
+       KASSERT(ip != NULL);
+
+       ip->size = size;
+       vp->v_size = vp->v_writesize = size;
+       return;
+}
+
+void
+chfs_change_size_free(struct chfs_mount *chmp,
+       struct chfs_eraseblock *cheb, int change)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+       KASSERT((int)(chmp->chm_free_size + change) >= 0);
+       KASSERT((int)(cheb->free_size + change) >= 0);
+       KASSERT((int)(cheb->free_size + change) <= chmp->chm_ebh->eb_size);
+       chmp->chm_free_size += change;
+       cheb->free_size += change;
+       return;
+}
+
+void
+chfs_change_size_dirty(struct chfs_mount *chmp,
+       struct chfs_eraseblock *cheb, int change)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+       KASSERT((int)(chmp->chm_dirty_size + change) >= 0);
+       KASSERT((int)(cheb->dirty_size + change) >= 0);
+       KASSERT((int)(cheb->dirty_size + change) <= chmp->chm_ebh->eb_size);
+       chmp->chm_dirty_size += change;
+       cheb->dirty_size += change;
+       return;
+}
+
+void
+chfs_change_size_unchecked(struct chfs_mount *chmp,
+       struct chfs_eraseblock *cheb, int change)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+       KASSERT((int)(chmp->chm_unchecked_size + change) >= 0);
+       KASSERT((int)(cheb->unchecked_size + change) >= 0);
+       KASSERT((int)(cheb->unchecked_size + change) <= chmp->chm_ebh->eb_size);
+       chmp->chm_unchecked_size += change;
+       cheb->unchecked_size += change;
+       return;
+}
+
+void
+chfs_change_size_used(struct chfs_mount *chmp,
+       struct chfs_eraseblock *cheb, int change)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+       KASSERT((int)(chmp->chm_used_size + change) >= 0);
+       KASSERT((int)(cheb->used_size + change) >= 0);
+       KASSERT((int)(cheb->used_size + change) <= chmp->chm_ebh->eb_size);
+       chmp->chm_used_size += change;
+       cheb->used_size += change;
+       return;
+}
+
+void
+chfs_change_size_wasted(struct chfs_mount *chmp,
+       struct chfs_eraseblock *cheb, int change)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+       KASSERT((int)(chmp->chm_wasted_size + change) >= 0);
+       KASSERT((int)(cheb->wasted_size + change) >= 0);
+       KASSERT((int)(cheb->wasted_size + change) <= chmp->chm_ebh->eb_size);
+       chmp->chm_wasted_size += change;
+       cheb->wasted_size += change;
+       return;
+}
+
diff --git a/sys/ufs/chfs/chfs_vnode_cache.c b/sys/ufs/chfs/chfs_vnode_cache.c
new file mode 100644 (file)
index 0000000..101b494
--- /dev/null
@@ -0,0 +1,165 @@
+/*     $NetBSD: chfs_vnode_cache.c,v 1.1 2011/11/24 15:51:32 ahoka Exp $       */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "chfs.h"
+#include <sys/pool.h>
+
+struct chfs_vnode_cache **
+chfs_vnocache_hash_init(void)
+{
+       return kmem_zalloc(VNODECACHE_SIZE *
+           sizeof(struct chfs_vnode_cache *), KM_SLEEP);
+}
+
+/**
+ * chfs_set_vnode_cache_state - set state of a vnode_cache
+ * @chmp: fs super block info
+ * @vc: vnode_cache
+ * @state: new state
+ */
+void
+chfs_vnode_cache_set_state(struct chfs_mount *chmp,
+    struct chfs_vnode_cache* vc, int state)
+{
+       /* XXX do we really need locking here? */
+       KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+       vc->state = state;
+}
+
+/**
+ * chfs_get_vnode_cache - get a vnode_cache from the vnocache_hash
+ * @chmp: fs super block info
+ * @ino: inode for search
+ * Returns the vnode_cache.
+ */
+struct chfs_vnode_cache *
+chfs_vnode_cache_get(struct chfs_mount *chmp, ino_t vno)
+{
+       struct chfs_vnode_cache* ret;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+       ret = chmp->chm_vnocache_hash[vno % VNODECACHE_SIZE];
+
+       if (ret == NULL) {
+               return NULL;
+       }
+
+       while (ret && ret->vno < vno) {
+               ret = ret->next;
+       }
+
+       if (ret && ret->vno != vno) {
+               ret = NULL;
+       }
+
+       return ret;
+}
+
+/**
+ * chfs_add_vnode_cache - add a vnode_cache to the vnocache_hash
+ * @chmp: fs super block info
+ * @new: new vnode_cache
+ */
+void
+chfs_vnode_cache_add(struct chfs_mount *chmp,
+    struct chfs_vnode_cache* new)
+{
+       struct chfs_vnode_cache** prev;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+       if (!new->vno) {
+               new->vno = ++chmp->chm_max_vno;
+       }
+
+       prev = &chmp->chm_vnocache_hash[new->vno % VNODECACHE_SIZE];
+
+       while ((*prev) && (*prev)->vno < new->vno) {
+               prev = &((*prev)->next);
+       }
+       new->next = *prev;
+       *prev = new;
+}
+
+/**
+ * chfs_del_vnode_cache - del a vnode_cache from the vnocache_hash
+ * @chmp: fs super block info
+ * @old: old vnode_cache
+ */
+void
+chfs_vnode_cache_remove(struct chfs_mount *chmp,
+    struct chfs_vnode_cache* old)
+{
+       struct chfs_vnode_cache** prev;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_vnocache));
+
+       prev = &chmp->chm_vnocache_hash[old->vno % VNODECACHE_SIZE];
+       while ((*prev) && (*prev)->vno < old->vno) {
+               prev = &(*prev)->next;
+       }
+
+       if ((*prev) == old) {
+               *prev = old->next;
+       }
+
+       if (old->state != VNO_STATE_READING &&
+           old->state != VNO_STATE_CLEARING) {
+               chfs_vnode_cache_free(old);
+       }
+}
+
+/**
+ * chfs_free_vnode_caches - free the vnocache_hash
+ * @chmp: fs super block info
+ */
+void
+chfs_vnocache_hash_destroy(struct chfs_vnode_cache **hash)
+{
+       struct chfs_vnode_cache *this, *next;
+       int i;
+
+       for (i = 0; i < VNODECACHE_SIZE; i++) {
+               this = hash[i];
+               while (this) {
+                       next = this->next;
+                       chfs_vnode_cache_free(this);
+                       this = next;
+               }
+               hash[i] = NULL;
+       }
+}
+
+
diff --git a/sys/ufs/chfs/chfs_vnops.c b/sys/ufs/chfs/chfs_vnops.c
new file mode 100644 (file)
index 0000000..f6a11d9
--- /dev/null
@@ -0,0 +1,1765 @@
+/*     $NetBSD: chfs_vnops.c,v 1.2 2011/11/24 21:09:37 agc Exp $       */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <uvm/uvm.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/fcntl.h>
+#include <sys/buf.h>
+#include <sys/fstrans.h>
+#include <sys/vnode.h>
+
+#include "chfs.h"
+
+#define READ_S  "chfs_read"
+
+int
+chfs_lookup(void *v)
+{
+       struct vnode *dvp = ((struct vop_lookup_args *) v)->a_dvp;
+       struct vnode **vpp = ((struct vop_lookup_args *) v)->a_vpp;
+       struct componentname *cnp = ((struct vop_lookup_args *) v)->a_cnp;
+
+       int error;
+       struct chfs_inode* ip;
+       struct ufsmount* ump;
+       struct chfs_mount* chmp;
+       struct chfs_vnode_cache* chvc;
+       struct chfs_dirent* fd;
+
+       dbg("lookup(): %s\n", cnp->cn_nameptr);
+
+       KASSERT(VOP_ISLOCKED(dvp));
+
+       *vpp = NULL;
+
+       // Check accessibility of requested node as a first step.
+       error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred);
+       if (error != 0) {
+               goto out;
+       }
+
+       // If requesting the last path component on a read-only file system
+       // with a write operation, deny it.
+       if ((cnp->cn_flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY)
+           && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+               error = EROFS;
+               goto out;
+       }
+
+       // Avoid doing a linear scan of the directory if the requested
+       // directory/name couple is already in the cache.
+       error = cache_lookup(dvp, vpp, cnp);
+       if (error >= 0) {
+               goto out;
+       }
+
+       ip = VTOI(dvp);
+       ump = VFSTOUFS(dvp->v_mount);
+       chmp = ump->um_chfs;
+       if (ip->ino == 0) {
+               ip->ino = ++chmp->chm_max_vno;
+       }
+       mutex_enter(&chmp->chm_lock_vnocache);
+       chvc = chfs_vnode_cache_get(chmp, ip->ino);
+       mutex_exit(&chmp->chm_lock_vnocache);
+
+       // We cannot be requesting the parent directory of the root node.
+       KASSERT(IMPLIES(dvp->v_type == VDIR && chvc->pvno == chvc->vno,
+               !(cnp->cn_flags & ISDOTDOT)));
+
+       if (cnp->cn_flags & ISDOTDOT) {
+               VOP_UNLOCK(dvp);
+               error = VFS_VGET(dvp->v_mount, ip->chvc->pvno, vpp);
+               vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+       } else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
+               vref(dvp);
+               *vpp = dvp;
+               error = 0;
+       } else {
+               fd = chfs_dir_lookup(ip, cnp);
+
+               if (fd == NULL) {
+                       dbg("fd null\n");
+                       // The entry was not found in the directory.
+                       // This is OK if we are creating or renaming an
+                       // entry and are working on the last component of
+                       // the path name.
+                       if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == CREATE
+                               || cnp->cn_nameiop == RENAME)) {
+                               error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
+                               if (error) {
+                                       dbg("after the entry was not found in dir\n");
+                                       goto out;
+                               }
+
+                               dbg("return EJUSTRETURN\n");
+                               error = EJUSTRETURN;
+                       } else {
+                               error = ENOENT;
+                       }
+               } else {
+                       // If we are not at the last path component and
+                       // found a non-directory or non-link entry (which
+                       // may itself be pointing to a directory), raise
+                       // an error.
+                       if ((fd->type != VDIR && fd->type != VLNK) && !(cnp->cn_flags
+                               & ISLASTCN)) {
+                               error = ENOTDIR;
+                               goto out;
+                       }
+
+                       dbg("vno@allocating new vnode: %llu\n",
+                               (unsigned long long)fd->vno);
+                       error = VFS_VGET(dvp->v_mount, fd->vno, vpp);
+               }
+       }
+       // Store the result of this lookup in the cache.  Avoid this if the
+       // request was for creation, as it does not improve timings on
+       // emprical tests.
+       if ((cnp->cn_flags & MAKEENTRY) && cnp->cn_nameiop != CREATE
+           && (cnp->cn_flags & ISDOTDOT) == 0)
+               cache_enter(dvp, *vpp, cnp);
+
+out:
+       // If there were no errors, *vpp cannot be null and it must be
+       // locked.
+       KASSERT(IFF(error == 0, *vpp != NULL && VOP_ISLOCKED(*vpp)));
+
+       // dvp must always be locked.
+       KASSERT(VOP_ISLOCKED(dvp));
+
+       return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_create(void *v)
+{
+       struct vop_create_args /* {
+                                 struct vnode *a_dvp;
+                                 struct vnode **a_vpp;
+                                 struct componentname *a_cnp;
+                                 struct vattr *a_vap;
+                                 } */*ap = v;
+       int error, mode;
+       dbg("create()\n");
+
+       mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
+
+       if ((mode & IFMT) == 0) {
+               if (ap->a_vap->va_type == VREG)
+                       mode |= IFREG;
+               if (ap->a_vap->va_type == VSOCK)
+                       mode |= IFSOCK;
+       }
+
+       error = chfs_makeinode(mode, ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap->va_type);
+
+       if (error) {
+               dbg("error: %d\n", error);
+               return error;
+       }
+
+       VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+       return 0;
+}
+/* --------------------------------------------------------------------- */
+
+int
+chfs_mknod(void *v)
+{
+       struct vnode *dvp = ((struct vop_mknod_args *) v)->a_dvp;
+       struct vnode **vpp = ((struct vop_mknod_args *) v)->a_vpp;
+       struct componentname *cnp = ((struct vop_mknod_args *) v)->a_cnp;
+       struct vattr *vap = ((struct vop_mknod_args *) v)->a_vap;
+       int mode, err = 0;
+       struct chfs_inode *ip;
+       struct vnode *vp;
+
+       struct ufsmount *ump;
+       struct chfs_mount *chmp;
+       ino_t ino;
+
+       struct chfs_full_dnode *fd;
+       struct buf *bp;
+       int len;
+       dbg("mknod()\n");
+
+       ump = VFSTOUFS(dvp->v_mount);
+       chmp = ump->um_chfs;
+
+       if (vap->va_type != VBLK && vap->va_type != VCHR && vap->va_type != VFIFO)
+               return EINVAL;
+
+       vp = *vpp;
+
+       mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+       if ((mode & IFMT) == 0) {
+               switch (vap->va_type) {
+               case VBLK:
+                       mode |= IFBLK;
+                       break;
+               case VCHR:
+                       mode |= IFCHR;
+                       break;
+               case VFIFO:
+                       mode |= IFIFO;
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       err = chfs_makeinode(mode, dvp, &vp, cnp, vap->va_type);
+
+       ip = VTOI(vp);
+       ino = ip->ino;
+       if (vap->va_rdev != VNOVAL)
+               ip->rdev = vap->va_rdev;
+
+       if (vap->va_type == VFIFO)
+               vp->v_op = chfs_fifoop_p;
+       else {
+               vp->v_op = chfs_specop_p;
+               spec_node_init(vp, ip->rdev);
+       }
+
+       if (err)
+               return err;
+
+       len = sizeof(dev_t);
+       chfs_set_vnode_size(vp, len);
+       bp = getiobuf(vp, true);
+       bp->b_bufsize = bp->b_resid = len;
+       bp->b_data = kmem_alloc(len, KM_SLEEP);
+       memcpy(bp->b_data, &ip->rdev, len);
+       bp->b_blkno = 0;
+
+       fd = chfs_alloc_full_dnode();
+
+       mutex_enter(&chmp->chm_lock_mountfields);
+
+       err = chfs_write_flash_dnode(chmp, vp, bp, fd);
+       if (err) {
+               mutex_exit(&chmp->chm_lock_mountfields);
+               kmem_free(bp->b_data, len);
+               return err;
+       }
+
+       err = chfs_add_full_dnode_to_inode(chmp, ip, fd);
+       if (err) {
+               mutex_exit(&chmp->chm_lock_mountfields);
+               kmem_free(bp->b_data, len);
+               return err;
+       }
+
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       *vpp = vp;
+       kmem_free(bp->b_data, len);
+       putiobuf(bp);
+
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_open(void *v)
+{
+       struct vnode *vp = ((struct vop_open_args *) v)->a_vp;
+       int mode = ((struct vop_open_args *) v)->a_mode;
+       dbg("open()\n");
+
+       int error;
+       struct chfs_inode *ip;
+
+       KASSERT(VOP_ISLOCKED(vp));
+
+       ip = VTOI(vp);
+
+       KASSERT(vp->v_size == ip->size);
+       if (ip->chvc->nlink < 1) {
+               error = ENOENT;
+               goto out;
+       }
+
+       // If the file is marked append-only, deny write requests.
+       if (ip->flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE)
+               error = EPERM;
+       else
+               error = 0;
+
+out:
+       KASSERT(VOP_ISLOCKED(vp));
+       return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_close(void *v)
+{
+       struct vnode *vp = ((struct vop_close_args *) v)->a_vp;
+       dbg("close()\n");
+
+       struct chfs_inode *ip;
+
+       KASSERT(VOP_ISLOCKED(vp));
+
+       ip = VTOI(vp);
+
+       if (ip->chvc->nlink > 0) {
+               //ip->chvc->nlink = 0;
+               chfs_update(vp, NULL, NULL, UPDATE_CLOSE);
+       }
+
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_access(void *v)
+{
+       struct vnode *vp = ((struct vop_access_args *) v)->a_vp;
+       int mode = ((struct vop_access_args *) v)->a_mode;
+       kauth_cred_t cred = ((struct vop_access_args *) v)->a_cred;
+
+       dbg("access()\n");
+       struct chfs_inode *ip = VTOI(vp);
+
+       if (mode & VWRITE) {
+               switch (vp->v_type) {
+               case VLNK:
+               case VDIR:
+               case VREG:
+                       if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                               return (EROFS);
+                       break;
+               case VBLK:
+               case VCHR:
+               case VSOCK:
+               case VFIFO:
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       if (mode & VWRITE && ip->flags & IMMUTABLE)
+               return (EPERM);
+
+       return genfs_can_access(vp->v_type, ip->mode & ALLPERMS,
+           ip->uid, ip->gid, mode, cred);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_getattr(void *v)
+{
+       struct vnode *vp = ((struct vop_getattr_args *) v)->a_vp;
+       struct vattr *vap = ((struct vop_getattr_args *) v)->a_vap;
+
+       struct chfs_inode *ip = VTOI(vp);
+       dbg("getattr()\n");
+
+       KASSERT(vp->v_size == ip->size);
+
+       vattr_null(vap);
+       CHFS_ITIMES(ip, NULL, NULL, NULL);
+
+       vap->va_type = vp->v_type;
+       vap->va_mode = ip->mode & ALLPERMS;
+       vap->va_nlink = ip->chvc->nlink;
+       vap->va_uid = ip->uid;
+       vap->va_gid = ip->gid;
+       vap->va_fsid = ip->dev;
+       vap->va_fileid = ip->ino;
+       vap->va_size = ip->size;
+       vap->va_blocksize = PAGE_SIZE;
+       vap->va_atime.tv_sec = ip->atime;
+       vap->va_atime.tv_nsec = 0;
+       vap->va_mtime.tv_sec = ip->mtime;
+       vap->va_mtime.tv_nsec = 0;
+       vap->va_ctime.tv_sec = ip->ctime;
+       vap->va_ctime.tv_nsec = 0;
+       vap->va_gen = ip->version;
+       vap->va_flags = ip->flags;
+       vap->va_rdev = ip->rdev;
+       vap->va_bytes = round_page(ip->size);
+       vap->va_filerev = VNOVAL;
+       vap->va_vaflags = 0;
+       vap->va_spare = VNOVAL;
+
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+/* Note: modelled after tmpfs's same function */
+
+int
+chfs_setattr(void *v)
+{
+       struct vnode *vp = ((struct vop_setattr_args *) v)->a_vp;
+       struct vattr *vap = ((struct vop_setattr_args *) v)->a_vap;
+       kauth_cred_t cred = ((struct vop_setattr_args *) v)->a_cred;
+
+       struct chfs_inode *ip;
+       struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+       struct chfs_mount *chmp = ump->um_chfs;
+       int error = 0;
+
+       dbg("setattr()\n");
+
+       KASSERT(VOP_ISLOCKED(vp));
+       ip = VTOI(vp);
+
+       /* Abort if any unsettable attribute is given. */
+       if (vap->va_type != VNON || vap->va_nlink != VNOVAL ||
+           vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL ||
+           vap->va_blocksize != VNOVAL /*|| GOODTIME(&vap->va_ctime)*/ ||
+           vap->va_gen != VNOVAL || vap->va_rdev != VNOVAL ||
+           vap->va_bytes != VNOVAL) {
+               return EINVAL;
+       }
+
+       if (error == 0 && (vap->va_flags != VNOVAL))
+               error = chfs_chflags(vp, vap->va_flags, cred);
+
+       if (error == 0 && (vap->va_size != VNOVAL))
+               error = chfs_chsize(vp, vap->va_size, cred);
+
+       if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL))
+               error = chfs_chown(vp, vap->va_uid, vap->va_gid, cred);
+
+       if (error == 0 && (vap->va_mode != VNOVAL))
+               error = chfs_chmod(vp, vap->va_mode, cred);
+
+#if 0
+       /* why do we need that? */
+       if (ip->flags & (IMMUTABLE | APPEND))
+               return EPERM;
+#endif
+
+       if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+               error = genfs_can_chtimes(vp, vap->va_vaflags, ip->uid, cred);
+               if (error)
+                       return error;
+               if (vap->va_atime.tv_sec != VNOVAL)
+                       ip->iflag |= IN_ACCESS;
+               if (vap->va_mtime.tv_sec != VNOVAL)
+                       ip->iflag |= IN_CHANGE | IN_UPDATE;
+               error = chfs_update(vp,
+                   &vap->va_atime, &vap->va_mtime, UPDATE_WAIT);
+               if (error)
+                       return error;
+       }
+
+       mutex_enter(&chmp->chm_lock_mountfields);
+       error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       return error;
+}
+
+int
+chfs_chmod(struct vnode *vp, int mode, kauth_cred_t cred)
+{
+       struct chfs_inode *ip = VTOI(vp);
+       int error;
+       dbg("chmod\n");
+
+       error = genfs_can_chmod(vp, cred, ip->uid, ip->gid, mode);
+       if (error)
+               return error;
+       ip->mode &= ~ALLPERMS;
+       ip->mode |= (mode & ALLPERMS);
+       ip->iflag |= IN_CHANGE;
+
+       error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+       if (error)
+               return error;
+
+       return 0;
+}
+
+int
+chfs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred)
+{
+       struct chfs_inode *ip = VTOI(vp);
+       int error;
+       dbg("chown\n");
+
+       if (uid == (uid_t)VNOVAL)
+               uid = ip->uid;
+       if (gid == (gid_t)VNOVAL)
+               gid = ip->gid;
+
+       error = genfs_can_chown(vp, cred, ip->uid, ip->gid, uid, gid);
+       if (error)
+               return error;
+
+       ip->gid = gid;
+       ip->uid = uid;
+       ip->iflag |= IN_CHANGE;
+
+       error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+       if (error)
+               return error;
+
+       return 0;
+}
+
+
+/* --------------------------------------------------------------------- */
+/* calculates ((off_t)blk * chmp->chm_chm_fs_bsize) */
+#define        lblktosize(chmp, blk)                                                 \
+       (((off_t)(blk)) << (chmp)->chm_fs_bshift)
+
+/* calculates (loc % chmp->chm_chm_fs_bsize) */
+#define        blkoff(chmp, loc)                                                             \
+       ((loc) & (chmp)->chm_fs_qbmask)
+
+/* calculates (loc / chmp->chm_chm_fs_bsize) */
+#define        lblkno(chmp, loc)                                                             \
+       ((loc) >> (chmp)->chm_fs_bshift)
+
+/* calculates roundup(size, chmp->chm_chm_fs_fsize) */
+#define        fragroundup(chmp, size)                                               \
+       (((size) + (chmp)->chm_fs_qfmask) & (chmp)->chm_fs_fmask)
+
+#define        blksize(chmp, ip, lbn)                                                \
+       (((lbn) >= NDADDR || (ip)->size >= lblktosize(chmp, (lbn) + 1))       \
+           ? (chmp)->chm_fs_bsize                                            \
+           : (fragroundup(chmp, blkoff(chmp, (ip)->size))))
+
+/* calculates roundup(size, chmp->chm_chm_fs_bsize) */
+#define        blkroundup(chmp, size)                                                \
+       (((size) + (chmp)->chm_fs_qbmask) & (chmp)->chm_fs_bmask)
+
+int
+chfs_read(void *v)
+{
+       struct vop_read_args /* {
+                               struct vnode *a_vp;
+                               struct uio *a_uio;
+                               int a_ioflag;
+                               kauth_cred_t a_cred;
+                               } */ *ap = v;
+       struct vnode *vp;
+       struct chfs_inode *ip;
+       struct uio *uio;
+       struct ufsmount *ump;
+       struct buf *bp;
+       struct chfs_mount *chmp;
+       daddr_t lbn, nextlbn;
+       off_t bytesinfile;
+       long size, xfersize, blkoffset;
+       int error, ioflag;
+       vsize_t bytelen;
+       bool usepc = false;
+
+       dbg("chfs_read\n");
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       ump = ip->ump;
+       uio = ap->a_uio;
+       ioflag = ap->a_ioflag;
+       error = 0;
+
+       dbg("ip->size:%llu\n", (unsigned long long)ip->size);
+
+#ifdef DIAGNOSTIC
+       if (uio->uio_rw != UIO_READ)
+               panic("%s: mode", READ_S);
+
+       if (vp->v_type == VLNK) {
+               if (ip->size < ump->um_maxsymlinklen)
+                       panic("%s: short symlink", READ_S);
+       } else if (vp->v_type != VREG && vp->v_type != VDIR)
+               panic("%s: type %d", READ_S, vp->v_type);
+#endif
+       chmp = ip->chmp;
+       if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
+               return (EFBIG);
+       if (uio->uio_resid == 0)
+               return (0);
+
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+       if (uio->uio_offset >= ip->size)
+               goto out;
+
+       usepc = vp->v_type == VREG;
+       bytelen = 0;
+       if (usepc) {
+               const int advice = IO_ADV_DECODE(ap->a_ioflag);
+
+               while (uio->uio_resid > 0) {
+                       if (ioflag & IO_DIRECT) {
+                               genfs_directio(vp, uio, ioflag);
+                       }
+                       bytelen = MIN(ip->size - uio->uio_offset,
+                           uio->uio_resid);
+                       if (bytelen == 0)
+                               break;
+                       error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+                           UBC_READ | UBC_PARTIALOK |
+                           (UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0));
+                       if (error)
+                               break;
+
+               }
+               goto out;
+       }
+
+
+       dbg("start reading\n");
+       for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+               bytesinfile = ip->size - uio->uio_offset;
+               if (bytesinfile <= 0)
+                       break;
+               lbn = lblkno(chmp, uio->uio_offset);
+               nextlbn = lbn + 1;
+               size = blksize(chmp, ip, lbn);
+               blkoffset = blkoff(chmp, uio->uio_offset);
+               xfersize = MIN(MIN(chmp->chm_fs_bsize - blkoffset, uio->uio_resid),
+                   bytesinfile);
+
+               if (lblktosize(chmp, nextlbn) >= ip->size) {
+                       error = bread(vp, lbn, size, NOCRED, 0, &bp);
+                       dbg("after bread\n");
+               } else {
+                       int nextsize = blksize(chmp, ip, nextlbn);
+                       dbg("size: %ld\n", size);
+                       error = breadn(vp, lbn,
+                           size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+                       dbg("after breadN\n");
+               }
+               if (error)
+                       break;
+
+               /*
+                * We should only get non-zero b_resid when an I/O error
+                * has occurred, which should cause us to break above.
+                * However, if the short read did not cause an error,
+                * then we want to ensure that we do not uiomove bad
+                * or uninitialized data.
+                */
+               size -= bp->b_resid;
+               if (size < xfersize) {
+                       if (size == 0)
+                               break;
+                       xfersize = size;
+               }
+               dbg("uiomove\n");
+               error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+               if (error)
+                       break;
+               brelse(bp, 0);
+       }
+       if (bp != NULL)
+               brelse(bp, 0);
+
+out:
+       if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
+               ip->iflag |= IN_ACCESS;
+               if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) {
+                       //error = UFS_WAPBL_BEGIN(vp->v_mount);
+                       if (error) {
+                               fstrans_done(vp->v_mount);
+                               return error;
+                       }
+                       error = chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+                       //UFS_WAPBL_END(vp->v_mount);
+               }
+       }
+
+       dbg("[END]\n");
+       fstrans_done(vp->v_mount);
+       return (error);
+}
+
+
+/* --------------------------------------------------------------------- */
+
+/*from ffs write*/
+int
+chfs_write(void *v)
+{
+       struct vop_write_args /* {
+                                struct vnode *a_vp;
+                                struct uio *a_uio;
+                                int a_ioflag;
+                                kauth_cred_t a_cred;
+                                } */ *ap = v;
+       struct vnode *vp ;
+       struct uio *uio;
+       struct chfs_inode *ip;
+       struct chfs_mount *chmp;
+       struct lwp *l;
+       kauth_cred_t cred;
+       off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
+       int blkoffset, error, flags, ioflag, resid;
+       int aflag;
+       int extended=0;
+       vsize_t bytelen;
+       bool async;
+       struct ufsmount *ump;
+
+
+       cred = ap->a_cred;
+       ioflag = ap->a_ioflag;
+       uio = ap->a_uio;
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       //dbg("file size (vp): %llu\n", (unsigned long long)vp->v_size);
+       //dbg("file size (ip): %llu\n", (unsigned long long)ip->i_size);
+       ump = ip->ump;
+
+       //dbg("uio->resid: %d\n", uio->uio_resid);
+       dbg("write\n");
+
+       KASSERT(vp->v_size == ip->size);
+
+       switch (vp->v_type) {
+       case VREG:
+               if (ioflag & IO_APPEND)
+                       uio->uio_offset = ip->size;
+               if ((ip->flags & APPEND) && uio->uio_offset != ip->size)
+                       return (EPERM);
+               /* FALLTHROUGH */
+       case VLNK:
+               break;
+       case VDIR:
+               if ((ioflag & IO_SYNC) == 0)
+                       panic("chfs_write: nonsync dir write");
+               break;
+       default:
+               panic("chfs_write: type");
+       }
+
+       chmp = ip->chmp;
+       if (uio->uio_offset < 0 ||
+           (u_int64_t)uio->uio_offset +
+           uio->uio_resid > ump->um_maxfilesize) {
+               dbg("uio->uio_offset = %lld | uio->uio_offset + "
+                   "uio->uio_resid (%llu) > ump->um_maxfilesize (%lld)\n",
+                   (long long)uio->uio_offset,
+                   (uint64_t)uio->uio_offset + uio->uio_resid,
+                   (long long)ump->um_maxfilesize);
+               return (EFBIG);
+       }
+       /*
+        * Maybe this should be above the vnode op call, but so long as
+        * file servers have no limits, I don't think it matters.
+        */
+       l = curlwp;
+       if (vp->v_type == VREG && l &&
+           uio->uio_offset + uio->uio_resid >
+           l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
+               mutex_enter(proc_lock);
+               psignal(l->l_proc, SIGXFSZ);
+               mutex_exit(proc_lock);
+               return (EFBIG);
+       }
+       if (uio->uio_resid == 0)
+               return (0);
+
+       //mutex_enter(&ip->inode_lock);
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+       flags = ioflag & IO_SYNC ? B_SYNC : 0;
+       async = vp->v_mount->mnt_flag & MNT_ASYNC;
+       origoff = uio->uio_offset;
+       resid = uio->uio_resid;
+       osize = ip->size;
+       error = 0;
+
+
+       /*if ((ioflag & IO_JOURNALLOCKED) == 0) {
+         error = UFS_WAPBL_BEGIN(vp->v_mount);
+         if (error) {
+         fstrans_done(vp->v_mount);
+         return error;
+         }
+         }*/
+
+       preallocoff = round_page(blkroundup(chmp,
+               MAX(osize, uio->uio_offset)));
+       aflag = ioflag & IO_SYNC ? B_SYNC : 0;
+       nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
+       endallocoff = nsize - blkoff(chmp, nsize);
+
+       /*
+        * if we're increasing the file size, deal with expanding
+        * the fragment if there is one.
+        */
+
+       if (nsize > osize && lblkno(chmp, osize) < NDADDR &&
+           lblkno(chmp, osize) != lblkno(chmp, nsize) &&
+           blkroundup(chmp, osize) != osize) {
+               off_t eob;
+
+               eob = blkroundup(chmp, osize);
+               uvm_vnp_setwritesize(vp, eob);
+               error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
+               if (error)
+                       goto out;
+               if (flags & B_SYNC) {
+                       mutex_enter(vp->v_interlock);
+                       VOP_PUTPAGES(vp,
+                           trunc_page(osize & chmp->chm_fs_bmask),
+                           round_page(eob),
+                           PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+               }
+       }
+
+       while (uio->uio_resid > 0) {
+               int ubc_flags = UBC_WRITE;
+               bool overwrite; /* if we're overwrite a whole block */
+               off_t newoff;
+
+               if (ioflag & IO_DIRECT) {
+                       genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
+               }
+
+               oldoff = uio->uio_offset;
+               blkoffset = blkoff(chmp, uio->uio_offset);
+               bytelen = MIN(chmp->chm_fs_bsize - blkoffset, uio->uio_resid);
+               if (bytelen == 0) {
+                       break;
+               }
+
+               /*
+                * if we're filling in a hole, allocate the blocks now and
+                * initialize the pages first.  if we're extending the file,
+                * we can safely allocate blocks without initializing pages
+                * since the new blocks will be inaccessible until the write
+                * is complete.
+                */
+               overwrite = uio->uio_offset >= preallocoff &&
+                   uio->uio_offset < endallocoff;
+               if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
+                   blkoff(chmp, uio->uio_offset) == 0 &&
+                   (uio->uio_offset & PAGE_MASK) == 0) {
+                       vsize_t len;
+
+                       len = trunc_page(bytelen);
+                       len -= blkoff(chmp, len);
+                       if (len > 0) {
+                               overwrite = true;
+                               bytelen = len;
+                       }
+               }
+
+               newoff = oldoff + bytelen;
+               if (vp->v_size < newoff) {
+                       uvm_vnp_setwritesize(vp, newoff);
+               }
+
+               if (!overwrite) {
+                       error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
+                           cred, aflag);
+                       if (error)
+                               break;
+               } else {
+                       genfs_node_wrlock(vp);
+                       error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
+                           aflag, cred);
+                       genfs_node_unlock(vp);
+                       if (error)
+                               break;
+                       ubc_flags |= UBC_FAULTBUSY;
+               }
+
+               /*
+                * copy the data.
+                */
+
+               ubc_flags |= UBC_WANT_UNMAP(vp) ? UBC_UNMAP : 0;
+               error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
+                   IO_ADV_DECODE(ioflag), ubc_flags);
+
+               /*
+                * update UVM's notion of the size now that we've
+                * copied the data into the vnode's pages.
+                *
+                * we should update the size even when uiomove failed.
+                */
+
+               if (vp->v_size < newoff) {
+                       uvm_vnp_setsize(vp, newoff);
+                       extended = 1;
+               }
+
+               if (error)
+                       break;
+
+               /*
+                * flush what we just wrote if necessary.
+                * XXXUBC simplistic async flushing.
+                */
+
+               if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
+                       mutex_enter(vp->v_interlock);
+                       error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
+                           (uio->uio_offset >> 16) << 16,
+                           PGO_CLEANIT | PGO_JOURNALLOCKED);
+                       if (error)
+                               break;
+               }
+       }
+out:
+       if (error == 0 && ioflag & IO_SYNC) {
+               mutex_enter(vp->v_interlock);
+               error = VOP_PUTPAGES(vp,
+                   trunc_page(origoff & chmp->chm_fs_bmask),
+                   round_page(blkroundup(chmp, uio->uio_offset)),
+                   PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+       }
+       ip->iflag |= IN_CHANGE | IN_UPDATE;
+       if (resid > uio->uio_resid && ap->a_cred &&
+           kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
+               ip->mode &= ~(ISUID | ISGID);
+       }
+       if (resid > uio->uio_resid)
+               VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+       if (error) {
+               (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred);
+               uio->uio_offset -= resid - uio->uio_resid;
+               uio->uio_resid = resid;
+       } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
+               error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+
+       //XXX hack, i write the next line after i know ip->i_size and vp->v_size don't equal
+       chfs_set_vnode_size(vp, vp->v_size);
+
+
+       //dbg("end file size (vp): %llu\n", (unsigned long long)vp->v_size);
+       //dbg("end file size (ip): %llu\n", (unsigned long long)ip->i_size);
+       KASSERT(vp->v_size == ip->size);
+       fstrans_done(vp->v_mount);
+
+       mutex_enter(&chmp->chm_lock_mountfields);
+       error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       //mutex_exit(&ip->inode_lock);
+       //dbg("end\n");
+       return (error);
+}
+
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_fsync(void *v)
+{
+       //dbg("fsync\n");
+       struct vop_fsync_args /* {
+                                struct vnode *a_vp;
+                                kauth_cred_t a_cred;
+                                int a_flags;
+                                off_t offlo;
+                                off_t offhi;
+                                } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       int wait;
+
+       if (ap->a_flags & FSYNC_CACHE) {
+               return ENODEV;
+       }
+       wait = (ap->a_flags & FSYNC_WAIT) != 0;
+       vflushbuf(vp, wait);
+       //struct chfs_inode *ip = VTOI(vp);
+       //chfs_set_vnode_size(vp, ip->write_size);
+
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_remove(void *v)
+{
+       struct vnode *dvp = ((struct vop_remove_args *) v)->a_dvp;
+       struct vnode *vp = ((struct vop_remove_args *) v)->a_vp;
+       struct componentname *cnp = (((struct vop_remove_args *) v)->a_cnp);
+       dbg("remove\n");
+
+       KASSERT(VOP_ISLOCKED(dvp));
+       KASSERT(VOP_ISLOCKED(vp));
+
+       struct chfs_inode *ip = VTOI(vp);
+       struct chfs_inode *parent = VTOI(dvp);
+       int error = 0;
+
+       KASSERT(ip->chvc->vno != ip->chvc->pvno);
+
+       error = chfs_do_unlink(ip,
+           parent, cnp->cn_nameptr, cnp->cn_namelen);
+
+       vput(dvp);
+       vput(vp);
+
+       return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_link(void *v)
+{
+       struct vnode *dvp = ((struct vop_link_args *) v)->a_dvp;
+       struct vnode *vp = ((struct vop_link_args *) v)->a_vp;
+       struct componentname *cnp = ((struct vop_link_args *) v)->a_cnp;
+
+       struct chfs_inode *ip, *parent;
+       int error = 0;
+
+       if (vp->v_type == VDIR) {
+               VOP_ABORTOP(dvp, cnp);
+               error = EISDIR;
+               goto out;
+       }
+       if (dvp->v_mount != vp->v_mount) {
+               VOP_ABORTOP(dvp, cnp);
+               error = EXDEV;
+               goto out;
+       }
+       if (dvp != vp && (error = vn_lock(vp, LK_EXCLUSIVE))) {
+               VOP_ABORTOP(dvp, cnp);
+               goto out;
+       }
+
+       parent = VTOI(dvp);
+       ip = VTOI(vp);
+
+       error = chfs_do_link(ip,
+           parent, cnp->cn_nameptr, cnp->cn_namelen, vp->v_type);
+
+       if (dvp != vp)
+               VOP_UNLOCK(vp);
+out:
+       vput(dvp);
+       return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_rename(void *v)
+{
+       struct vnode *fdvp = ((struct vop_rename_args *) v)->a_fdvp;
+       struct vnode *fvp = ((struct vop_rename_args *) v)->a_fvp;
+       struct componentname *fcnp = ((struct vop_rename_args *) v)->a_fcnp;
+       struct vnode *tdvp = ((struct vop_rename_args *) v)->a_tdvp;
+       struct vnode *tvp = ((struct vop_rename_args *) v)->a_tvp;
+       struct componentname *tcnp = ((struct vop_rename_args *) v)->a_tcnp;
+
+       struct chfs_inode *oldparent, *old;
+       struct chfs_inode *newparent;
+       struct chfs_dirent *fd;//, *oldfd;
+       struct chfs_inode *ip;
+       int error = 0;
+       dbg("rename\n");
+
+       KASSERT(VOP_ISLOCKED(tdvp));
+       KASSERT(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
+
+       oldparent = VTOI(fdvp);
+       old = VTOI(fvp);
+       newparent = VTOI(tdvp);
+       if (tvp) {
+               dbg("tvp not null\n");
+               ip = VTOI(tvp);
+               if (tvp->v_type == VDIR) {
+                       //TODO: lock
+//                     fd = ip->dents;
+//                     while (fd) {
+                       TAILQ_FOREACH(fd, &ip->dents, fds) {
+                               if (fd->vno) {
+                                       //TODO: unlock
+                                       error = ENOTEMPTY;
+                                       goto out_unlocked;
+                               }
+//                             fd = fd->next;
+                       }
+                       //TODO: unlock
+               }
+               error = chfs_do_unlink(ip,
+                   newparent, tcnp->cn_nameptr, tcnp->cn_namelen);
+               vput(tvp);
+       }
+       VFS_VGET(tdvp->v_mount, old->ino, &tvp);
+       ip = VTOI(tvp);
+
+//     for (oldfd = oldparent->dents;
+//          oldfd->vno != old->ino;
+//          oldfd = oldfd->next);
+
+       error = chfs_do_link(ip,
+           newparent, tcnp->cn_nameptr, tcnp->cn_namelen, tvp->v_type);
+       error = chfs_do_unlink(old,
+           oldparent, fcnp->cn_nameptr, fcnp->cn_namelen);
+
+//out:
+//     if (fchnode != tchnode)
+//     VOP_UNLOCK(fdvp, 0);
+
+out_unlocked:
+       // Release target nodes.
+       if (tdvp == tvp)
+               vrele(tdvp);
+       else
+               vput(tdvp);
+       if (tvp != NULL)
+               vput(tvp);
+
+       // Release source nodes.
+       vrele(fdvp);
+       vrele(fvp);
+
+       return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_mkdir(void *v)
+{
+       struct vnode *dvp = ((struct vop_mkdir_args *) v)->a_dvp;
+       struct vnode **vpp = ((struct vop_mkdir_args *)v)->a_vpp;
+       struct componentname *cnp = ((struct vop_mkdir_args *) v)->a_cnp;
+       struct vattr *vap = ((struct vop_mkdir_args *) v)->a_vap;
+       dbg("mkdir()\n");
+
+       int mode;
+
+       mode = vap->va_mode & ACCESSPERMS;
+       if ((mode & IFMT) == 0) {
+               mode |= IFDIR;
+       }
+
+       KASSERT(vap->va_type == VDIR);
+
+       return chfs_makeinode(mode, dvp, vpp, cnp, VDIR);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_rmdir(void *v)
+{
+       struct vnode *dvp = ((struct vop_rmdir_args *) v)->a_dvp;
+       struct vnode *vp = ((struct vop_rmdir_args *) v)->a_vp;
+       struct componentname *cnp = ((struct vop_rmdir_args *) v)->a_cnp;
+       dbg("rmdir()\n");
+
+       KASSERT(VOP_ISLOCKED(dvp));
+       KASSERT(VOP_ISLOCKED(vp));
+
+       struct chfs_inode *ip = VTOI(vp);
+       struct chfs_inode *parent = VTOI(dvp);
+       struct chfs_dirent *fd;
+       int error = 0;
+
+       if (vp->v_type != VDIR) {
+               error = ENOTDIR;
+               goto out;
+       }
+
+       KASSERT(ip->chvc->vno != ip->chvc->pvno);
+
+//     for (fd = ip->dents; fd; fd = fd->next) {
+       TAILQ_FOREACH(fd, &ip->dents, fds) {
+               if (fd->vno) {
+                       error = ENOTEMPTY;
+                       goto out;
+               }
+       }
+
+       error = chfs_do_unlink(ip,
+           parent, cnp->cn_nameptr, cnp->cn_namelen);
+
+out:
+       vput(dvp);
+       vput(vp);
+
+       return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_symlink(void *v)
+{
+       struct vnode *dvp = ((struct vop_symlink_args *) v)->a_dvp;
+       struct vnode **vpp = ((struct vop_symlink_args *) v)->a_vpp;
+       struct componentname *cnp = ((struct vop_symlink_args *) v)->a_cnp;
+       struct vattr *vap = ((struct vop_symlink_args *) v)->a_vap;
+       char *target = ((struct vop_symlink_args *) v)->a_target;
+
+       struct ufsmount *ump;
+       struct chfs_mount *chmp;
+       struct vnode *vp;
+       struct chfs_inode *ip;
+       int len, err;
+       struct chfs_full_dnode *fd;
+       struct buf *bp;
+       dbg("symlink()\n");
+
+       ump = VFSTOUFS(dvp->v_mount);
+       chmp = ump->um_chfs;
+
+       err = chfs_makeinode(IFLNK | vap->va_mode, dvp, vpp, cnp, VLNK);
+       if (err)
+               return (err);
+       VN_KNOTE(dvp, NOTE_WRITE);
+       vp = *vpp;
+       len = strlen(target);
+       ip = VTOI(vp);
+       /* TODO max symlink len instead of "100" */
+       if (len < 100) {
+               ip->target = kmem_alloc(len, KM_SLEEP);
+               memcpy(ip->target, target, len);
+               chfs_set_vnode_size(vp, len);
+               ip->iflag |= IN_CHANGE | IN_UPDATE;
+
+               bp = getiobuf(vp, true);
+               bp->b_bufsize = bp->b_resid = len;
+               bp->b_data = kmem_alloc(len, KM_SLEEP);
+               memcpy(bp->b_data, target, len);
+               bp->b_blkno = 0;
+
+               fd = chfs_alloc_full_dnode();
+
+               mutex_enter(&chmp->chm_lock_mountfields);
+
+               err = chfs_write_flash_dnode(chmp, vp, bp, fd);
+               if (err) {
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       goto out;
+               }
+
+               err = chfs_add_full_dnode_to_inode(chmp, ip, fd);
+               if (err) {
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       goto out;
+               }
+
+               mutex_exit(&chmp->chm_lock_mountfields);
+
+               kmem_free(bp->b_data, len);
+               putiobuf(bp);
+
+               uvm_vnp_setsize(vp, len);
+       } else {
+               err = vn_rdwr(UIO_WRITE, vp, target, len, (off_t)0,
+                   UIO_SYSSPACE, IO_NODELOCKED, cnp->cn_cred,
+                   (size_t *)0, NULL);
+       }
+
+out:
+       if (err)
+               vput(vp);
+
+       return (err);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_readdir(void *v)
+{
+       struct vnode *vp = ((struct vop_readdir_args *) v)->a_vp;
+       struct uio *uio = ((struct vop_readdir_args *) v)->a_uio;
+       int *eofflag = ((struct vop_readdir_args *) v)->a_eofflag;
+
+       int error = 0;
+       off_t skip, offset;
+       struct chfs_inode *ip;
+       struct chfs_dirent *fd;
+
+       struct ufsmount *ump;
+       struct chfs_mount *chmp;
+       struct chfs_vnode_cache *chvc;
+
+       KASSERT(VOP_ISLOCKED(vp));
+
+       /* This operation only makes sense on directory nodes. */
+       if (vp->v_type != VDIR) {
+               error = ENOTDIR;
+               goto out;
+       }
+
+       ip = VTOI(vp);
+
+       /* uiomove in chfs_filldir automatically increments the
+        * uio_offset by an arbitrary size, so we discard any change
+        * to uio_offset and set it to our own value on return
+        */
+       offset = uio->uio_offset;
+
+       if (offset == CHFS_OFFSET_DOT) {
+               error = chfs_filldir(uio, ip->ino, ".", 1, VDIR);
+               if (error == -1) {
+                       error = 0;
+                       goto outok;
+               } else if (error != 0)
+                       goto outok;
+
+               offset = CHFS_OFFSET_DOTDOT;
+       }
+
+       if (offset == CHFS_OFFSET_DOTDOT) {
+               ump = VFSTOUFS(vp->v_mount);
+               chmp = ump->um_chfs;
+               mutex_enter(&chmp->chm_lock_vnocache);
+               chvc = chfs_vnode_cache_get(chmp, ip->ino);
+               mutex_exit(&chmp->chm_lock_vnocache);
+
+               error = chfs_filldir(uio, chvc->pvno, "..", 2, VDIR);
+               if (error == -1) {
+                       error = 0;
+                       goto outok;
+               } else if (error != 0) {
+                       goto outok;
+               }
+
+               if (TAILQ_EMPTY(&ip->dents)) {
+                       offset = CHFS_OFFSET_EOF;
+               } else {
+                       offset = CHFS_OFFSET_FIRST;
+               }
+       }
+
+       if (offset != CHFS_OFFSET_EOF) {
+               skip = offset - CHFS_OFFSET_FIRST;
+
+               TAILQ_FOREACH(fd, &ip->dents, fds) {
+                       /* seek to offset by skipping items */
+                       /* XXX race conditions by changed dirent? */
+                       if (skip > 0) {
+                               skip--;
+                               continue;
+                       }
+
+                       if (fd->vno != 0) {
+                               error = chfs_filldir(uio, fd->vno,
+                                   fd->name, fd->nsize, fd->type);
+                               if (error == -1) {
+                                       error = 0;
+                                       goto outok;
+                               } else if (error != 0) {
+                                       dbg("err %d\n", error);
+                                       goto outok;
+                               }
+                       }
+                       offset++;
+               }
+       }
+       offset = CHFS_OFFSET_EOF;
+
+outok:
+       uio->uio_offset = offset;
+
+       if (eofflag != NULL) {
+               *eofflag = (error == 0 &&
+                   uio->uio_offset == CHFS_OFFSET_EOF);
+       }
+
+out:
+       KASSERT(VOP_ISLOCKED(vp));
+
+       return error;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_readlink(void *v)
+{
+
+       struct vnode *vp = ((struct vop_readlink_args *) v)->a_vp;
+       struct uio *uio = ((struct vop_readlink_args *) v)->a_uio;
+       kauth_cred_t cred = ((struct vop_readlink_args *) v)->a_cred;
+
+       struct chfs_inode *ip = VTOI(vp);
+
+       dbg("readlink()\n");
+
+       /* TODO max symlink len instead of "100" */
+       if (ip->size < 100) {
+               uiomove(ip->target, ip->size, uio);
+               return (0);
+       }
+
+       return (VOP_READ(vp, uio, 0, cred));
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_inactive(void *v)
+{
+       struct vnode *vp = ((struct vop_inactive_args *) v)->a_vp;
+       struct chfs_inode *ip = VTOI(vp);
+       struct chfs_vnode_cache *chvc;
+       dbg("inactive | vno: %llu\n", (unsigned long long)ip->ino);
+
+       KASSERT(VOP_ISLOCKED(vp));
+
+       if (ip->ino) {
+               chvc = ip->chvc;
+               if (chvc->nlink)
+                       *((struct vop_inactive_args *) v)->a_recycle = 0;
+       } else {
+               *((struct vop_inactive_args *) v)->a_recycle = 1;
+       }
+
+       VOP_UNLOCK(vp);
+
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_reclaim(void *v)
+{
+       struct vop_reclaim_args *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct chfs_inode *ip = VTOI(vp);
+       struct chfs_mount *chmp = ip->chmp;
+       struct chfs_dirent *fd;
+
+       //dbg("reclaim() | ino: %llu\n", (unsigned long long)ip->ino);
+       //mutex_enter(&ip->inode_lock);
+
+       mutex_enter(&chmp->chm_lock_vnocache);
+       chfs_vnode_cache_set_state(chmp,
+           ip->chvc, VNO_STATE_CHECKEDABSENT);
+       mutex_exit(&chmp->chm_lock_vnocache);
+
+       chfs_update(vp, NULL, NULL, UPDATE_CLOSE);
+
+       if (vp->v_type == VREG || vp->v_type == VLNK || vp->v_type == VCHR ||
+           vp->v_type == VBLK || vp->v_type == VFIFO || vp->v_type == VSOCK)
+               chfs_kill_fragtree(&ip->fragtree);
+
+       fd = TAILQ_FIRST(&ip->dents);
+       while(fd) {
+               TAILQ_REMOVE(&ip->dents, fd, fds);
+               chfs_free_dirent(fd);
+               fd = TAILQ_FIRST(&ip->dents);
+       }
+       //mutex_exit(&ip->inode_lock);
+       //mutex_destroy(&ip->inode_lock);
+
+       cache_purge(vp);
+       if (ip->devvp) {
+               vrele(ip->devvp);
+               ip->devvp = 0;
+       }
+       chfs_ihashrem(ip);
+
+       genfs_node_destroy(vp);
+       pool_put(&chfs_inode_pool, vp->v_data);
+       vp->v_data = NULL;
+       return (0);
+}
+
+/* --------------------------------------------------------------------- */
+
+int
+chfs_advlock(void *v)
+{
+       //struct vnode *vp = ((struct vop_advlock_args *) v)->a_vp;
+       dbg("advlock()\n");
+       /*
+         struct chfs_node *node;
+
+         node = VP_TO_CHFS_NODE(vp);
+
+         return lf_advlock(v, &node->chn_lockf, node->chn_size);
+       */
+       return 0;
+}
+
+/* --------------------------------------------------------------------- */
+int
+chfs_strategy(void *v)
+{
+       struct vop_strategy_args /* {
+                                   const struct vnodeop_desc *a_desc;
+                                   struct vnode *a_vp;
+                                   struct buf *a_bp;
+                                   } */ *ap = v;
+       struct chfs_full_dnode *fd;
+       struct buf *bp = ap->a_bp;
+       struct vnode *vp = ap->a_vp;
+       struct chfs_inode *ip = VTOI(vp);
+       struct chfs_mount *chmp = ip->chmp;
+       int read = (bp->b_flags & B_READ) ? 1 : 0;
+       int err = 0;
+
+/*     dbg("bp dump:\n");
+       dbg("   ->b_bcount: %d\n", bp->b_bcount);
+       dbg("   ->b_resid:  %d\n", bp->b_resid);
+       dbg("   ->b_blkno:  %llu\n", (unsigned long long)bp->b_blkno);
+       dbg("   ->b_error:  %d\n", bp->b_error);*/
+       if (read) {
+               err = chfs_read_data(chmp, vp, bp);
+       } else {
+               fd = chfs_alloc_full_dnode();
+
+               mutex_enter(&chmp->chm_lock_mountfields);
+
+               err = chfs_write_flash_dnode(chmp, vp, bp, fd);
+               if (err) {
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       goto out;
+               }
+
+               err = chfs_add_full_dnode_to_inode(chmp, ip, fd);
+               /*if (err) {
+                       mutex_exit(&chmp->chm_lock_mountfields);
+                       goto out;
+               }*/
+
+               mutex_exit(&chmp->chm_lock_mountfields);
+       }
+out:
+       biodone(bp);
+       //dbg("end\n");
+       return err;
+}
+
+int
+chfs_bmap(void *v)
+{
+       struct vop_bmap_args /* {
+                               struct vnode *a_vp;
+                               daddr_t  a_bn;
+                               struct vnode **a_vpp;
+                               daddr_t *a_bnp;
+                               int *a_runp;
+                               int *a_runb;
+                               } */ *ap = v;
+       if (ap->a_vpp != NULL)
+               *ap->a_vpp = ap->a_vp;
+       if (ap->a_bnp != NULL)
+               *ap->a_bnp = ap->a_bn;
+       if (ap->a_runp != NULL)
+               *ap->a_runp = 0;
+       return (0);
+}
+
+/*
+ * vnode operations vector used for files stored in a chfs file system.
+ */
+int
+(**chfs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc chfs_vnodeop_entries[] =
+       {
+               { &vop_default_desc, vn_default_error },
+               { &vop_lookup_desc, chfs_lookup },
+               { &vop_create_desc, chfs_create },
+               { &vop_mknod_desc, chfs_mknod },
+               { &vop_open_desc, chfs_open },
+               { &vop_close_desc, chfs_close },
+               { &vop_access_desc, chfs_access },
+               { &vop_getattr_desc, chfs_getattr },
+               { &vop_setattr_desc, chfs_setattr },
+               { &vop_read_desc, chfs_read },
+               { &vop_write_desc, chfs_write },
+               { &vop_ioctl_desc, genfs_enoioctl },
+               { &vop_fcntl_desc, genfs_fcntl },
+               { &vop_poll_desc, genfs_poll },
+               { &vop_kqfilter_desc, genfs_kqfilter },
+               { &vop_revoke_desc, genfs_revoke },
+               { &vop_mmap_desc, genfs_mmap },
+               { &vop_fsync_desc, chfs_fsync },
+               { &vop_seek_desc, genfs_seek },
+               { &vop_remove_desc, chfs_remove },
+               { &vop_link_desc, chfs_link },
+               { &vop_rename_desc, chfs_rename },
+               { &vop_mkdir_desc, chfs_mkdir },
+               { &vop_rmdir_desc, chfs_rmdir },
+               { &vop_symlink_desc, chfs_symlink },
+               { &vop_readdir_desc, chfs_readdir },
+               { &vop_readlink_desc, chfs_readlink },
+               { &vop_abortop_desc, genfs_abortop },
+               { &vop_inactive_desc, chfs_inactive },
+               { &vop_reclaim_desc, chfs_reclaim },
+               { &vop_lock_desc, genfs_lock },
+               { &vop_unlock_desc, genfs_unlock },
+               { &vop_bmap_desc, chfs_bmap },
+               { &vop_strategy_desc, chfs_strategy },
+               { &vop_print_desc, ufs_print },
+               { &vop_pathconf_desc, ufs_pathconf },
+               { &vop_islocked_desc, genfs_islocked },
+               { &vop_advlock_desc, chfs_advlock },
+               { &vop_bwrite_desc, vn_bwrite },
+               { &vop_getpages_desc, genfs_getpages },
+               { &vop_putpages_desc, genfs_putpages },
+               { NULL, NULL } };
+
+const struct vnodeopv_desc chfs_vnodeop_opv_desc =
+       { &chfs_vnodeop_p, chfs_vnodeop_entries };
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * vnode operations vector used for special devices stored in a chfs
+ * file system.
+ */
+int
+(**chfs_specop_p)(void *);
+const struct vnodeopv_entry_desc chfs_specop_entries[] =
+       {
+               { &vop_default_desc, vn_default_error },
+               { &vop_lookup_desc, spec_lookup },
+               { &vop_create_desc, spec_create },
+               { &vop_mknod_desc, spec_mknod },
+               { &vop_open_desc, spec_open },
+               { &vop_close_desc, ufsspec_close },
+               { &vop_access_desc, chfs_access },
+               { &vop_getattr_desc, chfs_getattr },
+               { &vop_setattr_desc, chfs_setattr },
+               { &vop_read_desc, chfs_read },
+               { &vop_write_desc, chfs_write },
+               { &vop_ioctl_desc, spec_ioctl },
+               { &vop_fcntl_desc, genfs_fcntl },
+               { &vop_poll_desc, spec_poll },
+               { &vop_kqfilter_desc, spec_kqfilter },
+               { &vop_revoke_desc, spec_revoke },
+               { &vop_mmap_desc, spec_mmap },
+               { &vop_fsync_desc, spec_fsync },
+               { &vop_seek_desc, spec_seek },
+               { &vop_remove_desc, spec_remove },
+               { &vop_link_desc, spec_link },
+               { &vop_rename_desc, spec_rename },
+               { &vop_mkdir_desc, spec_mkdir },
+               { &vop_rmdir_desc, spec_rmdir },
+               { &vop_symlink_desc, spec_symlink },
+               { &vop_readdir_desc, spec_readdir },
+               { &vop_readlink_desc, spec_readlink },
+               { &vop_abortop_desc, spec_abortop },
+               { &vop_inactive_desc, chfs_inactive },
+               { &vop_reclaim_desc, chfs_reclaim },
+               { &vop_lock_desc, genfs_lock },
+               { &vop_unlock_desc, genfs_unlock },
+               { &vop_bmap_desc, spec_bmap },
+               { &vop_strategy_desc, spec_strategy },
+               { &vop_print_desc, ufs_print },
+               { &vop_pathconf_desc, spec_pathconf },
+               { &vop_islocked_desc, genfs_islocked },
+               { &vop_advlock_desc, spec_advlock },
+               { &vop_bwrite_desc, vn_bwrite },
+               { &vop_getpages_desc, spec_getpages },
+               { &vop_putpages_desc, spec_putpages },
+               { NULL, NULL } };
+
+const struct vnodeopv_desc chfs_specop_opv_desc =
+       { &chfs_specop_p, chfs_specop_entries };
+
+/* --------------------------------------------------------------------- */
+/*
+ * vnode operations vector used for fifos stored in a chfs file system.
+ */
+int
+(**chfs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc chfs_fifoop_entries[] =
+       {
+               { &vop_default_desc, vn_default_error },
+               { &vop_lookup_desc, vn_fifo_bypass },
+               { &vop_create_desc, vn_fifo_bypass },
+               { &vop_mknod_desc, vn_fifo_bypass },
+               { &vop_open_desc, vn_fifo_bypass },
+               { &vop_close_desc, ufsfifo_close },
+               { &vop_access_desc, chfs_access },
+               { &vop_getattr_desc, chfs_getattr },
+               { &vop_setattr_desc, chfs_setattr },
+               { &vop_read_desc, ufsfifo_read },
+               { &vop_write_desc, ufsfifo_write },
+               { &vop_ioctl_desc, vn_fifo_bypass },
+               { &vop_fcntl_desc, genfs_fcntl },
+               { &vop_poll_desc, vn_fifo_bypass },
+               { &vop_kqfilter_desc, vn_fifo_bypass },
+               { &vop_revoke_desc, vn_fifo_bypass },
+               { &vop_mmap_desc, vn_fifo_bypass },
+               { &vop_fsync_desc, vn_fifo_bypass },
+               { &vop_seek_desc, vn_fifo_bypass },
+               { &vop_remove_desc, vn_fifo_bypass },
+               { &vop_link_desc, vn_fifo_bypass },
+               { &vop_rename_desc, vn_fifo_bypass },
+               { &vop_mkdir_desc, vn_fifo_bypass },
+               { &vop_rmdir_desc, vn_fifo_bypass },
+               { &vop_symlink_desc, vn_fifo_bypass },
+               { &vop_readdir_desc, vn_fifo_bypass },
+               { &vop_readlink_desc, vn_fifo_bypass },
+               { &vop_abortop_desc, vn_fifo_bypass },
+               { &vop_inactive_desc, chfs_inactive },
+               { &vop_reclaim_desc, chfs_reclaim },
+               { &vop_lock_desc, genfs_lock },
+               { &vop_unlock_desc, genfs_unlock },
+               { &vop_bmap_desc, vn_fifo_bypass },
+               { &vop_strategy_desc, vn_fifo_bypass },
+               { &vop_print_desc, ufs_print },
+               { &vop_pathconf_desc, vn_fifo_bypass },
+               { &vop_islocked_desc, genfs_islocked },
+               { &vop_advlock_desc, vn_fifo_bypass },
+               { &vop_bwrite_desc, genfs_nullop },
+               { &vop_getpages_desc, genfs_badop },
+               { &vop_putpages_desc, vn_fifo_bypass },
+               { NULL, NULL } };
+
+const struct vnodeopv_desc chfs_fifoop_opv_desc =
+       { &chfs_fifoop_p, chfs_fifoop_entries };
diff --git a/sys/ufs/chfs/chfs_wbuf.c b/sys/ufs/chfs/chfs_wbuf.c
new file mode 100644 (file)
index 0000000..c9823a6
--- /dev/null
@@ -0,0 +1,259 @@
+/*     $NetBSD: chfs_wbuf.c,v 1.2 2011/11/24 20:50:33 agc Exp $        */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <dev/flash/flash.h>
+#include <sys/uio.h>
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
+#define DBG_WBUF 1
+
+#define PAD(x) (((x)+3)&~3)
+
+#define EB_ADDRESS(x) ( ((unsigned long)(x) / chmp->chm_ebh->eb_size) * chmp->chm_ebh->eb_size )
+
+#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(chmp->chm_wbuf_pagesize)) * (unsigned long)(chmp->chm_wbuf_pagesize) )
+#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(chmp->chm_wbuf_pagesize) )
+
+/*
+// test functions
+int wbuf_test(void);
+void wbuf_test_erase_flash(struct chfs_mount*);
+void wbuf_test_callback(struct erase_instruction*);
+*/
+
+#define NOPAD  0
+#define SETPAD 1
+
+
+/**
+ * chfs_flush_wbuf - write wbuf to the flash
+ * @chmp: super block info
+ * @pad: padding (NOPAD / SETPAD)
+ * Returns zero in case of success.
+ */
+static int
+chfs_flush_wbuf(struct chfs_mount *chmp, int pad)
+{
+       int ret=0;
+       size_t retlen = 0;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+       KASSERT(rw_write_held(&chmp->chm_lock_wbuf));
+
+       if (pad) {
+               chmp->chm_wbuf_len = PAD(chmp->chm_wbuf_len);
+               memset(chmp->chm_wbuf + chmp->chm_wbuf_len, 0, chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len);
+
+               struct chfs_flash_padding_node* padnode = (void*)(chmp->chm_wbuf + chmp->chm_wbuf_len);
+               padnode->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+               padnode->type = htole16(CHFS_NODETYPE_PADDING);
+               padnode->length = htole32(chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len);
+               padnode->hdr_crc = htole32(crc32(0, (uint8_t *)padnode, sizeof(*padnode)-4));
+
+               struct chfs_node_ref *nref;
+               nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+               nref->nref_offset = chmp->chm_wbuf_ofs + chmp->chm_wbuf_len;
+               nref->nref_offset = CHFS_GET_OFS(nref->nref_offset) |
+                   CHFS_OBSOLETE_NODE_MASK;
+               chmp->chm_wbuf_len = chmp->chm_wbuf_pagesize;
+
+               chfs_change_size_free(chmp, chmp->chm_nextblock, -padnode->length);
+               chfs_change_size_wasted(chmp, chmp->chm_nextblock, padnode->length);
+       }
+
+       ret = chfs_write_leb(chmp, chmp->chm_nextblock->lnr, chmp->chm_wbuf, chmp->chm_wbuf_ofs, chmp->chm_wbuf_len, &retlen);
+       if(ret) {
+               return ret;
+       }
+
+       memset(chmp->chm_wbuf,0xff,chmp->chm_wbuf_pagesize);
+       chmp->chm_wbuf_ofs += chmp->chm_wbuf_pagesize;
+       chmp->chm_wbuf_len = 0;
+       return 0;
+}
+
+
+/**
+ * chfs_fill_wbuf - write to wbuf
+ * @chmp: super block info
+ * @buf: buffer
+ * @len: buffer length
+ * Return the len of the buf what we didn't write to the wbuf.
+ */
+static size_t
+chfs_fill_wbuf(struct chfs_mount *chmp, const u_char *buf, size_t len)
+{
+       if (len && !chmp->chm_wbuf_len && (len >= chmp->chm_wbuf_pagesize)) {
+               return 0;
+       }
+       if (len > (chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len)) {
+               len = chmp->chm_wbuf_pagesize - chmp->chm_wbuf_len;
+       }
+       memcpy(chmp->chm_wbuf + chmp->chm_wbuf_len, buf, len);
+
+       chmp->chm_wbuf_len += (int) len;
+       return len;
+}
+
+/**
+ * chfs_write_wbuf - write to wbuf and then the flash
+ * @chmp: super block info
+ * @invecs: io vectors
+ * @count: num of vectors
+ * @to: offset of target
+ * @retlen: writed bytes
+ * Returns zero in case of success.
+ */
+int
+chfs_write_wbuf(struct chfs_mount* chmp, const struct iovec *invecs, long count,
+    off_t to, size_t *retlen)
+{
+       int invec, ret = 0;
+       size_t wbuf_retlen, donelen = 0;
+       int outvec_to = to;
+
+       int lnr = chmp->chm_nextblock->lnr;
+
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       KASSERT(mutex_owned(&chmp->chm_lock_sizes));
+       KASSERT(!rw_write_held(&chmp->chm_lock_wbuf));
+
+       rw_enter(&chmp->chm_lock_wbuf, RW_WRITER);
+
+       //dbg("1. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len);
+
+       if (chmp->chm_wbuf_ofs == 0xffffffff) {
+               chmp->chm_wbuf_ofs = PAGE_DIV(to);
+               chmp->chm_wbuf_len = PAGE_MOD(to);
+               memset(chmp->chm_wbuf, 0xff, chmp->chm_wbuf_pagesize);
+       }
+
+       //dbg("2. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len);
+
+       if (EB_ADDRESS(to) != EB_ADDRESS(chmp->chm_wbuf_ofs)) {
+               if (chmp->chm_wbuf_len) {
+                       ret = chfs_flush_wbuf(chmp, SETPAD);
+                       if (ret)
+                               goto outerr;
+               }
+               chmp->chm_wbuf_ofs = PAGE_DIV(to);
+               chmp->chm_wbuf_len = PAGE_MOD(to);
+       }
+
+       //dbg("3. wbuf ofs: %zu, len: %zu\n", chmp->chm_wbuf_ofs, chmp->chm_wbuf_len);
+
+       if (to != PAD(chmp->chm_wbuf_ofs + chmp->chm_wbuf_len)) {
+               dbg("to: %llu != %zu\n", (unsigned long long)to,
+                       PAD(chmp->chm_wbuf_ofs + chmp->chm_wbuf_len));
+               dbg("Non-contiguous write\n");
+               panic("BUG\n");
+       }
+
+       /* adjust alignment offset */
+       if (chmp->chm_wbuf_len != PAGE_MOD(to)) {
+               chmp->chm_wbuf_len = PAGE_MOD(to);
+               /* take care of alignement to next page*/
+               if (!chmp->chm_wbuf_len) {
+                       chmp->chm_wbuf_len += chmp->chm_wbuf_pagesize;
+                       ret = chfs_flush_wbuf(chmp, NOPAD);
+                       if (ret)
+                               goto outerr;
+               }
+       }
+
+       for (invec = 0; invec < count; invec++) {
+               int vlen = invecs[invec].iov_len;
+               u_char* v = invecs[invec].iov_base;
+
+               //dbg("invec:%d len:%d\n", invec, vlen);
+
+               wbuf_retlen = chfs_fill_wbuf(chmp, v, vlen);
+               if (chmp->chm_wbuf_len == chmp->chm_wbuf_pagesize) {
+                       ret = chfs_flush_wbuf(chmp, NOPAD);
+                       if (ret) {
+                               goto outerr;
+                       }
+               }
+               vlen -= wbuf_retlen;
+               outvec_to += wbuf_retlen;
+               v += wbuf_retlen;
+               donelen += wbuf_retlen;
+               if (vlen >= chmp->chm_wbuf_pagesize) {
+                       ret = chfs_write_leb(chmp, lnr, v, outvec_to, PAGE_DIV(vlen), &wbuf_retlen);
+                       //dbg("fd->write: %zu\n", wbuf_retlen);
+                       vlen -= wbuf_retlen;
+                       outvec_to += wbuf_retlen;
+                       chmp->chm_wbuf_ofs = outvec_to;
+                       v += wbuf_retlen;
+                       donelen += wbuf_retlen;
+               }
+               wbuf_retlen = chfs_fill_wbuf(chmp, v, vlen);
+               if (chmp->chm_wbuf_len == chmp->chm_wbuf_pagesize) {
+                       ret = chfs_flush_wbuf(chmp, NOPAD);
+                       if (ret)
+                               goto outerr;
+               }
+
+               // if we write the last vector, we flush with padding
+               /*if (invec == count-1) {
+                 ret = chfs_flush_wbuf(chmp, SETPAD);
+                 if (ret)
+                 goto outerr;
+                 }*/
+               outvec_to += wbuf_retlen;
+               donelen += wbuf_retlen;
+       }
+       *retlen = donelen;
+       rw_exit(&chmp->chm_lock_wbuf);
+       return ret;
+
+outerr:
+       *retlen = 0;
+       return ret;
+}
+
+int chfs_flush_pending_wbuf(struct chfs_mount *chmp)
+{
+       //dbg("flush pending wbuf\n");
+       int err;
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+       mutex_enter(&chmp->chm_lock_sizes);
+       rw_enter(&chmp->chm_lock_wbuf, RW_WRITER);
+       err = chfs_flush_wbuf(chmp, SETPAD);
+       rw_exit(&chmp->chm_lock_wbuf);
+       mutex_exit(&chmp->chm_lock_sizes);
+       return err;
+}
diff --git a/sys/ufs/chfs/chfs_write.c b/sys/ufs/chfs/chfs_write.c
new file mode 100644 (file)
index 0000000..0838ed9
--- /dev/null
@@ -0,0 +1,545 @@
+/*     $NetBSD: chfs_write.c,v 1.2 2011/11/24 21:09:37 agc Exp $       */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2010 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2010 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * chfs_write.c
+ *
+ *  Created on: 2010.02.17.
+ *      Author: dtengeri
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+
+#include "chfs.h"
+
+int
+chfs_write_flash_vnode(struct chfs_mount *chmp,
+    struct chfs_inode *ip, int prio)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       struct chfs_flash_vnode *fvnode;
+       struct chfs_vnode_cache* chvc;
+       struct chfs_node_ref *nref;
+       struct iovec vec;
+       size_t size, retlen;
+       int err = 0, retries = 0;
+
+       if (ip->ino == CHFS_ROOTINO)
+               return 0;
+
+       fvnode = chfs_alloc_flash_vnode();
+       if (!fvnode)
+               return ENOMEM;
+
+       chvc = ip->chvc;
+
+       /* setting up flash_vnode members */
+       size = sizeof(*fvnode);
+       //dbg("size: %zu | PADDED: %zu\n", size, CHFS_PAD(size));
+       fvnode->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+       fvnode->type = htole16(CHFS_NODETYPE_VNODE);
+       fvnode->length = htole32(CHFS_PAD(size));
+       fvnode->hdr_crc = htole32(crc32(0, (uint8_t *)fvnode,
+               CHFS_NODE_HDR_SIZE - 4));
+       fvnode->vno = htole64(ip->ino);
+       fvnode->version = htole64(++ip->chvc->highest_version);
+       fvnode->mode = htole32(ip->mode);
+       fvnode->dn_size = htole32(ip->size);
+       fvnode->atime = htole32(ip->atime);
+       fvnode->ctime = htole32(ip->ctime);
+       fvnode->mtime = htole32(ip->mtime);
+       fvnode->gid = htole32(ip->gid);
+       fvnode->uid = htole32(ip->uid);
+       fvnode->node_crc = htole32(crc32(0, (uint8_t *)fvnode, size - 4));
+
+       /* write out flash_vnode */
+retry:
+       if (prio == ALLOC_GC) {
+               /* the GC calls this function */
+               err = chfs_reserve_space_gc(chmp, CHFS_PAD(size));
+               if (err)
+                       goto out;
+       } else {
+               chfs_gc_trigger(chmp);
+               if (prio == ALLOC_NORMAL)
+                       err = chfs_reserve_space_normal(chmp,
+                           CHFS_PAD(size), ALLOC_NORMAL);
+               else
+                       err = chfs_reserve_space_normal(chmp,
+                           CHFS_PAD(size), ALLOC_DELETION);
+               if (err)
+                       goto out;
+       }
+
+       nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+       if (!nref) {
+               err = ENOMEM;
+               goto out;
+       }
+
+       mutex_enter(&chmp->chm_lock_sizes);
+
+       nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+       chfs_change_size_free(chmp, chmp->chm_nextblock, -CHFS_PAD(size));
+       vec.iov_base = fvnode;
+       vec.iov_len = CHFS_PAD(size);
+       err = chfs_write_wbuf(chmp, &vec, 1, nref->nref_offset, &retlen);
+       if (err || retlen != CHFS_PAD(size)) {
+               chfs_err("error while writing out flash vnode to the media\n");
+               chfs_err("err: %d | size: %zu | retlen : %zu\n",
+                   err, CHFS_PAD(size), retlen);
+               chfs_change_size_dirty(chmp,
+                   chmp->chm_nextblock, CHFS_PAD(size));
+               if (retries) {
+                       err = EIO;
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       goto out;
+               }
+
+               retries++;
+               mutex_exit(&chmp->chm_lock_sizes);
+               goto retry;
+       }
+       //Everything went well
+       chfs_change_size_used(chmp,
+           &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size));
+       mutex_exit(&chmp->chm_lock_sizes);
+       
+       chfs_add_vnode_ref_to_vc(chmp, chvc, nref);
+       KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+out:
+       chfs_free_flash_vnode(fvnode);
+       return err;
+}
+
+int
+chfs_write_flash_dirent(struct chfs_mount *chmp, struct chfs_inode *pdir,
+    struct chfs_inode *ip, struct chfs_dirent *fd,
+    ino_t ino, int prio)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       struct chfs_flash_dirent_node *fdirent;
+       struct chfs_node_ref *nref;
+       struct iovec vec[2];
+       size_t size, retlen;
+       int err = 0, retries = 0;
+       uint8_t *name;
+       size_t namelen;
+
+       KASSERT(fd->vno != CHFS_ROOTINO);
+
+       fdirent = chfs_alloc_flash_dirent();
+       if (!fdirent)
+               return ENOMEM;
+
+       size = sizeof(*fdirent) + fd->nsize;
+       namelen = CHFS_PAD(size) - sizeof(*fdirent);
+
+       name = kmem_zalloc(namelen, KM_SLEEP);
+       memcpy(name, fd->name, fd->nsize);
+       //dbg("namelen: %zu | nsize: %hhu\n", namelen, fd->nsize);
+
+
+       //dbg("size: %zu | PADDED: %zu\n", size, CHFS_PAD(size));
+       fdirent->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+       fdirent->type = htole16(CHFS_NODETYPE_DIRENT);
+       fdirent->length = htole32(CHFS_PAD(size));
+       fdirent->hdr_crc = htole32(crc32(0, (uint8_t *)fdirent,
+               CHFS_NODE_HDR_SIZE - 4));
+       fdirent->vno = htole64(ino);
+       fdirent->pvno = htole64(pdir->ino);
+       fdirent->version = htole64(++pdir->chvc->highest_version);
+       fdirent->mctime = ip?ip->ctime:0;
+       fdirent->nsize = fd->nsize;
+       fdirent->dtype = fd->type;
+       fdirent->name_crc = crc32(0, (uint8_t *)&(fd->name), fd->nsize);
+       fdirent->node_crc = crc32(0, (uint8_t *)fdirent, sizeof(*fdirent) - 4);
+
+       vec[0].iov_base = fdirent;
+       vec[0].iov_len  = sizeof(*fdirent);
+       vec[1].iov_base = name;
+       vec[1].iov_len  = namelen;
+       
+retry:
+       if (prio == ALLOC_GC) {
+               /* the GC calls this function */
+               err = chfs_reserve_space_gc(chmp, CHFS_PAD(size));
+               if (err)
+                       goto out;
+       } else {
+               chfs_gc_trigger(chmp);
+               if (prio == ALLOC_NORMAL)
+                       err = chfs_reserve_space_normal(chmp,
+                           CHFS_PAD(size), ALLOC_NORMAL);
+               else
+                       err = chfs_reserve_space_normal(chmp,
+                           CHFS_PAD(size), ALLOC_DELETION);
+               if (err)
+                       goto out;
+       }
+
+       nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+       if (!nref) {
+               err = ENOMEM;
+               goto out;
+       }
+
+       mutex_enter(&chmp->chm_lock_sizes);
+
+       nref->nref_offset = chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+       chfs_change_size_free(chmp, chmp->chm_nextblock, -CHFS_PAD(size));
+
+       err = chfs_write_wbuf(chmp, vec, 2, nref->nref_offset, &retlen);
+       if (err || retlen != CHFS_PAD(size)) {
+               chfs_err("error while writing out flash dirent node to the media\n");
+               chfs_err("err: %d | size: %zu | retlen : %zu\n",
+                   err, CHFS_PAD(size), retlen);
+               chfs_change_size_dirty(chmp,
+                   chmp->chm_nextblock, CHFS_PAD(size));
+               if (retries) {
+                       err = EIO;
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       goto out;
+               }
+
+               retries++;
+               mutex_exit(&chmp->chm_lock_sizes);
+               goto retry;
+       }
+
+
+       // Everything went well
+       chfs_change_size_used(chmp,
+           &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size));
+       mutex_exit(&chmp->chm_lock_sizes);
+       KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+       fd->nref = nref;
+       if (prio != ALLOC_DELETION) {
+               chfs_add_node_to_list(chmp,
+                       pdir->chvc, nref, &pdir->chvc->dirents);
+       }
+out:
+       chfs_free_flash_dirent(fdirent);
+       return err;
+}
+
+/**
+ * chfs_write_flash_dnode - write out a data node to flash
+ * @chmp: chfs mount structure
+ * @vp: vnode where the data belongs to
+ * @bp: buffer contains data
+ */
+int
+chfs_write_flash_dnode(struct chfs_mount *chmp, struct vnode *vp,
+    struct buf *bp, struct chfs_full_dnode *fd)
+{
+       KASSERT(mutex_owned(&chmp->chm_lock_mountfields));
+
+       int err = 0, retries = 0;
+       size_t size, retlen;
+       off_t ofs;
+       struct chfs_flash_data_node *dnode;
+       struct chfs_node_ref *nref;
+       struct chfs_inode *ip = VTOI(vp);
+       struct iovec vec[2];
+       uint32_t len;
+       void *tmpbuf = NULL;
+
+       KASSERT(ip->ino != CHFS_ROOTINO);
+
+       dnode = chfs_alloc_flash_dnode();
+       if (!dnode)
+               return ENOMEM;
+
+       /* initialize flash data node */
+       ofs = bp->b_blkno * PAGE_SIZE;
+       //dbg("vp->v_size: %ju, bp->b_blkno: %ju, bp-b_data: %p,"
+       //    " bp->b_resid: %ju\n",
+       //    (uintmax_t )vp->v_size, (uintmax_t )bp->b_blkno,
+       //    bp->b_data, (uintmax_t )bp->b_resid);
+       //dbg("[XXX]vp->v_size - ofs: %llu\n", (vp->v_size - ofs));
+       len = MIN((vp->v_size - ofs), bp->b_resid);
+       size = sizeof(*dnode) + len;
+
+       dnode->magic = htole16(CHFS_FS_MAGIC_BITMASK);
+       dnode->type = htole16(CHFS_NODETYPE_DATA);
+       dnode->length = htole32(CHFS_PAD(size));
+       dnode->hdr_crc = htole32(crc32(0, (uint8_t *)dnode,
+               CHFS_NODE_HDR_SIZE - 4));
+       dnode->vno = htole64(ip->ino);
+       dnode->version = htole64(++ip->chvc->highest_version);
+       dnode->offset = htole64(ofs);
+       dnode->data_length = htole32(len);
+       dnode->data_crc = htole32(crc32(0, (uint8_t *)bp->b_data, len));
+       dnode->node_crc = htole32(crc32(0, (uint8_t *)dnode,
+               sizeof(*dnode) - 4));
+
+       dbg("dnode @%llu %ub v%llu\n", (unsigned long long)dnode->offset,
+               dnode->data_length, (unsigned long long)dnode->version);
+
+       if (CHFS_PAD(size) - sizeof(*dnode)) {
+               tmpbuf = kmem_zalloc(CHFS_PAD(size)
+                   - sizeof(*dnode), KM_SLEEP);
+               memcpy(tmpbuf, bp->b_data, len);
+       }
+
+       /* creating iovecs for wbuf */
+       vec[0].iov_base = dnode;
+       vec[0].iov_len = sizeof(*dnode);
+       vec[1].iov_base = tmpbuf;
+       vec[1].iov_len = CHFS_PAD(size) - sizeof(*dnode);
+
+       fd->frags = 0;
+       fd->ofs = ofs;
+       fd->size = len;
+
+retry:
+
+       /* Reserve space for data node. This will set up the next eraseblock
+        * where to we will write.
+        */
+
+       chfs_gc_trigger(chmp);
+       err = chfs_reserve_space_normal(chmp,
+           CHFS_PAD(size), ALLOC_NORMAL);
+       if (err)
+               goto out;
+
+       nref = chfs_alloc_node_ref(chmp->chm_nextblock);
+       if (!nref) {
+               err = ENOMEM;
+               goto out;
+       }
+
+       nref->nref_offset =
+           chmp->chm_ebh->eb_size - chmp->chm_nextblock->free_size;
+
+       KASSERT(nref->nref_offset < chmp->chm_ebh->eb_size);
+       
+       mutex_enter(&chmp->chm_lock_sizes);
+
+       chfs_change_size_free(chmp,
+           chmp->chm_nextblock, -CHFS_PAD(size));
+
+       //dbg("vno: %llu nref lnr: %u offset: %u\n",
+       //    dnode->vno, nref->nref_lnr, nref->nref_offset);
+
+       err = chfs_write_wbuf(chmp, vec, 2, nref->nref_offset, &retlen);
+       if (err || retlen != CHFS_PAD(size)) {
+               chfs_err("error while writing out flash data node to the media\n");
+               chfs_err("err: %d | size: %zu | retlen : %zu\n",
+                   err, size, retlen);
+               chfs_change_size_dirty(chmp,
+                   chmp->chm_nextblock, CHFS_PAD(size));
+               if (retries) {
+                       err = EIO;
+                       mutex_exit(&chmp->chm_lock_sizes);
+                       goto out;
+               }
+
+               retries++;
+               mutex_exit(&chmp->chm_lock_sizes);
+               goto retry;
+       }
+       /* Everything went well */
+       ip->write_size += fd->size;
+       chfs_change_size_used(chmp,
+           &chmp->chm_blocks[nref->nref_lnr], CHFS_PAD(size));
+       mutex_exit(&chmp->chm_lock_sizes);
+
+       KASSERT(chmp->chm_blocks[nref->nref_lnr].used_size <= chmp->chm_ebh->eb_size);
+       fd->nref = nref;
+       chfs_add_node_to_list(chmp, ip->chvc, nref, &ip->chvc->dnode);
+out:
+       chfs_free_flash_dnode(dnode);
+       if (CHFS_PAD(size) - sizeof(*dnode)) {
+               kmem_free(tmpbuf, CHFS_PAD(size) - sizeof(*dnode));
+       }
+
+       return err;
+}
+
+/**
+ * chfs_do_link - makes a copy from a node
+ * @old: old node
+ * @oldfd: dirent of old node
+ * @parent: parent of new node
+ * @name: name of new node
+ * @namelen: length of name
+ * This function writes the dirent of the new node to the media.
+ */
+int
+chfs_do_link(struct chfs_inode *ip, struct chfs_inode *parent, const char *name, int namelen, enum vtype type)
+{
+       int error = 0;
+       struct vnode *vp = ITOV(ip);
+       struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+       struct chfs_mount *chmp = ump->um_chfs;
+       struct chfs_dirent *newfd = NULL;
+//     struct chfs_dirent *fd = NULL;
+
+       //dbg("link vno: %llu\n", ip->ino);
+
+       newfd = chfs_alloc_dirent(namelen + 1);
+
+       newfd->vno = ip->ino;
+       newfd->type = type;
+       newfd->nsize = namelen;
+       memcpy(newfd->name, name, namelen);
+       newfd->name[newfd->nsize] = 0;
+//     newfd->next = NULL;
+
+       ip->chvc->nlink++;
+       parent->chvc->nlink++;
+       ip->iflag |= IN_CHANGE;
+       chfs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+       mutex_enter(&chmp->chm_lock_mountfields);
+
+       error = chfs_write_flash_vnode(chmp, ip, ALLOC_NORMAL);
+       if (error)
+               return error;
+
+       error = chfs_write_flash_dirent(chmp,
+           parent, ip, newfd, ip->ino, ALLOC_NORMAL);
+       /* TODO: what should we do if error isn't zero? */
+
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       /* add fd to the fd list */
+       TAILQ_INSERT_TAIL(&parent->dents, newfd, fds);
+#if 0
+       fd = parent->dents;
+       if (!fd) {
+               parent->dents = newfd;
+       } else {
+               while (fd->next)
+                       fd = fd->next;
+               fd->next = newfd;
+       }
+#endif
+
+       return error;
+}
+
+
+/**
+ * chfs_do_unlink - delete a node
+ * @ip: node what we'd like to delete
+ * @parent: parent of the node
+ * @name: name of the node
+ * @namelen: length of name
+ * This function set the nlink and vno of the node zero and write its dirent to the media.
+ */
+int
+chfs_do_unlink(struct chfs_inode *ip,
+    struct chfs_inode *parent, const char *name, int namelen)
+{
+       struct chfs_dirent *fd, *tmpfd;
+       int error = 0;
+       struct vnode *vp = ITOV(ip);
+       struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+       struct chfs_mount *chmp = ump->um_chfs;
+       struct chfs_node_ref *nref;
+
+       //dbg("unlink vno: %llu\n", ip->ino);
+
+       vflushbuf(vp, 0);
+
+       mutex_enter(&chmp->chm_lock_mountfields);
+
+       /* remove the full direntry from the parent dents list */
+       TAILQ_FOREACH_SAFE(fd, &parent->dents, fds, tmpfd) {
+               if (fd->vno == ip->ino &&
+                   fd->nsize == namelen &&
+                   !memcmp(fd->name, name, fd->nsize)) {
+                       if (fd->type == VDIR && ip->chvc->nlink == 2)
+                               ip->chvc->nlink = 0;
+                       else
+                               ip->chvc->nlink--;
+
+                       fd->type = VNON;
+
+                       TAILQ_REMOVE(&parent->dents, fd, fds);
+
+                       /* remove nref from dirents list */
+                       nref = parent->chvc->dirents;
+                       if (nref == fd->nref) {
+                               nref->nref_next = fd->nref->nref_next;
+                       } else {
+                               while (nref->nref_next && nref->nref_next != fd->nref)
+                                       nref = nref->nref_next;
+                               if (nref->nref_next)
+                                       nref->nref_next = fd->nref->nref_next;
+                       }
+
+                       //dbg("FD->NREF vno: %llu, lnr: %u, ofs: %u\n",
+                       //    fd->vno, fd->nref->nref_lnr, fd->nref->nref_offset);
+                       chfs_mark_node_obsolete(chmp, fd->nref);
+
+                       error = chfs_write_flash_dirent(chmp,
+                           parent, ip, fd, 0, ALLOC_DELETION);
+
+                       //dbg("FD->NREF vno: %llu, lnr: %u, ofs: %u\n",
+                       //    fd->vno, fd->nref->nref_lnr, fd->nref->nref_offset);
+                       chfs_mark_node_obsolete(chmp, fd->nref);
+
+                       nref = ip->chvc->dnode;
+                       while (nref != (struct chfs_node_ref *)ip->chvc) {
+                               //dbg("DATA NREF\n");
+                               chfs_mark_node_obsolete(chmp, nref);
+                               nref = nref->nref_next;
+                       }
+                       ip->chvc->dnode = (struct chfs_node_ref *)ip->chvc;
+
+                       nref = ip->chvc->v;
+                       while (nref != (struct chfs_node_ref *)ip->chvc) {
+                               //dbg("V NREF\n");
+                               chfs_mark_node_obsolete(chmp, nref);
+                               nref = nref->nref_next;
+                       }
+                       ip->chvc->v = ip->chvc->v->nref_next;
+
+                       parent->chvc->nlink--;
+                       //TODO: if error
+               }
+       }
+       mutex_exit(&chmp->chm_lock_mountfields);
+
+       return error;
+}
diff --git a/sys/ufs/chfs/debug.c b/sys/ufs/chfs/debug.c
new file mode 100644 (file)
index 0000000..0d1fa5b
--- /dev/null
@@ -0,0 +1,48 @@
+/*     $NetBSD: debug.c,v 1.1 2011/11/24 15:51:32 ahoka Exp $  */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * XipFFS -- Xip Flash File System
+ *
+ * Copyright (C) 2009  Ferenc Havasi <havasi@inf.u-szeged.hu>,
+ *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
+ *                     ...
+ *                     University of Szeged, Hungary
+ *
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
+
+#include "chfs.h"
+//#include </root/xipffs/netbsd.chfs/chfs.h>
+
diff --git a/sys/ufs/chfs/ebh.c b/sys/ufs/chfs/ebh.c
new file mode 100644 (file)
index 0000000..ff0d984
--- /dev/null
@@ -0,0 +1,2141 @@
+/*     $NetBSD: ebh.c,v 1.2 2011/11/25 11:15:24 ahoka Exp $    */
+
+/*-
+ * Copyright (c) 2010 Department of Software Engineering,
+ *                   University of Szeged, Hungary
+ * Copyright (C) 2009 Ferenc Havasi <havasi@inf.u-szeged.hu>
+ * Copyright (C) 2009 Zoltan Sogor <weth@inf.u-szeged.hu>
+ * Copyright (C) 2009 David Tengeri <dtengeri@inf.u-szeged.hu>
+ * Copyright (C) 2009 Tamas Toth <ttoth@inf.u-szeged.hu>
+ * Copyright (C) 2010 Adam Hoka <ahoka@NetBSD.org>
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by the Department of Software Engineering, University of Szeged, Hungary
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "ebh.h"
+
+/*****************************************************************************/
+/* Flash specific operations                                                */
+/*****************************************************************************/
+int nor_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr);
+int nand_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr);
+int nor_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset);
+int nand_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset);
+int nor_read_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr);
+int nand_read_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr);
+int nor_write_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr);
+int nand_write_eb_hdr(struct chfs_ebh *ebh, int pebnr,struct chfs_eb_hdr *ebhdr);
+int nor_check_eb_hdr(struct chfs_ebh *ebh, void *buf);
+int nand_check_eb_hdr(struct chfs_ebh *ebh, void *buf);
+int nor_mark_eb_hdr_dirty_flash(struct chfs_ebh *ebh, int pebnr, int lid);
+int nor_invalidate_eb_hdr(struct chfs_ebh *ebh, int pebnr);
+int mark_eb_hdr_free(struct chfs_ebh *ebh, int pebnr, int ec);
+
+int ltree_entry_cmp(struct chfs_ltree_entry *le1, struct chfs_ltree_entry *le2);
+int peb_in_use_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2);
+int peb_free_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2);
+int add_peb_to_erase_queue(struct chfs_ebh *ebh, int pebnr, int ec,struct peb_queue *queue);
+struct chfs_peb * find_peb_in_use(struct chfs_ebh *ebh, int pebnr);
+int add_peb_to_free(struct chfs_ebh *ebh, int pebnr, int ec);
+int add_peb_to_in_use(struct chfs_ebh *ebh, int pebnr, int ec);
+void erase_callback(struct flash_erase_instruction *ei);
+int free_peb(struct chfs_ebh *ebh);
+int release_peb(struct chfs_ebh *ebh, int pebnr);
+void erase_thread(void *data);
+static void erase_thread_start(struct chfs_ebh *ebh);
+static void erase_thread_stop(struct chfs_ebh *ebh);
+int scan_leb_used_cmp(struct chfs_scan_leb *sleb1, struct chfs_scan_leb *sleb2);
+int nor_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,struct chfs_eb_hdr *ebhdr, int pebnr, int leb_status);
+int nor_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    int pebnr, struct chfs_eb_hdr *ebhdr);
+int nand_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,struct chfs_eb_hdr *ebhdr, int pebnr);
+int nand_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    int pebnr, struct chfs_eb_hdr *ebhdr);
+struct chfs_scan_info *chfs_scan(struct chfs_ebh *ebh);
+void scan_info_destroy(struct chfs_scan_info *si);
+int scan_media(struct chfs_ebh *ebh);
+int get_peb(struct chfs_ebh *ebh);
+/**
+ * nor_create_eb_hdr - creates an eraseblock header for NOR flash
+ * @ebhdr: ebhdr to set
+ * @lnr: LEB number
+ */
+int
+nor_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr)
+{
+       ebhdr->u.nor_hdr.lid = htole32(lnr);
+       return 0;
+}
+
+/**
+ * nand_create_eb_hdr - creates an eraseblock header for NAND flash
+ * @ebhdr: ebhdr to set
+ * @lnr: LEB number
+ */
+int
+nand_create_eb_hdr(struct chfs_eb_hdr *ebhdr, int lnr)
+{
+       ebhdr->u.nand_hdr.lid = htole32(lnr);
+       return 0;
+}
+
+/**
+ * nor_calc_data_offs - calculates data offset on NOR flash
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @offset: offset within the eraseblock
+ */
+int
+nor_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset)
+{
+       return pebnr * ebh->flash_if->erasesize + offset +
+           CHFS_EB_EC_HDR_SIZE + CHFS_EB_HDR_NOR_SIZE;
+}
+
+/**
+ * nand_calc_data_offs - calculates data offset on NAND flash
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @offset: offset within the eraseblock
+ */
+int
+nand_calc_data_offs(struct chfs_ebh *ebh, int pebnr, int offset)
+{
+       return pebnr * ebh->flash_if->erasesize + offset +
+           2 * ebh->flash_if->page_size;
+}
+
+/**
+ * nor_read_eb_hdr - read ereaseblock header from NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @ebhdr: whereto store the data
+ *
+ * Reads the eraseblock header from media.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_read_eb_hdr(struct chfs_ebh *ebh,
+    int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+       int ret;
+       size_t retlen;
+       off_t ofs = pebnr * ebh->flash_if->erasesize;
+
+       KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+       ret = flash_read(ebh->flash_dev,
+           ofs, CHFS_EB_EC_HDR_SIZE,
+           &retlen, (unsigned char *) &ebhdr->ec_hdr);
+
+       if (ret || retlen != CHFS_EB_EC_HDR_SIZE)
+               return ret;
+
+       ofs += CHFS_EB_EC_HDR_SIZE;
+       ret = flash_read(ebh->flash_dev,
+           ofs, CHFS_EB_HDR_NOR_SIZE,
+           &retlen, (unsigned char *) &ebhdr->u.nor_hdr);
+
+       if (ret || retlen != CHFS_EB_HDR_NOR_SIZE)
+               return ret;
+
+       return 0;
+}
+
+/**
+ * nand_read_eb_hdr - read ereaseblock header from NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @ebhdr: whereto store the data
+ *
+ * Reads the eraseblock header from media. It is on the first two page.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_read_eb_hdr(struct chfs_ebh *ebh, int pebnr,
+    struct chfs_eb_hdr *ebhdr)
+{
+       int ret;
+       size_t retlen;
+       off_t ofs;
+
+       KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+       /* Read erase counter header from the first page. */
+       ofs = pebnr * ebh->flash_if->erasesize;
+       ret = flash_read(ebh->flash_dev,
+           ofs, CHFS_EB_EC_HDR_SIZE, &retlen,
+           (unsigned char *) &ebhdr->ec_hdr);
+       if (ret || retlen != CHFS_EB_EC_HDR_SIZE)
+               return ret;
+
+       /* Read NAND eraseblock header from the second page */
+       ofs += ebh->flash_if->page_size;
+       ret = flash_read(ebh->flash_dev,
+           ofs, CHFS_EB_HDR_NAND_SIZE, &retlen,
+           (unsigned char *) &ebhdr->u.nand_hdr);
+       if (ret || retlen != CHFS_EB_HDR_NAND_SIZE)
+               return ret;
+
+       return 0;
+}
+
+/**
+ * nor_write_eb_hdr - write ereaseblock header to NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number whereto write
+ * @ebh: ebh to write
+ *
+ * Writes the eraseblock header to media.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_write_eb_hdr(struct chfs_ebh *ebh, int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+       int ret, crc;
+       size_t retlen;
+
+       off_t ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE;
+
+       ebhdr->u.nor_hdr.lid = ebhdr->u.nor_hdr.lid
+           | htole32(CHFS_LID_NOT_DIRTY_BIT);
+
+       crc = crc32(0, (uint8_t *)&ebhdr->u.nor_hdr + 4,
+           CHFS_EB_HDR_NOR_SIZE - 4);
+       ebhdr->u.nand_hdr.crc = htole32(crc);
+
+       KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+       ret = flash_write(ebh->flash_dev,
+           ofs, CHFS_EB_HDR_NOR_SIZE, &retlen,
+           (unsigned char *) &ebhdr->u.nor_hdr);
+
+       if (ret || retlen != CHFS_EB_HDR_NOR_SIZE)
+               return ret;
+
+       return 0;
+}
+
+/**
+ * nand_write_eb_hdr - write ereaseblock header to NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number whereto write
+ * @ebh: ebh to write
+ *
+ * Writes the eraseblock header to media.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_write_eb_hdr(struct chfs_ebh *ebh, int pebnr,
+    struct chfs_eb_hdr *ebhdr)
+{
+       int ret, crc;
+       size_t retlen;
+       flash_off_t ofs;
+
+       KASSERT(pebnr >= 0 && pebnr < ebh->peb_nr);
+
+       ofs = pebnr * ebh->flash_if->erasesize +
+           ebh->flash_if->page_size;
+
+       ebhdr->u.nand_hdr.serial = htole64(++(*ebh->max_serial));
+
+       crc = crc32(0, (uint8_t *)&ebhdr->u.nand_hdr + 4,
+           CHFS_EB_HDR_NAND_SIZE - 4);
+       ebhdr->u.nand_hdr.crc = htole32(crc);
+
+       ret = flash_write(ebh->flash_dev, ofs,
+           CHFS_EB_HDR_NAND_SIZE, &retlen,
+           (unsigned char *) &ebhdr->u.nand_hdr);
+
+       if (ret || retlen != CHFS_EB_HDR_NAND_SIZE)
+               return ret;
+
+       return 0;
+}
+
+/**
+ * nor_check_eb_hdr - check ereaseblock header read from NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @buf: eraseblock header to check
+ *
+ * Returns eraseblock header status.
+ */
+int
+nor_check_eb_hdr(struct chfs_ebh *ebh, void *buf)
+{
+       uint32_t magic, crc, hdr_crc;
+       struct chfs_eb_hdr *ebhdr = buf;
+       le32 lid_save;
+
+       //check is there a header
+       if (check_pattern((void *) &ebhdr->ec_hdr,
+               0xFF, 0, CHFS_EB_EC_HDR_SIZE)) {
+               dbg_ebh("no header found\n");
+               return EBHDR_LEB_NO_HDR;
+       }
+
+       // check magic
+       magic = le32toh(ebhdr->ec_hdr.magic);
+       if (magic != CHFS_MAGIC_BITMASK) {
+               dbg_ebh("bad magic bitmask(exp: %x found %x)\n",
+                   CHFS_MAGIC_BITMASK, magic);
+               return EBHDR_LEB_BADMAGIC;
+       }
+
+       // check CRC_EC
+       hdr_crc = le32toh(ebhdr->ec_hdr.crc_ec);
+       crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4);
+       if (hdr_crc != crc) {
+               dbg_ebh("bad crc_ec found\n");
+               return EBHDR_LEB_BADCRC;
+       }
+
+       /* check if the PEB is free: magic, crc_ec and erase_cnt is good and
+        * everything else is FFF..
+        */
+       if (check_pattern((void *) &ebhdr->u.nor_hdr, 0xFF, 0,
+               CHFS_EB_HDR_NOR_SIZE)) {
+               dbg_ebh("free peb found\n");
+               return EBHDR_LEB_FREE;
+       }
+
+       // check invalidated (CRC == LID == 0)
+       if (ebhdr->u.nor_hdr.crc == 0 && ebhdr->u.nor_hdr.lid == 0) {
+               dbg_ebh("invalidated ebhdr found\n");
+               return EBHDR_LEB_INVALIDATED;
+       }
+
+       // check CRC
+       hdr_crc = le32toh(ebhdr->u.nor_hdr.crc);
+       lid_save = ebhdr->u.nor_hdr.lid;
+
+       // mark lid as not dirty for crc calc
+       ebhdr->u.nor_hdr.lid = ebhdr->u.nor_hdr.lid | htole32(
+               CHFS_LID_NOT_DIRTY_BIT);
+       crc = crc32(0, (uint8_t *) &ebhdr->u.nor_hdr + 4,
+           CHFS_EB_HDR_NOR_SIZE - 4);
+       // restore the original lid value in ebh
+       ebhdr->u.nor_hdr.lid = lid_save;
+
+       if (crc != hdr_crc) {
+               dbg_ebh("bad crc found\n");
+               return EBHDR_LEB_BADCRC;
+       }
+
+       // check dirty
+       if (!(le32toh(lid_save) & CHFS_LID_NOT_DIRTY_BIT)) {
+               dbg_ebh("dirty ebhdr found\n");
+               return EBHDR_LEB_DIRTY;
+       }
+
+       return EBHDR_LEB_OK;
+}
+
+/**
+ * nand_check_eb_hdr - check ereaseblock header read from NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @buf: eraseblock header to check
+ *
+ * Returns eraseblock header status.
+ */
+int
+nand_check_eb_hdr(struct chfs_ebh *ebh, void *buf)
+{
+       uint32_t magic, crc, hdr_crc;
+       struct chfs_eb_hdr *ebhdr = buf;
+
+       //check is there a header
+       if (check_pattern((void *) &ebhdr->ec_hdr,
+               0xFF, 0, CHFS_EB_EC_HDR_SIZE)) {
+               dbg_ebh("no header found\n");
+               return EBHDR_LEB_NO_HDR;
+       }
+
+       // check magic
+       magic = le32toh(ebhdr->ec_hdr.magic);
+       if (magic != CHFS_MAGIC_BITMASK) {
+               dbg_ebh("bad magic bitmask(exp: %x found %x)\n",
+                   CHFS_MAGIC_BITMASK, magic);
+               return EBHDR_LEB_BADMAGIC;
+       }
+
+       // check CRC_EC
+       hdr_crc = le32toh(ebhdr->ec_hdr.crc_ec);
+       crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4);
+       if (hdr_crc != crc) {
+               dbg_ebh("bad crc_ec found\n");
+               return EBHDR_LEB_BADCRC;
+       }
+
+       /* check if the PEB is free: magic, crc_ec and erase_cnt is good and
+        * everything else is FFF..
+        */
+       if (check_pattern((void *) &ebhdr->u.nand_hdr, 0xFF, 0,
+               CHFS_EB_HDR_NAND_SIZE)) {
+               dbg_ebh("free peb found\n");
+               return EBHDR_LEB_FREE;
+       }
+
+       // check CRC
+       hdr_crc = le32toh(ebhdr->u.nand_hdr.crc);
+
+       crc = crc32(0, (uint8_t *) &ebhdr->u.nand_hdr + 4,
+           CHFS_EB_HDR_NAND_SIZE - 4);
+
+       if (crc != hdr_crc) {
+               dbg_ebh("bad crc found\n");
+               return EBHDR_LEB_BADCRC;
+       }
+
+       return EBHDR_LEB_OK;
+}
+
+/**
+ * nor_mark_eb_hdr_dirty_flash- mark ereaseblock header dirty on NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @lid: leb id (it's bit number 31 will be set to 0)
+ *
+ * It pulls the CHFS_LID_NOT_DIRTY_BIT to zero on flash.
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_mark_eb_hdr_dirty_flash(struct chfs_ebh *ebh, int pebnr, int lid)
+{
+       int ret;
+       size_t retlen;
+       off_t ofs;
+
+       /* mark leb id dirty */
+       lid = htole32(lid & CHFS_LID_DIRTY_BIT_MASK);
+
+       /* calculate position */
+       ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE
+           + CHFS_GET_MEMBER_POS(struct chfs_nor_eb_hdr , lid);
+
+       ret = flash_write(ebh->flash_dev, ofs, sizeof(lid), &retlen,
+           (unsigned char *) &lid);
+       if (ret || retlen != sizeof(lid)) {
+               chfs_err("can't mark peb dirty");
+               return ret;
+       }
+
+       return 0;
+}
+
+/**
+ * nor_invalidate_eb_hdr - invalidate ereaseblock header on NOR flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ *
+ * Sets crc and lip field to zero.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_invalidate_eb_hdr(struct chfs_ebh *ebh, int pebnr)
+{
+       int ret;
+       size_t retlen;
+       off_t ofs;
+       char zero_buf[CHFS_INVALIDATE_SIZE];
+
+       /* fill with zero */
+       memset(zero_buf, 0x0, CHFS_INVALIDATE_SIZE);
+
+       /* calculate position (!!! lid is directly behind crc !!!) */
+       ofs = pebnr * ebh->flash_if->erasesize + CHFS_EB_EC_HDR_SIZE
+           + CHFS_GET_MEMBER_POS(struct chfs_nor_eb_hdr, crc);
+
+       ret = flash_write(ebh->flash_dev,
+           ofs, CHFS_INVALIDATE_SIZE, &retlen,
+           (unsigned char *) &zero_buf);
+       if (ret || retlen != CHFS_INVALIDATE_SIZE) {
+               chfs_err("can't invalidate peb");
+               return ret;
+       }
+
+       return 0;
+}
+
+/**
+ * mark_eb_hdr_free - free ereaseblock header on NOR or NAND flash
+ *
+ * @ebh: chfs eraseblock handler
+ * @pebnr: eraseblock number
+ * @ec: erase counter of PEB
+ *
+ * Write out the magic and erase counter to the physical eraseblock.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+mark_eb_hdr_free(struct chfs_ebh *ebh, int pebnr, int ec)
+{
+       int ret, crc;
+       size_t retlen;
+       off_t ofs;
+       struct chfs_eb_hdr *ebhdr;
+       ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+       ebhdr->ec_hdr.magic = htole32(CHFS_MAGIC_BITMASK);
+       ebhdr->ec_hdr.erase_cnt = htole32(ec);
+       crc = crc32(0, (uint8_t *) &ebhdr->ec_hdr + 8, 4);
+       ebhdr->ec_hdr.crc_ec = htole32(crc);
+
+       ofs = pebnr * ebh->flash_if->erasesize;
+
+       KASSERT(sizeof(ebhdr->ec_hdr) == CHFS_EB_EC_HDR_SIZE);
+
+       ret = flash_write(ebh->flash_dev,
+           ofs, CHFS_EB_EC_HDR_SIZE, &retlen,
+           (unsigned char *) &ebhdr->ec_hdr);
+
+       if (ret || retlen != CHFS_EB_EC_HDR_SIZE) {
+               chfs_err("can't mark peb as free: %d\n", pebnr);
+               kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+               return ret;
+       }
+
+       kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+       return 0;
+}
+
+/*****************************************************************************/
+/* End of Flash specific operations                                         */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Lock Tree                                                                */
+/*****************************************************************************/
+
+int
+ltree_entry_cmp(struct chfs_ltree_entry *le1,
+    struct chfs_ltree_entry *le2)
+{
+       return (le1->lnr - le2->lnr);
+}
+
+/* Generate functions for Lock tree's red-black tree */
+RB_PROTOTYPE( ltree_rbtree, chfs_ltree_entry, rb, ltree_entry_cmp);
+RB_GENERATE( ltree_rbtree, chfs_ltree_entry, rb, ltree_entry_cmp);
+
+
+/**
+ * ltree_lookup - looks up a logical eraseblock in the lock tree
+ * @ebh: chfs eraseblock handler
+ * @lid: identifier of the logical eraseblock
+ *
+ * This function returns a pointer to the wanted &struct chfs_ltree_entry
+ * if the logical eraseblock is in the lock tree, so it is locked, NULL
+ * otherwise.
+ * @ebh->ltree_lock has to be locked!
+ */
+static struct chfs_ltree_entry *
+ltree_lookup(struct chfs_ebh *ebh, int lnr)
+{
+       struct chfs_ltree_entry le, *result;
+       le.lnr = lnr;
+       result = RB_FIND(ltree_rbtree, &ebh->ltree, &le);
+       return result;
+}
+
+/**
+ * ltree_add_entry - add an entry to the lock tree
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * This function adds a new logical eraseblock entry identified with @lnr to the
+ * lock tree. If the entry is already in the tree, it increases the user
+ * counter.
+ * Returns NULL if can not allocate memory for lock tree entry, or a pointer
+ * to the inserted entry otherwise.
+ */
+static struct chfs_ltree_entry *
+ltree_add_entry(struct chfs_ebh *ebh, int lnr)
+{
+       struct chfs_ltree_entry *le, *result;
+
+       le = kmem_alloc(sizeof(struct chfs_ltree_entry), KM_SLEEP);
+
+       le->lnr = lnr;
+       le->users = 1;
+       rw_init(&le->mutex);
+
+       //dbg_ebh("enter ltree lock\n");
+       mutex_enter(&ebh->ltree_lock);
+       //dbg_ebh("insert\n");
+       result = RB_INSERT(ltree_rbtree, &ebh->ltree, le);
+       //dbg_ebh("inserted\n");
+       if (result) {
+               //The entry is already in the tree
+               result->users++;
+               kmem_free(le, sizeof(struct chfs_ltree_entry));
+       }
+       else {
+               result = le;
+       }
+       mutex_exit(&ebh->ltree_lock);
+
+       return result;
+}
+
+/**
+ * leb_read_lock - lock a logical eraseblock for read
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+static int
+leb_read_lock(struct chfs_ebh *ebh, int lnr)
+{
+       struct chfs_ltree_entry *le;
+
+       le = ltree_add_entry(ebh, lnr);
+       if (!le)
+               return ENOMEM;
+
+       rw_enter(&le->mutex, RW_READER);
+       return 0;
+}
+
+/**
+ * leb_read_unlock - unlock a logical eraseblock from read
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * This function unlocks a logical eraseblock from read and delete it from the
+ * lock tree is there are no more users of it.
+ */
+static void
+leb_read_unlock(struct chfs_ebh *ebh, int lnr)
+{
+       struct chfs_ltree_entry *le;
+
+       mutex_enter(&ebh->ltree_lock);
+       //dbg_ebh("LOCK: ebh->ltree_lock spin locked in leb_read_unlock()\n");
+       le = ltree_lookup(ebh, lnr);
+       if (!le)
+               goto out;
+
+       le->users -= 1;
+       KASSERT(le->users >= 0);
+       rw_exit(&le->mutex);
+       if (le->users == 0) {
+               le = RB_REMOVE(ltree_rbtree, &ebh->ltree, le);
+               if (le) {
+                       KASSERT(!rw_lock_held(&le->mutex));
+                       rw_destroy(&le->mutex);
+
+                       kmem_free(le, sizeof(struct chfs_ltree_entry));
+               }
+       }
+
+out:
+       mutex_exit(&ebh->ltree_lock);
+       //dbg_ebh("UNLOCK: ebh->ltree_lock spin unlocked in leb_read_unlock()\n");
+}
+
+/**
+ * leb_write_lock - lock a logical eraseblock for write
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+static int
+leb_write_lock(struct chfs_ebh *ebh, int lnr)
+{
+       struct chfs_ltree_entry *le;
+
+       le = ltree_add_entry(ebh, lnr);
+       if (!le)
+               return ENOMEM;
+
+       rw_enter(&le->mutex, RW_WRITER);
+       return 0;
+}
+
+/**
+ * leb_write_unlock - unlock a logical eraseblock from write
+ * @ebh: chfs eraseblock handler
+ * @lnr: identifier of the logical eraseblock
+ *
+ * This function unlocks a logical eraseblock from write and delete it from the
+ * lock tree is there are no more users of it.
+ */
+static void
+leb_write_unlock(struct chfs_ebh *ebh, int lnr)
+{
+       struct chfs_ltree_entry *le;
+
+       mutex_enter(&ebh->ltree_lock);
+       //dbg_ebh("LOCK: ebh->ltree_lock spin locked in leb_write_unlock()\n");
+       le = ltree_lookup(ebh, lnr);
+       if (!le)
+               goto out;
+
+       le->users -= 1;
+       KASSERT(le->users >= 0);
+       rw_exit(&le->mutex);
+       if (le->users == 0) {
+               RB_REMOVE(ltree_rbtree, &ebh->ltree, le);
+
+               KASSERT(!rw_lock_held(&le->mutex));
+               rw_destroy(&le->mutex);
+
+               kmem_free(le, sizeof(struct chfs_ltree_entry));
+       }
+
+out:
+       mutex_exit(&ebh->ltree_lock);
+       //dbg_ebh("UNLOCK: ebh->ltree_lock spin unlocked in leb_write_unlock()\n");
+}
+
+/*****************************************************************************/
+/* End of Lock Tree                                                         */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Erase related operations                                                 */
+/*****************************************************************************/
+
+/**
+ * If the first argument is smaller than the second, the function
+ * returns a value smaller than zero. If they are equal, the function re-
+ * turns zero. Otherwise, it should return a value greater than zero.
+ */
+int
+peb_in_use_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2)
+{
+       return (peb1->pebnr - peb2->pebnr);
+}
+
+int
+peb_free_cmp(struct chfs_peb *peb1, struct chfs_peb *peb2)
+{
+       int comp;
+
+       comp = peb1->erase_cnt - peb2->erase_cnt;
+       if (0 == comp)
+               comp = peb1->pebnr - peb2->pebnr;
+
+       return comp;
+}
+
+/* Generate functions for in use PEB's red-black tree */
+RB_PROTOTYPE(peb_in_use_rbtree, chfs_peb, u.rb, peb_in_use_cmp);
+RB_GENERATE(peb_in_use_rbtree, chfs_peb, u.rb, peb_in_use_cmp);
+RB_PROTOTYPE(peb_free_rbtree, chfs_peb, u.rb, peb_free_cmp);
+RB_GENERATE(peb_free_rbtree, chfs_peb, u.rb, peb_free_cmp);
+
+/**
+ * add_peb_to_erase_queue: adds a PEB to to_erase/fully_erased queue
+ * @ebh - chfs eraseblock handler
+ * @pebnr - physical eraseblock's number
+ * @ec - erase counter of PEB
+ * @queue: the queue to add to
+ *
+ * This function adds a PEB to the erase queue specified by @queue.
+ * The @ebh->erase_lock must be locked before using this.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+add_peb_to_erase_queue(struct chfs_ebh *ebh, int pebnr, int ec,
+    struct peb_queue *queue)
+{
+       struct chfs_peb *peb;
+
+       peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP);
+
+       peb->erase_cnt = ec;
+       peb->pebnr = pebnr;
+
+       TAILQ_INSERT_TAIL(queue, peb, u.queue);
+
+       return 0;
+
+}
+//TODO
+/**
+ * find_peb_in_use - looks up a PEB in the RB-tree of used blocks
+ * @ebh - chfs eraseblock handler
+ *
+ * This function returns a pointer to the PEB found in the tree,
+ * NULL otherwise.
+ * The @ebh->erase_lock must be locked before using this.
+ */
+struct chfs_peb *
+find_peb_in_use(struct chfs_ebh *ebh, int pebnr)
+{
+       struct chfs_peb peb, *result;
+       peb.pebnr = pebnr;
+       result = RB_FIND(peb_in_use_rbtree, &ebh->in_use, &peb);
+       return result;
+}
+
+/**
+ * add_peb_to_free - adds a PEB to the RB-tree of free PEBs
+ * @ebh - chfs eraseblock handler
+ * @pebnr - physical eraseblock's number
+ * @ec - erase counter of PEB
+ *
+ *
+ * This function adds a physical eraseblock to the RB-tree of free PEBs
+ * stored in the @ebh. The key is the erase counter and pebnr.
+ * The @ebh->erase_lock must be locked before using this.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+add_peb_to_free(struct chfs_ebh *ebh, int pebnr, int ec)
+{
+       struct chfs_peb *peb, *result;
+
+       peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP);
+
+       peb->erase_cnt = ec;
+       peb->pebnr = pebnr;
+       result = RB_INSERT(peb_free_rbtree, &ebh->free, peb);
+       if (result)
+               return 1;
+
+       return 0;
+}
+
+/**
+ * add_peb_to_in_use - adds a PEB to the RB-tree of used PEBs
+ * @ebh - chfs eraseblock handler
+ * @pebnr - physical eraseblock's number
+ * @ec - erase counter of PEB
+ *
+ *
+ * This function adds a physical eraseblock to the RB-tree of used PEBs
+ * stored in the @ebh. The key is pebnr.
+ * The @ebh->erase_lock must be locked before using this.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+add_peb_to_in_use(struct chfs_ebh *ebh, int pebnr, int ec)
+{
+       struct chfs_peb *peb, *result;
+
+       peb = kmem_alloc(sizeof(struct chfs_peb), KM_SLEEP);
+
+       peb->erase_cnt = ec;
+       peb->pebnr = pebnr;
+       result = RB_INSERT(peb_in_use_rbtree, &ebh->in_use, peb);
+       if (result)
+               return 1;
+
+       return 0;
+}
+
+/**
+ * erase_callback - callback function for flash erase
+ * @ei: erase information
+ */
+void
+erase_callback(struct flash_erase_instruction *ei)
+{
+       int err;
+       struct chfs_erase_info_priv *priv = (void *) ei->ei_priv;
+       //dbg_ebh("ERASE_CALLBACK() CALLED\n");
+       struct chfs_ebh *ebh = priv->ebh;
+       struct chfs_peb *peb = priv->peb;
+
+       peb->erase_cnt += 1;
+
+       if (ei->ei_state == FLASH_ERASE_DONE) {
+
+               /* Write out erase counter */
+               err = ebh->ops->mark_eb_hdr_free(ebh,
+                   peb->pebnr, peb->erase_cnt);
+               if (err) {
+                       /* cannot mark PEB as free,so erase it again */
+                       chfs_err(
+                               "cannot mark eraseblock as free, PEB: %d\n",
+                               peb->pebnr);
+                       mutex_enter(&ebh->erase_lock);
+                       /*dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_callback() "
+                         "after mark ebhdr free\n");*/
+                       add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt,
+                           &ebh->to_erase);
+                       mutex_exit(&ebh->erase_lock);
+                       /*dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_callback() "
+                         "after mark ebhdr free\n");*/
+                       kmem_free(peb, sizeof(struct chfs_peb));
+                       return;
+               }
+
+               mutex_enter(&ebh->erase_lock);
+               /*dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_callback()\n");*/
+               err = add_peb_to_free(ebh, peb->pebnr, peb->erase_cnt);
+               mutex_exit(&ebh->erase_lock);
+               /*dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_callback()\n");*/
+               kmem_free(peb, sizeof(struct chfs_peb));
+       } else {
+               /*
+                * Erase is finished, but there was a problem,
+                * so erase PEB again
+                */
+               chfs_err("erase failed, state is: 0x%x\n", ei->ei_state);
+               add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt, &ebh->to_erase);
+               kmem_free(peb, sizeof(struct chfs_peb));
+       }
+}
+
+/**
+ * free_peb: free a PEB
+ * @ebh: chfs eraseblock handler
+ *
+ * This function erases the first physical eraseblock from one of the erase
+ * lists and adds to the RB-tree of free PEBs.
+ * Returns zero in case of succes, error code in case of fail.
+ */
+int
+free_peb(struct chfs_ebh *ebh)
+{
+       int err, retries = 0;
+       off_t ofs;
+       struct chfs_peb *peb = NULL;
+       struct flash_erase_instruction *ei;
+
+       KASSERT(mutex_owned(&ebh->erase_lock));
+
+       if (!TAILQ_EMPTY(&ebh->fully_erased)) {
+               //dbg_ebh("[FREE PEB] got a fully erased block\n");
+               peb = TAILQ_FIRST(&ebh->fully_erased);
+               TAILQ_REMOVE(&ebh->fully_erased, peb, u.queue);
+               err = ebh->ops->mark_eb_hdr_free(ebh,
+                   peb->pebnr, peb->erase_cnt);
+               if (err) {
+                       goto out_free;
+               }
+               err = add_peb_to_free(ebh, peb->pebnr, peb->erase_cnt);
+               goto out_free;
+       }
+       /* Erase PEB */
+       //dbg_ebh("[FREE PEB] eraseing a block\n");
+       peb = TAILQ_FIRST(&ebh->to_erase);
+       TAILQ_REMOVE(&ebh->to_erase, peb, u.queue);
+       mutex_exit(&ebh->erase_lock);
+       //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in free_peb()\n");
+       ofs = peb->pebnr * ebh->flash_if->erasesize;
+
+       /* XXX where do we free this? */
+       ei = kmem_alloc(sizeof(struct flash_erase_instruction)
+           + sizeof(struct chfs_erase_info_priv), KM_SLEEP);
+retry:
+       memset(ei, 0, sizeof(*ei));
+
+//     ei->ei_if = ebh->flash_if;
+       ei->ei_addr = ofs;
+       ei->ei_len = ebh->flash_if->erasesize;
+       ei->ei_callback = erase_callback;
+       ei->ei_priv = (unsigned long) (&ei[1]);
+
+       ((struct chfs_erase_info_priv *) ei->ei_priv)->ebh = ebh;
+       ((struct chfs_erase_info_priv *) ei->ei_priv)->peb = peb;
+
+       err = flash_erase(ebh->flash_dev, ei);
+       dbg_ebh("erased peb: %d\n", peb->pebnr);
+
+       /* einval would mean we did something wrong */
+       KASSERT(err != EINVAL);
+
+       if (err) {
+               dbg_ebh("errno: %d, ei->ei_state: %d\n", err, ei->ei_state);
+               if (CHFS_MAX_GET_PEB_RETRIES < ++retries &&
+                   ei->ei_state == FLASH_ERASE_FAILED) {
+                       /* The block went bad mark it */
+                       dbg_ebh("ebh markbad! 0x%jx\n", (uintmax_t )ofs);
+                       err = flash_block_markbad(ebh->flash_dev, ofs);
+                       if (!err) {
+                               ebh->peb_nr--;
+                       }
+
+                       goto out;
+               }
+               chfs_err("can not erase PEB: %d, try again\n", peb->pebnr);
+               goto retry;
+       }
+
+out:
+       /* lock the erase_lock, because it was locked
+        * when the function was called */
+       mutex_enter(&ebh->erase_lock);
+       return err;
+
+out_free:
+       kmem_free(peb, sizeof(struct chfs_peb));
+       return err;
+}
+
+/**
+ * release_peb - schedule an erase for the PEB
+ * @ebh: chfs eraseblock handler
+ * @pebnr: physical eraseblock number
+ *
+ * This function get the peb identified by @pebnr from the in_use RB-tree of
+ * @ebh, removes it and schedule an erase for it.
+ *
+ * Returns zero on success, error code in case of fail.
+ */
+int
+release_peb(struct chfs_ebh *ebh, int pebnr)
+{
+       int err = 0;
+       struct chfs_peb *peb;
+
+       mutex_enter(&ebh->erase_lock);
+
+       //dbg_ebh("LOCK: ebh->erase_lock spin locked in release_peb()\n");
+       peb = find_peb_in_use(ebh, pebnr);
+       if (!peb) {
+               chfs_err("LEB is mapped, but is not in the 'in_use' "
+                   "tree of ebh\n");
+               goto out_unlock;
+       }
+       err = add_peb_to_erase_queue(ebh, peb->pebnr, peb->erase_cnt,
+           &ebh->to_erase);
+
+       if (err)
+               goto out_unlock;
+
+       RB_REMOVE(peb_in_use_rbtree, &ebh->in_use, peb);
+out_unlock:
+       mutex_exit(&ebh->erase_lock);
+       //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in release_peb()"
+       //              " at out_unlock\n");
+       return err;
+}
+
+/**
+ * erase_thread - background thread for erasing PEBs
+ * @data: pointer to the eraseblock handler
+ */
+/*void
+  erase_thread(void *data)
+  {
+  struct chfs_ebh *ebh = data;
+
+  dbg_ebh("erase thread started\n");
+  while (ebh->bg_erase.eth_running) {
+  int err;
+
+  mutex_enter(&ebh->erase_lock);
+  dbg_ebh("LOCK: ebh->erase_lock spin locked in erase_thread()\n");
+  if (TAILQ_EMPTY(&ebh->to_erase) && TAILQ_EMPTY(&ebh->fully_erased)) {
+  dbg_ebh("thread has nothing to do\n");
+  mutex_exit(&ebh->erase_lock);
+  mutex_enter(&ebh->bg_erase.eth_thread_mtx);
+  cv_timedwait_sig(&ebh->bg_erase.eth_wakeup,
+  &ebh->bg_erase.eth_thread_mtx, mstohz(100));
+  mutex_exit(&ebh->bg_erase.eth_thread_mtx);
+
+  dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_thread()\n");
+  continue;
+  }
+  mutex_exit(&ebh->erase_lock);
+  dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in erase_thread()\n");
+
+  err = free_peb(ebh);
+  if (err)
+  chfs_err("freeing PEB failed in the background thread: %d\n", err);
+
+  }
+  dbg_ebh("erase thread stopped\n");
+  kthread_exit(0);
+  }*/
+
+/**
+ * erase_thread - background thread for erasing PEBs
+ * @data: pointer to the eraseblock handler
+ */
+void
+erase_thread(void *data) {
+       dbg_ebh("[EBH THREAD] erase thread started\n");
+
+       struct chfs_ebh *ebh = data;
+       int err;
+
+       mutex_enter(&ebh->erase_lock);
+       while (ebh->bg_erase.eth_running) {
+               if (TAILQ_EMPTY(&ebh->to_erase) &&
+                   TAILQ_EMPTY(&ebh->fully_erased)) {
+                       cv_timedwait_sig(&ebh->bg_erase.eth_wakeup,
+                           &ebh->erase_lock, mstohz(100));
+               } else {
+                       /* XXX exiting this mutex is a bit odd here as
+                        * free_peb instantly reenters it...
+                        */
+                       err = free_peb(ebh);
+                       mutex_exit(&ebh->erase_lock);
+                       if (err) {
+                               chfs_err("freeing PEB failed in the"
+                                   " background thread: %d\n", err);
+                       }
+                       mutex_enter(&ebh->erase_lock);
+               }
+       }
+       mutex_exit(&ebh->erase_lock);
+
+       dbg_ebh("[EBH THREAD] erase thread stopped\n");
+       kthread_exit(0);
+}
+
+/**
+ * erase_thread_start - init and start erase thread
+ * @ebh: eraseblock handler
+ */
+static void
+erase_thread_start(struct chfs_ebh *ebh)
+{
+       cv_init(&ebh->bg_erase.eth_wakeup, "ebheracv");
+
+       ebh->bg_erase.eth_running = true;
+       kthread_create(PRI_NONE, KTHREAD_MPSAFE | KTHREAD_MUSTJOIN, NULL,
+           erase_thread, ebh, &ebh->bg_erase.eth_thread, "ebherase");
+}
+
+/**
+ * erase_thread_stop - stop background erase thread
+ * @ebh: eraseblock handler
+ */
+static void
+erase_thread_stop(struct chfs_ebh *ebh)
+{
+       ebh->bg_erase.eth_running = false;
+       cv_signal(&ebh->bg_erase.eth_wakeup);
+       dbg_ebh("[EBH THREAD STOP] signaled\n");
+
+       kthread_join(ebh->bg_erase.eth_thread);
+#ifdef BROKEN_KTH_JOIN
+       kpause("chfsebhjointh", false, mstohz(1000), NULL);
+#endif
+
+       cv_destroy(&ebh->bg_erase.eth_wakeup);
+}
+
+/*****************************************************************************/
+/* End of Erase related operations                                          */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Scan related operations                                                  */
+/*****************************************************************************/
+int
+scan_leb_used_cmp(struct chfs_scan_leb *sleb1, struct chfs_scan_leb *sleb2)
+{
+       return (sleb1->lnr - sleb2->lnr);
+}
+
+RB_PROTOTYPE(scan_leb_used_rbtree, chfs_scan_leb, u.rb, scan_leb_used_cmp);
+RB_GENERATE(scan_leb_used_rbtree, chfs_scan_leb, u.rb, scan_leb_used_cmp);
+
+/**
+ * scan_add_to_queue - adds a physical eraseblock to one of the
+ *                     eraseblock queue
+ * @si: chfs scanning information
+ * @pebnr: physical eraseblock number
+ * @erase_cnt: erase counter of the physical eraseblock
+ * @list: the list to add to
+ *
+ * This function adds a physical eraseblock to one of the lists in the scanning
+ * information.
+ * Returns zero in case of success, negative error code in case of fail.
+ */
+static int
+scan_add_to_queue(struct chfs_scan_info *si, int pebnr, int erase_cnt,
+    struct scan_leb_queue *queue)
+{
+       struct chfs_scan_leb *sleb;
+
+       sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP);
+
+       sleb->pebnr = pebnr;
+       sleb->erase_cnt = erase_cnt;
+       TAILQ_INSERT_TAIL(queue, sleb, u.queue);
+       return 0;
+}
+
+/*
+ * nor_scan_add_to_used - add a physical eraseblock to the
+ *                        used tree of scan info
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @ebhdr: eraseblock header
+ * @pebnr: physical eraseblock number
+ * @leb_status: the status of the PEB's eraseblock header
+ *
+ * This function adds a PEB to the used tree of the scanning information.
+ * It handles the situations if there are more physical eraseblock referencing
+ * to the same logical eraseblock.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    struct chfs_eb_hdr *ebhdr, int pebnr, int leb_status)
+{
+       int err, lnr, ec;
+       struct chfs_scan_leb *sleb, *old;
+
+       lnr = CHFS_GET_LID(ebhdr->u.nor_hdr.lid);
+       ec = le32toh(ebhdr->ec_hdr.erase_cnt);
+
+       sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP);
+
+       sleb->erase_cnt = ec;
+       sleb->lnr = lnr;
+       sleb->pebnr = pebnr;
+       sleb->info = leb_status;
+
+       old = RB_INSERT(scan_leb_used_rbtree, &si->used, sleb);
+       if (old) {
+               kmem_free(sleb, sizeof(struct chfs_scan_leb));
+               /* There is already an eraseblock in the used tree */
+               /* If the new one is bad */
+               if (EBHDR_LEB_DIRTY == leb_status &&
+                   EBHDR_LEB_OK == old->info) {
+                       return scan_add_to_queue(si, pebnr, ec, &si->erase);
+               } else {
+                       err = scan_add_to_queue(si, old->pebnr,
+                           old->erase_cnt, &si->erase);
+                       if (err) {
+                               return err;
+                       }
+
+                       old->erase_cnt = ec;
+                       old->lnr = lnr;
+                       old->pebnr = pebnr;
+                       old->info = leb_status;
+                       return 0;
+               }
+       }
+       return 0;
+}
+
+/**
+ * nor_process eb -read the headers from NOR flash, check them and add to
+ *                                the scanning information
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @pebnr: physical eraseblock number
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nor_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+       int err, erase_cnt, leb_status;
+
+       err = ebh->ops->read_eb_hdr(ebh, pebnr, ebhdr);
+       if (err)
+               return err;
+
+       erase_cnt = le32toh(ebhdr->ec_hdr.erase_cnt);
+       dbg_ebh("erase_cnt: %d\n", erase_cnt);
+       leb_status = ebh->ops->check_eb_hdr(ebh, ebhdr);
+       if (EBHDR_LEB_BADMAGIC == leb_status ||
+           EBHDR_LEB_BADCRC == leb_status) {
+               err = scan_add_to_queue(si, pebnr, erase_cnt, &si->corrupted);
+               return err;
+       }
+       else if (EBHDR_LEB_FREE == leb_status) {
+               err = scan_add_to_queue(si, pebnr, erase_cnt, &si->free);
+               goto count_mean;
+       }
+       else if (EBHDR_LEB_NO_HDR == leb_status) {
+               err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erased);
+               return err;
+       }
+       else if (EBHDR_LEB_INVALIDATED == leb_status) {
+               err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erase);
+               return err;
+       }
+
+       err = nor_scan_add_to_used(ebh, si, ebhdr, pebnr, leb_status);
+       if (err)
+               return err;
+
+
+count_mean:
+       si->sum_of_ec += erase_cnt;
+       si->num_of_eb++;
+
+       return err;
+}
+
+/*
+ * nand_scan_add_to_used - add a physical eraseblock to the
+ *                         used tree of scan info
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @ebhdr: eraseblock header
+ * @pebnr: physical eraseblock number
+ * @leb_status: the status of the PEB's eraseblock header
+ *
+ * This function adds a PEB to the used tree of the scanning information.
+ * It handles the situations if there are more physical eraseblock referencing
+ * to the same logical eraseblock.
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_scan_add_to_used(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    struct chfs_eb_hdr *ebhdr, int pebnr)
+{
+       int err, lnr, ec;
+       struct chfs_scan_leb *sleb, *old;
+       uint64_t serial = le64toh(ebhdr->u.nand_hdr.serial);
+
+       lnr = CHFS_GET_LID(ebhdr->u.nor_hdr.lid);
+       ec = le32toh(ebhdr->ec_hdr.erase_cnt);
+
+       sleb = kmem_alloc(sizeof(struct chfs_scan_leb), KM_SLEEP);
+
+       sleb->erase_cnt = ec;
+       sleb->lnr = lnr;
+       sleb->pebnr = pebnr;
+       sleb->info = serial;
+
+       old = RB_INSERT(scan_leb_used_rbtree, &si->used, sleb);
+       if (old) {
+               kmem_free(sleb, sizeof(struct chfs_scan_leb));
+               /* There is already an eraseblock in the used tree */
+               /* If the new one is bad */
+               if (serial < old->info)
+                       return scan_add_to_queue(si, pebnr, ec, &si->erase);
+               else {
+                       err = scan_add_to_queue(si,
+                           old->pebnr, old->erase_cnt, &si->erase);
+                       if (err)
+                               return err;
+
+                       old->erase_cnt = ec;
+                       old->lnr = lnr;
+                       old->pebnr = pebnr;
+                       old->info = serial;
+                       return 0;
+               }
+       }
+       return 0;
+}
+
+/**
+ * nand_process eb -read the headers from NAND flash, check them and add to the
+ *                                     scanning information
+ * @ebh: chfs eraseblock handler
+ * @si: chfs scanning information
+ * @pebnr: physical eraseblock number
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+nand_process_eb(struct chfs_ebh *ebh, struct chfs_scan_info *si,
+    int pebnr, struct chfs_eb_hdr *ebhdr)
+{
+       int err, erase_cnt, leb_status;
+       uint64_t max_serial;
+       /* isbad() is defined on some ancient platforms, heh */
+       bool is_bad;
+
+       /* Check block is bad */
+       err = flash_block_isbad(ebh->flash_dev,
+           pebnr * ebh->flash_if->erasesize, &is_bad);
+       if (err) {
+               chfs_err("checking block is bad failed\n");
+               return err;
+       }
+       if (is_bad) {
+               si->bad_peb_cnt++;
+               return 0;
+       }
+
+       err = ebh->ops->read_eb_hdr(ebh, pebnr, ebhdr);
+       if (err)
+               return err;
+
+       erase_cnt = le32toh(ebhdr->ec_hdr.erase_cnt);
+       leb_status = ebh->ops->check_eb_hdr(ebh, ebhdr);
+       if (EBHDR_LEB_BADMAGIC == leb_status ||
+           EBHDR_LEB_BADCRC == leb_status) {
+               err = scan_add_to_queue(si, pebnr, erase_cnt, &si->corrupted);
+               return err;
+       }
+       else if (EBHDR_LEB_FREE == leb_status) {
+               err = scan_add_to_queue(si, pebnr, erase_cnt, &si->free);
+               goto count_mean;
+       }
+       else if (EBHDR_LEB_NO_HDR == leb_status) {
+               err = scan_add_to_queue(si, pebnr, erase_cnt, &si->erased);
+               return err;
+       }
+
+       err = nand_scan_add_to_used(ebh, si, ebhdr, pebnr);
+       if (err)
+               return err;
+
+       max_serial = le64toh(ebhdr->u.nand_hdr.serial);
+       if (max_serial > *ebh->max_serial) {
+               *ebh->max_serial = max_serial;
+       }
+
+count_mean:
+       si->sum_of_ec += erase_cnt;
+       si->num_of_eb++;
+
+       return err;
+}
+
+/**
+ * chfs_scan - scans the media and returns informations about it
+ * @ebh: chfs eraseblock handler
+ *
+ * This function scans through the media and returns information about it or if
+ * it fails NULL will be returned.
+ */
+struct chfs_scan_info *
+chfs_scan(struct chfs_ebh *ebh)
+{
+       struct chfs_scan_info *si;
+       struct chfs_eb_hdr *ebhdr;
+       int pebnr, err;
+
+       si = kmem_alloc(sizeof(*si), KM_SLEEP);
+
+       TAILQ_INIT(&si->corrupted);
+       TAILQ_INIT(&si->free);
+       TAILQ_INIT(&si->erase);
+       TAILQ_INIT(&si->erased);
+       RB_INIT(&si->used);
+       si->bad_peb_cnt = 0;
+       si->num_of_eb = 0;
+       si->sum_of_ec = 0;
+
+       ebhdr = kmem_alloc(sizeof(*ebhdr), KM_SLEEP);
+
+       for (pebnr = 0; pebnr < ebh->peb_nr; pebnr++) {
+               dbg_ebh("processing PEB %d\n", pebnr);
+               err = ebh->ops->process_eb(ebh, si, pebnr, ebhdr);
+               if (err < 0)
+                       goto out_ebhdr;
+       }
+       kmem_free(ebhdr, sizeof(*ebhdr));
+       dbg_ebh("[CHFS_SCAN] scanning information collected\n");
+       return si;
+
+out_ebhdr:
+       kmem_free(ebhdr, sizeof(*ebhdr));
+       kmem_free(si, sizeof(*si));
+       return NULL;
+}
+
+/**
+ * scan_info_destroy - frees all lists and trees in the scanning information
+ * @si: the scanning information
+ */
+void
+scan_info_destroy(struct chfs_scan_info *si)
+{
+       EBH_QUEUE_DESTROY(&si->corrupted,
+           struct chfs_scan_leb, u.queue);
+
+       EBH_QUEUE_DESTROY(&si->erase,
+           struct chfs_scan_leb, u.queue);
+
+       EBH_QUEUE_DESTROY(&si->erased,
+           struct chfs_scan_leb, u.queue);
+
+       EBH_QUEUE_DESTROY(&si->free,
+           struct chfs_scan_leb, u.queue);
+
+       EBH_TREE_DESTROY(scan_leb_used_rbtree,
+           &si->used, struct chfs_scan_leb);
+
+       kmem_free(si, sizeof(*si));
+       dbg_ebh("[SCAN_INFO_DESTROY] scanning information destroyed\n");
+}
+
+/**
+ * scan_media - scan media
+ *
+ * @ebh - chfs eraseblock handler
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+
+int
+scan_media(struct chfs_ebh *ebh)
+{
+       int err, i, avg_ec;
+       struct chfs_scan_info *si;
+       struct chfs_scan_leb *sleb;
+
+       si = chfs_scan(ebh);
+       /*
+        * Process the scan info, manage the eraseblock lists
+        */
+       mutex_init(&ebh->ltree_lock, MUTEX_DEFAULT, IPL_NONE);
+       mutex_init(&ebh->erase_lock, MUTEX_DEFAULT, IPL_NONE);
+       RB_INIT(&ebh->ltree);
+       RB_INIT(&ebh->free);
+       RB_INIT(&ebh->in_use);
+       TAILQ_INIT(&ebh->to_erase);
+       TAILQ_INIT(&ebh->fully_erased);
+       mutex_init(&ebh->alc_mutex, MUTEX_DEFAULT, IPL_NONE);
+
+       ebh->peb_nr -= si->bad_peb_cnt;
+
+       /*
+        * Create background thread for erasing
+        */
+       erase_thread_start(ebh);
+
+       ebh->lmap = kmem_alloc(ebh->peb_nr * sizeof(int), KM_SLEEP);
+
+       for (i = 0; i < ebh->peb_nr; i++) {
+               ebh->lmap[i] = EBH_LEB_UNMAPPED;
+       }
+
+       if (si->num_of_eb == 0) {
+               /* The flash contains no data. */
+               avg_ec = 0;
+       }
+       else {
+               avg_ec = (int) (si->sum_of_ec / si->num_of_eb);
+       }
+       dbg_ebh("num_of_eb: %d\n", si->num_of_eb);
+
+       mutex_enter(&ebh->erase_lock);
+
+       RB_FOREACH(sleb, scan_leb_used_rbtree, &si->used) {
+               ebh->lmap[sleb->lnr] = sleb->pebnr;
+               err = add_peb_to_in_use(ebh, sleb->pebnr, sleb->erase_cnt);
+               if (err)
+                       goto out_free;
+       }
+
+       TAILQ_FOREACH(sleb, &si->erased, u.queue) {
+               err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec,
+                   &ebh->fully_erased);
+               if (err)
+                       goto out_free;
+       }
+
+       TAILQ_FOREACH(sleb, &si->erase, u.queue) {
+               err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec,
+                   &ebh->to_erase);
+               if (err)
+                       goto out_free;
+       }
+
+       TAILQ_FOREACH(sleb, &si->free, u.queue) {
+               err = add_peb_to_free(ebh, sleb->pebnr, sleb->erase_cnt);
+               if (err)
+                       goto out_free;
+       }
+
+       TAILQ_FOREACH(sleb, &si->corrupted, u.queue) {
+               err = add_peb_to_erase_queue(ebh, sleb->pebnr, avg_ec,
+                   &ebh->to_erase);
+               if (err)
+                       goto out_free;
+       }
+       mutex_exit(&ebh->erase_lock);
+       scan_info_destroy(si);
+       return 0;
+
+out_free:
+       mutex_exit(&ebh->erase_lock);
+       kmem_free(ebh->lmap, ebh->peb_nr * sizeof(int));
+       scan_info_destroy(si);
+       dbg_ebh("[SCAN_MEDIA] returning with error: %d\n", err);
+       return err;
+}
+
+/*****************************************************************************/
+/* End of Scan related operations                                           */
+/*****************************************************************************/
+
+/**
+ * ebh_open - opens mtd device and init ereaseblock header
+ * @ebh: eraseblock handler
+ * @flash_nr: flash device number to use
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_open(struct chfs_ebh *ebh, dev_t dev)
+{
+       int err;
+
+       ebh->flash_dev = flash_get_device(dev);
+       if (!ebh->flash_dev) {
+               aprint_error("ebh_open: cant get flash device\n");
+               return ENODEV;
+       }
+
+       ebh->flash_if = flash_get_interface(dev);
+       if (!ebh->flash_if) {
+               aprint_error("ebh_open: cant get flash interface\n");
+               return ENODEV;
+       }
+
+       ebh->flash_size = flash_get_size(dev);
+       ebh->peb_nr = ebh->flash_size / ebh->flash_if->erasesize;
+//     ebh->peb_nr = ebh->flash_if->size / ebh->flash_if->erasesize;
+       /* Set up flash operations based on flash type */
+       ebh->ops = kmem_alloc(sizeof(struct chfs_ebh_ops), KM_SLEEP);
+
+       switch (ebh->flash_if->type) {
+       case FLASH_TYPE_NOR:
+               ebh->eb_size = ebh->flash_if->erasesize -
+                   CHFS_EB_EC_HDR_SIZE - CHFS_EB_HDR_NOR_SIZE;
+
+               ebh->ops->read_eb_hdr = nor_read_eb_hdr;
+               ebh->ops->write_eb_hdr = nor_write_eb_hdr;
+               ebh->ops->check_eb_hdr = nor_check_eb_hdr;
+               ebh->ops->mark_eb_hdr_dirty_flash =
+                   nor_mark_eb_hdr_dirty_flash;
+               ebh->ops->invalidate_eb_hdr = nor_invalidate_eb_hdr;
+               ebh->ops->mark_eb_hdr_free = mark_eb_hdr_free;
+
+               ebh->ops->process_eb = nor_process_eb;
+
+               ebh->ops->create_eb_hdr = nor_create_eb_hdr;
+               ebh->ops->calc_data_offs = nor_calc_data_offs;
+
+               ebh->max_serial = NULL;
+               break;
+       case FLASH_TYPE_NAND:
+               ebh->eb_size = ebh->flash_if->erasesize -
+                   2 * ebh->flash_if->page_size;
+
+               ebh->ops->read_eb_hdr = nand_read_eb_hdr;
+               ebh->ops->write_eb_hdr = nand_write_eb_hdr;
+               ebh->ops->check_eb_hdr = nand_check_eb_hdr;
+               ebh->ops->mark_eb_hdr_free = mark_eb_hdr_free;
+               ebh->ops->mark_eb_hdr_dirty_flash = NULL;
+               ebh->ops->invalidate_eb_hdr = NULL;
+
+               ebh->ops->process_eb = nand_process_eb;
+
+               ebh->ops->create_eb_hdr = nand_create_eb_hdr;
+               ebh->ops->calc_data_offs = nand_calc_data_offs;
+
+               ebh->max_serial = kmem_alloc(sizeof(uint64_t), KM_SLEEP);
+
+               *ebh->max_serial = 0;
+               break;
+       default:
+               return 1;
+       }
+       printf("opening ebh: eb_size: %zu\n", ebh->eb_size);
+       err = scan_media(ebh);
+       if (err) {
+               dbg_ebh("Scan failed.");
+               kmem_free(ebh->ops, sizeof(struct chfs_ebh_ops));
+               kmem_free(ebh, sizeof(struct chfs_ebh));
+               return err;
+       }
+       return 0;
+}
+
+/**
+ * ebh_close - close ebh
+ * @ebh: eraseblock handler
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_close(struct chfs_ebh *ebh)
+{
+       erase_thread_stop(ebh);
+
+       EBH_TREE_DESTROY(peb_free_rbtree, &ebh->free, struct chfs_peb);
+       EBH_TREE_DESTROY(peb_in_use_rbtree, &ebh->in_use, struct chfs_peb);
+
+       EBH_QUEUE_DESTROY(&ebh->fully_erased, struct chfs_peb, u.queue);
+       EBH_QUEUE_DESTROY(&ebh->to_erase, struct chfs_peb, u.queue);
+
+       /* XXX HACK, see ebh.h */
+       EBH_TREE_DESTROY_MUTEX(ltree_rbtree, &ebh->ltree,
+           struct chfs_ltree_entry);
+
+       KASSERT(!mutex_owned(&ebh->ltree_lock));
+       KASSERT(!mutex_owned(&ebh->alc_mutex));
+       KASSERT(!mutex_owned(&ebh->erase_lock));
+
+       mutex_destroy(&ebh->ltree_lock);
+       mutex_destroy(&ebh->alc_mutex);
+       mutex_destroy(&ebh->erase_lock);
+
+       kmem_free(ebh->ops, sizeof(struct chfs_ebh_ops));
+       kmem_free(ebh, sizeof(struct chfs_ebh));
+
+       return 0;
+}
+
+/**
+ * ebh_read_leb - read data from leb
+ * @ebh: eraseblock handler
+ * @lnr: logical eraseblock number
+ * @buf: buffer to read to
+ * @offset: offset from where to read
+ * @len: bytes number to read
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_read_leb(struct chfs_ebh *ebh, int lnr, char *buf, uint32_t offset,
+    size_t len, size_t *retlen)
+{
+       int err, pebnr;
+       off_t data_offset;
+
+       KASSERT(offset + len <= ebh->eb_size);
+
+       err = leb_read_lock(ebh, lnr);
+       if (err)
+               return err;
+       pebnr = ebh->lmap[lnr];
+       /* If PEB is not mapped the buffer is filled with 0xFF */
+       if (EBH_LEB_UNMAPPED == pebnr) {
+               leb_read_unlock(ebh, lnr);
+               memset(buf, 0xFF, len);
+               return 0;
+       }
+
+       /* Read data */
+       data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset);
+       err = flash_read(ebh->flash_dev, data_offset, len, retlen,
+           (unsigned char *) buf);
+       if (err)
+               goto out_free;
+
+       KASSERT(len == *retlen);
+
+       leb_read_unlock(ebh, lnr);
+       return err;
+
+out_free:
+       leb_read_unlock(ebh, lnr);
+       return err;
+}
+
+/**
+ * get_peb: get a free physical eraseblock
+ * @ebh - chfs eraseblock handler
+ *
+ * This function gets a free eraseblock from the ebh->free RB-tree.
+ * The fist entry will be returned and deleted from the tree.
+ * The entries sorted by the erase counters, so the PEB with the smallest
+ * erase counter will be added back.
+ * If something goes bad a negative value will be returned.
+ */
+int
+get_peb(struct chfs_ebh *ebh)
+{
+       int err, pebnr;
+       struct chfs_peb *peb;
+
+retry:
+       mutex_enter(&ebh->erase_lock);
+       //dbg_ebh("LOCK: ebh->erase_lock spin locked in get_peb()\n");
+       if (RB_EMPTY(&ebh->free)) {
+               /*There is no more free PEBs in the tree*/
+               if (TAILQ_EMPTY(&ebh->to_erase) &&
+                   TAILQ_EMPTY(&ebh->fully_erased)) {
+                       mutex_exit(&ebh->erase_lock);
+                       //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n");
+                       return ENOSPC;
+               }
+               err = free_peb(ebh);
+
+               mutex_exit(&ebh->erase_lock);
+               //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n");
+
+               if (err)
+                       return err;
+               goto retry;
+       }
+       peb = RB_MIN(peb_free_rbtree, &ebh->free);
+       pebnr = peb->pebnr;
+       RB_REMOVE(peb_free_rbtree, &ebh->free, peb);
+       err = add_peb_to_in_use(ebh, peb->pebnr, peb->erase_cnt);
+       if (err)
+               pebnr = err;
+
+       kmem_free(peb, sizeof(struct chfs_peb));
+
+       mutex_exit(&ebh->erase_lock);
+       //dbg_ebh("UNLOCK: ebh->erase_lock spin unlocked in get_peb()\n");
+
+       return pebnr;
+}
+
+/**
+ * ebh_write_leb - write data to leb
+ * @ebh: eraseblock handler
+ * @lnr: logical eraseblock number
+ * @buf: data to write
+ * @offset: offset where to write
+ * @len: bytes number to write
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_write_leb(struct chfs_ebh *ebh, int lnr, char *buf, uint32_t offset,
+    size_t len, size_t *retlen)
+{
+       int err, pebnr, retries = 0;
+       off_t data_offset;
+       struct chfs_eb_hdr *ebhdr;
+
+       dbg("offset: %d | len: %zu | (offset+len): %zu "
+           " | ebsize: %zu\n", offset, len, (offset+len), ebh->eb_size);
+
+       KASSERT(offset + len <= ebh->eb_size);
+
+       err = leb_write_lock(ebh, lnr);
+       if (err)
+               return err;
+
+       pebnr = ebh->lmap[lnr];
+       /* If the LEB is mapped write out data */
+       if (pebnr != EBH_LEB_UNMAPPED) {
+               data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset);
+               err = flash_write(ebh->flash_dev, data_offset, len, retlen,
+                   (unsigned char *) buf);
+
+               if (err) {
+                       chfs_err("error %d while writing %zu bytes to PEB "
+                           "%d:%ju, written %zu bytes\n",
+                           err, len, pebnr, (uintmax_t )offset, *retlen);
+               } else {
+                       KASSERT(len == *retlen);
+               }
+
+               leb_write_unlock(ebh, lnr);
+               return err;
+       }
+
+       /*
+        * If the LEB is unmapped, get a free PEB and write the
+        * eraseblock header first
+        */
+       ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+       /* Setting up eraseblock header properties */
+       ebh->ops->create_eb_hdr(ebhdr, lnr);
+
+retry:
+       /* Getting a physical eraseblock from the wear leveling system */
+       pebnr = get_peb(ebh);
+       if (pebnr < 0) {
+               leb_write_unlock(ebh, lnr);
+               kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+               return pebnr;
+       }
+
+       /* Write the eraseblock header to the media */
+       err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr);
+       if (err) {
+               chfs_warn(
+                       "error writing eraseblock header: LEB %d , PEB %d\n",
+                       lnr, pebnr);
+               goto write_error;
+       }
+
+       /* Write out data */
+       if (len) {
+               data_offset = ebh->ops->calc_data_offs(ebh, pebnr, offset);
+               err = flash_write(ebh->flash_dev,
+                   data_offset, len, retlen, (unsigned char *) buf);
+               if (err) {
+                       chfs_err("error %d while writing %zu bytes to PEB "
+                           " %d:%ju, written %zu bytes\n",
+                           err, len, pebnr, (uintmax_t )offset, *retlen);
+                       goto write_error;
+               }
+       }
+
+       ebh->lmap[lnr] = pebnr;
+       leb_write_unlock(ebh, lnr);
+       kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+
+       return 0;
+
+write_error: err = release_peb(ebh, pebnr);
+       // max retries (NOW: 2)
+       if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) {
+               leb_write_unlock(ebh, lnr);
+               kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+               return err;
+       }
+       goto retry;
+}
+
+/**
+ * ebh_erase_leb - erase a leb
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_erase_leb(struct chfs_ebh *ebh, int lnr)
+{
+       int err, pebnr;
+
+       leb_write_lock(ebh, lnr);
+
+       pebnr = ebh->lmap[lnr];
+       if (pebnr < 0) {
+               leb_write_unlock(ebh, lnr);
+               return EBH_LEB_UNMAPPED;
+       }
+       err = release_peb(ebh, pebnr);
+       if (err)
+               goto out_unlock;
+
+       ebh->lmap[lnr] = EBH_LEB_UNMAPPED;
+       cv_signal(&ebh->bg_erase.eth_wakeup);
+out_unlock:
+       leb_write_unlock(ebh, lnr);
+       return err;
+}
+
+/**
+ * ebh_map_leb - maps a PEB to LEB
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Returns zero on success, error code in case of fail
+ */
+int
+ebh_map_leb(struct chfs_ebh *ebh, int lnr)
+{
+       int err, pebnr, retries = 0;
+       struct chfs_eb_hdr *ebhdr;
+
+       ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+       err = leb_write_lock(ebh, lnr);
+       if (err)
+               return err;
+
+retry:
+       pebnr = get_peb(ebh);
+       if (pebnr < 0) {
+               err = pebnr;
+               goto out_unlock;
+       }
+
+       ebh->ops->create_eb_hdr(ebhdr, lnr);
+
+       err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr);
+       if (err) {
+               chfs_warn(
+                       "error writing eraseblock header: LEB %d , PEB %d\n",
+                       lnr, pebnr);
+               goto write_error;
+       }
+
+       ebh->lmap[lnr] = pebnr;
+
+out_unlock:
+       leb_write_unlock(ebh, lnr);
+       return err;
+
+write_error:
+       err = release_peb(ebh, pebnr);
+       // max retries (NOW: 2)
+       if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) {
+               leb_write_unlock(ebh, lnr);
+               kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+               return err;
+       }
+       goto retry;
+}
+
+/**
+ * ebh_unmap_leb -
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Retruns zero on success, error code in case of fail.
+ */
+int
+ebh_unmap_leb(struct chfs_ebh *ebh, int lnr)
+{
+       int err;
+
+       if (ebh_is_mapped(ebh, lnr) < 0)
+               /* If the eraseblock already unmapped */
+               return 0;
+
+       err = ebh_erase_leb(ebh, lnr);
+
+       return err;
+}
+
+/**
+ * ebh_is_mapped - check if a PEB is mapped to @lnr
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ *
+ * Retruns 0 if the logical eraseblock is mapped, negative error code otherwise.
+ */
+int
+ebh_is_mapped(struct chfs_ebh *ebh, int lnr)
+{
+       int err, result;
+       err = leb_read_lock(ebh, lnr);
+       if (err)
+               return err;
+
+       result = ebh->lmap[lnr];
+       leb_read_unlock(ebh, lnr);
+
+       return result;
+}
+
+/**
+ * ebh_change_leb - write the LEB to another PEB
+ * @ebh: eraseblock handler
+ * @lnr: leb number
+ * @buf: data to write
+ * @len: length of data
+ * Returns zero in case of success, error code in case of fail.
+ */
+int
+ebh_change_leb(struct chfs_ebh *ebh, int lnr, char *buf, size_t len,
+    size_t *retlen)
+{
+       int err, pebnr, pebnr_old, retries = 0;
+       off_t data_offset;
+
+       struct chfs_peb *peb = NULL;
+       struct chfs_eb_hdr *ebhdr;
+
+       if (ebh_is_mapped(ebh, lnr) < 0)
+               return EBH_LEB_UNMAPPED;
+
+       if (len == 0) {
+               err = ebh_unmap_leb(ebh, lnr);
+               if (err)
+                       return err;
+               return ebh_map_leb(ebh, lnr);
+       }
+
+       ebhdr = kmem_alloc(sizeof(struct chfs_eb_hdr), KM_SLEEP);
+
+       pebnr_old = ebh->lmap[lnr];
+
+       mutex_enter(&ebh->alc_mutex);
+       err = leb_write_lock(ebh, lnr);
+       if (err)
+               goto out_mutex;
+
+       if (ebh->ops->mark_eb_hdr_dirty_flash) {
+               err = ebh->ops->mark_eb_hdr_dirty_flash(ebh, pebnr_old, lnr);
+               if (err)
+                       goto out_unlock;
+       }
+
+       /* Setting up eraseblock header properties */
+       ebh->ops->create_eb_hdr(ebhdr, lnr);
+
+retry:
+       /* Getting a physical eraseblock from the wear leveling system */
+       pebnr = get_peb(ebh);
+       if (pebnr < 0) {
+               leb_write_unlock(ebh, lnr);
+               mutex_exit(&ebh->alc_mutex);
+               kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+               return pebnr;
+       }
+
+       err = ebh->ops->write_eb_hdr(ebh, pebnr, ebhdr);
+       if (err) {
+               chfs_warn(
+                       "error writing eraseblock header: LEB %d , PEB %d",
+                       lnr, pebnr);
+               goto write_error;
+       }
+
+       /* Write out data */
+       data_offset = ebh->ops->calc_data_offs(ebh, pebnr, 0);
+       err = flash_write(ebh->flash_dev, data_offset, len, retlen,
+           (unsigned char *) buf);
+       if (err) {
+               chfs_err("error %d while writing %zu bytes to PEB %d:%ju,"
+                   " written %zu bytes",
+                   err, len, pebnr, (uintmax_t)data_offset, *retlen);
+               goto write_error;
+       }
+
+       ebh->lmap[lnr] = pebnr;
+
+       if (ebh->ops->invalidate_eb_hdr) {
+               err = ebh->ops->invalidate_eb_hdr(ebh, pebnr_old);
+               if (err)
+                       goto out_unlock;
+       }
+       peb = find_peb_in_use(ebh, pebnr_old);
+       err = release_peb(ebh, peb->pebnr);
+
+out_unlock:
+       leb_write_unlock(ebh, lnr);
+
+out_mutex:
+       mutex_exit(&ebh->alc_mutex);
+       kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+       kmem_free(peb, sizeof(struct chfs_peb));
+       return err;
+
+write_error:
+       err = release_peb(ebh, pebnr);
+       //max retries (NOW: 2)
+       if (err || CHFS_MAX_GET_PEB_RETRIES < ++retries) {
+               leb_write_unlock(ebh, lnr);
+               mutex_exit(&ebh->alc_mutex);
+               kmem_free(ebhdr, sizeof(struct chfs_eb_hdr));
+               return err;
+       }
+       goto retry;
+}
+
similarity index 100%
rename from include/ufs/chfs/ebh.h
rename to sys/ufs/chfs/ebh.h
diff --git a/sys/ufs/ext2fs/Makefile b/sys/ufs/ext2fs/Makefile
new file mode 100644 (file)
index 0000000..a3df42f
--- /dev/null
@@ -0,0 +1,7 @@
+#      $NetBSD: Makefile,v 1.1 1998/06/12 23:23:11 cgd Exp $
+
+INCSDIR= /usr/include/ufs/ext2fs
+
+INCS=  ext2fs.h ext2fs_dinode.h ext2fs_dir.h ext2fs_extern.h
+
+.include <bsd.kinc.mk>
diff --git a/sys/ufs/ext2fs/ext2fs_alloc.c b/sys/ufs/ext2fs/ext2fs_alloc.c
new file mode 100644 (file)
index 0000000..9c2b4cf
--- /dev/null
@@ -0,0 +1,637 @@
+/*     $NetBSD: ext2fs_alloc.c,v 1.42 2011/03/06 04:46:26 rmind Exp $  */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_alloc.c 8.11 (Berkeley) 10/27/94
+ *  Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *     @(#)ffs_alloc.c 8.11 (Berkeley) 10/27/94
+ *  Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_alloc.c,v 1.42 2011/03/06 04:46:26 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/syslog.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufsmount.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+u_long ext2gennumber;
+
+static daddr_t ext2fs_alloccg(struct inode *, int, daddr_t, int);
+static u_long  ext2fs_dirpref(struct m_ext2fs *);
+static void    ext2fs_fserr(struct m_ext2fs *, u_int, const char *);
+static u_long  ext2fs_hashalloc(struct inode *, int, long, int,
+                   daddr_t (*)(struct inode *, int, daddr_t, int));
+static daddr_t ext2fs_nodealloccg(struct inode *, int, daddr_t, int);
+static daddr_t ext2fs_mapsearch(struct m_ext2fs *, char *, daddr_t);
+
+/*
+ * Allocate a block in the file system.
+ *
+ * A preference may be optionally specified. If a preference is given
+ * the following hierarchy is used to allocate a block:
+ *   1) allocate the requested block.
+ *   2) allocate a rotationally optimal block in the same cylinder.
+ *   3) allocate a block in the same cylinder group.
+ *   4) quadradically rehash into other cylinder groups, until an
+ *       available block is located.
+ * If no block preference is given the following hierarchy is used
+ * to allocate a block:
+ *   1) allocate a block in the cylinder group that contains the
+ *       inode for the file.
+ *   2) quadradically rehash into other cylinder groups, until an
+ *       available block is located.
+ */
+int
+ext2fs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref,
+    kauth_cred_t cred, daddr_t *bnp)
+{
+       struct m_ext2fs *fs;
+       daddr_t bno;
+       int cg;
+
+       *bnp = 0;
+       fs = ip->i_e2fs;
+#ifdef DIAGNOSTIC
+       if (cred == NOCRED)
+               panic("ext2fs_alloc: missing credential");
+#endif /* DIAGNOSTIC */
+       if (fs->e2fs.e2fs_fbcount == 0)
+               goto nospace;
+       if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
+           NULL, NULL) != 0 &&
+           freespace(fs) <= 0)
+               goto nospace;
+       if (bpref >= fs->e2fs.e2fs_bcount)
+               bpref = 0;
+       if (bpref == 0)
+               cg = ino_to_cg(fs, ip->i_number);
+       else
+               cg = dtog(fs, bpref);
+       bno = (daddr_t)ext2fs_hashalloc(ip, cg, bpref, fs->e2fs_bsize,
+           ext2fs_alloccg);
+       if (bno > 0) {
+               ip->i_e2fs_nblock += btodb(fs->e2fs_bsize);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               *bnp = bno;
+               return (0);
+       }
+nospace:
+       ext2fs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
+       uprintf("\n%s: write failed, file system is full\n", fs->e2fs_fsmnt);
+       return (ENOSPC);
+}
+
+/*
+ * Allocate an inode in the file system.
+ *
+ * If allocating a directory, use ext2fs_dirpref to select the inode.
+ * If allocating in a directory, the following hierarchy is followed:
+ *   1) allocate the preferred inode.
+ *   2) allocate an inode in the same cylinder group.
+ *   3) quadradically rehash into other cylinder groups, until an
+ *       available inode is located.
+ * If no inode preference is given the following hierarchy is used
+ * to allocate an inode:
+ *   1) allocate an inode in cylinder group 0.
+ *   2) quadradically rehash into other cylinder groups, until an
+ *       available inode is located.
+ */
+int
+ext2fs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
+    struct vnode **vpp)
+{
+       struct inode *pip;
+       struct m_ext2fs *fs;
+       struct inode *ip;
+       ino_t ino, ipref;
+       int cg, error;
+
+       *vpp = NULL;
+       pip = VTOI(pvp);
+       fs = pip->i_e2fs;
+       if (fs->e2fs.e2fs_ficount == 0)
+               goto noinodes;
+
+       if ((mode & IFMT) == IFDIR)
+               cg = ext2fs_dirpref(fs);
+       else
+               cg = ino_to_cg(fs, pip->i_number);
+       ipref = cg * fs->e2fs.e2fs_ipg + 1;
+       ino = (ino_t)ext2fs_hashalloc(pip, cg, (long)ipref, mode, ext2fs_nodealloccg);
+       if (ino == 0)
+               goto noinodes;
+       error = VFS_VGET(pvp->v_mount, ino, vpp);
+       if (error) {
+               ext2fs_vfree(pvp, ino, mode);
+               return (error);
+       }
+       ip = VTOI(*vpp);
+       if (ip->i_e2fs_mode && ip->i_e2fs_nlink != 0) {
+               printf("mode = 0%o, nlinks %d, inum = %llu, fs = %s\n",
+                   ip->i_e2fs_mode, ip->i_e2fs_nlink,
+                   (unsigned long long)ip->i_number, fs->e2fs_fsmnt);
+               panic("ext2fs_valloc: dup alloc");
+       }
+
+       memset(ip->i_din.e2fs_din, 0, sizeof(struct ext2fs_dinode));
+
+       /*
+        * Set up a new generation number for this inode.
+        */
+       if (++ext2gennumber < time_second)
+               ext2gennumber = time_second;
+       ip->i_e2fs_gen = ext2gennumber;
+       return (0);
+noinodes:
+       ext2fs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes");
+       uprintf("\n%s: create/symlink failed, no inodes free\n", fs->e2fs_fsmnt);
+       return (ENOSPC);
+}
+
+/*
+ * Find a cylinder to place a directory.
+ *
+ * The policy implemented by this algorithm is to select from
+ * among those cylinder groups with above the average number of
+ * free inodes, the one with the smallest number of directories.
+ */
+static u_long
+ext2fs_dirpref(struct m_ext2fs *fs)
+{
+       int cg, maxspace, mincg, avgifree;
+
+       avgifree = fs->e2fs.e2fs_ficount / fs->e2fs_ncg;
+       maxspace = 0;
+       mincg = -1;
+       for (cg = 0; cg < fs->e2fs_ncg; cg++)
+               if ( fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree) {
+                       if (mincg == -1 || fs->e2fs_gd[cg].ext2bgd_nbfree > maxspace) {
+                               mincg = cg;
+                               maxspace = fs->e2fs_gd[cg].ext2bgd_nbfree;
+                       }
+               }
+       return mincg;
+}
+
+/*
+ * Select the desired position for the next block in a file.  The file is
+ * logically divided into sections. The first section is composed of the
+ * direct blocks. Each additional section contains fs_maxbpg blocks.
+ *
+ * If no blocks have been allocated in the first section, the policy is to
+ * request a block in the same cylinder group as the inode that describes
+ * the file. Otherwise, the policy is to try to allocate the blocks
+ * contigously. The two fields of the ext2 inode extension (see
+ * ufs/ufs/inode.h) help this.
+ */
+daddr_t
+ext2fs_blkpref(struct inode *ip, daddr_t lbn, int indx,
+               int32_t *bap /* XXX ondisk32 */)
+{
+       struct m_ext2fs *fs;
+       int cg, i;
+
+       fs = ip->i_e2fs;
+       /*
+        * if we are doing contigous lbn allocation, try to alloc blocks
+        * contigously on disk
+        */
+
+       if ( ip->i_e2fs_last_blk && lbn == ip->i_e2fs_last_lblk + 1) {
+               return ip->i_e2fs_last_blk + 1;
+       }
+
+       /*
+        * bap, if provided, gives us a list of blocks to which we want to
+        * stay close
+        */
+
+       if (bap) {
+               for (i = indx; i >= 0 ; i--) {
+                       if (bap[i]) {
+                               return fs2h32(bap[i]) + 1;
+                       }
+               }
+       }
+
+       /* fall back to the first block of the cylinder containing the inode */
+
+       cg = ino_to_cg(fs, ip->i_number);
+       return fs->e2fs.e2fs_bpg * cg + fs->e2fs.e2fs_first_dblock + 1;
+}
+
+/*
+ * Implement the cylinder overflow algorithm.
+ *
+ * The policy implemented by this algorithm is:
+ *   1) allocate the block in its requested cylinder group.
+ *   2) quadradically rehash on the cylinder group number.
+ *   3) brute force search for a free block.
+ */
+static u_long
+ext2fs_hashalloc(struct inode *ip, int cg, long pref, int size,
+               daddr_t (*allocator)(struct inode *, int, daddr_t, int))
+{
+       struct m_ext2fs *fs;
+       long result;
+       int i, icg = cg;
+
+       fs = ip->i_e2fs;
+       /*
+        * 1: preferred cylinder group
+        */
+       result = (*allocator)(ip, cg, pref, size);
+       if (result)
+               return (result);
+       /*
+        * 2: quadratic rehash
+        */
+       for (i = 1; i < fs->e2fs_ncg; i *= 2) {
+               cg += i;
+               if (cg >= fs->e2fs_ncg)
+                       cg -= fs->e2fs_ncg;
+               result = (*allocator)(ip, cg, 0, size);
+               if (result)
+                       return (result);
+       }
+       /*
+        * 3: brute force search
+        * Note that we start at i == 2, since 0 was checked initially,
+        * and 1 is always checked in the quadratic rehash.
+        */
+       cg = (icg + 2) % fs->e2fs_ncg;
+       for (i = 2; i < fs->e2fs_ncg; i++) {
+               result = (*allocator)(ip, cg, 0, size);
+               if (result)
+                       return (result);
+               cg++;
+               if (cg == fs->e2fs_ncg)
+                       cg = 0;
+       }
+       return (0);
+}
+
+/*
+ * Determine whether a block can be allocated.
+ *
+ * Check to see if a block of the appropriate size is available,
+ * and if it is, allocate it.
+ */
+
+static daddr_t
+ext2fs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
+{
+       struct m_ext2fs *fs;
+       char *bbp;
+       struct buf *bp;
+       /* XXX ondisk32 */
+       int error, bno, start, end, loc;
+
+       fs = ip->i_e2fs;
+       if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0)
+               return (0);
+       error = bread(ip->i_devvp, fsbtodb(fs,
+               fs->e2fs_gd[cg].ext2bgd_b_bitmap),
+               (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (0);
+       }
+       bbp = (char *)bp->b_data;
+
+       if (dtog(fs, bpref) != cg)
+               bpref = 0;
+       if (bpref != 0) {
+               bpref = dtogd(fs, bpref);
+               /*
+                * if the requested block is available, use it
+                */
+               if (isclr(bbp, bpref)) {
+                       bno = bpref;
+                       goto gotit;
+               }
+       }
+       /*
+        * no blocks in the requested cylinder, so take next
+        * available one in this cylinder group.
+        * first try to get 8 contigous blocks, then fall back to a single
+        * block.
+        */
+       if (bpref)
+               start = dtogd(fs, bpref) / NBBY;
+       else
+               start = 0;
+       end = howmany(fs->e2fs.e2fs_fpg, NBBY) - start;
+       for (loc = start; loc < end; loc++) {
+               if (bbp[loc] == 0) {
+                       bno = loc * NBBY;
+                       goto gotit;
+               }
+       }
+       for (loc = 0; loc < start; loc++) {
+               if (bbp[loc] == 0) {
+                       bno = loc * NBBY;
+                       goto gotit;
+               }
+       }
+
+       bno = ext2fs_mapsearch(fs, bbp, bpref);
+       if (bno < 0)
+               return (0);
+gotit:
+#ifdef DIAGNOSTIC
+       if (isset(bbp, (daddr_t)bno)) {
+               printf("ext2fs_alloccgblk: cg=%d bno=%d fs=%s\n",
+                       cg, bno, fs->e2fs_fsmnt);
+               panic("ext2fs_alloccg: dup alloc");
+       }
+#endif
+       setbit(bbp, (daddr_t)bno);
+       fs->e2fs.e2fs_fbcount--;
+       fs->e2fs_gd[cg].ext2bgd_nbfree--;
+       fs->e2fs_fmod = 1;
+       bdwrite(bp);
+       return (cg * fs->e2fs.e2fs_fpg + fs->e2fs.e2fs_first_dblock + bno);
+}
+
+/*
+ * Determine whether an inode can be allocated.
+ *
+ * Check to see if an inode is available, and if it is,
+ * allocate it using the following policy:
+ *   1) allocate the requested inode.
+ *   2) allocate the next available inode after the requested
+ *       inode in the specified cylinder group.
+ */
+static daddr_t
+ext2fs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
+{
+       struct m_ext2fs *fs;
+       char *ibp;
+       struct buf *bp;
+       int error, start, len, loc, map, i;
+
+       ipref--; /* to avoid a lot of (ipref -1) */
+       if (ipref == -1)
+               ipref = 0;
+       fs = ip->i_e2fs;
+       if (fs->e2fs_gd[cg].ext2bgd_nifree == 0)
+               return (0);
+       error = bread(ip->i_devvp, fsbtodb(fs,
+               fs->e2fs_gd[cg].ext2bgd_i_bitmap),
+               (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (0);
+       }
+       ibp = (char *)bp->b_data;
+       if (ipref) {
+               ipref %= fs->e2fs.e2fs_ipg;
+               if (isclr(ibp, ipref))
+                       goto gotit;
+       }
+       start = ipref / NBBY;
+       len = howmany(fs->e2fs.e2fs_ipg - ipref, NBBY);
+       loc = skpc(0xff, len, &ibp[start]);
+       if (loc == 0) {
+               len = start + 1;
+               start = 0;
+               loc = skpc(0xff, len, &ibp[0]);
+               if (loc == 0) {
+                       printf("cg = %d, ipref = %lld, fs = %s\n",
+                               cg, (long long)ipref, fs->e2fs_fsmnt);
+                       panic("ext2fs_nodealloccg: map corrupted");
+                       /* NOTREACHED */
+               }
+       }
+       i = start + len - loc;
+       map = ibp[i] ^ 0xff;
+       if (map == 0) {
+               printf("fs = %s\n", fs->e2fs_fsmnt);
+               panic("ext2fs_nodealloccg: block not in map");
+       }
+       ipref = i * NBBY + ffs(map) - 1;
+gotit:
+       setbit(ibp, ipref);
+       fs->e2fs.e2fs_ficount--;
+       fs->e2fs_gd[cg].ext2bgd_nifree--;
+       fs->e2fs_fmod = 1;
+       if ((mode & IFMT) == IFDIR) {
+               fs->e2fs_gd[cg].ext2bgd_ndirs++;
+       }
+       bdwrite(bp);
+       return (cg * fs->e2fs.e2fs_ipg + ipref +1);
+}
+
+/*
+ * Free a block.
+ *
+ * The specified block is placed back in the
+ * free map.
+ */
+void
+ext2fs_blkfree(struct inode *ip, daddr_t bno)
+{
+       struct m_ext2fs *fs;
+       char *bbp;
+       struct buf *bp;
+       int error, cg;
+
+       fs = ip->i_e2fs;
+       cg = dtog(fs, bno);
+       if ((u_int)bno >= fs->e2fs.e2fs_bcount) {
+               printf("bad block %lld, ino %llu\n", (long long)bno,
+                   (unsigned long long)ip->i_number);
+               ext2fs_fserr(fs, ip->i_uid, "bad block");
+               return;
+       }
+       error = bread(ip->i_devvp,
+               fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap),
+               (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return;
+       }
+       bbp = (char *)bp->b_data;
+       bno = dtogd(fs, bno);
+       if (isclr(bbp, bno)) {
+               printf("dev = 0x%llx, block = %lld, fs = %s\n",
+                   (unsigned long long)ip->i_dev, (long long)bno,
+                   fs->e2fs_fsmnt);
+               panic("blkfree: freeing free block");
+       }
+       clrbit(bbp, bno);
+       fs->e2fs.e2fs_fbcount++;
+       fs->e2fs_gd[cg].ext2bgd_nbfree++;
+
+       fs->e2fs_fmod = 1;
+       bdwrite(bp);
+}
+
+/*
+ * Free an inode.
+ *
+ * The specified inode is placed back in the free map.
+ */
+int
+ext2fs_vfree(struct vnode *pvp, ino_t ino, int mode)
+{
+       struct m_ext2fs *fs;
+       char *ibp;
+       struct inode *pip;
+       struct buf *bp;
+       int error, cg;
+
+       pip = VTOI(pvp);
+       fs = pip->i_e2fs;
+       if ((u_int)ino > fs->e2fs.e2fs_icount || (u_int)ino < EXT2_FIRSTINO)
+               panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
+                   (unsigned long long)pip->i_dev, (unsigned long long)ino,
+                   fs->e2fs_fsmnt);
+       cg = ino_to_cg(fs, ino);
+       error = bread(pip->i_devvp,
+               fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_i_bitmap),
+               (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (0);
+       }
+       ibp = (char *)bp->b_data;
+       ino = (ino - 1) % fs->e2fs.e2fs_ipg;
+       if (isclr(ibp, ino)) {
+               printf("dev = 0x%llx, ino = %llu, fs = %s\n",
+                   (unsigned long long)pip->i_dev,
+                   (unsigned long long)ino, fs->e2fs_fsmnt);
+               if (fs->e2fs_ronly == 0)
+                       panic("ifree: freeing free inode");
+       }
+       clrbit(ibp, ino);
+       fs->e2fs.e2fs_ficount++;
+       fs->e2fs_gd[cg].ext2bgd_nifree++;
+       if ((mode & IFMT) == IFDIR) {
+               fs->e2fs_gd[cg].ext2bgd_ndirs--;
+       }
+       fs->e2fs_fmod = 1;
+       bdwrite(bp);
+       return (0);
+}
+
+/*
+ * Find a block in the specified cylinder group.
+ *
+ * It is a panic if a request is made to find a block if none are
+ * available.
+ */
+
+static daddr_t
+ext2fs_mapsearch(struct m_ext2fs *fs, char *bbp, daddr_t bpref)
+{
+       int start, len, loc, i, map;
+
+       /*
+        * find the fragment by searching through the free block
+        * map for an appropriate bit pattern
+        */
+       if (bpref)
+               start = dtogd(fs, bpref) / NBBY;
+       else
+               start = 0;
+       len = howmany(fs->e2fs.e2fs_fpg, NBBY) - start;
+       loc = skpc(0xff, len, &bbp[start]);
+       if (loc == 0) {
+               len = start + 1;
+               start = 0;
+               loc = skpc(0xff, len, &bbp[start]);
+               if (loc == 0) {
+                       printf("start = %d, len = %d, fs = %s\n",
+                               start, len, fs->e2fs_fsmnt);
+                       panic("ext2fs_alloccg: map corrupted");
+                       /* NOTREACHED */
+               }
+       }
+       i = start + len - loc;
+       map = bbp[i] ^ 0xff;
+       if (map == 0) {
+               printf("fs = %s\n", fs->e2fs_fsmnt);
+               panic("ext2fs_mapsearch: block not in map");
+       }
+       return i * NBBY + ffs(map) - 1;
+}
+
+/*
+ * Fserr prints the name of a file system with an error diagnostic.
+ *
+ * The form of the error message is:
+ *     fs: error message
+ */
+static void
+ext2fs_fserr(struct m_ext2fs *fs, u_int uid, const char *cp)
+{
+
+       log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->e2fs_fsmnt, cp);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_balloc.c b/sys/ufs/ext2fs/ext2fs_balloc.c
new file mode 100644 (file)
index 0000000..6564bf9
--- /dev/null
@@ -0,0 +1,403 @@
+/*     $NetBSD: ext2fs_balloc.c,v 1.34 2009/10/19 18:41:17 bouyer Exp $        */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_balloc.c        8.4 (Berkeley) 9/23/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *     @(#)ffs_balloc.c        8.4 (Berkeley) 9/23/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_balloc.c,v 1.34 2009/10/19 18:41:17 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_uvmhist.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <uvm/uvm.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+/*
+ * Balloc defines the structure of file system storage
+ * by allocating the physical blocks on a device given
+ * the inode and the logical block number in a file.
+ */
+int
+ext2fs_balloc(struct inode *ip, daddr_t bn, int size,
+    kauth_cred_t cred, struct buf **bpp, int flags)
+{
+       struct m_ext2fs *fs;
+       daddr_t nb;
+       struct buf *bp, *nbp;
+       struct vnode *vp = ITOV(ip);
+       struct indir indirs[NIADDR + 2];
+       daddr_t newb, lbn, pref;
+       int32_t *bap;   /* XXX ondisk32 */
+       int num, i, error;
+       u_int deallocated;
+       daddr_t *blkp, *allocblk, allociblk[NIADDR + 1];
+       int32_t *allocib;       /* XXX ondisk32 */
+       int unwindidx = -1;
+       UVMHIST_FUNC("ext2fs_balloc"); UVMHIST_CALLED(ubchist);
+
+       UVMHIST_LOG(ubchist, "bn 0x%x", bn,0,0,0);
+
+       if (bpp != NULL) {
+               *bpp = NULL;
+       }
+       if (bn < 0)
+               return (EFBIG);
+       fs = ip->i_e2fs;
+       lbn = bn;
+
+       /*
+        * The first NDADDR blocks are direct blocks
+        */
+       if (bn < NDADDR) {
+               /* XXX ondisk32 */
+               nb = fs2h32(ip->i_e2fs_blocks[bn]);
+               if (nb != 0) {
+
+                       /*
+                        * the block is already allocated, just read it.
+                        */
+
+                       if (bpp != NULL) {
+                               error = bread(vp, bn, fs->e2fs_bsize, NOCRED,
+                                             B_MODIFY, &bp);
+                               if (error) {
+                                       brelse(bp, 0);
+                                       return (error);
+                               }
+                               *bpp = bp;
+                       }
+                       return (0);
+               }
+
+               /*
+                * allocate a new direct block.
+                */
+
+               error = ext2fs_alloc(ip, bn,
+                   ext2fs_blkpref(ip, bn, bn, &ip->i_e2fs_blocks[0]),
+                   cred, &newb);
+               if (error)
+                       return (error);
+               ip->i_e2fs_last_lblk = lbn;
+               ip->i_e2fs_last_blk = newb;
+               /* XXX ondisk32 */
+               ip->i_e2fs_blocks[bn] = h2fs32((int32_t)newb);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               if (bpp != NULL) {
+                       bp = getblk(vp, bn, fs->e2fs_bsize, 0, 0);
+                       bp->b_blkno = fsbtodb(fs, newb);
+                       if (flags & B_CLRBUF)
+                               clrbuf(bp);
+                       *bpp = bp;
+               }
+               return (0);
+       }
+       /*
+        * Determine the number of levels of indirection.
+        */
+       pref = 0;
+       if ((error = ufs_getlbns(vp, bn, indirs, &num)) != 0)
+               return(error);
+#ifdef DIAGNOSTIC
+       if (num < 1)
+               panic ("ext2fs_balloc: ufs_getlbns returned indirect block\n");
+#endif
+       /*
+        * Fetch the first indirect block allocating if necessary.
+        */
+       --num;
+       /* XXX ondisk32 */
+       nb = fs2h32(ip->i_e2fs_blocks[NDADDR + indirs[0].in_off]);
+       allocib = NULL;
+       allocblk = allociblk;
+       if (nb == 0) {
+               pref = ext2fs_blkpref(ip, lbn, 0, (int32_t *)0);
+               error = ext2fs_alloc(ip, lbn, pref, cred, &newb);
+               if (error)
+                       return (error);
+               nb = newb;
+               *allocblk++ = nb;
+               ip->i_e2fs_last_blk = newb;
+               bp = getblk(vp, indirs[1].in_lbn, fs->e2fs_bsize, 0, 0);
+               bp->b_blkno = fsbtodb(fs, newb);
+               clrbuf(bp);
+               /*
+                * Write synchronously so that indirect blocks
+                * never point at garbage.
+                */
+               if ((error = bwrite(bp)) != 0)
+                       goto fail;
+               unwindidx = 0;
+               allocib = &ip->i_e2fs_blocks[NDADDR + indirs[0].in_off];
+               /* XXX ondisk32 */
+               *allocib = h2fs32((int32_t)newb);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+       }
+       /*
+        * Fetch through the indirect blocks, allocating as necessary.
+        */
+       for (i = 1;;) {
+               error = bread(vp,
+                   indirs[i].in_lbn, (int)fs->e2fs_bsize, NOCRED, 0, &bp);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               bap = (int32_t *)bp->b_data;    /* XXX ondisk32 */
+               nb = fs2h32(bap[indirs[i].in_off]);
+               if (i == num)
+                       break;
+               i++;
+               if (nb != 0) {
+                       brelse(bp, 0);
+                       continue;
+               }
+               pref = ext2fs_blkpref(ip, lbn, 0, (int32_t *)0);
+               error = ext2fs_alloc(ip, lbn, pref, cred, &newb);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               nb = newb;
+               *allocblk++ = nb;
+               ip->i_e2fs_last_blk = newb;
+               nbp = getblk(vp, indirs[i].in_lbn, fs->e2fs_bsize, 0, 0);
+               nbp->b_blkno = fsbtodb(fs, nb);
+               clrbuf(nbp);
+               /*
+                * Write synchronously so that indirect blocks
+                * never point at garbage.
+                */
+               if ((error = bwrite(nbp)) != 0) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               if (unwindidx < 0)
+                       unwindidx = i - 1;
+               /* XXX ondisk32 */
+               bap[indirs[i - 1].in_off] = h2fs32((int32_t)nb);
+               /*
+                * If required, write synchronously, otherwise use
+                * delayed write.
+                */
+               if (flags & B_SYNC) {
+                       bwrite(bp);
+               } else {
+                       bdwrite(bp);
+               }
+       }
+       /*
+        * Get the data block, allocating if necessary.
+        */
+       if (nb == 0) {
+               pref = ext2fs_blkpref(ip, lbn, indirs[num].in_off, &bap[0]);
+               error = ext2fs_alloc(ip, lbn, pref, cred, &newb);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               nb = newb;
+               *allocblk++ = nb;
+               ip->i_e2fs_last_lblk = lbn;
+               ip->i_e2fs_last_blk = newb;
+               /* XXX ondisk32 */
+               bap[indirs[num].in_off] = h2fs32((int32_t)nb);
+               /*
+                * If required, write synchronously, otherwise use
+                * delayed write.
+                */
+               if (flags & B_SYNC) {
+                       bwrite(bp);
+               } else {
+                       bdwrite(bp);
+               }
+               if (bpp != NULL) {
+                       nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
+                       nbp->b_blkno = fsbtodb(fs, nb);
+                       if (flags & B_CLRBUF)
+                               clrbuf(nbp);
+                       *bpp = nbp;
+               }
+               return (0);
+       }
+       brelse(bp, 0);
+       if (bpp != NULL) {
+               if (flags & B_CLRBUF) {
+                       error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED,
+                                     B_MODIFY, &nbp);
+                       if (error) {
+                               brelse(nbp, 0);
+                               goto fail;
+                       }
+               } else {
+                       nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
+                       nbp->b_blkno = fsbtodb(fs, nb);
+               }
+               *bpp = nbp;
+       }
+       return (0);
+fail:
+       /*
+        * If we have failed part way through block allocation, we
+        * have to deallocate any indirect blocks that we have allocated.
+        */
+       for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
+               ext2fs_blkfree(ip, *blkp);
+               deallocated += fs->e2fs_bsize;
+       }
+       if (unwindidx >= 0) {
+               if (unwindidx == 0) {
+                       *allocib = 0;
+               } else {
+                       int r;
+
+                       r = bread(vp, indirs[unwindidx].in_lbn,
+                           (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+                       if (r) {
+                               panic("Could not unwind indirect block, error %d", r);
+                               brelse(bp, 0);
+                       } else {
+                               bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+                               bap[indirs[unwindidx].in_off] = 0;
+                               if (flags & B_SYNC)
+                                       bwrite(bp);
+                               else
+                                       bdwrite(bp);
+                       }
+               }
+               for (i = unwindidx + 1; i <= num; i++) {
+                       bp = getblk(vp, indirs[i].in_lbn, (int)fs->e2fs_bsize,
+                           0, 0);
+                       brelse(bp, BC_INVAL);
+               }
+       }
+       if (deallocated) {
+               ip->i_e2fs_nblock -= btodb(deallocated);
+               ip->i_e2fs_flags |= IN_CHANGE | IN_UPDATE;
+       }
+       return error;
+}
+
+int
+ext2fs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
+    kauth_cred_t cred)
+{
+       struct inode *ip = VTOI(vp);
+       struct m_ext2fs *fs = ip->i_e2fs;
+       int error, delta, bshift, bsize;
+       UVMHIST_FUNC("ext2fs_gop_alloc"); UVMHIST_CALLED(ubchist);
+
+       bshift = fs->e2fs_bshift;
+       bsize = 1 << bshift;
+
+       delta = off & (bsize - 1);
+       off -= delta;
+       len += delta;
+
+       while (len > 0) {
+               bsize = min(bsize, len);
+               UVMHIST_LOG(ubchist, "off 0x%x len 0x%x bsize 0x%x",
+                           off, len, bsize, 0);
+
+               error = ext2fs_balloc(ip, lblkno(fs, off), bsize, cred,
+                   NULL, flags);
+               if (error) {
+                       UVMHIST_LOG(ubchist, "error %d", error, 0,0,0);
+                       return error;
+               }
+
+               /*
+                * increase file size now, ext2fs_balloc() requires that
+                * EOF be up-to-date before each call.
+                */
+
+               if (ext2fs_size(ip) < off + bsize) {
+                       UVMHIST_LOG(ubchist, "old 0x%lx%8lx new 0x%lx%8lx",
+                           /* Note that arguments are always cast to u_long. */
+                                   ext2fs_size(ip) >> 32,
+                                   ext2fs_size(ip) & 0xffffffff,
+                                   (off + bsize) >> 32,
+                                   (off + bsize) & 0xffffffff);
+                       error = ext2fs_setsize(ip, off + bsize);
+                       if (error) {
+                               UVMHIST_LOG(ubchist, "error %d", error, 0,0,0);
+                               return error;
+                       }
+               }
+
+               off += bsize;
+               len -= bsize;
+       }
+       return 0;
+}
diff --git a/sys/ufs/ext2fs/ext2fs_bmap.c b/sys/ufs/ext2fs/ext2fs_bmap.c
new file mode 100644 (file)
index 0000000..5336fdd
--- /dev/null
@@ -0,0 +1,269 @@
+/*     $NetBSD: ext2fs_bmap.c,v 1.25 2009/10/19 18:41:17 bouyer Exp $  */
+
+/*
+ * Copyright (c) 1989, 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_bmap.c  8.6 (Berkeley) 1/21/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *     @(#)ufs_bmap.c  8.6 (Berkeley) 1/21/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_bmap.c,v 1.25 2009/10/19 18:41:17 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+static int ext2fs_bmaparray(struct vnode *, daddr_t, daddr_t *,
+                               struct indir *, int *, int *);
+
+#define        is_sequential(ump, a, b)        ((b) == (a) + ump->um_seqinc)
+
+/*
+ * Bmap converts a the logical block number of a file to its physical block
+ * number on the disk. The conversion is done by using the logical block
+ * number to index into the array of block pointers described by the dinode.
+ */
+int
+ext2fs_bmap(void *v)
+{
+       struct vop_bmap_args /* {
+               struct vnode *a_vp;
+               daddr_t  a_bn;
+               struct vnode **a_vpp;
+               daddr_t *a_bnp;
+               int *a_runp;
+       } */ *ap = v;
+       /*
+        * Check for underlying vnode requests and ensure that logical
+        * to physical mapping is requested.
+        */
+       if (ap->a_vpp != NULL)
+               *ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
+       if (ap->a_bnp == NULL)
+               return (0);
+
+       return (ext2fs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
+               ap->a_runp));
+}
+
+/*
+ * Indirect blocks are now on the vnode for the file.  They are given negative
+ * logical block numbers.  Indirect blocks are addressed by the negative
+ * address of the first data block to which they point.  Double indirect blocks
+ * are addressed by one less than the address of the first indirect block to
+ * which they point.  Triple indirect blocks are addressed by one less than
+ * the address of the first double indirect block to which they point.
+ *
+ * ext2fs_bmaparray does the bmap conversion, and if requested returns the
+ * array of logical blocks which must be traversed to get to a block.
+ * Each entry contains the offset into that block that gets you to the
+ * next block and the disk address of the block (if it is assigned).
+ */
+
+int
+ext2fs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
+               int *nump, int *runp)
+{
+       struct inode *ip;
+       struct buf *bp, *cbp;
+       struct ufsmount *ump;
+       struct mount *mp;
+       struct indir a[NIADDR+1], *xap;
+       daddr_t daddr;
+       daddr_t metalbn;
+       int error, maxrun = 0, num;
+
+       ip = VTOI(vp);
+       mp = vp->v_mount;
+       ump = ip->i_ump;
+#ifdef DIAGNOSTIC
+       if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL))
+               panic("ext2fs_bmaparray: invalid arguments");
+#endif
+
+       if (runp) {
+               /*
+                * XXX
+                * If MAXBSIZE is the largest transfer the disks can handle,
+                * we probably want maxrun to be 1 block less so that we
+                * don't create a block larger than the device can handle.
+                */
+               *runp = 0;
+               maxrun = MAXBSIZE / mp->mnt_stat.f_iosize - 1;
+       }
+
+       if (bn >= 0 && bn < NDADDR) {
+               /* XXX ondisk32 */
+               *bnp = blkptrtodb(ump, fs2h32(ip->i_e2fs_blocks[bn]));
+               if (*bnp == 0)
+                       *bnp = -1;
+               else if (runp)
+                       /* XXX ondisk32 */
+                       for (++bn; bn < NDADDR && *runp < maxrun &&
+                               is_sequential(ump, (daddr_t)fs2h32(ip->i_e2fs_blocks[bn - 1]),
+                                                         (daddr_t)fs2h32(ip->i_e2fs_blocks[bn]));
+                               ++bn, ++*runp);
+               return (0);
+       }
+
+       xap = ap == NULL ? a : ap;
+       if (!nump)
+               nump = &num;
+       if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
+               return (error);
+
+       num = *nump;
+
+       /* Get disk address out of indirect block array */
+       /* XXX ondisk32 */
+       daddr = fs2h32(ip->i_e2fs_blocks[NDADDR + xap->in_off]);
+
+#ifdef DIAGNOSTIC
+    if (num > NIADDR + 1 || num < 1) {
+               printf("ext2fs_bmaparray: num=%d\n", num);
+               panic("ext2fs_bmaparray: num");
+       }
+#endif
+       for (bp = NULL, ++xap; --num; ++xap) {
+               /*
+                * Exit the loop if there is no disk address assigned yet and
+                * the indirect block isn't in the cache, or if we were
+                * looking for an indirect block and we've found it.
+                */
+
+               metalbn = xap->in_lbn;
+               if (metalbn == bn)
+                       break;
+               if (daddr == 0) {
+                       mutex_enter(&bufcache_lock);
+                       cbp = incore(vp, metalbn);
+                       mutex_exit(&bufcache_lock);
+                       if (cbp == NULL)
+                               break;
+               }
+               /*
+                * If we get here, we've either got the block in the cache
+                * or we have a disk address for it, go fetch it.
+                */
+               if (bp)
+                       brelse(bp, 0);
+
+               xap->in_exists = 1;
+               bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
+               if (bp == NULL) {
+
+                       /*
+                        * getblk() above returns NULL only iff we are
+                        * pagedaemon.  See the implementation of getblk
+                        * for detail.
+                        */
+
+                        return (ENOMEM);
+               }
+               if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+                       trace(TR_BREADHIT, pack(vp, size), metalbn);
+               }
+#ifdef DIAGNOSTIC
+               else if (!daddr)
+                       panic("ext2fs_bmaparry: indirect block not in cache");
+#endif
+               else {
+                       trace(TR_BREADMISS, pack(vp, size), metalbn);
+                       bp->b_blkno = blkptrtodb(ump, daddr);
+                       bp->b_flags |= B_READ;
+                       VOP_STRATEGY(vp, bp);
+                       curlwp->l_ru.ru_inblock++;      /* XXX */
+                       if ((error = biowait(bp)) != 0) {
+                               brelse(bp, 0);
+                               return (error);
+                       }
+               }
+
+               /* XXX ondisk32 */
+               daddr = fs2h32(((int32_t *)bp->b_data)[xap->in_off]);
+               if (num == 1 && daddr && runp)
+                       /* XXX ondisk32 */
+                       for (bn = xap->in_off + 1;
+                               bn < MNINDIR(ump) && *runp < maxrun &&
+                               is_sequential(ump, ((int32_t *)bp->b_data)[bn - 1],
+                               ((int32_t *)bp->b_data)[bn]);
+                               ++bn, ++*runp);
+       }
+       if (bp)
+               brelse(bp, 0);
+
+       daddr = blkptrtodb(ump, daddr);
+       *bnp = daddr == 0 ? -1 : daddr;
+       return (0);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_bswap.c b/sys/ufs/ext2fs/ext2fs_bswap.c
new file mode 100644 (file)
index 0000000..ba0ddc4
--- /dev/null
@@ -0,0 +1,121 @@
+/*     $NetBSD: ext2fs_bswap.c,v 1.16 2009/10/19 18:41:17 bouyer Exp $ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_bswap.c,v 1.16 2009/10/19 18:41:17 bouyer Exp $");
+
+#include <sys/types.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_dinode.h>
+
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+/* These functions are only needed if native byte order is not big endian */
+#if BYTE_ORDER == BIG_ENDIAN
+void
+e2fs_sb_bswap(struct ext2fs *old, struct ext2fs *new)
+{
+
+       /* preserve unused fields */
+       memcpy(new, old, sizeof(struct ext2fs));
+       new->e2fs_icount        =       bswap32(old->e2fs_icount);
+       new->e2fs_bcount        =       bswap32(old->e2fs_bcount);
+       new->e2fs_rbcount       =       bswap32(old->e2fs_rbcount);
+       new->e2fs_fbcount       =       bswap32(old->e2fs_fbcount);
+       new->e2fs_ficount       =       bswap32(old->e2fs_ficount);
+       new->e2fs_first_dblock  =       bswap32(old->e2fs_first_dblock);
+       new->e2fs_log_bsize     =       bswap32(old->e2fs_log_bsize);
+       new->e2fs_fsize         =       bswap32(old->e2fs_fsize);
+       new->e2fs_bpg           =       bswap32(old->e2fs_bpg);
+       new->e2fs_fpg           =       bswap32(old->e2fs_fpg);
+       new->e2fs_ipg           =       bswap32(old->e2fs_ipg);
+       new->e2fs_mtime         =       bswap32(old->e2fs_mtime);
+       new->e2fs_wtime         =       bswap32(old->e2fs_wtime);
+       new->e2fs_mnt_count     =       bswap16(old->e2fs_mnt_count);
+       new->e2fs_max_mnt_count =       bswap16(old->e2fs_max_mnt_count);
+       new->e2fs_magic         =       bswap16(old->e2fs_magic);
+       new->e2fs_state         =       bswap16(old->e2fs_state);
+       new->e2fs_beh           =       bswap16(old->e2fs_beh);
+       new->e2fs_minrev        =       bswap16(old->e2fs_minrev);
+       new->e2fs_lastfsck      =       bswap32(old->e2fs_lastfsck);
+       new->e2fs_fsckintv      =       bswap32(old->e2fs_fsckintv);
+       new->e2fs_creator       =       bswap32(old->e2fs_creator);
+       new->e2fs_rev           =       bswap32(old->e2fs_rev);
+       new->e2fs_ruid          =       bswap16(old->e2fs_ruid);
+       new->e2fs_rgid          =       bswap16(old->e2fs_rgid);
+       new->e2fs_first_ino     =       bswap32(old->e2fs_first_ino);
+       new->e2fs_inode_size    =       bswap16(old->e2fs_inode_size);
+       new->e2fs_block_group_nr =      bswap16(old->e2fs_block_group_nr);
+       new->e2fs_features_compat =     bswap32(old->e2fs_features_compat);
+       new->e2fs_features_incompat =   bswap32(old->e2fs_features_incompat);
+       new->e2fs_features_rocompat =   bswap32(old->e2fs_features_rocompat);
+       new->e2fs_algo          =       bswap32(old->e2fs_algo);
+       new->e2fs_reserved_ngdb =       bswap16(old->e2fs_reserved_ngdb);
+}
+
+void e2fs_cg_bswap(struct ext2_gd *old, struct ext2_gd *new, int size)
+{
+       int i;
+
+       for (i = 0; i < (size / (int)sizeof(struct  ext2_gd)); i++) {
+               new[i].ext2bgd_b_bitmap = bswap32(old[i].ext2bgd_b_bitmap);
+               new[i].ext2bgd_i_bitmap = bswap32(old[i].ext2bgd_i_bitmap);
+               new[i].ext2bgd_i_tables = bswap32(old[i].ext2bgd_i_tables);
+               new[i].ext2bgd_nbfree   = bswap16(old[i].ext2bgd_nbfree);
+               new[i].ext2bgd_nifree   = bswap16(old[i].ext2bgd_nifree);
+               new[i].ext2bgd_ndirs    = bswap16(old[i].ext2bgd_ndirs);
+       }
+}
+
+void e2fs_i_bswap(struct ext2fs_dinode *old, struct ext2fs_dinode *new)
+{
+
+       new->e2di_mode          =       bswap16(old->e2di_mode);
+       new->e2di_uid           =       bswap16(old->e2di_uid);
+       new->e2di_gid           =       bswap16(old->e2di_gid);
+       new->e2di_nlink         =       bswap16(old->e2di_nlink);
+       new->e2di_size          =       bswap32(old->e2di_size);
+       new->e2di_atime         =       bswap32(old->e2di_atime);
+       new->e2di_ctime         =       bswap32(old->e2di_ctime);
+       new->e2di_mtime         =       bswap32(old->e2di_mtime);
+       new->e2di_dtime         =       bswap32(old->e2di_dtime);
+       new->e2di_nblock        =       bswap32(old->e2di_nblock);
+       new->e2di_flags         =       bswap32(old->e2di_flags);
+       new->e2di_gen           =       bswap32(old->e2di_gen);
+       new->e2di_facl          =       bswap32(old->e2di_facl);
+       new->e2di_dacl          =       bswap32(old->e2di_dacl);
+       new->e2di_faddr         =       bswap32(old->e2di_faddr);
+       new->e2di_uid_high      =       bswap16(old->e2di_uid_high);
+       new->e2di_gid_high      =       bswap16(old->e2di_gid_high);
+       memcpy(&new->e2di_blocks[0], &old->e2di_blocks[0],
+           (NDADDR + NIADDR) * sizeof(uint32_t));
+}
+#endif
diff --git a/sys/ufs/ext2fs/ext2fs_inode.c b/sys/ufs/ext2fs/ext2fs_inode.c
new file mode 100644 (file)
index 0000000..0d52fb4
--- /dev/null
@@ -0,0 +1,558 @@
+/*     $NetBSD: ext2fs_inode.c,v 1.74 2011/06/16 09:21:03 hannken Exp $        */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_inode.c 8.8 (Berkeley) 10/19/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *     @(#)ffs_inode.c 8.8 (Berkeley) 10/19/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_inode.c,v 1.74 2011/06/16 09:21:03 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/trace.h>
+#include <sys/resourcevar.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+extern int prtactive;
+
+static int ext2fs_indirtrunc(struct inode *, daddr_t, daddr_t,
+                                 daddr_t, int, long *);
+
+/*
+ * Get the size of an inode.
+ */
+uint64_t
+ext2fs_size(struct inode *ip)
+{
+       uint64_t size = ip->i_e2fs_size;
+
+       if ((ip->i_e2fs_mode & IFMT) == IFREG)
+               size |= (uint64_t)ip->i_e2fs_dacl << 32;
+       return size;
+}
+
+int
+ext2fs_setsize(struct inode *ip, uint64_t size)
+{
+       if ((ip->i_e2fs_mode & IFMT) == IFREG ||
+           ip->i_e2fs_mode == 0) {
+               ip->i_e2fs_dacl = size >> 32;
+               if (size >= 0x80000000U) {
+                       struct m_ext2fs *fs = ip->i_e2fs;
+
+                       if (fs->e2fs.e2fs_rev <= E2FS_REV0) {
+                               /* Linux automagically upgrades to REV1 here! */
+                               return EFBIG;
+                       }
+                       if (!(fs->e2fs.e2fs_features_rocompat
+                           & EXT2F_ROCOMPAT_LARGEFILE)) {
+                               fs->e2fs.e2fs_features_rocompat |=
+                                   EXT2F_ROCOMPAT_LARGEFILE;
+                               fs->e2fs_fmod = 1;
+                       }
+               }
+       } else if (size >= 0x80000000U)
+               return EFBIG;
+
+       ip->i_e2fs_size = size;
+
+       return 0;
+}
+
+/*
+ * Last reference to an inode.  If necessary, write or delete it.
+ */
+int
+ext2fs_inactive(void *v)
+{
+       struct vop_inactive_args /* {
+               struct vnode *a_vp;
+               bool *a_recycle;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       int error = 0;
+
+       if (prtactive && vp->v_usecount != 0)
+               vprint("ext2fs_inactive: pushing active", vp);
+       /* Get rid of inodes related to stale file handles. */
+       if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0)
+               goto out;
+
+       error = 0;
+       if (ip->i_e2fs_nlink == 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+               /* Defer final inode free and update to reclaim.*/
+               if (ext2fs_size(ip) != 0) {
+                       error = ext2fs_truncate(vp, (off_t)0, 0, NOCRED);
+               }
+               ip->i_e2fs_dtime = time_second;
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               ip->i_omode = 1;
+       }
+       if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
+               ext2fs_update(vp, NULL, NULL, 0);
+       }
+out:
+       /*
+        * If we are done with the inode, reclaim it
+        * so that it can be reused immediately.
+        */
+       *ap->a_recycle = (ip->i_e2fs_dtime != 0);
+       VOP_UNLOCK(vp);
+       return (error);
+}
+
+
+/*
+ * Update the access, modified, and inode change times as specified by the
+ * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is
+ * used to specify that the inode needs to be updated but that the times have
+ * already been set. The access and modified times are taken from the second
+ * and third parameters; the inode change time is always taken from the current
+ * time. If UPDATE_WAIT or UPDATE_DIROP is set, then wait for the disk
+ * write of the inode to complete.
+ */
+int
+ext2fs_update(struct vnode *vp, const struct timespec *acc,
+    const struct timespec *mod, int updflags)
+{
+       struct m_ext2fs *fs;
+       struct buf *bp;
+       struct inode *ip;
+       int error;
+       void *cp;
+       int flags;
+
+       if (vp->v_mount->mnt_flag & MNT_RDONLY)
+               return (0);
+       ip = VTOI(vp);
+       EXT2FS_ITIMES(ip, acc, mod, NULL);
+       if (updflags & UPDATE_CLOSE)
+               flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED);
+       else
+               flags = ip->i_flag & IN_MODIFIED;
+       if (flags == 0)
+               return (0);
+       fs = ip->i_e2fs;
+
+       error = bread(ip->i_devvp,
+                         fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+                         (int)fs->e2fs_bsize, NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (error);
+       }
+       ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED);
+       cp = (char *)bp->b_data +
+           (ino_to_fsbo(fs, ip->i_number) * EXT2_DINODE_SIZE(fs));
+       e2fs_isave(ip->i_din.e2fs_din, (struct ext2fs_dinode *)cp);
+       if ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) != 0 &&
+           (flags & IN_MODIFIED) != 0 &&
+           (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
+               return (bwrite(bp));
+       else {
+               bdwrite(bp);
+               return (0);
+       }
+}
+
+#define        SINGLE  0       /* index of single indirect block */
+#define        DOUBLE  1       /* index of double indirect block */
+#define        TRIPLE  2       /* index of triple indirect block */
+/*
+ * Truncate the inode oip to at most length size, freeing the
+ * disk blocks.
+ */
+int
+ext2fs_truncate(struct vnode *ovp, off_t length, int ioflag,
+    kauth_cred_t cred)
+{
+       daddr_t lastblock;
+       struct inode *oip = VTOI(ovp);
+       daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR];
+       /* XXX ondisk32 */
+       int32_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
+       struct m_ext2fs *fs;
+       int offset, size, level;
+       long count, blocksreleased = 0;
+       int i, nblocks;
+       int error, allerror = 0;
+       off_t osize;
+       int sync;
+       struct ufsmount *ump = oip->i_ump;
+
+       if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
+           ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
+               return 0;
+       }
+
+       if (length < 0)
+               return (EINVAL);
+
+       if (ovp->v_type == VLNK &&
+           (ext2fs_size(oip) < ump->um_maxsymlinklen ||
+            (ump->um_maxsymlinklen == 0 && oip->i_e2fs_nblock == 0))) {
+               KDASSERT(length == 0);
+               memset((char *)&oip->i_din.e2fs_din->e2di_shortlink, 0,
+                       (u_int)ext2fs_size(oip));
+               (void)ext2fs_setsize(oip, 0);
+               oip->i_flag |= IN_CHANGE | IN_UPDATE;
+               return (ext2fs_update(ovp, NULL, NULL, 0));
+       }
+       if (ext2fs_size(oip) == length) {
+               /* still do a uvm_vnp_setsize() as writesize may be larger */
+               uvm_vnp_setsize(ovp, length);
+               oip->i_flag |= IN_CHANGE | IN_UPDATE;
+               return (ext2fs_update(ovp, NULL, NULL, 0));
+       }
+       fs = oip->i_e2fs;
+       if (length > ump->um_maxfilesize)
+               return (EFBIG);
+
+       osize = ext2fs_size(oip);
+
+       /*
+        * Lengthen the size of the file. We must ensure that the
+        * last byte of the file is allocated. Since the smallest
+        * value of osize is 0, length will be at least 1.
+        */
+       if (osize < length) {
+               uvm_vnp_setwritesize(ovp, length);
+               error = ufs_balloc_range(ovp, length - 1, 1, cred,
+                   ioflag & IO_SYNC ? B_SYNC : 0);
+               if (error) {
+                       (void) ext2fs_truncate(ovp, osize, ioflag & IO_SYNC,
+                           cred);
+                       return (error);
+               }
+               uvm_vnp_setsize(ovp, length);
+               oip->i_flag |= IN_CHANGE | IN_UPDATE;
+               KASSERT(error  || ovp->v_size == ext2fs_size(oip));
+               return (ext2fs_update(ovp, NULL, NULL, 0));
+       }
+       /*
+        * Shorten the size of the file. If the file is not being
+        * truncated to a block boundry, the contents of the
+        * partial block following the end of the file must be
+        * zero'ed in case it ever become accessible again because
+        * of subsequent file growth.
+        */
+       offset = blkoff(fs, length);
+       if (offset != 0) {
+               size = fs->e2fs_bsize;
+
+               /* XXXUBC we should handle more than just VREG */
+               ubc_zerorange(&ovp->v_uobj, length, size - offset,
+                   UBC_UNMAP_FLAG(ovp));
+       }
+       (void)ext2fs_setsize(oip, length);
+       uvm_vnp_setsize(ovp, length);
+       /*
+        * Calculate index into inode's block list of
+        * last direct and indirect blocks (if any)
+        * which we want to keep.  Lastblock is -1 when
+        * the file is truncated to 0.
+        */
+       lastblock = lblkno(fs, length + fs->e2fs_bsize - 1) - 1;
+       lastiblock[SINGLE] = lastblock - NDADDR;
+       lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
+       lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
+       nblocks = btodb(fs->e2fs_bsize);
+       /*
+        * Update file and block pointers on disk before we start freeing
+        * blocks.  If we crash before free'ing blocks below, the blocks
+        * will be returned to the free list.  lastiblock values are also
+        * normalized to -1 for calls to ext2fs_indirtrunc below.
+        */
+       memcpy((void *)oldblks, (void *)&oip->i_e2fs_blocks[0], sizeof oldblks);
+       sync = 0;
+       for (level = TRIPLE; level >= SINGLE; level--) {
+               if (lastiblock[level] < 0 && oldblks[NDADDR + level] != 0) {
+                       sync = 1;
+                       oip->i_e2fs_blocks[NDADDR + level] = 0;
+                       lastiblock[level] = -1;
+               }
+       }
+       for (i = 0; i < NDADDR; i++) {
+               if (i > lastblock && oldblks[i] != 0) {
+                       sync = 1;
+                       oip->i_e2fs_blocks[i] = 0;
+               }
+       }
+       oip->i_flag |= IN_CHANGE | IN_UPDATE;
+       if (sync) {
+               error = ext2fs_update(ovp, NULL, NULL, UPDATE_WAIT);
+               if (error && !allerror)
+                       allerror = error;
+       }
+
+       /*
+        * Having written the new inode to disk, save its new configuration
+        * and put back the old block pointers long enough to process them.
+        * Note that we save the new block configuration so we can check it
+        * when we are done.
+        */
+       memcpy((void *)newblks, (void *)&oip->i_e2fs_blocks[0], sizeof newblks);
+       memcpy((void *)&oip->i_e2fs_blocks[0], (void *)oldblks, sizeof oldblks);
+
+       (void)ext2fs_setsize(oip, osize);
+       error = vtruncbuf(ovp, lastblock + 1, 0, 0);
+       if (error && !allerror)
+               allerror = error;
+
+       /*
+        * Indirect blocks first.
+        */
+       indir_lbn[SINGLE] = -NDADDR;
+       indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) -1;
+       indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
+       for (level = TRIPLE; level >= SINGLE; level--) {
+               /* XXX ondisk32 */
+               bn = fs2h32(oip->i_e2fs_blocks[NDADDR + level]);
+               if (bn != 0) {
+                       error = ext2fs_indirtrunc(oip, indir_lbn[level],
+                           fsbtodb(fs, bn), lastiblock[level], level, &count);
+                       if (error)
+                               allerror = error;
+                       blocksreleased += count;
+                       if (lastiblock[level] < 0) {
+                               oip->i_e2fs_blocks[NDADDR + level] = 0;
+                               ext2fs_blkfree(oip, bn);
+                               blocksreleased += nblocks;
+                       }
+               }
+               if (lastiblock[level] >= 0)
+                       goto done;
+       }
+
+       /*
+        * All whole direct blocks or frags.
+        */
+       for (i = NDADDR - 1; i > lastblock; i--) {
+               /* XXX ondisk32 */
+               bn = fs2h32(oip->i_e2fs_blocks[i]);
+               if (bn == 0)
+                       continue;
+               oip->i_e2fs_blocks[i] = 0;
+               ext2fs_blkfree(oip, bn);
+               blocksreleased += btodb(fs->e2fs_bsize);
+       }
+
+done:
+#ifdef DIAGNOSTIC
+       for (level = SINGLE; level <= TRIPLE; level++)
+               if (newblks[NDADDR + level] !=
+                   oip->i_e2fs_blocks[NDADDR + level])
+                       panic("ext2fs_truncate1");
+       for (i = 0; i < NDADDR; i++)
+               if (newblks[i] != oip->i_e2fs_blocks[i])
+                       panic("ext2fs_truncate2");
+       if (length == 0 &&
+           (!LIST_EMPTY(&ovp->v_cleanblkhd) ||
+            !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+               panic("ext2fs_truncate3");
+#endif /* DIAGNOSTIC */
+       /*
+        * Put back the real size.
+        */
+       (void)ext2fs_setsize(oip, length);
+       oip->i_e2fs_nblock -= blocksreleased;
+       oip->i_flag |= IN_CHANGE;
+       KASSERT(ovp->v_type != VREG || ovp->v_size == ext2fs_size(oip));
+       return (allerror);
+}
+
+/*
+ * Release blocks associated with the inode ip and stored in the indirect
+ * block bn.  Blocks are free'd in LIFO order up to (but not including)
+ * lastbn.  If level is greater than SINGLE, the block is an indirect block
+ * and recursive calls to indirtrunc must be used to cleanse other indirect
+ * blocks.
+ *
+ * NB: triple indirect blocks are untested.
+ */
+static int
+ext2fs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
+               int level, long *countp)
+{
+       int i;
+       struct buf *bp;
+       struct m_ext2fs *fs = ip->i_e2fs;
+       int32_t *bap;   /* XXX ondisk32 */
+       struct vnode *vp;
+       daddr_t nb, nlbn, last;
+       int32_t *copy = NULL;   /* XXX ondisk32 */
+       long blkcount, factor;
+       int nblocks, blocksreleased = 0;
+       int error = 0, allerror = 0;
+
+       /*
+        * Calculate index in current block of last
+        * block to be kept.  -1 indicates the entire
+        * block so we need not calculate the index.
+        */
+       factor = 1;
+       for (i = SINGLE; i < level; i++)
+               factor *= NINDIR(fs);
+       last = lastbn;
+       if (lastbn > 0)
+               last /= factor;
+       nblocks = btodb(fs->e2fs_bsize);
+       /*
+        * Get buffer of block pointers, zero those entries corresponding
+        * to blocks to be free'd, and update on disk copy first.  Since
+        * double(triple) indirect before single(double) indirect, calls
+        * to bmap on these blocks will fail.  However, we already have
+        * the on disk address, so we have to set the b_blkno field
+        * explicitly instead of letting bread do everything for us.
+        */
+       vp = ITOV(ip);
+       bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0);
+       if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+               /* Braces must be here in case trace evaluates to nothing. */
+               trace(TR_BREADHIT, pack(vp, fs->e2fs_bsize), lbn);
+       } else {
+               trace(TR_BREADMISS, pack(vp, fs->e2fs_bsize), lbn);
+               curlwp->l_ru.ru_inblock++;      /* pay for read */
+               bp->b_flags |= B_READ;
+               if (bp->b_bcount > bp->b_bufsize)
+                       panic("ext2fs_indirtrunc: bad buffer size");
+               bp->b_blkno = dbn;
+               VOP_STRATEGY(vp, bp);
+               error = biowait(bp);
+       }
+       if (error) {
+               brelse(bp, 0);
+               *countp = 0;
+               return (error);
+       }
+
+       bap = (int32_t *)bp->b_data;    /* XXX ondisk32 */
+       if (lastbn >= 0) {
+               /* XXX ondisk32 */
+               copy = malloc(fs->e2fs_bsize, M_TEMP, M_WAITOK);
+               memcpy((void *)copy, (void *)bap, (u_int)fs->e2fs_bsize);
+               memset((void *)&bap[last + 1], 0,
+                       (u_int)(NINDIR(fs) - (last + 1)) * sizeof (uint32_t));
+               error = bwrite(bp);
+               if (error)
+                       allerror = error;
+               bap = copy;
+       }
+
+       /*
+        * Recursively free totally unused blocks.
+        */
+       for (i = NINDIR(fs) - 1,
+               nlbn = lbn + 1 - i * factor; i > last;
+               i--, nlbn += factor) {
+               /* XXX ondisk32 */
+               nb = fs2h32(bap[i]);
+               if (nb == 0)
+                       continue;
+               if (level > SINGLE) {
+                       error = ext2fs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+                                                  (daddr_t)-1, level - 1,
+                                                  &blkcount);
+                       if (error)
+                               allerror = error;
+                       blocksreleased += blkcount;
+               }
+               ext2fs_blkfree(ip, nb);
+               blocksreleased += nblocks;
+       }
+
+       /*
+        * Recursively free last partial block.
+        */
+       if (level > SINGLE && lastbn >= 0) {
+               last = lastbn % factor;
+               /* XXX ondisk32 */
+               nb = fs2h32(bap[i]);
+               if (nb != 0) {
+                       error = ext2fs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+                                                  last, level - 1, &blkcount);
+                       if (error)
+                               allerror = error;
+                       blocksreleased += blkcount;
+               }
+       }
+
+       if (copy != NULL) {
+               free(copy, M_TEMP);
+       } else {
+               brelse(bp, BC_INVAL);
+       }
+
+       *countp = blocksreleased;
+       return (allerror);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_lookup.c b/sys/ufs/ext2fs/ext2fs_lookup.c
new file mode 100644 (file)
index 0000000..eb59c45
--- /dev/null
@@ -0,0 +1,1079 @@
+/*     $NetBSD: ext2fs_lookup.c,v 1.66 2011/07/12 16:59:48 dholland Exp $      */
+
+/*
+ * Modified for NetBSD 1.2E
+ * May 1997, Manuel Bouyer
+ * Laboratoire d'informatique de Paris VI
+ */
+/*
+ *  modified for Lites 1.1
+ *
+ *  Aug 1995, Godmar Back (gback@cs.utah.edu)
+ *  University of Utah, Department of Computer Science
+ */
+/*
+ * Copyright (c) 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_lookup.c        8.6 (Berkeley) 4/1/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_lookup.c,v 1.66 2011/07/12 16:59:48 dholland Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/kauth.h>
+#include <sys/proc.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+#include <ufs/ext2fs/ext2fs.h>
+
+extern int dirchk;
+
+static void    ext2fs_dirconv2ffs(struct ext2fs_direct *e2dir,
+                                         struct dirent *ffsdir);
+static int     ext2fs_dirbadentry(struct vnode *dp,
+                                         struct ext2fs_direct *de,
+                                         int entryoffsetinblock);
+
+/*
+ * the problem that is tackled below is the fact that FFS
+ * includes the terminating zero on disk while EXT2FS doesn't
+ * this implies that we need to introduce some padding.
+ * For instance, a filename "sbin" has normally a reclen 12
+ * in EXT2, but 16 in FFS.
+ * This reminds me of that Pepsi commercial: 'Kid saved a lousy nine cents...'
+ * If it wasn't for that, the complete ufs code for directories would
+ * have worked w/o changes (except for the difference in DIRBLKSIZ)
+ */
+static void
+ext2fs_dirconv2ffs(struct ext2fs_direct *e2dir, struct dirent *ffsdir)
+{
+       memset(ffsdir, 0, sizeof(struct dirent));
+       ffsdir->d_fileno = fs2h32(e2dir->e2d_ino);
+       ffsdir->d_namlen = e2dir->e2d_namlen;
+
+       ffsdir->d_type = DT_UNKNOWN;            /* don't know more here */
+#ifdef DIAGNOSTIC
+#if MAXNAMLEN < E2FS_MAXNAMLEN
+       /*
+        * we should handle this more gracefully !
+        */
+       if (e2dir->e2d_namlen > MAXNAMLEN)
+               panic("ext2fs: e2dir->e2d_namlen");
+#endif
+#endif
+       strncpy(ffsdir->d_name, e2dir->e2d_name, ffsdir->d_namlen);
+
+       /* Godmar thinks: since e2dir->e2d_reclen can be big and means
+          nothing anyway, we compute our own reclen according to what
+          we think is right
+        */
+       ffsdir->d_reclen = _DIRENT_SIZE(ffsdir);
+}
+
+/*
+ * Vnode op for reading directories.
+ *
+ * Convert the on-disk entries to <sys/dirent.h> entries.
+ * the problem is that the conversion will blow up some entries by four bytes,
+ * so it can't be done in place. This is too bad. Right now the conversion is
+ * done entry by entry, the converted entry is sent via uiomove.
+ *
+ * XXX allocate a buffer, convert as many entries as possible, then send
+ * the whole buffer to uiomove
+ */
+int
+ext2fs_readdir(void *v)
+{
+       struct vop_readdir_args /* {
+               struct vnode *a_vp;
+               struct uio *a_uio;
+               kauth_cred_t a_cred;
+               int **a_eofflag;
+               off_t **a_cookies;
+               int ncookies;
+       } */ *ap = v;
+       struct uio *uio = ap->a_uio;
+       int error;
+       size_t e2fs_count, readcnt;
+       struct vnode *vp = ap->a_vp;
+       struct m_ext2fs *fs = VTOI(vp)->i_e2fs;
+
+       struct ext2fs_direct *dp;
+       struct dirent *dstd;
+       struct uio auio;
+       struct iovec aiov;
+       void *dirbuf;
+       off_t off = uio->uio_offset;
+       off_t *cookies = NULL;
+       int nc = 0, ncookies = 0;
+       int e2d_reclen;
+
+       if (vp->v_type != VDIR)
+               return (ENOTDIR);
+
+       e2fs_count = uio->uio_resid;
+       /* Make sure we don't return partial entries. */
+       e2fs_count -= (uio->uio_offset + e2fs_count) & (fs->e2fs_bsize -1);
+       if (e2fs_count <= 0)
+               return (EINVAL);
+
+       auio = *uio;
+       auio.uio_iov = &aiov;
+       auio.uio_iovcnt = 1;
+       aiov.iov_len = e2fs_count;
+       auio.uio_resid = e2fs_count;
+       UIO_SETUP_SYSSPACE(&auio);
+       dirbuf = malloc(e2fs_count, M_TEMP, M_WAITOK);
+       dstd = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK | M_ZERO);
+       if (ap->a_ncookies) {
+               nc = e2fs_count / _DIRENT_MINSIZE((struct dirent *)0);
+               ncookies = nc;
+               cookies = malloc(sizeof (off_t) * ncookies, M_TEMP, M_WAITOK);
+               *ap->a_cookies = cookies;
+       }
+       memset(dirbuf, 0, e2fs_count);
+       aiov.iov_base = dirbuf;
+
+       error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
+       if (error == 0) {
+               readcnt = e2fs_count - auio.uio_resid;
+               for (dp = (struct ext2fs_direct *)dirbuf;
+                       (char *)dp < (char *)dirbuf + readcnt; ) {
+                       e2d_reclen = fs2h16(dp->e2d_reclen);
+                       if (e2d_reclen == 0) {
+                               error = EIO;
+                               break;
+                       }
+                       ext2fs_dirconv2ffs(dp, dstd);
+                       if(dstd->d_reclen > uio->uio_resid) {
+                               break;
+                       }
+                       error = uiomove(dstd, dstd->d_reclen, uio);
+                       if (error != 0) {
+                               break;
+                       }
+                       off = off + e2d_reclen;
+                       if (cookies != NULL) {
+                               *cookies++ = off;
+                               if (--ncookies <= 0){
+                                       break;  /* out of cookies */
+                               }
+                       }
+                       /* advance dp */
+                       dp = (struct ext2fs_direct *) ((char *)dp + e2d_reclen);
+               }
+               /* we need to correct uio_offset */
+               uio->uio_offset = off;
+       }
+       free(dirbuf, M_TEMP);
+       free(dstd, M_TEMP);
+       *ap->a_eofflag = ext2fs_size(VTOI(ap->a_vp)) <= uio->uio_offset;
+       if (ap->a_ncookies) {
+               if (error) {
+                       free(*ap->a_cookies, M_TEMP);
+                       *ap->a_ncookies = 0;
+                       *ap->a_cookies = NULL;
+               } else
+                       *ap->a_ncookies = nc - ncookies;
+       }
+       return (error);
+}
+
+/*
+ * Convert a component of a pathname into a pointer to a locked inode.
+ * This is a very central and rather complicated routine.
+ * If the file system is not maintained in a strict tree hierarchy,
+ * this can result in a deadlock situation (see comments in code below).
+ *
+ * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
+ * on whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it and the target of the pathname
+ * exists, lookup returns both the target and its parent directory locked.
+ * When creating or renaming and LOCKPARENT is specified, the target may
+ * not be ".".  When deleting and LOCKPARENT is specified, the target may
+ * be "."., but the caller must check to ensure it does an vrele and vput
+ * instead of two vputs.
+ *
+ * Overall outline of ext2fs_lookup:
+ *
+ *     check accessibility of directory
+ *     look for name in cache, if found, then if at end of path
+ *       and deleting or creating, drop it, else return name
+ *     search for name in directory, to found or notfound
+ * notfound:
+ *     if creating, return locked directory, leaving info on available slots
+ *     else return error
+ * found:
+ *     if at end of path and deleting, return information to allow delete
+ *     if at end of path and rewriting (RENAME and LOCKPARENT), lock target
+ *       inode and return info to allow rewrite
+ *     if not at end, add name to cache; if at end and neither creating
+ *       nor deleting, add name to cache
+ */
+int
+ext2fs_lookup(void *v)
+{
+       struct vop_lookup_args /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+       } */ *ap = v;
+       struct vnode *vdp = ap->a_dvp;  /* vnode for directory being searched */
+       struct inode *dp = VTOI(vdp);   /* inode for directory being searched */
+       struct buf *bp;                 /* a buffer of directory entries */
+       struct ext2fs_direct *ep;       /* the current directory entry */
+       int entryoffsetinblock;         /* offset of ep in bp's buffer */
+       enum {NONE, COMPACT, FOUND} slotstatus;
+       doff_t slotoffset;              /* offset of area with free space */
+       int slotsize;                   /* size of area at slotoffset */
+       int slotfreespace;              /* amount of space free in slot */
+       int slotneeded;                 /* size of the entry we're seeking */
+       int numdirpasses;               /* strategy for directory search */
+       doff_t endsearch;               /* offset to end directory search */
+       doff_t prevoff;                 /* prev entry dp->i_offset */
+       struct vnode *pdp;              /* saved dp during symlink work */
+       struct vnode *tdp;              /* returned by VFS_VGET */
+       doff_t enduseful;               /* pointer past last used dir slot */
+       u_long bmask;                   /* block offset mask */
+       int namlen, error;
+       struct vnode **vpp = ap->a_vpp;
+       struct componentname *cnp = ap->a_cnp;
+       kauth_cred_t cred = cnp->cn_cred;
+       int flags;
+       int nameiop = cnp->cn_nameiop;
+       struct ufsmount *ump = dp->i_ump;
+       int dirblksiz = ump->um_dirblksiz;
+       ino_t foundino;
+       struct ufs_lookup_results *results;
+
+       flags = cnp->cn_flags;
+
+       bp = NULL;
+       slotoffset = -1;
+       *vpp = NULL;
+
+       /*
+        * Produce the auxiliary lookup results into i_crap. Increment
+        * its serial number so elsewhere we can tell if we're using
+        * stale results. This should not be done this way. XXX.
+        */
+       results = &dp->i_crap;
+       dp->i_crapcounter++;
+
+       /*
+        * Check accessiblity of directory.
+        */
+       if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
+               return (error);
+
+       if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
+           (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+               return (EROFS);
+
+       /*
+        * We now have a segment name to search for, and a directory to search.
+        *
+        * Before tediously performing a linear scan of the directory,
+        * check the name cache to see if the directory/name pair
+        * we are looking for is known already.
+        */
+       if ((error = cache_lookup(vdp, vpp, cnp)) >= 0)
+               return (error);
+
+       /*
+        * Suppress search for slots unless creating
+        * file and at end of pathname, in which case
+        * we watch for a place to put the new file in
+        * case it doesn't already exist.
+        */
+       slotstatus = FOUND;
+       slotfreespace = slotsize = slotneeded = 0;
+       if ((nameiop == CREATE || nameiop == RENAME) &&
+           (flags & ISLASTCN)) {
+               slotstatus = NONE;
+               slotneeded = EXT2FS_DIRSIZ(cnp->cn_namelen);
+       }
+
+       /*
+        * If there is cached information on a previous search of
+        * this directory, pick up where we last left off.
+        * We cache only lookups as these are the most common
+        * and have the greatest payoff. Caching CREATE has little
+        * benefit as it usually must search the entire directory
+        * to determine that the entry does not exist. Caching the
+        * location of the last DELETE or RENAME has not reduced
+        * profiling time and hence has been removed in the interest
+        * of simplicity.
+        */
+       bmask = vdp->v_mount->mnt_stat.f_iosize - 1;
+       if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
+           results->ulr_diroff >= ext2fs_size(dp)) {
+               entryoffsetinblock = 0;
+               results->ulr_offset = 0;
+               numdirpasses = 1;
+       } else {
+               results->ulr_offset = results->ulr_diroff;
+               if ((entryoffsetinblock = results->ulr_offset & bmask) &&
+                   (error = ext2fs_blkatoff(vdp, (off_t)results->ulr_offset, NULL, &bp)))
+                       return (error);
+               numdirpasses = 2;
+               nchstats.ncs_2passes++;
+       }
+       prevoff = results->ulr_offset;
+       endsearch = roundup(ext2fs_size(dp), dirblksiz);
+       enduseful = 0;
+
+searchloop:
+       while (results->ulr_offset < endsearch) {
+               if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
+                       preempt();
+               /*
+                * If necessary, get the next directory block.
+                */
+               if ((results->ulr_offset & bmask) == 0) {
+                       if (bp != NULL)
+                               brelse(bp, 0);
+                       error = ext2fs_blkatoff(vdp, (off_t)results->ulr_offset, NULL,
+                           &bp);
+                       if (error != 0)
+                               return (error);
+                       entryoffsetinblock = 0;
+               }
+               /*
+                * If still looking for a slot, and at a dirblksize
+                * boundary, have to start looking for free space again.
+                */
+               if (slotstatus == NONE &&
+                   (entryoffsetinblock & (dirblksiz - 1)) == 0) {
+                       slotoffset = -1;
+                       slotfreespace = 0;
+               }
+               /*
+                * Get pointer to next entry.
+                * Full validation checks are slow, so we only check
+                * enough to insure forward progress through the
+                * directory. Complete checks can be run by patching
+                * "dirchk" to be true.
+                */
+               KASSERT(bp != NULL);
+               ep = (struct ext2fs_direct *)
+                       ((char *)bp->b_data + entryoffsetinblock);
+               if (ep->e2d_reclen == 0 ||
+                   (dirchk &&
+                    ext2fs_dirbadentry(vdp, ep, entryoffsetinblock))) {
+                       int i;
+
+                       ufs_dirbad(dp, results->ulr_offset, "mangled entry");
+                       i = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
+                       results->ulr_offset += i;
+                       entryoffsetinblock += i;
+                       continue;
+               }
+
+               /*
+                * If an appropriate sized slot has not yet been found,
+                * check to see if one is available. Also accumulate space
+                * in the current block so that we can determine if
+                * compaction is viable.
+                */
+               if (slotstatus != FOUND) {
+                       int size = fs2h16(ep->e2d_reclen);
+
+                       if (ep->e2d_ino != 0)
+                               size -= EXT2FS_DIRSIZ(ep->e2d_namlen);
+                       if (size > 0) {
+                               if (size >= slotneeded) {
+                                       slotstatus = FOUND;
+                                       slotoffset = results->ulr_offset;
+                                       slotsize = fs2h16(ep->e2d_reclen);
+                               } else if (slotstatus == NONE) {
+                                       slotfreespace += size;
+                                       if (slotoffset == -1)
+                                               slotoffset = results->ulr_offset;
+                                       if (slotfreespace >= slotneeded) {
+                                               slotstatus = COMPACT;
+                                               slotsize = results->ulr_offset +
+                                                   fs2h16(ep->e2d_reclen) -
+                                                   slotoffset;
+                                       }
+                               }
+                       }
+               }
+
+               /*
+                * Check for a name match.
+                */
+               if (ep->e2d_ino) {
+                       namlen = ep->e2d_namlen;
+                       if (namlen == cnp->cn_namelen &&
+                           !memcmp(cnp->cn_nameptr, ep->e2d_name,
+                           (unsigned)namlen)) {
+                               /*
+                                * Save directory entry's inode number and
+                                * reclen in ndp->ni_ufs area, and release
+                                * directory buffer.
+                                */
+                               foundino = fs2h32(ep->e2d_ino);
+                               results->ulr_reclen = fs2h16(ep->e2d_reclen);
+                               goto found;
+                       }
+               }
+               prevoff = results->ulr_offset;
+               results->ulr_offset += fs2h16(ep->e2d_reclen);
+               entryoffsetinblock += fs2h16(ep->e2d_reclen);
+               if (ep->e2d_ino)
+                       enduseful = results->ulr_offset;
+       }
+/* notfound: */
+       /*
+        * If we started in the middle of the directory and failed
+        * to find our target, we must check the beginning as well.
+        */
+       if (numdirpasses == 2) {
+               numdirpasses--;
+               results->ulr_offset = 0;
+               endsearch = results->ulr_diroff;
+               goto searchloop;
+       }
+       if (bp != NULL)
+               brelse(bp, 0);
+       /*
+        * If creating, and at end of pathname and current
+        * directory has not been removed, then can consider
+        * allowing file to be created.
+        */
+       if ((nameiop == CREATE || nameiop == RENAME) &&
+           (flags & ISLASTCN) && dp->i_e2fs_nlink != 0) {
+               /*
+                * Access for write is interpreted as allowing
+                * creation of files in the directory.
+                */
+               error = VOP_ACCESS(vdp, VWRITE, cred);
+               if (error)
+                       return (error);
+               /*
+                * Return an indication of where the new directory
+                * entry should be put.  If we didn't find a slot,
+                * then set results->ulr_count to 0 indicating
+                * that the new slot belongs at the end of the
+                * directory. If we found a slot, then the new entry
+                * can be put in the range from results->ulr_offset to
+                * results->ulr_offset + results->ulr_count.
+                */
+               if (slotstatus == NONE) {
+                       results->ulr_offset = roundup(ext2fs_size(dp), dirblksiz);
+                       results->ulr_count = 0;
+                       enduseful = results->ulr_offset;
+               } else {
+                       results->ulr_offset = slotoffset;
+                       results->ulr_count = slotsize;
+                       if (enduseful < slotoffset + slotsize)
+                               enduseful = slotoffset + slotsize;
+               }
+               results->ulr_endoff = roundup(enduseful, dirblksiz);
+#if 0
+               dp->i_flag |= IN_CHANGE | IN_UPDATE;
+#endif
+               /*
+                * We return with the directory locked, so that
+                * the parameters we set up above will still be
+                * valid if we actually decide to do a direnter().
+                * We return ni_vp == NULL to indicate that the entry
+                * does not currently exist; we leave a pointer to
+                * the (locked) directory inode in ndp->ni_dvp.
+                *
+                * NB - if the directory is unlocked, then this
+                * information cannot be used.
+                */
+               return (EJUSTRETURN);
+       }
+       /*
+        * Insert name into cache (as non-existent) if appropriate.
+        */
+       if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+               cache_enter(vdp, *vpp, cnp);
+       return (ENOENT);
+
+found:
+       if (numdirpasses == 2)
+               nchstats.ncs_pass2++;
+       /*
+        * Check that directory length properly reflects presence
+        * of this entry.
+        */
+       if (results->ulr_offset + EXT2FS_DIRSIZ(ep->e2d_namlen) > ext2fs_size(dp)) {
+               ufs_dirbad(dp, results->ulr_offset, "i_size too small");
+               error = ext2fs_setsize(dp,
+                               results->ulr_offset + EXT2FS_DIRSIZ(ep->e2d_namlen));
+               if (error) {
+                       brelse(bp, 0);
+                       return (error);
+               }
+               dp->i_flag |= IN_CHANGE | IN_UPDATE;
+               uvm_vnp_setsize(vdp, ext2fs_size(dp));
+       }
+       brelse(bp, 0);
+
+       /*
+        * Found component in pathname.
+        * If the final component of path name, save information
+        * in the cache as to where the entry was found.
+        */
+       if ((flags & ISLASTCN) && nameiop == LOOKUP)
+               results->ulr_diroff = results->ulr_offset &~ (dirblksiz - 1);
+
+       /*
+        * If deleting, and at end of pathname, return
+        * parameters which can be used to remove file.
+        * Lock the inode, being careful with ".".
+        */
+       if (nameiop == DELETE && (flags & ISLASTCN)) {
+               /*
+                * Write access to directory required to delete files.
+                */
+               if ((error = VOP_ACCESS(vdp, VWRITE, cred)) != 0)
+                       return (error);
+               /*
+                * Return pointer to current entry in results->ulr_offset,
+                * and distance past previous entry (if there
+                * is a previous entry in this block) in results->ulr_count.
+                * Save directory inode pointer in ndp->ni_dvp for dirremove().
+                */
+               if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+                       results->ulr_count = 0;
+               else
+                       results->ulr_count = results->ulr_offset - prevoff;
+               if (dp->i_number == foundino) {
+                       vref(vdp);
+                       *vpp = vdp;
+                       return (0);
+               }
+               if (flags & ISDOTDOT)
+                       VOP_UNLOCK(vdp); /* race to get the inode */
+               error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+               if (flags & ISDOTDOT)
+                       vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+               if (error)
+                       return (error);
+               /*
+                * If directory is "sticky", then user must own
+                * the directory, or the file in it, else she
+                * may not delete it (unless she's root). This
+                * implements append-only directories.
+                */
+               if ((dp->i_e2fs_mode & ISVTX) &&
+                   kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) &&
+                   kauth_cred_geteuid(cred) != dp->i_uid &&
+                   VTOI(tdp)->i_uid != kauth_cred_geteuid(cred)) {
+                       vput(tdp);
+                       return (EPERM);
+               }
+               *vpp = tdp;
+               return (0);
+       }
+
+       /*
+        * If rewriting (RENAME), return the inode and the
+        * information required to rewrite the present directory
+        * Must get inode of directory entry to verify it's a
+        * regular file, or empty directory.
+        */
+       if (nameiop == RENAME && (flags & ISLASTCN)) {
+               error = VOP_ACCESS(vdp, VWRITE, cred);
+               if (error)
+                       return (error);
+               /*
+                * Careful about locking second inode.
+                * This can only occur if the target is ".".
+                */
+               if (dp->i_number == foundino)
+                       return (EISDIR);
+               if (flags & ISDOTDOT)
+                       VOP_UNLOCK(vdp); /* race to get the inode */
+               error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+               if (flags & ISDOTDOT)
+                       vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+               if (error)
+                       return (error);
+               *vpp = tdp;
+               return (0);
+       }
+
+       /*
+        * Step through the translation in the name.  We do not `vput' the
+        * directory because we may need it again if a symbolic link
+        * is relative to the current directory.  Instead we save it
+        * unlocked as "pdp".  We must get the target inode before unlocking
+        * the directory to insure that the inode will not be removed
+        * before we get it.  We prevent deadlock by always fetching
+        * inodes from the root, moving down the directory tree. Thus
+        * when following backward pointers ".." we must unlock the
+        * parent directory before getting the requested directory.
+        * There is a potential race condition here if both the current
+        * and parent directories are removed before the VFS_VGET for the
+        * inode associated with ".." returns.  We hope that this occurs
+        * infrequently since we cannot avoid this race condition without
+        * implementing a sophisticated deadlock detection algorithm.
+        * Note also that this simple deadlock detection scheme will not
+        * work if the file system has any hard links other than ".."
+        * that point backwards in the directory structure.
+        */
+       pdp = vdp;
+       if (flags & ISDOTDOT) {
+               VOP_UNLOCK(pdp);        /* race to get the inode */
+               error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+               vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
+               if (error) {
+                       return (error);
+               }
+               *vpp = tdp;
+       } else if (dp->i_number == foundino) {
+               vref(vdp);      /* we want ourself, ie "." */
+               *vpp = vdp;
+       } else {
+               error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+               if (error)
+                       return (error);
+               *vpp = tdp;
+       }
+
+       /*
+        * Insert name into cache if appropriate.
+        */
+       if (cnp->cn_flags & MAKEENTRY)
+               cache_enter(vdp, *vpp, cnp);
+       return (0);
+}
+
+/*
+ * Do consistency checking on a directory entry:
+ *     record length must be multiple of 4
+ *     entry must fit in rest of its dirblksize block
+ *     record must be large enough to contain entry
+ *     name is not longer than EXT2FS_MAXNAMLEN
+ *     name must be as long as advertised, and null terminated
+ */
+/*
+ *     changed so that it confirms to ext2fs_check_dir_entry
+ */
+static int
+ext2fs_dirbadentry(struct vnode *dp, struct ext2fs_direct *de,
+               int entryoffsetinblock)
+{
+       struct ufsmount *ump = VFSTOUFS(dp->v_mount);
+       int dirblksiz = ump->um_dirblksiz;
+
+               const char *error_msg = NULL;
+               int reclen = fs2h16(de->e2d_reclen);
+               int namlen = de->e2d_namlen;
+
+               if (reclen < EXT2FS_DIRSIZ(1)) /* e2d_namlen = 1 */
+                       error_msg = "rec_len is smaller than minimal";
+               else if (reclen % 4 != 0)
+                       error_msg = "rec_len % 4 != 0";
+               else if (namlen > EXT2FS_MAXNAMLEN)
+                       error_msg = "namlen > EXT2FS_MAXNAMLEN";
+               else if (reclen < EXT2FS_DIRSIZ(namlen))
+                       error_msg = "reclen is too small for name_len";
+               else if (entryoffsetinblock + reclen > dirblksiz)
+                       error_msg = "directory entry across blocks";
+               else if (fs2h32(de->e2d_ino) >
+                   VTOI(dp)->i_e2fs->e2fs.e2fs_icount)
+                       error_msg = "inode out of bounds";
+
+               if (error_msg != NULL) {
+                       printf( "bad directory entry: %s\n"
+                           "offset=%d, inode=%lu, rec_len=%d, name_len=%d \n",
+                           error_msg, entryoffsetinblock,
+                           (unsigned long) fs2h32(de->e2d_ino),
+                           reclen, namlen);
+                       panic("ext2fs_dirbadentry");
+               }
+               return error_msg == NULL ? 0 : 1;
+}
+
+/*
+ * Write a directory entry after a call to namei, using the parameters
+ * that it left in nameidata.  The argument ip is the inode which the new
+ * directory entry will refer to.  Dvp is a pointer to the directory to
+ * be written, which was left locked by namei. Remaining parameters
+ * (ulr_offset, ulr_count) indicate how the space for the new
+ * entry is to be obtained.
+ */
+int
+ext2fs_direnter(struct inode *ip, struct vnode *dvp,
+               const struct ufs_lookup_results *ulr,
+               struct componentname *cnp)
+{
+       struct ext2fs_direct *ep, *nep;
+       struct inode *dp;
+       struct buf *bp;
+       struct ext2fs_direct newdir;
+       struct iovec aiov;
+       struct uio auio;
+       u_int dsize;
+       int error, loc, newentrysize, spacefree;
+       char *dirbuf;
+       struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
+       int dirblksiz = ump->um_dirblksiz;
+
+       dp = VTOI(dvp);
+
+       newdir.e2d_ino = h2fs32(ip->i_number);
+       newdir.e2d_namlen = cnp->cn_namelen;
+       if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+           (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+               newdir.e2d_type = inot2ext2dt(IFTODT(ip->i_e2fs_mode));
+       } else {
+               newdir.e2d_type = 0;
+       }
+       memcpy(newdir.e2d_name, cnp->cn_nameptr, (unsigned)cnp->cn_namelen + 1);
+       newentrysize = EXT2FS_DIRSIZ(cnp->cn_namelen);
+       if (ulr->ulr_count == 0) {
+               /*
+                * If ulr_count is 0, then namei could find no
+                * space in the directory. Here, ulr_offset will
+                * be on a directory block boundary and we will write the
+                * new entry into a fresh block.
+                */
+               if (ulr->ulr_offset & (dirblksiz - 1))
+                       panic("ext2fs_direnter: newblk");
+               auio.uio_offset = ulr->ulr_offset;
+               newdir.e2d_reclen = h2fs16(dirblksiz);
+               auio.uio_resid = newentrysize;
+               aiov.iov_len = newentrysize;
+               aiov.iov_base = (void *)&newdir;
+               auio.uio_iov = &aiov;
+               auio.uio_iovcnt = 1;
+               auio.uio_rw = UIO_WRITE;
+               UIO_SETUP_SYSSPACE(&auio);
+               error = VOP_WRITE(dvp, &auio, IO_SYNC, cnp->cn_cred);
+               if (dirblksiz > dvp->v_mount->mnt_stat.f_bsize)
+                       /* XXX should grow with balloc() */
+                       panic("ext2fs_direnter: frag size");
+               else if (!error) {
+                       error = ext2fs_setsize(dp,
+                               roundup(ext2fs_size(dp), dirblksiz));
+                       if (error)
+                               return (error);
+                       dp->i_flag |= IN_CHANGE;
+                       uvm_vnp_setsize(dvp, ext2fs_size(dp));
+               }
+               return (error);
+       }
+
+       /*
+        * If ulr_count is non-zero, then namei found space
+        * for the new entry in the range ulr_offset to
+        * ulr_offset + ulr_count in the directory.
+        * To use this space, we may have to compact the entries located
+        * there, by copying them together towards the beginning of the
+        * block, leaving the free space in one usable chunk at the end.
+        */
+
+       /*
+        * Get the block containing the space for the new directory entry.
+        */
+       if ((error = ext2fs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp)) != 0)
+               return (error);
+       /*
+        * Find space for the new entry. In the simple case, the entry at
+        * offset base will have the space. If it does not, then namei
+        * arranged that compacting the region ulr_offset to
+        * ulr_offset + ulr_count would yield the
+        * space.
+        */
+       ep = (struct ext2fs_direct *)dirbuf;
+       dsize = EXT2FS_DIRSIZ(ep->e2d_namlen);
+       spacefree = fs2h16(ep->e2d_reclen) - dsize;
+       for (loc = fs2h16(ep->e2d_reclen); loc < ulr->ulr_count; ) {
+               nep = (struct ext2fs_direct *)(dirbuf + loc);
+               if (ep->e2d_ino) {
+                       /* trim the existing slot */
+                       ep->e2d_reclen = h2fs16(dsize);
+                       ep = (struct ext2fs_direct *)((char *)ep + dsize);
+               } else {
+                       /* overwrite; nothing there; header is ours */
+                       spacefree += dsize;
+               }
+               dsize = EXT2FS_DIRSIZ(nep->e2d_namlen);
+               spacefree += fs2h16(nep->e2d_reclen) - dsize;
+               loc += fs2h16(nep->e2d_reclen);
+               memcpy((void *)ep, (void *)nep, dsize);
+       }
+       /*
+        * Update the pointer fields in the previous entry (if any),
+        * copy in the new entry, and write out the block.
+        */
+       if (ep->e2d_ino == 0) {
+#ifdef DIAGNOSTIC
+               if (spacefree + dsize < newentrysize)
+                       panic("ext2fs_direnter: compact1");
+#endif
+               newdir.e2d_reclen = h2fs16(spacefree + dsize);
+       } else {
+#ifdef DIAGNOSTIC
+               if (spacefree < newentrysize) {
+                       printf("ext2fs_direnter: compact2 %u %u",
+                           (u_int)spacefree, (u_int)newentrysize);
+                       panic("ext2fs_direnter: compact2");
+               }
+#endif
+               newdir.e2d_reclen = h2fs16(spacefree);
+               ep->e2d_reclen = h2fs16(dsize);
+               ep = (struct ext2fs_direct *)((char *)ep + dsize);
+       }
+       memcpy((void *)ep, (void *)&newdir, (u_int)newentrysize);
+       error = VOP_BWRITE(bp->b_vp, bp);
+       dp->i_flag |= IN_CHANGE | IN_UPDATE;
+       if (!error && ulr->ulr_endoff && ulr->ulr_endoff < ext2fs_size(dp))
+               error = ext2fs_truncate(dvp, (off_t)ulr->ulr_endoff, IO_SYNC,
+                   cnp->cn_cred);
+       return (error);
+}
+
+/*
+ * Remove a directory entry after a call to namei, using
+ * the auxiliary results it provided. The entry
+ * ulr_offset contains the offset into the directory of the
+ * entry to be eliminated.  The ulr_count field contains the
+ * size of the previous record in the directory.  If this
+ * is 0, the first entry is being deleted, so we need only
+ * zero the inode number to mark the entry as free.  If the
+ * entry is not the first in the directory, we must reclaim
+ * the space of the now empty record by adding the record size
+ * to the size of the previous entry.
+ */
+int
+ext2fs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+                struct componentname *cnp)
+{
+       struct inode *dp;
+       struct ext2fs_direct *ep;
+       struct buf *bp;
+       int error;
+
+       dp = VTOI(dvp);
+
+       if (ulr->ulr_count == 0) {
+               /*
+                * First entry in block: set d_ino to zero.
+                */
+               error = ext2fs_blkatoff(dvp, (off_t)ulr->ulr_offset,
+                   (void *)&ep, &bp);
+               if (error != 0)
+                       return (error);
+               ep->e2d_ino = 0;
+               error = VOP_BWRITE(bp->b_vp, bp);
+               dp->i_flag |= IN_CHANGE | IN_UPDATE;
+               return (error);
+       }
+       /*
+        * Collapse new free space into previous entry.
+        */
+       error = ext2fs_blkatoff(dvp, (off_t)(ulr->ulr_offset - ulr->ulr_count),
+           (void *)&ep, &bp);
+       if (error != 0)
+               return (error);
+       ep->e2d_reclen = h2fs16(fs2h16(ep->e2d_reclen) + ulr->ulr_reclen);
+       error = VOP_BWRITE(bp->b_vp, bp);
+       dp->i_flag |= IN_CHANGE | IN_UPDATE;
+       return (error);
+}
+
+/*
+ * Rewrite an existing directory entry to point at the inode
+ * supplied.  The parameters describing the directory entry are
+ * set up by a call to namei.
+ */
+int
+ext2fs_dirrewrite(struct inode *dp, const struct ufs_lookup_results *ulr,
+    struct inode *ip, struct componentname *cnp)
+{
+       struct buf *bp;
+       struct ext2fs_direct *ep;
+       struct vnode *vdp = ITOV(dp);
+       int error;
+
+       error = ext2fs_blkatoff(vdp, (off_t)ulr->ulr_offset, (void *)&ep, &bp);
+       if (error != 0)
+               return (error);
+       ep->e2d_ino = h2fs32(ip->i_number);
+       if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+           (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+               ep->e2d_type = inot2ext2dt(IFTODT(ip->i_e2fs_mode));
+       } else {
+               ep->e2d_type = 0;
+       }
+       error = VOP_BWRITE(bp->b_vp, bp);
+       dp->i_flag |= IN_CHANGE | IN_UPDATE;
+       return (error);
+}
+
+/*
+ * Check if a directory is empty or not.
+ * Inode supplied must be locked.
+ *
+ * Using a struct dirtemplate here is not precisely
+ * what we want, but better than using a struct ext2fs_direct.
+ *
+ * NB: does not handle corrupted directories.
+ */
+int
+ext2fs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred)
+{
+       off_t off;
+       struct ext2fs_dirtemplate dbuf;
+       struct ext2fs_direct *dp = (struct ext2fs_direct *)&dbuf;
+       int error, namlen;
+       size_t count;
+
+#define        MINDIRSIZ (sizeof (struct ext2fs_dirtemplate) / 2)
+
+       for (off = 0; off < ext2fs_size(ip); off += fs2h16(dp->e2d_reclen)) {
+               error = vn_rdwr(UIO_READ, ITOV(ip), (void *)dp, MINDIRSIZ, off,
+                  UIO_SYSSPACE, IO_NODELOCKED, cred, &count, NULL);
+               /*
+                * Since we read MINDIRSIZ, residual must
+                * be 0 unless we're at end of file.
+                */
+               if (error || count != 0)
+                       return (0);
+               /* avoid infinite loops */
+               if (dp->e2d_reclen == 0)
+                       return (0);
+               /* skip empty entries */
+               if (dp->e2d_ino == 0)
+                       continue;
+               /* accept only "." and ".." */
+               namlen = dp->e2d_namlen;
+               if (namlen > 2)
+                       return (0);
+               if (dp->e2d_name[0] != '.')
+                       return (0);
+               /*
+                * At this point namlen must be 1 or 2.
+                * 1 implies ".", 2 implies ".." if second
+                * char is also "."
+                */
+               if (namlen == 1)
+                       continue;
+               if (dp->e2d_name[1] == '.' && fs2h32(dp->e2d_ino) == parentino)
+                       continue;
+               return (0);
+       }
+       return (1);
+}
+
+/*
+ * Check if source directory is in the path of the target directory.
+ * Target is supplied locked, source is unlocked.
+ * The target is always vput before returning.
+ */
+int
+ext2fs_checkpath(struct inode *source, struct inode *target,
+       kauth_cred_t cred)
+{
+       struct vnode *vp;
+       int error, rootino, namlen;
+       struct ext2fs_dirtemplate dirbuf;
+       uint32_t ino;
+
+       vp = ITOV(target);
+       if (target->i_number == source->i_number) {
+               error = EEXIST;
+               goto out;
+       }
+       rootino = ROOTINO;
+       error = 0;
+       if (target->i_number == rootino)
+               goto out;
+
+       for (;;) {
+               if (vp->v_type != VDIR) {
+                       error = ENOTDIR;
+                       break;
+               }
+               error = vn_rdwr(UIO_READ, vp, (void *)&dirbuf,
+                       sizeof (struct ext2fs_dirtemplate), (off_t)0,
+                       UIO_SYSSPACE, IO_NODELOCKED, cred, (size_t *)0,
+                       NULL);
+               if (error != 0)
+                       break;
+               namlen = dirbuf.dotdot_namlen;
+               if (namlen != 2 ||
+                       dirbuf.dotdot_name[0] != '.' ||
+                       dirbuf.dotdot_name[1] != '.') {
+                       error = ENOTDIR;
+                       break;
+               }
+               ino = fs2h32(dirbuf.dotdot_ino);
+               if (ino == source->i_number) {
+                       error = EINVAL;
+                       break;
+               }
+               if (ino == rootino)
+                       break;
+               vput(vp);
+               error = VFS_VGET(vp->v_mount, ino, &vp);
+               if (error != 0) {
+                       vp = NULL;
+                       break;
+               }
+       }
+
+out:
+       if (error == ENOTDIR) {
+               printf("checkpath: .. not a directory\n");
+               panic("checkpath");
+       }
+       if (vp != NULL)
+               vput(vp);
+       return (error);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_readwrite.c b/sys/ufs/ext2fs/ext2fs_readwrite.c
new file mode 100644 (file)
index 0000000..0b6f8d6
--- /dev/null
@@ -0,0 +1,392 @@
+/*     $NetBSD: ext2fs_readwrite.c,v 1.58 2011/11/18 21:18:51 christos Exp $   */
+
+/*-
+ * Copyright (c) 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_readwrite.c     8.8 (Berkeley) 8/4/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*-
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *     @(#)ufs_readwrite.c     8.8 (Berkeley) 8/4/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_readwrite.c,v 1.58 2011/11/18 21:18:51 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+
+#define doclusterread 0 /* XXX underway */
+#define doclusterwrite 0
+
+/*
+ * Vnode op for reading.
+ */
+/* ARGSUSED */
+int
+ext2fs_read(void *v)
+{
+       struct vop_read_args /* {
+               struct vnode *a_vp;
+               struct uio *a_uio;
+               int a_ioflag;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp;
+       struct inode *ip;
+       struct uio *uio;
+       struct m_ext2fs *fs;
+       struct buf *bp;
+       struct ufsmount *ump;
+       vsize_t bytelen;
+       daddr_t lbn, nextlbn;
+       off_t bytesinfile;
+       long size, xfersize, blkoffset;
+       int error;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       ump = ip->i_ump;
+       uio = ap->a_uio;
+       error = 0;
+
+#ifdef DIAGNOSTIC
+       if (uio->uio_rw != UIO_READ)
+               panic("%s: mode", "ext2fs_read");
+
+       if (vp->v_type == VLNK) {
+               if (ext2fs_size(ip) < ump->um_maxsymlinklen ||
+                   (ump->um_maxsymlinklen == 0 && ip->i_e2fs_nblock == 0))
+                       panic("%s: short symlink", "ext2fs_read");
+       } else if (vp->v_type != VREG && vp->v_type != VDIR)
+               panic("%s: type %d", "ext2fs_read", vp->v_type);
+#endif
+       fs = ip->i_e2fs;
+       if ((uint64_t)uio->uio_offset > ump->um_maxfilesize)
+               return (EFBIG);
+       if (uio->uio_resid == 0)
+               return (0);
+       if (uio->uio_offset >= ext2fs_size(ip))
+               goto out;
+
+       if (vp->v_type == VREG) {
+               const int advice = IO_ADV_DECODE(ap->a_ioflag);
+
+               while (uio->uio_resid > 0) {
+                       bytelen = MIN(ext2fs_size(ip) - uio->uio_offset,
+                           uio->uio_resid);
+                       if (bytelen == 0)
+                               break;
+
+                       error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+                           UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp));
+                       if (error)
+                               break;
+               }
+               goto out;
+       }
+
+       for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+               bytesinfile = ext2fs_size(ip) - uio->uio_offset;
+               if (bytesinfile <= 0)
+                       break;
+               lbn = lblkno(fs, uio->uio_offset);
+               nextlbn = lbn + 1;
+               size = fs->e2fs_bsize;
+               blkoffset = blkoff(fs, uio->uio_offset);
+               xfersize = fs->e2fs_bsize - blkoffset;
+               if (uio->uio_resid < xfersize)
+                       xfersize = uio->uio_resid;
+               if (bytesinfile < xfersize)
+                       xfersize = bytesinfile;
+
+               if (lblktosize(fs, nextlbn) >= ext2fs_size(ip))
+                       error = bread(vp, lbn, size, NOCRED, 0, &bp);
+               else {
+                       int nextsize = fs->e2fs_bsize;
+                       error = breadn(vp, lbn,
+                               size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+               }
+               if (error)
+                       break;
+
+               /*
+                * We should only get non-zero b_resid when an I/O error
+                * has occurred, which should cause us to break above.
+                * However, if the short read did not cause an error,
+                * then we want to ensure that we do not uiomove bad
+                * or uninitialized data.
+                */
+               size -= bp->b_resid;
+               if (size < xfersize) {
+                       if (size == 0)
+                               break;
+                       xfersize = size;
+               }
+               error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+               if (error)
+                       break;
+               brelse(bp, 0);
+       }
+       if (bp != NULL)
+               brelse(bp, 0);
+
+out:
+       if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
+               ip->i_flag |= IN_ACCESS;
+               if ((ap->a_ioflag & IO_SYNC) == IO_SYNC)
+                       error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT);
+       }
+       return (error);
+}
+
+/*
+ * Vnode op for writing.
+ */
+int
+ext2fs_write(void *v)
+{
+       struct vop_write_args /* {
+               struct vnode *a_vp;
+               struct uio *a_uio;
+               int a_ioflag;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp;
+       struct uio *uio;
+       struct inode *ip;
+       struct m_ext2fs *fs;
+       struct buf *bp;
+       struct ufsmount *ump;
+       daddr_t lbn;
+       off_t osize;
+       int blkoffset, error, flags, ioflag, resid, xfersize;
+       vsize_t bytelen;
+       off_t oldoff = 0;                                       /* XXX */
+       bool async;
+       int extended = 0;
+       int advice;
+
+       ioflag = ap->a_ioflag;
+       advice = IO_ADV_DECODE(ioflag);
+       uio = ap->a_uio;
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       ump = ip->i_ump;
+       error = 0;
+
+#ifdef DIAGNOSTIC
+       if (uio->uio_rw != UIO_WRITE)
+               panic("%s: mode", "ext2fs_write");
+#endif
+
+       switch (vp->v_type) {
+       case VREG:
+               if (ioflag & IO_APPEND)
+                       uio->uio_offset = ext2fs_size(ip);
+               if ((ip->i_e2fs_flags & EXT2_APPEND) &&
+                   uio->uio_offset != ext2fs_size(ip))
+                       return (EPERM);
+               /* FALLTHROUGH */
+       case VLNK:
+               break;
+       case VDIR:
+               if ((ioflag & IO_SYNC) == 0)
+                       panic("%s: nonsync dir write", "ext2fs_write");
+               break;
+       default:
+               panic("%s: type", "ext2fs_write");
+       }
+
+       fs = ip->i_e2fs;
+       if (uio->uio_offset < 0 ||
+           (uint64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
+               return (EFBIG);
+       if (uio->uio_resid == 0)
+               return (0);
+
+       async = vp->v_mount->mnt_flag & MNT_ASYNC;
+       resid = uio->uio_resid;
+       osize = ext2fs_size(ip);
+
+       if (vp->v_type == VREG) {
+               while (uio->uio_resid > 0) {
+                       oldoff = uio->uio_offset;
+                       blkoffset = blkoff(fs, uio->uio_offset);
+                       bytelen = MIN(fs->e2fs_bsize - blkoffset,
+                           uio->uio_resid);
+
+                       if (vp->v_size < oldoff + bytelen) {
+                               uvm_vnp_setwritesize(vp, oldoff + bytelen);
+                       }
+                       error = ufs_balloc_range(vp, uio->uio_offset,
+                           bytelen, ap->a_cred, 0);
+                       if (error)
+                               break;
+                       error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+                           UBC_WRITE | UBC_UNMAP_FLAG(vp));
+                       if (error)
+                               break;
+
+                       /*
+                        * update UVM's notion of the size now that we've
+                        * copied the data into the vnode's pages.
+                        */
+
+                       if (vp->v_size < uio->uio_offset) {
+                               uvm_vnp_setsize(vp, uio->uio_offset);
+                               extended = 1;
+                       }
+
+                       /*
+                        * flush what we just wrote if necessary.
+                        * XXXUBC simplistic async flushing.
+                        */
+
+                       if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
+                               mutex_enter(vp->v_interlock);
+                               error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
+                                   (uio->uio_offset >> 16) << 16, PGO_CLEANIT);
+                       }
+               }
+               if (error == 0 && ioflag & IO_SYNC) {
+                       mutex_enter(vp->v_interlock);
+                       error = VOP_PUTPAGES(vp, trunc_page(oldoff),
+                           round_page(blkroundup(fs, uio->uio_offset)),
+                           PGO_CLEANIT | PGO_SYNCIO);
+               }
+
+               goto out;
+       }
+
+       flags = ioflag & IO_SYNC ? B_SYNC : 0;
+       for (error = 0; uio->uio_resid > 0;) {
+               lbn = lblkno(fs, uio->uio_offset);
+               blkoffset = blkoff(fs, uio->uio_offset);
+               xfersize = MIN(fs->e2fs_bsize - blkoffset, uio->uio_resid);
+               if (xfersize < fs->e2fs_bsize)
+                       flags |= B_CLRBUF;
+               else
+                       flags &= ~B_CLRBUF;
+               error = ext2fs_balloc(ip,
+                   lbn, blkoffset + xfersize, ap->a_cred, &bp, flags);
+               if (error)
+                       break;
+               if (ext2fs_size(ip) < uio->uio_offset + xfersize) {
+                       error = ext2fs_setsize(ip, uio->uio_offset + xfersize);
+                       if (error)
+                               break;
+               }
+               error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+
+               /*
+                * update UVM's notion of the size now that we've
+                * copied the data into the vnode's pages.
+                */
+
+               if (vp->v_size < uio->uio_offset) {
+                       uvm_vnp_setsize(vp, uio->uio_offset);
+                       extended = 1;
+               }
+
+               if (ioflag & IO_SYNC)
+                       (void)bwrite(bp);
+               else if (xfersize + blkoffset == fs->e2fs_bsize)
+                       bawrite(bp);
+               else
+                       bdwrite(bp);
+               if (error || xfersize == 0)
+                       break;
+       }
+
+       /*
+        * If we successfully wrote any data, and we are not the superuser
+        * we clear the setuid and setgid bits as a precaution against
+        * tampering.
+        */
+
+out:
+       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+       if (vp->v_mount->mnt_flag & MNT_RELATIME)
+               ip->i_flag |= IN_ACCESS;
+       if (resid > uio->uio_resid && ap->a_cred &&
+           kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL))
+               ip->i_e2fs_mode &= ~(ISUID | ISGID);
+       if (resid > uio->uio_resid)
+               VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+       if (error) {
+               (void) ext2fs_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred);
+               uio->uio_offset -= resid - uio->uio_resid;
+               uio->uio_resid = resid;
+       } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
+               error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT);
+       KASSERT(vp->v_size == ext2fs_size(ip));
+       return (error);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_subr.c b/sys/ufs/ext2fs/ext2fs_subr.c
new file mode 100644 (file)
index 0000000..64f4c9f
--- /dev/null
@@ -0,0 +1,137 @@
+/*     $NetBSD: ext2fs_subr.c,v 1.27 2009/10/19 18:41:17 bouyer Exp $  */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_subr.c  8.2 (Berkeley) 9/21/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *     @(#)ffs_subr.c  8.2 (Berkeley) 9/21/93
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_subr.c,v 1.27 2009/10/19 18:41:17 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/inttypes.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+/*
+ * Return buffer with the contents of block "offset" from the beginning of
+ * directory "ip".  If "res" is non-zero, fill it in with a pointer to the
+ * remaining space in the directory.
+ */
+int
+ext2fs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp)
+{
+       struct inode *ip;
+       struct m_ext2fs *fs;
+       struct buf *bp;
+       daddr_t lbn;
+       int error;
+
+       ip = VTOI(vp);
+       fs = ip->i_e2fs;
+       lbn = lblkno(fs, offset);
+
+       *bpp = NULL;
+       if ((error = bread(vp, lbn, fs->e2fs_bsize, NOCRED, 0, &bp)) != 0) {
+               brelse(bp, 0);
+               return (error);
+       }
+       if (res)
+               *res = (char *)bp->b_data + blkoff(fs, offset);
+       *bpp = bp;
+       return (0);
+}
+
+void
+ext2fs_itimes(struct inode *ip, const struct timespec *acc,
+    const struct timespec *mod, const struct timespec *cre)
+{
+       struct timespec now;
+
+       if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
+               return;
+       }
+
+       vfs_timestamp(&now);
+       if (ip->i_flag & IN_ACCESS) {
+               if (acc == NULL)
+                       acc = &now;
+               ip->i_e2fs_atime = acc->tv_sec;
+       }
+       if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
+               if (mod == NULL)
+                       mod = &now;
+               ip->i_e2fs_mtime = mod->tv_sec;
+               ip->i_modrev++;
+       }
+       if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
+               if (cre == NULL)
+                       cre = &now;
+               ip->i_e2fs_ctime = cre->tv_sec;
+       }
+       if (ip->i_flag & (IN_ACCESS | IN_MODIFY))
+               ip->i_flag |= IN_ACCESSED;
+       if (ip->i_flag & (IN_UPDATE | IN_CHANGE))
+               ip->i_flag |= IN_MODIFIED;
+       ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_vfsops.c b/sys/ufs/ext2fs/ext2fs_vfsops.c
new file mode 100644 (file)
index 0000000..76f6dd5
--- /dev/null
@@ -0,0 +1,1266 @@
+/*     $NetBSD: ext2fs_vfsops.c,v 1.162 2011/11/14 18:35:14 hannken Exp $      */
+
+/*
+ * Copyright (c) 1989, 1991, 1993, 1994
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_vfsops.c        8.14 (Berkeley) 11/28/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *     @(#)ffs_vfsops.c        8.14 (Berkeley) 11/28/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_vfsops.c,v 1.162 2011/11/14 18:35:14 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_compat_netbsd.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/lock.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+#include <sys/module.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+
+MODULE(MODULE_CLASS_VFS, ext2fs, "ffs");
+
+int ext2fs_sbupdate(struct ufsmount *, int);
+static int ext2fs_checksb(struct ext2fs *, int);
+
+static struct sysctllog *ext2fs_sysctl_log;
+
+extern const struct vnodeopv_desc ext2fs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc ext2fs_specop_opv_desc;
+extern const struct vnodeopv_desc ext2fs_fifoop_opv_desc;
+
+const struct vnodeopv_desc * const ext2fs_vnodeopv_descs[] = {
+       &ext2fs_vnodeop_opv_desc,
+       &ext2fs_specop_opv_desc,
+       &ext2fs_fifoop_opv_desc,
+       NULL,
+};
+
+struct vfsops ext2fs_vfsops = {
+       MOUNT_EXT2FS,
+       sizeof (struct ufs_args),
+       ext2fs_mount,
+       ufs_start,
+       ext2fs_unmount,
+       ufs_root,
+       ufs_quotactl,
+       ext2fs_statvfs,
+       ext2fs_sync,
+       ext2fs_vget,
+       ext2fs_fhtovp,
+       ext2fs_vptofh,
+       ext2fs_init,
+       ext2fs_reinit,
+       ext2fs_done,
+       ext2fs_mountroot,
+       (int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp,
+       vfs_stdextattrctl,
+       (void *)eopnotsupp,     /* vfs_suspendctl */
+       genfs_renamelock_enter,
+       genfs_renamelock_exit,
+       (void *)eopnotsupp,
+       ext2fs_vnodeopv_descs,
+       0,
+       { NULL, NULL },
+};
+
+static const struct genfs_ops ext2fs_genfsops = {
+       .gop_size = genfs_size,
+       .gop_alloc = ext2fs_gop_alloc,
+       .gop_write = genfs_gop_write,
+       .gop_markupdate = ufs_gop_markupdate,
+};
+
+static const struct ufs_ops ext2fs_ufsops = {
+       .uo_itimes = ext2fs_itimes,
+       .uo_update = ext2fs_update,
+       .uo_vfree = ext2fs_vfree,
+       .uo_unmark_vnode = (void (*)(vnode_t *))nullop,
+};
+
+/* Fill in the inode uid/gid from ext2 halves.  */
+void
+ext2fs_set_inode_guid(struct inode *ip)
+{
+
+       ip->i_gid = ip->i_e2fs_gid;
+       ip->i_uid = ip->i_e2fs_uid;
+       if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+               ip->i_gid |= ip->i_e2fs_gid_high << 16;
+               ip->i_uid |= ip->i_e2fs_uid_high << 16;
+       }
+}
+
+static int
+ext2fs_modcmd(modcmd_t cmd, void *arg)
+{
+       int error;
+
+       switch (cmd) {
+       case MODULE_CMD_INIT:
+               error = vfs_attach(&ext2fs_vfsops);
+               if (error != 0)
+                       break;
+               sysctl_createv(&ext2fs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT,
+                              CTLTYPE_NODE, "vfs", NULL,
+                              NULL, 0, NULL, 0,
+                              CTL_VFS, CTL_EOL);
+               sysctl_createv(&ext2fs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT,
+                              CTLTYPE_NODE, "ext2fs",
+                              SYSCTL_DESCR("Linux EXT2FS file system"),
+                              NULL, 0, NULL, 0,
+                              CTL_VFS, 17, CTL_EOL);
+               /*
+                * XXX the "17" above could be dynamic, thereby eliminating
+                * one more instance of the "number to vfs" mapping problem,
+                * but "17" is the order as taken from sys/mount.h
+                */
+               break;
+       case MODULE_CMD_FINI:
+               error = vfs_detach(&ext2fs_vfsops);
+               if (error != 0)
+                       break;
+               sysctl_teardown(&ext2fs_sysctl_log);
+               break;
+       default:
+               error = ENOTTY;
+               break;
+       }
+
+       return (error);
+}
+
+/*
+ * XXX Same structure as FFS inodes?  Should we share a common pool?
+ */
+struct pool ext2fs_inode_pool;
+struct pool ext2fs_dinode_pool;
+
+extern u_long ext2gennumber;
+
+void
+ext2fs_init(void)
+{
+
+       pool_init(&ext2fs_inode_pool, sizeof(struct inode), 0, 0, 0,
+           "ext2fsinopl", &pool_allocator_nointr, IPL_NONE);
+       pool_init(&ext2fs_dinode_pool, sizeof(struct ext2fs_dinode), 0, 0, 0,
+           "ext2dinopl", &pool_allocator_nointr, IPL_NONE);
+       ufs_init();
+}
+
+void
+ext2fs_reinit(void)
+{
+       ufs_reinit();
+}
+
+void
+ext2fs_done(void)
+{
+
+       ufs_done();
+       pool_destroy(&ext2fs_inode_pool);
+       pool_destroy(&ext2fs_dinode_pool);
+}
+
+/*
+ * Called by main() when ext2fs is going to be mounted as root.
+ *
+ * Name is updated by mount(8) after booting.
+ */
+#define ROOTNAME       "root_device"
+
+int
+ext2fs_mountroot(void)
+{
+       extern struct vnode *rootvp;
+       struct m_ext2fs *fs;
+       struct mount *mp;
+       struct ufsmount *ump;
+       int error;
+
+       if (device_class(root_device) != DV_DISK)
+               return (ENODEV);
+
+       if ((error = vfs_rootmountalloc(MOUNT_EXT2FS, "root_device", &mp))) {
+               vrele(rootvp);
+               return (error);
+       }
+
+       if ((error = ext2fs_mountfs(rootvp, mp)) != 0) {
+               vfs_unbusy(mp, false, NULL);
+               vfs_destroy(mp);
+               return (error);
+       }
+       mutex_enter(&mountlist_lock);
+       CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+       mutex_exit(&mountlist_lock);
+       ump = VFSTOUFS(mp);
+       fs = ump->um_e2fs;
+       memset(fs->e2fs_fsmnt, 0, sizeof(fs->e2fs_fsmnt));
+       (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs_fsmnt,
+           sizeof(fs->e2fs_fsmnt) - 1, 0);
+       if (fs->e2fs.e2fs_rev > E2FS_REV0) {
+               memset(fs->e2fs.e2fs_fsmnt, 0, sizeof(fs->e2fs.e2fs_fsmnt));
+               (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs.e2fs_fsmnt,
+                   sizeof(fs->e2fs.e2fs_fsmnt) - 1, 0);
+       }
+       (void)ext2fs_statvfs(mp, &mp->mnt_stat);
+       vfs_unbusy(mp, false, NULL);
+       setrootfstime((time_t)fs->e2fs.e2fs_wtime);
+       return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+int
+ext2fs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+       struct lwp *l = curlwp;
+       struct vnode *devvp;
+       struct ufs_args *args = data;
+       struct ufsmount *ump = NULL;
+       struct m_ext2fs *fs;
+       size_t size;
+       int error = 0, flags, update;
+       mode_t accessmode;
+
+       if (*data_len < sizeof *args)
+               return EINVAL;
+
+       if (mp->mnt_flag & MNT_GETARGS) {
+               ump = VFSTOUFS(mp);
+               if (ump == NULL)
+                       return EIO;
+               memset(args, 0, sizeof *args);
+               args->fspec = NULL;
+               *data_len = sizeof *args;
+               return 0;
+       }
+
+       update = mp->mnt_flag & MNT_UPDATE;
+
+       /* Check arguments */
+       if (args->fspec != NULL) {
+               /*
+                * Look up the name and verify that it's sane.
+                */
+               error = namei_simple_user(args->fspec,
+                                       NSM_FOLLOW_NOEMULROOT, &devvp);
+               if (error != 0)
+                       return (error);
+
+               if (!update) {
+                       /*
+                        * Be sure this is a valid block device
+                        */
+                       if (devvp->v_type != VBLK)
+                               error = ENOTBLK;
+                       else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+                               error = ENXIO;
+               } else {
+                       /*
+                        * Be sure we're still naming the same device
+                        * used for our initial mount
+                        */
+                       ump = VFSTOUFS(mp);
+                       if (devvp != ump->um_devvp) {
+                               if (devvp->v_rdev != ump->um_devvp->v_rdev)
+                                       error = EINVAL;
+                               else {
+                                       vrele(devvp);
+                                       devvp = ump->um_devvp;
+                                       vref(devvp);
+                               }
+                       }
+               }
+       } else {
+               if (!update) {
+                       /* New mounts must have a filename for the device */
+                       return (EINVAL);
+               } else {
+                       ump = VFSTOUFS(mp);
+                       devvp = ump->um_devvp;
+                       vref(devvp);
+               }
+       }
+
+       /*
+        * If mount by non-root, then verify that user has necessary
+        * permissions on the device.
+        *
+        * Permission to update a mount is checked higher, so here we presume
+        * updating the mount is okay (for example, as far as securelevel goes)
+        * which leaves us with the normal check.
+        */
+       if (error == 0) {
+               accessmode = VREAD;
+               if (update ?
+                   (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
+                   (mp->mnt_flag & MNT_RDONLY) == 0)
+                       accessmode |= VWRITE;
+               vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+               error = genfs_can_mount(devvp, accessmode, l->l_cred);
+               VOP_UNLOCK(devvp);
+       }
+
+       if (error) {
+               vrele(devvp);
+               return (error);
+       }
+
+       if (!update) {
+               int xflags;
+
+               if (mp->mnt_flag & MNT_RDONLY)
+                       xflags = FREAD;
+               else
+                       xflags = FREAD|FWRITE;
+               vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+               error = VOP_OPEN(devvp, xflags, FSCRED);
+               VOP_UNLOCK(devvp);
+               if (error)
+                       goto fail;
+               error = ext2fs_mountfs(devvp, mp);
+               if (error) {
+                       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+                       (void)VOP_CLOSE(devvp, xflags, NOCRED);
+                       VOP_UNLOCK(devvp);
+                       goto fail;
+               }
+
+               ump = VFSTOUFS(mp);
+               fs = ump->um_e2fs;
+       } else {
+               /*
+                * Update the mount.
+                */
+
+               /*
+                * The initial mount got a reference on this
+                * device, so drop the one obtained via
+                * namei(), above.
+                */
+               vrele(devvp);
+
+               ump = VFSTOUFS(mp);
+               fs = ump->um_e2fs;
+               if (fs->e2fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+                       /*
+                        * Changing from r/w to r/o
+                        */
+                       flags = WRITECLOSE;
+                       if (mp->mnt_flag & MNT_FORCE)
+                               flags |= FORCECLOSE;
+                       error = ext2fs_flushfiles(mp, flags);
+                       if (error == 0 &&
+                           ext2fs_cgupdate(ump, MNT_WAIT) == 0 &&
+                           (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) {
+                               fs->e2fs.e2fs_state = E2FS_ISCLEAN;
+                               (void) ext2fs_sbupdate(ump, MNT_WAIT);
+                       }
+                       if (error)
+                               return (error);
+                       fs->e2fs_ronly = 1;
+               }
+
+               if (mp->mnt_flag & MNT_RELOAD) {
+                       error = ext2fs_reload(mp, l->l_cred, l);
+                       if (error)
+                               return (error);
+               }
+
+               if (fs->e2fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
+                       /*
+                        * Changing from read-only to read/write
+                        */
+                       fs->e2fs_ronly = 0;
+                       if (fs->e2fs.e2fs_state == E2FS_ISCLEAN)
+                               fs->e2fs.e2fs_state = 0;
+                       else
+                               fs->e2fs.e2fs_state = E2FS_ERRORS;
+                       fs->e2fs_fmod = 1;
+               }
+               if (args->fspec == NULL)
+                       return 0;
+       }
+
+       error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+           UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+       (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs_fsmnt,
+           sizeof(fs->e2fs_fsmnt) - 1, &size);
+       memset(fs->e2fs_fsmnt + size, 0, sizeof(fs->e2fs_fsmnt) - size);
+       if (fs->e2fs.e2fs_rev > E2FS_REV0) {
+               (void) copystr(mp->mnt_stat.f_mntonname, fs->e2fs.e2fs_fsmnt,
+                   sizeof(fs->e2fs.e2fs_fsmnt) - 1, &size);
+               memset(fs->e2fs.e2fs_fsmnt, 0,
+                   sizeof(fs->e2fs.e2fs_fsmnt) - size);
+       }
+       if (fs->e2fs_fmod != 0) {       /* XXX */
+               fs->e2fs_fmod = 0;
+               if (fs->e2fs.e2fs_state == 0)
+                       fs->e2fs.e2fs_wtime = time_second;
+               else
+                       printf("%s: file system not clean; please fsck(8)\n",
+                               mp->mnt_stat.f_mntfromname);
+               (void) ext2fs_cgupdate(ump, MNT_WAIT);
+       }
+       return (error);
+
+fail:
+       vrele(devvp);
+       return (error);
+}
+
+/*
+ * Reload all incore data for a filesystem (used after running fsck on
+ * the root filesystem and finding things to fix). The filesystem must
+ * be mounted read-only.
+ *
+ * Things to do to update the mount:
+ *     1) invalidate all cached meta-data.
+ *     2) re-read superblock from disk.
+ *     3) re-read summary information from disk.
+ *     4) invalidate all inactive vnodes.
+ *     5) invalidate all cached file data.
+ *     6) re-read inode data for all active vnodes.
+ */
+int
+ext2fs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
+{
+       struct vnode *vp, *mvp, *devvp;
+       struct inode *ip;
+       struct buf *bp;
+       struct m_ext2fs *fs;
+       struct ext2fs *newfs;
+       int i, error;
+       void *cp;
+       struct ufsmount *ump;
+
+       if ((mp->mnt_flag & MNT_RDONLY) == 0)
+               return (EINVAL);
+
+       ump = VFSTOUFS(mp);
+       /*
+        * Step 1: invalidate all cached meta-data.
+        */
+       devvp = ump->um_devvp;
+       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+       error = vinvalbuf(devvp, 0, cred, l, 0, 0);
+       VOP_UNLOCK(devvp);
+       if (error)
+               panic("ext2fs_reload: dirty1");
+       /*
+        * Step 2: re-read superblock from disk.
+        */
+       error = bread(devvp, SBLOCK, SBSIZE, NOCRED, 0, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (error);
+       }
+       newfs = (struct ext2fs *)bp->b_data;
+       error = ext2fs_checksb(newfs, (mp->mnt_flag & MNT_RDONLY) != 0);
+       if (error) {
+               brelse(bp, 0);
+               return (error);
+       }
+
+       fs = ump->um_e2fs;
+       /*
+        * copy in new superblock, and compute in-memory values
+        */
+       e2fs_sbload(newfs, &fs->e2fs);
+       fs->e2fs_ncg =
+           howmany(fs->e2fs.e2fs_bcount - fs->e2fs.e2fs_first_dblock,
+           fs->e2fs.e2fs_bpg);
+       fs->e2fs_fsbtodb = fs->e2fs.e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT;
+       fs->e2fs_bsize = MINBSIZE << fs->e2fs.e2fs_log_bsize;
+       fs->e2fs_bshift = LOG_MINBSIZE + fs->e2fs.e2fs_log_bsize;
+       fs->e2fs_qbmask = fs->e2fs_bsize - 1;
+       fs->e2fs_bmask = ~fs->e2fs_qbmask;
+       fs->e2fs_ngdb =
+           howmany(fs->e2fs_ncg, fs->e2fs_bsize / sizeof(struct ext2_gd));
+       fs->e2fs_ipb = fs->e2fs_bsize / EXT2_DINODE_SIZE(fs);
+       fs->e2fs_itpg = fs->e2fs.e2fs_ipg / fs->e2fs_ipb;
+       brelse(bp, 0);
+
+       /*
+        * Step 3: re-read summary information from disk.
+        */
+
+       for (i = 0; i < fs->e2fs_ngdb; i++) {
+               error = bread(devvp ,
+                   fsbtodb(fs, fs->e2fs.e2fs_first_dblock +
+                   1 /* superblock */ + i),
+                   fs->e2fs_bsize, NOCRED, 0, &bp);
+               if (error) {
+                       brelse(bp, 0);
+                       return (error);
+               }
+               e2fs_cgload((struct ext2_gd *)bp->b_data,
+                   &fs->e2fs_gd[i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
+                   fs->e2fs_bsize);
+               brelse(bp, 0);
+       }
+
+       /* Allocate a marker vnode. */
+       mvp = vnalloc(mp);
+       /*
+        * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+        * and vclean() can be called indirectly
+        */
+       mutex_enter(&mntvnode_lock);
+loop:
+       for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+               vmark(mvp, vp);
+               if (vp->v_mount != mp || vismarker(vp))
+                       continue;
+               /*
+                * Step 4: invalidate all inactive vnodes.
+                */
+               if (vrecycle(vp, &mntvnode_lock, l)) {
+                       mutex_enter(&mntvnode_lock);
+                       (void)vunmark(mvp);
+                       goto loop;
+               }
+               /*
+                * Step 5: invalidate all cached file data.
+                */
+               mutex_enter(vp->v_interlock);
+               mutex_exit(&mntvnode_lock);
+               if (vget(vp, LK_EXCLUSIVE)) {
+                       mutex_enter(&mntvnode_lock);
+                       (void)vunmark(mvp);
+                       goto loop;
+               }
+               if (vinvalbuf(vp, 0, cred, l, 0, 0))
+                       panic("ext2fs_reload: dirty2");
+               /*
+                * Step 6: re-read inode data for all active vnodes.
+                */
+               ip = VTOI(vp);
+               error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+                   (int)fs->e2fs_bsize, NOCRED, 0, &bp);
+               if (error) {
+                       vput(vp);
+                       mutex_enter(&mntvnode_lock);
+                       (void)vunmark(mvp);
+                       break;
+               }
+               cp = (char *)bp->b_data +
+                   (ino_to_fsbo(fs, ip->i_number) * EXT2_DINODE_SIZE(fs));
+               e2fs_iload((struct ext2fs_dinode *)cp, ip->i_din.e2fs_din);
+               ext2fs_set_inode_guid(ip);
+               brelse(bp, 0);
+               vput(vp);
+               mutex_enter(&mntvnode_lock);
+       }
+       mutex_exit(&mntvnode_lock);
+       vnfree(mvp);
+       return (error);
+}
+
+/*
+ * Common code for mount and mountroot
+ */
+int
+ext2fs_mountfs(struct vnode *devvp, struct mount *mp)
+{
+       struct lwp *l = curlwp;
+       struct ufsmount *ump;
+       struct buf *bp;
+       struct ext2fs *fs;
+       struct m_ext2fs *m_fs;
+       dev_t dev;
+       int error, i, ronly;
+       kauth_cred_t cred;
+       struct proc *p;
+
+       dev = devvp->v_rdev;
+       p = l ? l->l_proc : NULL;
+       cred = l ? l->l_cred : NOCRED;
+
+       /* Flush out any old buffers remaining from a previous use. */
+       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+       error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+       VOP_UNLOCK(devvp);
+       if (error)
+               return (error);
+
+       ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+       bp = NULL;
+       ump = NULL;
+
+#ifdef DEBUG_EXT2
+       printf("ext2 sb size: %zu\n", sizeof(struct ext2fs));
+#endif
+       error = bread(devvp, SBLOCK, SBSIZE, cred, 0, &bp);
+       if (error)
+               goto out;
+       fs = (struct ext2fs *)bp->b_data;
+       error = ext2fs_checksb(fs, ronly);
+       if (error)
+               goto out;
+       ump = malloc(sizeof(*ump), M_UFSMNT, M_WAITOK);
+       memset(ump, 0, sizeof(*ump));
+       ump->um_fstype = UFS1;
+       ump->um_ops = &ext2fs_ufsops;
+       ump->um_e2fs = malloc(sizeof(struct m_ext2fs), M_UFSMNT, M_WAITOK);
+       memset(ump->um_e2fs, 0, sizeof(struct m_ext2fs));
+       e2fs_sbload((struct ext2fs *)bp->b_data, &ump->um_e2fs->e2fs);
+       brelse(bp, 0);
+       bp = NULL;
+       m_fs = ump->um_e2fs;
+       m_fs->e2fs_ronly = ronly;
+
+#ifdef DEBUG_EXT2
+       printf("ext2 ino size %zu\n", EXT2_DINODE_SIZE(m_fs));
+#endif
+       if (ronly == 0) {
+               if (m_fs->e2fs.e2fs_state == E2FS_ISCLEAN)
+                       m_fs->e2fs.e2fs_state = 0;
+               else
+                       m_fs->e2fs.e2fs_state = E2FS_ERRORS;
+               m_fs->e2fs_fmod = 1;
+       }
+
+       /* compute dynamic sb infos */
+       m_fs->e2fs_ncg =
+           howmany(m_fs->e2fs.e2fs_bcount - m_fs->e2fs.e2fs_first_dblock,
+           m_fs->e2fs.e2fs_bpg);
+       m_fs->e2fs_fsbtodb = m_fs->e2fs.e2fs_log_bsize + LOG_MINBSIZE - DEV_BSHIFT;
+       m_fs->e2fs_bsize = MINBSIZE << m_fs->e2fs.e2fs_log_bsize;
+       m_fs->e2fs_bshift = LOG_MINBSIZE + m_fs->e2fs.e2fs_log_bsize;
+       m_fs->e2fs_qbmask = m_fs->e2fs_bsize - 1;
+       m_fs->e2fs_bmask = ~m_fs->e2fs_qbmask;
+       m_fs->e2fs_ngdb =
+           howmany(m_fs->e2fs_ncg, m_fs->e2fs_bsize / sizeof(struct ext2_gd));
+       m_fs->e2fs_ipb = m_fs->e2fs_bsize / EXT2_DINODE_SIZE(m_fs);
+       m_fs->e2fs_itpg = m_fs->e2fs.e2fs_ipg / m_fs->e2fs_ipb;
+
+       m_fs->e2fs_gd = malloc(m_fs->e2fs_ngdb * m_fs->e2fs_bsize,
+           M_UFSMNT, M_WAITOK);
+       for (i = 0; i < m_fs->e2fs_ngdb; i++) {
+               error = bread(devvp ,
+                   fsbtodb(m_fs, m_fs->e2fs.e2fs_first_dblock +
+                   1 /* superblock */ + i),
+                   m_fs->e2fs_bsize, NOCRED, 0, &bp);
+               if (error) {
+                       free(m_fs->e2fs_gd, M_UFSMNT);
+                       goto out;
+               }
+               e2fs_cgload((struct ext2_gd *)bp->b_data,
+                   &m_fs->e2fs_gd[
+                       i * m_fs->e2fs_bsize / sizeof(struct ext2_gd)],
+                   m_fs->e2fs_bsize);
+               brelse(bp, 0);
+               bp = NULL;
+       }
+
+       mp->mnt_data = ump;
+       mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+       mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_EXT2FS);
+       mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+       mp->mnt_stat.f_namemax = EXT2FS_MAXNAMLEN;
+       mp->mnt_flag |= MNT_LOCAL;
+       mp->mnt_dev_bshift = DEV_BSHIFT;        /* XXX */
+       mp->mnt_fs_bshift = m_fs->e2fs_bshift;
+       mp->mnt_iflag |= IMNT_DTYPE;
+       ump->um_flags = 0;
+       ump->um_mountp = mp;
+       ump->um_dev = dev;
+       ump->um_devvp = devvp;
+       ump->um_nindir = NINDIR(m_fs);
+       ump->um_lognindir = ffs(NINDIR(m_fs)) - 1;
+       ump->um_bptrtodb = m_fs->e2fs_fsbtodb;
+       ump->um_seqinc = 1; /* no frags */
+       ump->um_maxsymlinklen = EXT2_MAXSYMLINKLEN;
+       ump->um_dirblksiz = m_fs->e2fs_bsize;
+       ump->um_maxfilesize = ((uint64_t)0x80000000 * m_fs->e2fs_bsize - 1);
+       devvp->v_specmountpoint = mp;
+       return (0);
+
+out:
+       KASSERT(bp != NULL);
+       brelse(bp, 0);
+       if (ump) {
+               free(ump->um_e2fs, M_UFSMNT);
+               free(ump, M_UFSMNT);
+               mp->mnt_data = NULL;
+       }
+       return (error);
+}
+
+/*
+ * unmount system call
+ */
+int
+ext2fs_unmount(struct mount *mp, int mntflags)
+{
+       struct ufsmount *ump;
+       struct m_ext2fs *fs;
+       int error, flags;
+
+       flags = 0;
+       if (mntflags & MNT_FORCE)
+               flags |= FORCECLOSE;
+       if ((error = ext2fs_flushfiles(mp, flags)) != 0)
+               return (error);
+       ump = VFSTOUFS(mp);
+       fs = ump->um_e2fs;
+       if (fs->e2fs_ronly == 0 &&
+               ext2fs_cgupdate(ump, MNT_WAIT) == 0 &&
+               (fs->e2fs.e2fs_state & E2FS_ERRORS) == 0) {
+               fs->e2fs.e2fs_state = E2FS_ISCLEAN;
+               (void) ext2fs_sbupdate(ump, MNT_WAIT);
+       }
+       if (ump->um_devvp->v_type != VBAD)
+               ump->um_devvp->v_specmountpoint = NULL;
+       vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+       error = VOP_CLOSE(ump->um_devvp, fs->e2fs_ronly ? FREAD : FREAD|FWRITE,
+           NOCRED);
+       vput(ump->um_devvp);
+       free(fs->e2fs_gd, M_UFSMNT);
+       free(fs, M_UFSMNT);
+       free(ump, M_UFSMNT);
+       mp->mnt_data = NULL;
+       mp->mnt_flag &= ~MNT_LOCAL;
+       return (error);
+}
+
+/*
+ * Flush out all the files in a filesystem.
+ */
+int
+ext2fs_flushfiles(struct mount *mp, int flags)
+{
+       extern int doforce;
+       int error;
+
+       if (!doforce)
+               flags &= ~FORCECLOSE;
+       error = vflush(mp, NULLVP, flags);
+       return (error);
+}
+
+/*
+ * Get file system statistics.
+ */
+int
+ext2fs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+       struct ufsmount *ump;
+       struct m_ext2fs *fs;
+       uint32_t overhead, overhead_per_group, ngdb;
+       int i, ngroups;
+
+       ump = VFSTOUFS(mp);
+       fs = ump->um_e2fs;
+       if (fs->e2fs.e2fs_magic != E2FS_MAGIC)
+               panic("ext2fs_statvfs");
+
+       /*
+        * Compute the overhead (FS structures)
+        */
+       overhead_per_group =
+           1 /* block bitmap */ +
+           1 /* inode bitmap */ +
+           fs->e2fs_itpg;
+       overhead = fs->e2fs.e2fs_first_dblock +
+           fs->e2fs_ncg * overhead_per_group;
+       if (fs->e2fs.e2fs_rev > E2FS_REV0 &&
+           fs->e2fs.e2fs_features_rocompat & EXT2F_ROCOMPAT_SPARSESUPER) {
+               for (i = 0, ngroups = 0; i < fs->e2fs_ncg; i++) {
+                       if (cg_has_sb(i))
+                               ngroups++;
+               }
+       } else {
+               ngroups = fs->e2fs_ncg;
+       }
+       ngdb = fs->e2fs_ngdb;
+       if (fs->e2fs.e2fs_rev > E2FS_REV0 &&
+           fs->e2fs.e2fs_features_compat & EXT2F_COMPAT_RESIZE)
+               ngdb += fs->e2fs.e2fs_reserved_ngdb;
+       overhead += ngroups * (1 /* superblock */ + ngdb);
+
+       sbp->f_bsize = fs->e2fs_bsize;
+       sbp->f_frsize = MINBSIZE << fs->e2fs.e2fs_fsize;
+       sbp->f_iosize = fs->e2fs_bsize;
+       sbp->f_blocks = fs->e2fs.e2fs_bcount - overhead;
+       sbp->f_bfree = fs->e2fs.e2fs_fbcount;
+       sbp->f_bresvd = fs->e2fs.e2fs_rbcount;
+       if (sbp->f_bfree > sbp->f_bresvd)
+               sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
+       else
+               sbp->f_bavail = 0;
+       sbp->f_files =  fs->e2fs.e2fs_icount;
+       sbp->f_ffree = fs->e2fs.e2fs_ficount;
+       sbp->f_favail = fs->e2fs.e2fs_ficount;
+       sbp->f_fresvd = 0;
+       copy_statvfs_info(sbp, mp);
+       return (0);
+}
+
+/*
+ * Go through the disk queues to initiate sandbagged IO;
+ * go through the inodes to write those that have been modified;
+ * initiate the writing of the super block if it has been modified.
+ *
+ * Note: we are always called with the filesystem marked `MPBUSY'.
+ */
+int
+ext2fs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
+{
+       struct vnode *vp, *mvp;
+       struct inode *ip;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct m_ext2fs *fs;
+       int error, allerror = 0;
+
+       fs = ump->um_e2fs;
+       if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) {        /* XXX */
+               printf("fs = %s\n", fs->e2fs_fsmnt);
+               panic("update: rofs mod");
+       }
+
+       /* Allocate a marker vnode. */
+       mvp = vnalloc(mp);
+
+       /*
+        * Write back each (modified) inode.
+        */
+       mutex_enter(&mntvnode_lock);
+loop:
+       /*
+        * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+        * and vclean() can be called indirectly
+        */
+       for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+               vmark(mvp, vp);
+               if (vp->v_mount != mp || vismarker(vp))
+                       continue;
+               mutex_enter(vp->v_interlock);
+               ip = VTOI(vp);
+               if (ip == NULL || (vp->v_iflag & (VI_XLOCK|VI_CLEAN)) != 0 ||
+                   vp->v_type == VNON ||
+                   ((ip->i_flag &
+                     (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 &&
+                    LIST_EMPTY(&vp->v_dirtyblkhd) &&
+                    UVM_OBJ_IS_CLEAN(&vp->v_uobj)))
+               {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+               mutex_exit(&mntvnode_lock);
+               error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT);
+               if (error) {
+                       mutex_enter(&mntvnode_lock);
+                       if (error == ENOENT) {
+                               mutex_enter(&mntvnode_lock);
+                               (void)vunmark(mvp);
+                               goto loop;
+                       }
+                       continue;
+               }
+               if (vp->v_type == VREG && waitfor == MNT_LAZY)
+                       error = ext2fs_update(vp, NULL, NULL, 0);
+               else
+                       error = VOP_FSYNC(vp, cred,
+                           waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0);
+               if (error)
+                       allerror = error;
+               vput(vp);
+               mutex_enter(&mntvnode_lock);
+       }
+       mutex_exit(&mntvnode_lock);
+       vnfree(mvp);
+       /*
+        * Force stale file system control information to be flushed.
+        */
+       if (waitfor != MNT_LAZY) {
+               vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+               if ((error = VOP_FSYNC(ump->um_devvp, cred,
+                   waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0)
+                       allerror = error;
+               VOP_UNLOCK(ump->um_devvp);
+       }
+       /*
+        * Write back modified superblock.
+        */
+       if (fs->e2fs_fmod != 0) {
+               fs->e2fs_fmod = 0;
+               fs->e2fs.e2fs_wtime = time_second;
+               if ((error = ext2fs_cgupdate(ump, waitfor)))
+                       allerror = error;
+       }
+       return (allerror);
+}
+
+/*
+ * Look up a EXT2FS dinode number to find its incore vnode, otherwise read it
+ * in from disk.  If it is in core, wait for the lock bit to clear, then
+ * return the inode locked.  Detection and handling of mount points must be
+ * done by the calling routine.
+ */
+int
+ext2fs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+       struct m_ext2fs *fs;
+       struct inode *ip;
+       struct ufsmount *ump;
+       struct buf *bp;
+       struct vnode *vp;
+       dev_t dev;
+       int error;
+       void *cp;
+
+       ump = VFSTOUFS(mp);
+       dev = ump->um_dev;
+retry:
+       if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
+               return (0);
+
+       /* Allocate a new vnode/inode. */
+       error = getnewvnode(VT_EXT2FS, mp, ext2fs_vnodeop_p, NULL, &vp);
+       if (error) {
+               *vpp = NULL;
+               return (error);
+       }
+       ip = pool_get(&ext2fs_inode_pool, PR_WAITOK);
+
+       mutex_enter(&ufs_hashlock);
+       if ((*vpp = ufs_ihashget(dev, ino, 0)) != NULL) {
+               mutex_exit(&ufs_hashlock);
+               ungetnewvnode(vp);
+               pool_put(&ext2fs_inode_pool, ip);
+               goto retry;
+       }
+
+       vp->v_vflag |= VV_LOCKSWORK;
+
+       memset(ip, 0, sizeof(struct inode));
+       vp->v_data = ip;
+       ip->i_vnode = vp;
+       ip->i_ump = ump;
+       ip->i_e2fs = fs = ump->um_e2fs;
+       ip->i_dev = dev;
+       ip->i_number = ino;
+       ip->i_e2fs_last_lblk = 0;
+       ip->i_e2fs_last_blk = 0;
+       genfs_node_init(vp, &ext2fs_genfsops);
+
+       /*
+        * Put it onto its hash chain and lock it so that other requests for
+        * this inode will block if they arrive while we are sleeping waiting
+        * for old data structures to be purged or for the contents of the
+        * disk portion of this inode to be read.
+        */
+
+       ufs_ihashins(ip);
+       mutex_exit(&ufs_hashlock);
+
+       /* Read in the disk contents for the inode, copy into the inode. */
+       error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
+           (int)fs->e2fs_bsize, NOCRED, 0, &bp);
+       if (error) {
+
+               /*
+                * The inode does not contain anything useful, so it would
+                * be misleading to leave it on its hash chain. With mode
+                * still zero, it will be unlinked and returned to the free
+                * list by vput().
+                */
+
+               vput(vp);
+               brelse(bp, 0);
+               *vpp = NULL;
+               return (error);
+       }
+       cp = (char *)bp->b_data + (ino_to_fsbo(fs, ino) * EXT2_DINODE_SIZE(fs));
+       ip->i_din.e2fs_din = pool_get(&ext2fs_dinode_pool, PR_WAITOK);
+       e2fs_iload((struct ext2fs_dinode *)cp, ip->i_din.e2fs_din);
+       ext2fs_set_inode_guid(ip);
+       brelse(bp, 0);
+
+       /* If the inode was deleted, reset all fields */
+       if (ip->i_e2fs_dtime != 0) {
+               ip->i_e2fs_mode = ip->i_e2fs_nblock = 0;
+               (void)ext2fs_setsize(ip, 0);
+               memset(ip->i_e2fs_blocks, 0, sizeof(ip->i_e2fs_blocks));
+       }
+
+       /*
+        * Initialize the vnode from the inode, check for aliases.
+        */
+
+       error = ext2fs_vinit(mp, ext2fs_specop_p, ext2fs_fifoop_p, &vp);
+       if (error) {
+               vput(vp);
+               *vpp = NULL;
+               return (error);
+       }
+       /*
+        * Finish inode initialization now that aliasing has been resolved.
+        */
+
+       ip->i_devvp = ump->um_devvp;
+       vref(ip->i_devvp);
+
+       /*
+        * Set up a generation number for this inode if it does not
+        * already have one. This should only happen on old filesystems.
+        */
+
+       if (ip->i_e2fs_gen == 0) {
+               if (++ext2gennumber < (u_long)time_second)
+                       ext2gennumber = time_second;
+               ip->i_e2fs_gen = ext2gennumber;
+               if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+                       ip->i_flag |= IN_MODIFIED;
+       }
+       uvm_vnp_setsize(vp, ext2fs_size(ip));
+       *vpp = vp;
+       return (0);
+}
+
+/*
+ * File handle to vnode
+ *
+ * Have to be really careful about stale file handles:
+ * - check that the inode number is valid
+ * - call ext2fs_vget() to get the locked inode
+ * - check for an unallocated inode (i_mode == 0)
+ */
+int
+ext2fs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+       struct inode *ip;
+       struct vnode *nvp;
+       int error;
+       struct ufid ufh;
+       struct m_ext2fs *fs;
+
+       if (fhp->fid_len != sizeof(struct ufid))
+               return EINVAL;
+
+       memcpy(&ufh, fhp, sizeof(struct ufid));
+       fs = VFSTOUFS(mp)->um_e2fs;
+       if ((ufh.ufid_ino < EXT2_FIRSTINO && ufh.ufid_ino != EXT2_ROOTINO) ||
+               ufh.ufid_ino >= fs->e2fs_ncg * fs->e2fs.e2fs_ipg)
+               return (ESTALE);
+
+       if ((error = VFS_VGET(mp, ufh.ufid_ino, &nvp)) != 0) {
+               *vpp = NULLVP;
+               return (error);
+       }
+       ip = VTOI(nvp);
+       if (ip->i_e2fs_mode == 0 || ip->i_e2fs_dtime != 0 ||
+               ip->i_e2fs_gen != ufh.ufid_gen) {
+               vput(nvp);
+               *vpp = NULLVP;
+               return (ESTALE);
+       }
+       *vpp = nvp;
+       return (0);
+}
+
+/*
+ * Vnode pointer to File handle
+ */
+/* ARGSUSED */
+int
+ext2fs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+       struct inode *ip;
+       struct ufid ufh;
+
+       if (*fh_size < sizeof(struct ufid)) {
+               *fh_size = sizeof(struct ufid);
+               return E2BIG;
+       }
+       *fh_size = sizeof(struct ufid);
+
+       ip = VTOI(vp);
+       memset(&ufh, 0, sizeof(ufh));
+       ufh.ufid_len = sizeof(struct ufid);
+       ufh.ufid_ino = ip->i_number;
+       ufh.ufid_gen = ip->i_e2fs_gen;
+       memcpy(fhp, &ufh, sizeof(ufh));
+       return (0);
+}
+
+/*
+ * Write a superblock and associated information back to disk.
+ */
+int
+ext2fs_sbupdate(struct ufsmount *mp, int waitfor)
+{
+       struct m_ext2fs *fs = mp->um_e2fs;
+       struct buf *bp;
+       int error = 0;
+
+       bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0);
+       e2fs_sbsave(&fs->e2fs, (struct ext2fs*)bp->b_data);
+       if (waitfor == MNT_WAIT)
+               error = bwrite(bp);
+       else
+               bawrite(bp);
+       return (error);
+}
+
+int
+ext2fs_cgupdate(struct ufsmount *mp, int waitfor)
+{
+       struct m_ext2fs *fs = mp->um_e2fs;
+       struct buf *bp;
+       int i, error = 0, allerror = 0;
+
+       allerror = ext2fs_sbupdate(mp, waitfor);
+       for (i = 0; i < fs->e2fs_ngdb; i++) {
+               bp = getblk(mp->um_devvp, fsbtodb(fs,
+                   fs->e2fs.e2fs_first_dblock +
+                   1 /* superblock */ + i), fs->e2fs_bsize, 0, 0);
+               e2fs_cgsave(&fs->e2fs_gd[
+                   i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
+                   (struct ext2_gd *)bp->b_data, fs->e2fs_bsize);
+               if (waitfor == MNT_WAIT)
+                       error = bwrite(bp);
+               else
+                       bawrite(bp);
+       }
+
+       if (!allerror && error)
+               allerror = error;
+       return (allerror);
+}
+
+static int
+ext2fs_checksb(struct ext2fs *fs, int ronly)
+{
+
+       if (fs2h16(fs->e2fs_magic) != E2FS_MAGIC) {
+               return (EINVAL);                /* XXX needs translation */
+       }
+       if (fs2h32(fs->e2fs_rev) > E2FS_REV1) {
+#ifdef DIAGNOSTIC
+               printf("Ext2 fs: unsupported revision number: %x\n",
+                   fs2h32(fs->e2fs_rev));
+#endif
+               return (EINVAL);                /* XXX needs translation */
+       }
+       if (fs2h32(fs->e2fs_log_bsize) > 2) { /* block size = 1024|2048|4096 */
+#ifdef DIAGNOSTIC
+               printf("Ext2 fs: bad block size: %d "
+                   "(expected <= 2 for ext2 fs)\n",
+                   fs2h32(fs->e2fs_log_bsize));
+#endif
+               return (EINVAL);           /* XXX needs translation */
+       }
+       if (fs2h32(fs->e2fs_rev) > E2FS_REV0) {
+               if (fs2h32(fs->e2fs_first_ino) != EXT2_FIRSTINO) {
+                       printf("Ext2 fs: unsupported first inode position\n");
+                       return (EINVAL);      /* XXX needs translation */
+               }
+               if (fs2h32(fs->e2fs_features_incompat) &
+                   ~EXT2F_INCOMPAT_SUPP) {
+                       printf("Ext2 fs: unsupported optional feature\n");
+                       return (EINVAL);      /* XXX needs translation */
+               }
+               if (!ronly && fs2h32(fs->e2fs_features_rocompat) &
+                   ~EXT2F_ROCOMPAT_SUPP) {
+                       return (EROFS);      /* XXX needs translation */
+               }
+       }
+       return (0);
+}
diff --git a/sys/ufs/ext2fs/ext2fs_vnops.c b/sys/ufs/ext2fs/ext2fs_vnops.c
new file mode 100644 (file)
index 0000000..0ea5dd3
--- /dev/null
@@ -0,0 +1,1664 @@
+/*     $NetBSD: ext2fs_vnops.c,v 1.101 2011/11/18 21:18:51 christos Exp $      */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_vnops.c 8.14 (Berkeley) 10/26/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+/*
+ * Copyright (c) 1997 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *     @(#)ufs_vnops.c 8.14 (Berkeley) 10/26/94
+ * Modified for ext2fs by Manuel Bouyer.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ext2fs_vnops.c,v 1.101 2011/11/18 21:18:51 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufsmount.h>
+
+#include <ufs/ext2fs/ext2fs.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+
+extern int prtactive;
+
+static int ext2fs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *);
+static int ext2fs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t,
+                               struct lwp *);
+
+union _qcvt {
+       int64_t qcvt;
+       int32_t val[2];
+};
+
+#define SETHIGH(q, h) { \
+       union _qcvt tmp; \
+       tmp.qcvt = (q); \
+       tmp.val[_QUAD_HIGHWORD] = (h); \
+       (q) = tmp.qcvt; \
+}
+#define SETLOW(q, l) { \
+       union _qcvt tmp; \
+       tmp.qcvt = (q); \
+       tmp.val[_QUAD_LOWWORD] = (l); \
+       (q) = tmp.qcvt; \
+}
+
+/*
+ * Create a regular file
+ */
+int
+ext2fs_create(void *v)
+{
+       struct vop_create_args /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+               struct vattr *a_vap;
+       } */ *ap = v;
+       int     error;
+
+       error =
+           ext2fs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
+                            ap->a_dvp, ap->a_vpp, ap->a_cnp);
+
+       if (error)
+               return (error);
+       VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+       return (0);
+}
+
+/*
+ * Mknod vnode call
+ */
+/* ARGSUSED */
+int
+ext2fs_mknod(void *v)
+{
+       struct vop_mknod_args /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+               struct vattr *a_vap;
+       } */ *ap = v;
+       struct vattr *vap = ap->a_vap;
+       struct vnode **vpp = ap->a_vpp;
+       struct inode *ip;
+       int error;
+       struct mount    *mp;
+       ino_t           ino;
+
+       if ((error = ext2fs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
+                   ap->a_dvp, vpp, ap->a_cnp)) != 0)
+               return (error);
+       VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+       ip = VTOI(*vpp);
+       mp  = (*vpp)->v_mount;
+       ino = ip->i_number;
+       ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+       if (vap->va_rdev != VNOVAL) {
+               /*
+                * Want to be able to use this to make badblock
+                * inodes, so don't truncate the dev number.
+                */
+               ip->i_din.e2fs_din->e2di_rdev = h2fs32(vap->va_rdev);
+       }
+       /*
+        * Remove inode so that it will be reloaded by VFS_VGET and
+        * checked to see if it is an alias of an existing entry in
+        * the inode cache.
+        */
+       VOP_UNLOCK(*vpp);
+       (*vpp)->v_type = VNON;
+       vgone(*vpp);
+       error = VFS_VGET(mp, ino, vpp);
+       if (error != 0) {
+               *vpp = NULL;
+               return (error);
+       }
+       return (0);
+}
+
+/*
+ * Open called.
+ *
+ * Just check the APPEND flag.
+ */
+/* ARGSUSED */
+int
+ext2fs_open(void *v)
+{
+       struct vop_open_args /* {
+               struct vnode *a_vp;
+               int  a_mode;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+
+       /*
+        * Files marked append-only must be opened for appending.
+        */
+       if ((VTOI(ap->a_vp)->i_e2fs_flags & EXT2_APPEND) &&
+               (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
+               return (EPERM);
+       return (0);
+}
+
+static int
+ext2fs_check_possible(struct vnode *vp, struct inode *ip, mode_t mode)
+{
+
+       /*
+        * Disallow write attempts on read-only file systems;
+        * unless the file is a socket, fifo, or a block or
+        * character device resident on the file system.
+        */
+       if (mode & VWRITE) {
+               switch (vp->v_type) {
+               case VDIR:
+               case VLNK:
+               case VREG:
+                       if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                               return (EROFS);
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       /* If immutable bit set, nobody gets to write it. */
+       if ((mode & VWRITE) && (ip->i_e2fs_flags & EXT2_IMMUTABLE))
+               return (EPERM);
+
+       return 0;
+}
+
+static int
+ext2fs_check_permitted(struct vnode *vp, struct inode *ip, mode_t mode,
+    kauth_cred_t cred)
+{
+
+       return genfs_can_access(vp->v_type, ip->i_e2fs_mode & ALLPERMS,
+           ip->i_uid, ip->i_gid, mode, cred);
+}
+
+int
+ext2fs_access(void *v)
+{
+       struct vop_access_args /* {
+               struct vnode *a_vp;
+               int  a_mode;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       mode_t mode = ap->a_mode;
+       int error;
+
+       error = ext2fs_check_possible(vp, ip, mode);
+       if (error)
+               return error;
+
+       error = ext2fs_check_permitted(vp, ip, mode, ap->a_cred);
+
+       return error;
+}
+
+/* ARGSUSED */
+int
+ext2fs_getattr(void *v)
+{
+       struct vop_getattr_args /* {
+               struct vnode *a_vp;
+               struct vattr *a_vap;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       struct vattr *vap = ap->a_vap;
+
+       EXT2FS_ITIMES(ip, NULL, NULL, NULL);
+       /*
+        * Copy from inode table
+        */
+       vap->va_fsid = ip->i_dev;
+       vap->va_fileid = ip->i_number;
+       vap->va_mode = ip->i_e2fs_mode & ALLPERMS;
+       vap->va_nlink = ip->i_e2fs_nlink;
+       vap->va_uid = ip->i_uid;
+       vap->va_gid = ip->i_gid;
+       vap->va_rdev = (dev_t)fs2h32(ip->i_din.e2fs_din->e2di_rdev);
+       vap->va_size = vp->v_size;
+       vap->va_atime.tv_sec = ip->i_e2fs_atime;
+       vap->va_atime.tv_nsec = 0;
+       vap->va_mtime.tv_sec = ip->i_e2fs_mtime;
+       vap->va_mtime.tv_nsec = 0;
+       vap->va_ctime.tv_sec = ip->i_e2fs_ctime;
+       vap->va_ctime.tv_nsec = 0;
+#ifdef EXT2FS_SYSTEM_FLAGS
+       vap->va_flags = (ip->i_e2fs_flags & EXT2_APPEND) ? SF_APPEND : 0;
+       vap->va_flags |= (ip->i_e2fs_flags & EXT2_IMMUTABLE) ? SF_IMMUTABLE : 0;
+#else
+       vap->va_flags = (ip->i_e2fs_flags & EXT2_APPEND) ? UF_APPEND : 0;
+       vap->va_flags |= (ip->i_e2fs_flags & EXT2_IMMUTABLE) ? UF_IMMUTABLE : 0;
+#endif
+       vap->va_gen = ip->i_e2fs_gen;
+       /* this doesn't belong here */
+       if (vp->v_type == VBLK)
+               vap->va_blocksize = BLKDEV_IOSIZE;
+       else if (vp->v_type == VCHR)
+               vap->va_blocksize = MAXBSIZE;
+       else
+               vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+       vap->va_bytes = dbtob((u_quad_t)ip->i_e2fs_nblock);
+       vap->va_type = vp->v_type;
+       vap->va_filerev = ip->i_modrev;
+       return (0);
+}
+
+/*
+ * Set attribute vnode op. called from several syscalls
+ */
+int
+ext2fs_setattr(void *v)
+{
+       struct vop_setattr_args /* {
+               struct vnode *a_vp;
+               struct vattr *a_vap;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vattr *vap = ap->a_vap;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       kauth_cred_t cred = ap->a_cred;
+       struct lwp *l = curlwp;
+       int error;
+
+       /*
+        * Check for unsettable attributes.
+        */
+       if ((vap->va_type != VNON) || (vap->va_nlink != (nlink_t)VNOVAL) ||
+           (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
+           (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
+           ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
+               return (EINVAL);
+       }
+       if (vap->va_flags != VNOVAL) {
+               if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                       return (EROFS);
+               if (kauth_cred_geteuid(cred) != ip->i_uid &&
+                   (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+                   NULL)))
+                       return (error);
+#ifdef EXT2FS_SYSTEM_FLAGS
+               if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+                   NULL) == 0) {
+                       if ((ip->i_e2fs_flags &
+                           (EXT2_APPEND | EXT2_IMMUTABLE)) &&
+                           kauth_authorize_system(l->l_cred,
+                            KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL))
+                               return (EPERM);
+                       ip->i_e2fs_flags &= ~(EXT2_APPEND | EXT2_IMMUTABLE);
+                       ip->i_e2fs_flags |=
+                           (vap->va_flags & SF_APPEND) ?  EXT2_APPEND : 0 |
+                           (vap->va_flags & SF_IMMUTABLE) ? EXT2_IMMUTABLE : 0;
+               } else
+                       return (EPERM);
+#else
+               ip->i_e2fs_flags &= ~(EXT2_APPEND | EXT2_IMMUTABLE);
+               ip->i_e2fs_flags |=
+                   (vap->va_flags & UF_APPEND) ? EXT2_APPEND : 0 |
+                   (vap->va_flags & UF_IMMUTABLE) ? EXT2_IMMUTABLE : 0;
+#endif
+               ip->i_flag |= IN_CHANGE;
+               if (vap->va_flags & (IMMUTABLE | APPEND))
+                       return (0);
+       }
+       if (ip->i_e2fs_flags & (EXT2_APPEND | EXT2_IMMUTABLE))
+               return (EPERM);
+       /*
+        * Go through the fields and update iff not VNOVAL.
+        */
+       if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
+               if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                       return (EROFS);
+               error = ext2fs_chown(vp, vap->va_uid, vap->va_gid, cred, l);
+               if (error)
+                       return (error);
+       }
+       if (vap->va_size != VNOVAL) {
+               /*
+                * Disallow write attempts on read-only file systems;
+                * unless the file is a socket, fifo, or a block or
+                * character device resident on the file system.
+                */
+               switch (vp->v_type) {
+               case VDIR:
+                       return (EISDIR);
+               case VLNK:
+               case VREG:
+                       if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                               return (EROFS);
+               default:
+                       break;
+               }
+               error = ext2fs_truncate(vp, vap->va_size, 0, cred);
+               if (error)
+                       return (error);
+       }
+       ip = VTOI(vp);
+       if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+               if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                       return (EROFS);
+               error = genfs_can_chtimes(vp, vap->va_vaflags, ip->i_uid, cred);
+               if (error)
+                       return (error);
+               if (vap->va_atime.tv_sec != VNOVAL)
+                       if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
+                               ip->i_flag |= IN_ACCESS;
+               if (vap->va_mtime.tv_sec != VNOVAL) {
+                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+                       if (vp->v_mount->mnt_flag & MNT_RELATIME)
+                               ip->i_flag |= IN_ACCESS;
+               }
+               error = ext2fs_update(vp, &vap->va_atime, &vap->va_mtime,
+                       UPDATE_WAIT);
+               if (error)
+                       return (error);
+       }
+       error = 0;
+       if (vap->va_mode != (mode_t)VNOVAL) {
+               if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                       return (EROFS);
+               error = ext2fs_chmod(vp, (int)vap->va_mode, cred, l);
+       }
+       VN_KNOTE(vp, NOTE_ATTRIB);
+       return (error);
+}
+
+/*
+ * Change the mode on a file.
+ * Inode must be locked before calling.
+ */
+static int
+ext2fs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l)
+{
+       struct inode *ip = VTOI(vp);
+       int error;
+
+       error = genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode);
+       if (error)
+               return (error);
+
+       ip->i_e2fs_mode &= ~ALLPERMS;
+       ip->i_e2fs_mode |= (mode & ALLPERMS);
+       ip->i_flag |= IN_CHANGE;
+       return (0);
+}
+
+/*
+ * Perform chown operation on inode ip;
+ * inode must be locked prior to call.
+ */
+static int
+ext2fs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
+               struct lwp *l)
+{
+       struct inode *ip = VTOI(vp);
+       uid_t ouid;
+       gid_t ogid;
+       int error;
+
+       if (uid == (uid_t)VNOVAL)
+               uid = ip->i_uid;
+       if (gid == (gid_t)VNOVAL)
+               gid = ip->i_gid;
+
+       error = genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid);
+       if (error)
+               return (error);
+
+       ogid = ip->i_gid;
+       ouid = ip->i_uid;
+
+       ip->i_e2fs_gid = gid & 0xffff;
+       ip->i_e2fs_uid = uid & 0xffff;
+       if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+               ip->i_e2fs_gid_high = (gid >> 16) & 0xffff;
+               ip->i_e2fs_uid_high = (uid >> 16) & 0xffff;
+       } else {
+               ip->i_e2fs_gid_high = 0;
+               ip->i_e2fs_uid_high = 0;
+       }
+       if (ouid != uid || ogid != gid) {
+               ext2fs_set_inode_guid(ip);
+               ip->i_flag |= IN_CHANGE;
+       }
+       if (ouid != uid && kauth_authorize_generic(cred,
+           KAUTH_GENERIC_ISSUSER, NULL) != 0)
+               ip->i_e2fs_mode &= ~ISUID;
+       if (ogid != gid && kauth_authorize_generic(cred,
+           KAUTH_GENERIC_ISSUSER, NULL) != 0)
+               ip->i_e2fs_mode &= ~ISGID;
+       return (0);
+}
+
+int
+ext2fs_remove(void *v)
+{
+       struct vop_remove_args /* {
+               struct vnode *a_dvp;
+               struct vnode *a_vp;
+               struct componentname *a_cnp;
+       } */ *ap = v;
+       struct inode *ip;
+       struct vnode *vp = ap->a_vp;
+       struct vnode *dvp = ap->a_dvp;
+       struct ufs_lookup_results *ulr;
+       int error;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+       ip = VTOI(vp);
+       if (vp->v_type == VDIR ||
+               (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) ||
+               (VTOI(dvp)->i_e2fs_flags & EXT2_APPEND)) {
+               error = EPERM;
+       } else {
+               error = ext2fs_dirremove(dvp, ulr, ap->a_cnp);
+               if (error == 0) {
+                       ip->i_e2fs_nlink--;
+                       ip->i_flag |= IN_CHANGE;
+               }
+       }
+
+       VN_KNOTE(vp, NOTE_DELETE);
+       VN_KNOTE(dvp, NOTE_WRITE);
+       if (dvp == vp)
+               vrele(vp);
+       else
+               vput(vp);
+       vput(dvp);
+       return (error);
+}
+
+/*
+ * ext2fs_link: create hard link.
+ */
+int
+ext2fs_link(void *v)
+{
+       struct vop_link_args /* {
+               struct vnode *a_dvp;
+               struct vnode *a_vp;
+               struct componentname *a_cnp;
+       } */ *ap = v;
+       struct vnode *dvp = ap->a_dvp;
+       struct vnode *vp = ap->a_vp;
+       struct componentname *cnp = ap->a_cnp;
+       struct inode *ip;
+       int error;
+       struct ufs_lookup_results *ulr;
+
+       KASSERT(dvp != vp);
+       KASSERT(vp->v_type != VDIR);
+       KASSERT(dvp->v_mount == vp->v_mount);
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+       error = vn_lock(vp, LK_EXCLUSIVE);
+       if (error) {
+               VOP_ABORTOP(dvp, cnp);
+               goto out2;
+       }
+       ip = VTOI(vp);
+       if ((nlink_t)ip->i_e2fs_nlink >= LINK_MAX) {
+               VOP_ABORTOP(dvp, cnp);
+               error = EMLINK;
+               goto out1;
+       }
+       if (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) {
+               VOP_ABORTOP(dvp, cnp);
+               error = EPERM;
+               goto out1;
+       }
+       ip->i_e2fs_nlink++;
+       ip->i_flag |= IN_CHANGE;
+       error = ext2fs_update(vp, NULL, NULL, UPDATE_WAIT);
+       if (!error)
+               error = ext2fs_direnter(ip, dvp, ulr, cnp);
+       if (error) {
+               ip->i_e2fs_nlink--;
+               ip->i_flag |= IN_CHANGE;
+       }
+out1:
+       VOP_UNLOCK(vp);
+out2:
+       VN_KNOTE(vp, NOTE_LINK);
+       VN_KNOTE(dvp, NOTE_WRITE);
+       vput(dvp);
+       return (error);
+}
+
+/*
+ * Rename system call.
+ *     rename("foo", "bar");
+ * is essentially
+ *     unlink("bar");
+ *     link("foo", "bar");
+ *     unlink("foo");
+ * but ``atomically''.  Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time.  Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ *    target.  This also ensure the inode won't be deleted out
+ *    from underneath us while we work (it may be truncated by
+ *    a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination.  If destination already exists,
+ *    delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ *    directory was moved and the parent of the destination
+ *    is different from the source, patch the ".." entry in the
+ *    directory.
+ */
+int
+ext2fs_rename(void *v)
+{
+       struct vop_rename_args  /* {
+               struct vnode *a_fdvp;
+               struct vnode *a_fvp;
+               struct componentname *a_fcnp;
+               struct vnode *a_tdvp;
+               struct vnode *a_tvp;
+               struct componentname *a_tcnp;
+       } */ *ap = v;
+       struct vnode *tvp = ap->a_tvp;
+       struct vnode *tdvp = ap->a_tdvp;
+       struct vnode *fvp = ap->a_fvp;
+       struct vnode *fdvp = ap->a_fdvp;
+       struct componentname *tcnp = ap->a_tcnp;
+       struct componentname *fcnp = ap->a_fcnp;
+       struct inode *ip, *xp, *dp;
+       struct ext2fs_dirtemplate dirbuf;
+       int doingdirectory = 0, oldparent = 0, newparent = 0;
+       int error = 0;
+       u_char namlen;
+
+       /*
+        * Check for cross-device rename.
+        */
+       if ((fvp->v_mount != tdvp->v_mount) ||
+           (tvp && (fvp->v_mount != tvp->v_mount))) {
+               error = EXDEV;
+abortit:
+               VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
+               if (tdvp == tvp)
+                       vrele(tdvp);
+               else
+                       vput(tdvp);
+               if (tvp)
+                       vput(tvp);
+               VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
+               vrele(fdvp);
+               vrele(fvp);
+               return (error);
+       }
+
+       /*
+        * Check if just deleting a link name.
+        */
+       if (tvp && ((VTOI(tvp)->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) ||
+           (VTOI(tdvp)->i_e2fs_flags & EXT2_APPEND))) {
+               error = EPERM;
+               goto abortit;
+       }
+       if (fvp == tvp) {
+               if (fvp->v_type == VDIR) {
+                       error = EINVAL;
+                       goto abortit;
+               }
+
+               /* Release destination completely. */
+               VOP_ABORTOP(tdvp, tcnp);
+               vput(tdvp);
+               vput(tvp);
+
+               /* Delete source. */
+               vrele(fvp);
+               fcnp->cn_flags &= ~(MODMASK);
+               fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+               fcnp->cn_nameiop = DELETE;
+               vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+               if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+                       vput(fdvp);
+                       return (error);
+               }
+               return (VOP_REMOVE(fdvp, fvp, fcnp));
+       }
+       if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
+               goto abortit;
+       dp = VTOI(fdvp);
+       ip = VTOI(fvp);
+       if ((nlink_t) ip->i_e2fs_nlink >= LINK_MAX) {
+               VOP_UNLOCK(fvp);
+               error = EMLINK;
+               goto abortit;
+       }
+       if ((ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND)) ||
+               (dp->i_e2fs_flags & EXT2_APPEND)) {
+               VOP_UNLOCK(fvp);
+               error = EPERM;
+               goto abortit;
+       }
+       if ((ip->i_e2fs_mode & IFMT) == IFDIR) {
+               error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+               if (!error && tvp)
+                       error = VOP_ACCESS(tvp, VWRITE, tcnp->cn_cred);
+               if (error) {
+                       VOP_UNLOCK(fvp);
+                       error = EACCES;
+                       goto abortit;
+               }
+               /*
+                * Avoid ".", "..", and aliases of "." for obvious reasons.
+                */
+               if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+                   dp == ip ||
+                   (fcnp->cn_flags & ISDOTDOT) ||
+                   (tcnp->cn_flags & ISDOTDOT) ||
+                   (ip->i_flag & IN_RENAME)) {
+                       VOP_UNLOCK(fvp);
+                       error = EINVAL;
+                       goto abortit;
+               }
+               ip->i_flag |= IN_RENAME;
+               oldparent = dp->i_number;
+               doingdirectory = 1;
+       }
+       VN_KNOTE(fdvp, NOTE_WRITE);             /* XXXLUKEM/XXX: right place? */
+
+       /*
+        * When the target exists, both the directory
+        * and target vnodes are returned locked.
+        */
+       dp = VTOI(tdvp);
+       xp = NULL;
+       if (tvp)
+               xp = VTOI(tvp);
+
+       /*
+        * 1) Bump link count while we're moving stuff
+        *    around.  If we crash somewhere before
+        *    completing our work, the link count
+        *    may be wrong, but correctable.
+        */
+       ip->i_e2fs_nlink++;
+       ip->i_flag |= IN_CHANGE;
+       if ((error = ext2fs_update(fvp, NULL, NULL, UPDATE_WAIT)) != 0) {
+               VOP_UNLOCK(fvp);
+               goto bad;
+       }
+
+       /*
+        * If ".." must be changed (ie the directory gets a new
+        * parent) then the source directory must not be in the
+        * directory hierarchy above the target, as this would
+        * orphan everything below the source directory. Also
+        * the user must have write permission in the source so
+        * as to be able to change "..". We must repeat the call
+        * to namei, as the parent directory is unlocked by the
+        * call to checkpath().
+        */
+       error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+       VOP_UNLOCK(fvp);
+       if (oldparent != dp->i_number)
+               newparent = dp->i_number;
+       if (doingdirectory && newparent) {
+               if (error)      /* write access check above */
+                       goto bad;
+               if (xp != NULL)
+                       vput(tvp);
+               vref(tdvp);     /* compensate for the ref checkpath loses */
+               error = ext2fs_checkpath(ip, dp, tcnp->cn_cred);
+               if (error != 0) {
+                       vrele(tdvp);
+                       goto out;
+               }
+               vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
+               if ((error = relookup(tdvp, &tvp, tcnp, 0)) != 0) {
+                       vput(tdvp);
+                       goto out;
+               }
+               dp = VTOI(tdvp);
+               xp = NULL;
+               if (tvp)
+                       xp = VTOI(tvp);
+       }
+       /*
+        * 2) If target doesn't exist, link the target
+        *    to the source and unlink the source.
+        *    Otherwise, rewrite the target directory
+        *    entry to reference the source inode and
+        *    expunge the original entry's existence.
+        */
+       if (xp == NULL) {
+               if (dp->i_dev != ip->i_dev)
+                       panic("rename: EXDEV");
+               /*
+                * Account for ".." in new directory.
+                * When source and destination have the same
+                * parent we don't fool with the link count.
+                */
+               if (doingdirectory && newparent) {
+                       if ((nlink_t)dp->i_e2fs_nlink >= LINK_MAX) {
+                               error = EMLINK;
+                               goto bad;
+                       }
+                       dp->i_e2fs_nlink++;
+                       dp->i_flag |= IN_CHANGE;
+                       if ((error = ext2fs_update(tdvp, NULL, NULL,
+                           UPDATE_WAIT)) != 0)
+                               goto bad;
+               }
+               error = ext2fs_direnter(ip, tdvp, &VTOI(tdvp)->i_crap, tcnp);
+               if (error != 0) {
+                       if (doingdirectory && newparent) {
+                               dp->i_e2fs_nlink--;
+                               dp->i_flag |= IN_CHANGE;
+                               (void)ext2fs_update(tdvp, NULL, NULL,
+                                   UPDATE_WAIT);
+                       }
+                       goto bad;
+               }
+               VN_KNOTE(tdvp, NOTE_WRITE);
+               vput(tdvp);
+       } else {
+               if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
+                       panic("rename: EXDEV");
+               /*
+                * Short circuit rename(foo, foo).
+                */
+               if (xp->i_number == ip->i_number)
+                       panic("rename: same file");
+               /*
+                * If the parent directory is "sticky", then the user must
+                * own the parent directory, or the destination of the rename,
+                * otherwise the destination may not be changed (except by
+                * root). This implements append-only directories.
+                */
+               if ((dp->i_e2fs_mode & S_ISTXT) &&
+                   kauth_authorize_generic(tcnp->cn_cred,
+                    KAUTH_GENERIC_ISSUSER, NULL) != 0 &&
+                   kauth_cred_geteuid(tcnp->cn_cred) != dp->i_uid &&
+                   xp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
+                       error = EPERM;
+                       goto bad;
+               }
+               /*
+                * Target must be empty if a directory and have no links
+                * to it. Also, ensure source and target are compatible
+                * (both directories, or both not directories).
+                */
+               if ((xp->i_e2fs_mode & IFMT) == IFDIR) {
+                       if (!ext2fs_dirempty(xp, dp->i_number, tcnp->cn_cred) ||
+                               xp->i_e2fs_nlink > 2) {
+                               error = ENOTEMPTY;
+                               goto bad;
+                       }
+                       if (!doingdirectory) {
+                               error = ENOTDIR;
+                               goto bad;
+                       }
+                       cache_purge(tdvp);
+               } else if (doingdirectory) {
+                       error = EISDIR;
+                       goto bad;
+               }
+               error = ext2fs_dirrewrite(dp, &dp->i_crap, ip, tcnp);
+               if (error != 0)
+                       goto bad;
+               /*
+                * If the target directory is in the same
+                * directory as the source directory,
+                * decrement the link count on the parent
+                * of the target directory.
+                */
+                if (doingdirectory && !newparent) {
+                       dp->i_e2fs_nlink--;
+                       dp->i_flag |= IN_CHANGE;
+               }
+               /*
+                * Adjust the link count of the target to
+                * reflect the dirrewrite above.  If this is
+                * a directory it is empty and there are
+                * no links to it, so we can squash the inode and
+                * any space associated with it.  We disallowed
+                * renaming over top of a directory with links to
+                * it above, as the remaining link would point to
+                * a directory without "." or ".." entries.
+                */
+               xp->i_e2fs_nlink--;
+               if (doingdirectory) {
+                       if (--xp->i_e2fs_nlink != 0)
+                               panic("rename: linked directory");
+                       error = ext2fs_truncate(tvp, (off_t)0, IO_SYNC,
+                           tcnp->cn_cred);
+               }
+               xp->i_flag |= IN_CHANGE;
+               VN_KNOTE(tdvp, NOTE_WRITE);
+               vput(tdvp);
+               VN_KNOTE(tvp, NOTE_DELETE);
+               vput(tvp);
+               xp = NULL;
+       }
+
+       /*
+        * 3) Unlink the source.
+        */
+       fcnp->cn_flags &= ~(MODMASK);
+       fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+       vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+       if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+               vput(fdvp);
+               vrele(ap->a_fvp);
+               return (error);
+       }
+       if (fvp != NULL) {
+               xp = VTOI(fvp);
+               dp = VTOI(fdvp);
+       } else {
+               /*
+                * From name has disappeared.
+                */
+               if (doingdirectory)
+                       panic("ext2fs_rename: lost dir entry");
+               vrele(ap->a_fvp);
+               return (0);
+       }
+       /*
+        * Ensure that the directory entry still exists and has not
+        * changed while the new name has been entered. If the source is
+        * a file then the entry may have been unlinked or renamed. In
+        * either case there is no further work to be done. If the source
+        * is a directory then it cannot have been rmdir'ed; its link
+        * count of three would cause a rmdir to fail with ENOTEMPTY.
+        * The IRENAME flag ensures that it cannot be moved by another
+        * rename.
+        */
+       if (xp != ip) {
+               if (doingdirectory)
+                       panic("ext2fs_rename: lost dir entry");
+       } else {
+               /*
+                * If the source is a directory with a
+                * new parent, the link count of the old
+                * parent directory must be decremented
+                * and ".." set to point to the new parent.
+                */
+               if (doingdirectory && newparent) {
+                       KASSERT(dp != NULL);
+                       dp->i_e2fs_nlink--;
+                       dp->i_flag |= IN_CHANGE;
+                       error = vn_rdwr(UIO_READ, fvp, (void *)&dirbuf,
+                               sizeof (struct ext2fs_dirtemplate), (off_t)0,
+                               UIO_SYSSPACE, IO_NODELOCKED,
+                               tcnp->cn_cred, (size_t *)0, NULL);
+                       if (error == 0) {
+                                       namlen = dirbuf.dotdot_namlen;
+                               if (namlen != 2 ||
+                                   dirbuf.dotdot_name[0] != '.' ||
+                                   dirbuf.dotdot_name[1] != '.') {
+                                       ufs_dirbad(xp, (doff_t)12,
+                                           "ext2fs_rename: mangled dir");
+                               } else {
+                                       dirbuf.dotdot_ino = h2fs32(newparent);
+                                       (void) vn_rdwr(UIO_WRITE, fvp,
+                                           (void *)&dirbuf,
+                                           sizeof (struct dirtemplate),
+                                           (off_t)0, UIO_SYSSPACE,
+                                           IO_NODELOCKED|IO_SYNC,
+                                           tcnp->cn_cred, (size_t *)0,
+                                           NULL);
+                                       cache_purge(fdvp);
+                               }
+                       }
+               }
+               error = ext2fs_dirremove(fdvp, &VTOI(fdvp)->i_crap, fcnp);
+               if (!error) {
+                       xp->i_e2fs_nlink--;
+                       xp->i_flag |= IN_CHANGE;
+               }
+               xp->i_flag &= ~IN_RENAME;
+       }
+       VN_KNOTE(fvp, NOTE_RENAME);
+       if (dp)
+               vput(fdvp);
+       if (xp)
+               vput(fvp);
+       vrele(ap->a_fvp);
+       return (error);
+
+bad:
+       if (xp)
+               vput(ITOV(xp));
+       vput(ITOV(dp));
+out:
+       if (doingdirectory)
+               ip->i_flag &= ~IN_RENAME;
+       if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
+               ip->i_e2fs_nlink--;
+               ip->i_flag |= IN_CHANGE;
+               vput(fvp);
+       } else
+               vrele(fvp);
+       vrele(fdvp);
+       return (error);
+}
+
+/*
+ * Mkdir system call
+ */
+int
+ext2fs_mkdir(void *v)
+{
+       struct vop_mkdir_args /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+               struct vattr *a_vap;
+       } */ *ap = v;
+       struct vnode            *dvp = ap->a_dvp;
+       struct vattr            *vap = ap->a_vap;
+       struct componentname    *cnp = ap->a_cnp;
+       struct inode            *ip, *dp = VTOI(dvp);
+       struct vnode            *tvp;
+       struct ext2fs_dirtemplate dirtemplate;
+       int                     error, dmode;
+       struct ufs_lookup_results *ulr;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+       if ((nlink_t)dp->i_e2fs_nlink >= LINK_MAX) {
+               error = EMLINK;
+               goto out;
+       }
+       dmode = vap->va_mode & ACCESSPERMS;
+       dmode |= IFDIR;
+       /*
+        * Must simulate part of ext2fs_makeinode here to acquire the inode,
+        * but not have it entered in the parent directory. The entry is
+        * made later after writing "." and ".." entries.
+        */
+       if ((error = ext2fs_valloc(dvp, dmode, cnp->cn_cred, &tvp)) != 0)
+               goto out;
+       ip = VTOI(tvp);
+       ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+       ip->i_e2fs_uid = ip->i_uid & 0xffff;
+       ip->i_e2fs_gid = dp->i_e2fs_gid;
+       if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+               ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff;
+               ip->i_e2fs_gid_high = dp->i_e2fs_gid_high;
+       } else {
+               ip->i_e2fs_uid_high = 0;
+               ip->i_e2fs_gid_high = 0;
+       }
+       ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16);
+       ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+       ip->i_e2fs_mode = dmode;
+       tvp->v_type = VDIR;     /* Rest init'd in getnewvnode(). */
+       ip->i_e2fs_nlink = 2;
+
+       /*
+        * Bump link count in parent directory
+        * to reflect work done below.  Should
+        * be done before reference is created
+        * so reparation is possible if we crash.
+        */
+       dp->i_e2fs_nlink++;
+       dp->i_flag |= IN_CHANGE;
+       if ((error = ext2fs_update(dvp, NULL, NULL, UPDATE_DIROP)) != 0)
+               goto bad;
+
+       /* Initialize directory with "." and ".." from static template. */
+       memset(&dirtemplate, 0, sizeof(dirtemplate));
+       dirtemplate.dot_ino = h2fs32(ip->i_number);
+       dirtemplate.dot_reclen = h2fs16(12);
+       dirtemplate.dot_namlen = 1;
+       if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+           (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+               dirtemplate.dot_type = EXT2_FT_DIR;
+       }
+       dirtemplate.dot_name[0] = '.';
+       dirtemplate.dotdot_ino = h2fs32(dp->i_number);
+    dirtemplate.dotdot_reclen = h2fs16(VTOI(dvp)->i_e2fs->e2fs_bsize - 12);
+       dirtemplate.dotdot_namlen = 2;
+       if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0 &&
+           (ip->i_e2fs->e2fs.e2fs_features_incompat & EXT2F_INCOMPAT_FTYPE)) {
+               dirtemplate.dotdot_type = EXT2_FT_DIR;
+       }
+       dirtemplate.dotdot_name[0] = dirtemplate.dotdot_name[1] = '.';
+       error = vn_rdwr(UIO_WRITE, tvp, (void *)&dirtemplate,
+           sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE,
+           IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (size_t *)0, NULL);
+       if (error) {
+               dp->i_e2fs_nlink--;
+               dp->i_flag |= IN_CHANGE;
+               goto bad;
+       }
+       if (VTOI(dvp)->i_e2fs->e2fs_bsize > dvp->v_mount->mnt_stat.f_bsize)
+               panic("ext2fs_mkdir: blksize"); /* XXX should grow with balloc() */
+       else {
+               error = ext2fs_setsize(ip, VTOI(dvp)->i_e2fs->e2fs_bsize);
+               if (error) {
+                       dp->i_e2fs_nlink--;
+                       dp->i_flag |= IN_CHANGE;
+                       goto bad;
+               }
+               ip->i_flag |= IN_CHANGE;
+               uvm_vnp_setsize(tvp, ext2fs_size(ip));
+       }
+
+       /* Directory set up, now install it's entry in the parent directory. */
+       error = ext2fs_direnter(ip, dvp, ulr, cnp);
+       if (error != 0) {
+               dp->i_e2fs_nlink--;
+               dp->i_flag |= IN_CHANGE;
+       }
+bad:
+       /*
+        * No need to do an explicit ext2fs_truncate here, vrele will do this
+        * for us because we set the link count to 0.
+        */
+       if (error) {
+               ip->i_e2fs_nlink = 0;
+               ip->i_flag |= IN_CHANGE;
+               vput(tvp);
+       } else {
+               VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+               *ap->a_vpp = tvp;
+       }
+out:
+       vput(dvp);
+       return (error);
+}
+
+/*
+ * Rmdir system call.
+ */
+int
+ext2fs_rmdir(void *v)
+{
+       struct vop_rmdir_args /* {
+               struct vnode *a_dvp;
+               struct vnode *a_vp;
+               struct componentname *a_cnp;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct vnode *dvp = ap->a_dvp;
+       struct componentname *cnp = ap->a_cnp;
+       struct inode *ip, *dp;
+       int error;
+       struct ufs_lookup_results *ulr;
+
+       ip = VTOI(vp);
+       dp = VTOI(dvp);
+
+       /* XXX should handle this material another way */
+       ulr = &dp->i_crap;
+       UFS_CHECK_CRAPCOUNTER(dp);
+
+       /*
+        * No rmdir "." please.
+        */
+       if (dp == ip) {
+               vrele(dvp);
+               vput(vp);
+               return (EINVAL);
+       }
+       /*
+        * Verify the directory is empty (and valid).
+        * (Rmdir ".." won't be valid since
+        *  ".." will contain a reference to
+        *  the current directory and thus be
+        *  non-empty.)
+        */
+       error = 0;
+       if (ip->i_e2fs_nlink != 2 ||
+           !ext2fs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
+               error = ENOTEMPTY;
+               goto out;
+       }
+       if ((dp->i_e2fs_flags & EXT2_APPEND) ||
+                                (ip->i_e2fs_flags & (EXT2_IMMUTABLE | EXT2_APPEND))) {
+               error = EPERM;
+               goto out;
+       }
+       /*
+        * Delete reference to directory before purging
+        * inode.  If we crash in between, the directory
+        * will be reattached to lost+found,
+        */
+       error = ext2fs_dirremove(dvp, ulr, cnp);
+       if (error != 0)
+               goto out;
+       dp->i_e2fs_nlink--;
+       dp->i_flag |= IN_CHANGE;
+       VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+       cache_purge(dvp);
+       vput(dvp);
+       dvp = NULL;
+       /*
+        * Truncate inode.  The only stuff left
+        * in the directory is "." and "..".  The
+        * "." reference is inconsequential since
+        * we're quashing it.  The ".." reference
+        * has already been adjusted above.  We've
+        * removed the "." reference and the reference
+        * in the parent directory, but there may be
+        * other hard links so decrement by 2 and
+        * worry about them later.
+        */
+       ip->i_e2fs_nlink -= 2;
+       error = ext2fs_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred);
+       cache_purge(ITOV(ip));
+out:
+       VN_KNOTE(vp, NOTE_DELETE);
+       if (dvp)
+               vput(dvp);
+       vput(vp);
+       return (error);
+}
+
+/*
+ * symlink -- make a symbolic link
+ */
+int
+ext2fs_symlink(void *v)
+{
+       struct vop_symlink_args /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+               struct vattr *a_vap;
+               char *a_target;
+       } */ *ap = v;
+       struct vnode    *vp, **vpp;
+       struct inode    *ip;
+       int             len, error;
+
+       vpp = ap->a_vpp;
+       error = ext2fs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
+                             vpp, ap->a_cnp);
+       if (error)
+               return (error);
+       VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+       vp = *vpp;
+       len = strlen(ap->a_target);
+       ip = VTOI(vp);
+       if (len < ip->i_ump->um_maxsymlinklen) {
+               memcpy((char *)ip->i_din.e2fs_din->e2di_shortlink, ap->a_target, len);
+               error = ext2fs_setsize(ip, len);
+               if (error)
+                       goto bad;
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               if (vp->v_mount->mnt_flag & MNT_RELATIME)
+                       ip->i_flag |= IN_ACCESS;
+               uvm_vnp_setsize(vp, len);
+       } else
+               error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
+                   UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred,
+                   (size_t *)0, NULL);
+bad:
+       if (error)
+               vput(vp);
+       return (error);
+}
+
+/*
+ * Return target name of a symbolic link
+ */
+int
+ext2fs_readlink(void *v)
+{
+       struct vop_readlink_args /* {
+               struct vnode *a_vp;
+               struct uio *a_uio;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode    *vp = ap->a_vp;
+       struct inode    *ip = VTOI(vp);
+       struct ufsmount *ump = ip->i_ump;
+       int             isize;
+
+       isize = ext2fs_size(ip);
+       if (isize < ump->um_maxsymlinklen ||
+           (ump->um_maxsymlinklen == 0 && ip->i_e2fs_nblock == 0)) {
+               uiomove((char *)ip->i_din.e2fs_din->e2di_shortlink, isize, ap->a_uio);
+               return (0);
+       }
+       return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+ext2fs_advlock(void *v)
+{
+       struct vop_advlock_args /* {
+               struct vnode *a_vp;
+               void * a_id;
+               int  a_op;
+               struct flock *a_fl;
+               int  a_flags;
+       } */ *ap = v;
+       struct inode *ip = VTOI(ap->a_vp);
+
+       return lf_advlock(ap, &ip->i_lockf, ext2fs_size(ip));
+}
+
+int
+ext2fs_fsync(void *v)
+{
+       struct vop_fsync_args /* {
+               struct vnode *a_vp;
+               kauth_cred_t a_cred;
+               int a_flags;
+               off_t offlo;
+               off_t offhi;
+               struct proc *a_p;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       int wait;
+       int error;
+
+       wait = (ap->a_flags & FSYNC_WAIT) != 0;
+
+       if (vp->v_type == VBLK)
+               error = spec_fsync(v);
+       else
+               error = vflushbuf(vp, wait);
+       if (error == 0 && (ap->a_flags & FSYNC_DATAONLY) == 0)
+               error = ext2fs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+
+       if (error == 0 && ap->a_flags & FSYNC_CACHE) {
+               int l = 0;
+               error = VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+                   curlwp->l_cred);
+       }
+
+       return error;
+}
+
+/*
+ * Initialize the vnode associated with a new inode, handle aliased
+ * vnodes.
+ */
+int
+ext2fs_vinit(struct mount *mntp, int (**specops)(void *),
+       int (**fifoops)(void *), struct vnode **vpp)
+{
+       struct timeval tv;
+       struct inode *ip;
+       struct vnode *vp;
+
+       vp = *vpp;
+       ip = VTOI(vp);
+       switch(vp->v_type = IFTOVT(ip->i_e2fs_mode)) {
+       case VCHR:
+       case VBLK:
+               vp->v_op = specops;
+               spec_node_init(vp, fs2h32(ip->i_din.e2fs_din->e2di_rdev));
+               break;
+       case VFIFO:
+               vp->v_op = fifoops;
+               break;
+       case VNON:
+       case VBAD:
+       case VSOCK:
+       case VLNK:
+       case VDIR:
+       case VREG:
+               break;
+       }
+       if (ip->i_number == ROOTINO)
+                vp->v_vflag |= VV_ROOT;
+       /*
+        * Initialize modrev times
+        */
+       getmicrouptime(&tv);
+       SETHIGH(ip->i_modrev, tv.tv_sec);
+       SETLOW(ip->i_modrev, tv.tv_usec * 4294);
+       *vpp = vp;
+       return (0);
+}
+
+/*
+ * Allocate a new inode.
+ */
+int
+ext2fs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
+               struct componentname *cnp)
+{
+       struct inode *ip, *pdir;
+       struct vnode *tvp;
+       int error, ismember = 0;
+       struct ufs_lookup_results *ulr;
+
+       pdir = VTOI(dvp);
+
+       /* XXX should handle this material another way */
+       ulr = &pdir->i_crap;
+       UFS_CHECK_CRAPCOUNTER(pdir);
+
+       *vpp = NULL;
+       if ((mode & IFMT) == 0)
+               mode |= IFREG;
+
+       if ((error = ext2fs_valloc(dvp, mode, cnp->cn_cred, &tvp)) != 0) {
+               vput(dvp);
+               return (error);
+       }
+       ip = VTOI(tvp);
+       ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+       ip->i_e2fs_uid = ip->i_uid & 0xffff;
+       ip->i_e2fs_gid = pdir->i_e2fs_gid;
+       if (ip->i_e2fs->e2fs.e2fs_rev > E2FS_REV0) {
+               ip->i_e2fs_uid_high = (ip->i_uid >> 16) & 0xffff;
+               ip->i_e2fs_gid_high = pdir->i_e2fs_gid_high;
+       } else {
+               ip->i_e2fs_uid_high = 0;
+               ip->i_e2fs_gid_high = 0;
+       }
+       ip->i_gid = ip->i_e2fs_gid | (ip->i_e2fs_gid_high << 16);
+       ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+       ip->i_e2fs_mode = mode;
+       tvp->v_type = IFTOVT(mode);     /* Rest init'd in getnewvnode(). */
+       ip->i_e2fs_nlink = 1;
+       if ((ip->i_e2fs_mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred,
+           ip->i_gid, &ismember) != 0 || !ismember) &&
+           kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL))
+               ip->i_e2fs_mode &= ~ISGID;
+
+       /*
+        * Make sure inode goes to disk before directory entry.
+        */
+       if ((error = ext2fs_update(tvp, NULL, NULL, UPDATE_WAIT)) != 0)
+               goto bad;
+       error = ext2fs_direnter(ip, dvp, ulr, cnp);
+       if (error != 0)
+               goto bad;
+       vput(dvp);
+       *vpp = tvp;
+       return (0);
+
+bad:
+       /*
+        * Write error occurred trying to update the inode
+        * or the directory so must deallocate the inode.
+        */
+       tvp->v_type = VNON;     /* Stop explosion if VBLK */
+       ip->i_e2fs_nlink = 0;
+       ip->i_flag |= IN_CHANGE;
+       vput(tvp);
+       vput(dvp);
+       return (error);
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+int
+ext2fs_reclaim(void *v)
+{
+       struct vop_reclaim_args /* {
+               struct vnode *a_vp;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       int error;
+
+       /*
+        * The inode must be freed and updated before being removed
+        * from its hash chain.  Other threads trying to gain a hold
+        * on the inode will be stalled because it is locked (VI_XLOCK).
+        */
+       if (ip->i_omode == 1 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+               ext2fs_vfree(vp, ip->i_number, ip->i_e2fs_mode);
+       if ((error = ufs_reclaim(vp)) != 0)
+               return (error);
+       if (ip->i_din.e2fs_din != NULL)
+               pool_put(&ext2fs_dinode_pool, ip->i_din.e2fs_din);
+       genfs_node_destroy(vp);
+       pool_put(&ext2fs_inode_pool, vp->v_data);
+       vp->v_data = NULL;
+       return (0);
+}
+
+/* Global vfs data structures for ext2fs. */
+int (**ext2fs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc ext2fs_vnodeop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, ext2fs_lookup },            /* lookup */
+       { &vop_create_desc, ext2fs_create },            /* create */
+       { &vop_mknod_desc, ext2fs_mknod },              /* mknod */
+       { &vop_open_desc, ext2fs_open },                /* open */
+       { &vop_close_desc, ufs_close },                 /* close */
+       { &vop_access_desc, ext2fs_access },            /* access */
+       { &vop_getattr_desc, ext2fs_getattr },          /* getattr */
+       { &vop_setattr_desc, ext2fs_setattr },          /* setattr */
+       { &vop_read_desc, ext2fs_read },                /* read */
+       { &vop_write_desc, ext2fs_write },              /* write */
+       { &vop_ioctl_desc, ufs_ioctl },                 /* ioctl */
+       { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
+       { &vop_poll_desc, ufs_poll },                   /* poll */
+       { &vop_kqfilter_desc, genfs_kqfilter },         /* kqfilter */
+       { &vop_revoke_desc, ufs_revoke },               /* revoke */
+       { &vop_mmap_desc, ufs_mmap },                   /* mmap */
+       { &vop_fsync_desc, ext2fs_fsync },              /* fsync */
+       { &vop_seek_desc, ufs_seek },                   /* seek */
+       { &vop_remove_desc, ext2fs_remove },            /* remove */
+       { &vop_link_desc, ext2fs_link },                /* link */
+       { &vop_rename_desc, ext2fs_rename },            /* rename */
+       { &vop_mkdir_desc, ext2fs_mkdir },              /* mkdir */
+       { &vop_rmdir_desc, ext2fs_rmdir },              /* rmdir */
+       { &vop_symlink_desc, ext2fs_symlink },          /* symlink */
+       { &vop_readdir_desc, ext2fs_readdir },          /* readdir */
+       { &vop_readlink_desc, ext2fs_readlink },        /* readlink */
+       { &vop_abortop_desc, ufs_abortop },             /* abortop */
+       { &vop_inactive_desc, ext2fs_inactive },        /* inactive */
+       { &vop_reclaim_desc, ext2fs_reclaim },          /* reclaim */
+       { &vop_lock_desc, ufs_lock },                   /* lock */
+       { &vop_unlock_desc, ufs_unlock },               /* unlock */
+       { &vop_bmap_desc, ext2fs_bmap },                /* bmap */
+       { &vop_strategy_desc, ufs_strategy },           /* strategy */
+       { &vop_print_desc, ufs_print },                 /* print */
+       { &vop_islocked_desc, ufs_islocked },           /* islocked */
+       { &vop_pathconf_desc, ufs_pathconf },           /* pathconf */
+       { &vop_advlock_desc, ext2fs_advlock },          /* advlock */
+       { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
+       { &vop_getpages_desc, genfs_getpages },         /* getpages */
+       { &vop_putpages_desc, genfs_putpages },         /* putpages */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc ext2fs_vnodeop_opv_desc =
+       { &ext2fs_vnodeop_p, ext2fs_vnodeop_entries };
+
+int (**ext2fs_specop_p)(void *);
+const struct vnodeopv_entry_desc ext2fs_specop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, spec_lookup },              /* lookup */
+       { &vop_create_desc, spec_create },              /* create */
+       { &vop_mknod_desc, spec_mknod },                /* mknod */
+       { &vop_open_desc, spec_open },                  /* open */
+       { &vop_close_desc, ufsspec_close },             /* close */
+       { &vop_access_desc, ext2fs_access },            /* access */
+       { &vop_getattr_desc, ext2fs_getattr },          /* getattr */
+       { &vop_setattr_desc, ext2fs_setattr },          /* setattr */
+       { &vop_read_desc, ufsspec_read },               /* read */
+       { &vop_write_desc, ufsspec_write },             /* write */
+       { &vop_ioctl_desc, spec_ioctl },                /* ioctl */
+       { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
+       { &vop_poll_desc, spec_poll },                  /* poll */
+       { &vop_kqfilter_desc, spec_kqfilter },          /* kqfilter */
+       { &vop_revoke_desc, spec_revoke },              /* revoke */
+       { &vop_mmap_desc, spec_mmap },                  /* mmap */
+       { &vop_fsync_desc, ext2fs_fsync },              /* fsync */
+       { &vop_seek_desc, spec_seek },                  /* seek */
+       { &vop_remove_desc, spec_remove },              /* remove */
+       { &vop_link_desc, spec_link },                  /* link */
+       { &vop_rename_desc, spec_rename },              /* rename */
+       { &vop_mkdir_desc, spec_mkdir },                /* mkdir */
+       { &vop_rmdir_desc, spec_rmdir },                /* rmdir */
+       { &vop_symlink_desc, spec_symlink },            /* symlink */
+       { &vop_readdir_desc, spec_readdir },            /* readdir */
+       { &vop_readlink_desc, spec_readlink },          /* readlink */
+       { &vop_abortop_desc, spec_abortop },            /* abortop */
+       { &vop_inactive_desc, ext2fs_inactive },        /* inactive */
+       { &vop_reclaim_desc, ext2fs_reclaim },          /* reclaim */
+       { &vop_lock_desc, ufs_lock },                   /* lock */
+       { &vop_unlock_desc, ufs_unlock },               /* unlock */
+       { &vop_bmap_desc, spec_bmap },                  /* bmap */
+       { &vop_strategy_desc, spec_strategy },          /* strategy */
+       { &vop_print_desc, ufs_print },                 /* print */
+       { &vop_islocked_desc, ufs_islocked },           /* islocked */
+       { &vop_pathconf_desc, spec_pathconf },          /* pathconf */
+       { &vop_advlock_desc, spec_advlock },            /* advlock */
+       { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
+       { &vop_getpages_desc, spec_getpages },          /* getpages */
+       { &vop_putpages_desc, spec_putpages },          /* putpages */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc ext2fs_specop_opv_desc =
+       { &ext2fs_specop_p, ext2fs_specop_entries };
+
+int (**ext2fs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc ext2fs_fifoop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, vn_fifo_bypass },           /* lookup */
+       { &vop_create_desc, vn_fifo_bypass },           /* create */
+       { &vop_mknod_desc, vn_fifo_bypass },            /* mknod */
+       { &vop_open_desc, vn_fifo_bypass },             /* open */
+       { &vop_close_desc, ufsfifo_close },             /* close */
+       { &vop_access_desc, ext2fs_access },            /* access */
+       { &vop_getattr_desc, ext2fs_getattr },          /* getattr */
+       { &vop_setattr_desc, ext2fs_setattr },          /* setattr */
+       { &vop_read_desc, ufsfifo_read },               /* read */
+       { &vop_write_desc, ufsfifo_write },             /* write */
+       { &vop_ioctl_desc, vn_fifo_bypass },            /* ioctl */
+       { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
+       { &vop_poll_desc, vn_fifo_bypass },             /* poll */
+       { &vop_kqfilter_desc, vn_fifo_bypass },         /* kqfilter */
+       { &vop_revoke_desc, vn_fifo_bypass },           /* revoke */
+       { &vop_mmap_desc, vn_fifo_bypass },             /* mmap */
+       { &vop_fsync_desc, ext2fs_fsync },              /* fsync */
+       { &vop_seek_desc, vn_fifo_bypass },             /* seek */
+       { &vop_remove_desc, vn_fifo_bypass },           /* remove */
+       { &vop_link_desc, vn_fifo_bypass },             /* link */
+       { &vop_rename_desc, vn_fifo_bypass },           /* rename */
+       { &vop_mkdir_desc, vn_fifo_bypass },            /* mkdir */
+       { &vop_rmdir_desc, vn_fifo_bypass },            /* rmdir */
+       { &vop_symlink_desc, vn_fifo_bypass },          /* symlink */
+       { &vop_readdir_desc, vn_fifo_bypass },          /* readdir */
+       { &vop_readlink_desc, vn_fifo_bypass },         /* readlink */
+       { &vop_abortop_desc, vn_fifo_bypass },          /* abortop */
+       { &vop_inactive_desc, ext2fs_inactive },        /* inactive */
+       { &vop_reclaim_desc, ext2fs_reclaim },          /* reclaim */
+       { &vop_lock_desc, ufs_lock },                   /* lock */
+       { &vop_unlock_desc, ufs_unlock },               /* unlock */
+       { &vop_bmap_desc, vn_fifo_bypass },             /* bmap */
+       { &vop_strategy_desc, vn_fifo_bypass },         /* strategy */
+       { &vop_print_desc, ufs_print },                 /* print */
+       { &vop_islocked_desc, ufs_islocked },           /* islocked */
+       { &vop_pathconf_desc, vn_fifo_bypass },         /* pathconf */
+       { &vop_advlock_desc, vn_fifo_bypass },          /* advlock */
+       { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
+       { &vop_putpages_desc, vn_fifo_bypass },         /* putpages */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc ext2fs_fifoop_opv_desc =
+       { &ext2fs_fifoop_p, ext2fs_fifoop_entries };
diff --git a/sys/ufs/ffs/Makefile b/sys/ufs/ffs/Makefile
new file mode 100644 (file)
index 0000000..1f03afc
--- /dev/null
@@ -0,0 +1,7 @@
+#      $NetBSD: Makefile,v 1.1 1998/06/12 23:23:11 cgd Exp $
+
+INCSDIR= /usr/include/ufs/ffs
+
+INCS=  ffs_extern.h fs.h
+
+.include <bsd.kinc.mk>
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
new file mode 100644 (file)
index 0000000..411f1a8
--- /dev/null
@@ -0,0 +1,2030 @@
+/*     $NetBSD: ffs_alloc.c,v 1.130 2011/11/28 08:05:07 tls Exp $      */
+
+/*-
+ * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2002 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Marshall
+ * Kirk McKusick and Network Associates Laboratories, the Security
+ * Research Division of Network Associates, Inc. under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
+ * research program
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.130 2011/11/28 08:05:07 tls Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#include "opt_uvm_page_trkown.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/cprng.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/syslog.h>
+#include <sys/vnode.h>
+#include <sys/wapbl.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#ifdef UVM_PAGE_TRKOWN
+#include <uvm/uvm.h>
+#endif
+
+static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int);
+static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int);
+static ino_t ffs_dirpref(struct inode *);
+static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int);
+static void ffs_fserr(struct fs *, u_int, const char *);
+static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int,
+    daddr_t (*)(struct inode *, int, daddr_t, int, int));
+static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int);
+static int32_t ffs_mapsearch(struct fs *, struct cg *,
+                                     daddr_t, int);
+static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *,
+    daddr_t, long, bool);
+static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t,
+    int, bool);
+
+/* if 1, changes in optimalization strategy are logged */
+int ffs_log_changeopt = 0;
+
+/* in ffs_tables.c */
+extern const int inside[], around[];
+extern const u_char * const fragtbl[];
+
+/* Basic consistency check for block allocations */
+static int
+ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno,
+    long size, dev_t dev, ino_t inum)
+{
+       if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
+           fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
+               printf("dev = 0x%llx, bno = %" PRId64 " bsize = %d, "
+                   "size = %ld, fs = %s\n",
+                   (long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
+               panic("%s: bad size", func);
+       }
+
+       if (bno >= fs->fs_size) {
+               printf("bad block %" PRId64 ", ino %llu\n", bno,
+                   (unsigned long long)inum);
+               ffs_fserr(fs, inum, "bad block");
+               return EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Allocate a block in the file system.
+ *
+ * The size of the requested block is given, which must be some
+ * multiple of fs_fsize and <= fs_bsize.
+ * A preference may be optionally specified. If a preference is given
+ * the following hierarchy is used to allocate a block:
+ *   1) allocate the requested block.
+ *   2) allocate a rotationally optimal block in the same cylinder.
+ *   3) allocate a block in the same cylinder group.
+ *   4) quadradically rehash into other cylinder groups, until an
+ *      available block is located.
+ * If no block preference is given the following hierarchy is used
+ * to allocate a block:
+ *   1) allocate a block in the cylinder group that contains the
+ *      inode for the file.
+ *   2) quadradically rehash into other cylinder groups, until an
+ *      available block is located.
+ *
+ * => called with um_lock held
+ * => releases um_lock before returning
+ */
+int
+ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags,
+    kauth_cred_t cred, daddr_t *bnp)
+{
+       struct ufsmount *ump;
+       struct fs *fs;
+       daddr_t bno;
+       int cg;
+#if defined(QUOTA) || defined(QUOTA2)
+       int error;
+#endif
+
+       fs = ip->i_fs;
+       ump = ip->i_ump;
+
+       KASSERT(mutex_owned(&ump->um_lock));
+
+#ifdef UVM_PAGE_TRKOWN
+
+       /*
+        * Sanity-check that allocations within the file size
+        * do not allow other threads to read the stale contents
+        * of newly allocated blocks.
+        * Usually pages will exist to cover the new allocation.
+        * There is an optimization in ffs_write() where we skip
+        * creating pages if several conditions are met:
+        *  - the file must not be mapped (in any user address space).
+        *  - the write must cover whole pages and whole blocks.
+        * If those conditions are not met then pages must exist and
+        * be locked by the current thread.
+        */
+
+       if (ITOV(ip)->v_type == VREG &&
+           lblktosize(fs, (voff_t)lbn) < round_page(ITOV(ip)->v_size)) {
+               struct vm_page *pg;
+               struct vnode *vp = ITOV(ip);
+               struct uvm_object *uobj = &vp->v_uobj;
+               voff_t off = trunc_page(lblktosize(fs, lbn));
+               voff_t endoff = round_page(lblktosize(fs, lbn) + size);
+
+               mutex_enter(uobj->vmobjlock);
+               while (off < endoff) {
+                       pg = uvm_pagelookup(uobj, off);
+                       KASSERT((pg == NULL && (vp->v_vflag & VV_MAPPED) == 0 &&
+                                (size & PAGE_MASK) == 0 && 
+                                blkoff(fs, size) == 0) ||
+                               (pg != NULL && pg->owner == curproc->p_pid &&
+                                pg->lowner == curlwp->l_lid));
+                       off += PAGE_SIZE;
+               }
+               mutex_exit(uobj->vmobjlock);
+       }
+#endif
+
+       *bnp = 0;
+#ifdef DIAGNOSTIC
+       if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
+               printf("dev = 0x%llx, bsize = %d, size = %d, fs = %s\n",
+                   (unsigned long long)ip->i_dev, fs->fs_bsize, size,
+                   fs->fs_fsmnt);
+               panic("ffs_alloc: bad size");
+       }
+       if (cred == NOCRED)
+               panic("ffs_alloc: missing credential");
+#endif /* DIAGNOSTIC */
+       if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
+               goto nospace;
+       if (freespace(fs, fs->fs_minfree) <= 0 &&
+           kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
+           NULL, NULL) != 0)
+               goto nospace;
+#if defined(QUOTA) || defined(QUOTA2)
+       mutex_exit(&ump->um_lock);
+       if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
+               return (error);
+       mutex_enter(&ump->um_lock);
+#endif
+
+       if (bpref >= fs->fs_size)
+               bpref = 0;
+       if (bpref == 0)
+               cg = ino_to_cg(fs, ip->i_number);
+       else
+               cg = dtog(fs, bpref);
+       bno = ffs_hashalloc(ip, cg, bpref, size, flags, ffs_alloccg);
+       if (bno > 0) {
+               DIP_ADD(ip, blocks, btodb(size));
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               *bnp = bno;
+               return (0);
+       }
+#if defined(QUOTA) || defined(QUOTA2)
+       /*
+        * Restore user's disk quota because allocation failed.
+        */
+       (void) chkdq(ip, -btodb(size), cred, FORCE);
+#endif
+       if (flags & B_CONTIG) {
+               /*
+                * XXX ump->um_lock handling is "suspect" at best.
+                * For the case where ffs_hashalloc() fails early
+                * in the B_CONTIG case we reach here with um_lock
+                * already unlocked, so we can't release it again
+                * like in the normal error path.  See kern/39206.
+                *
+                *
+                * Fail silently - it's up to our caller to report
+                * errors.
+                */
+               return (ENOSPC);
+       }
+nospace:
+       mutex_exit(&ump->um_lock);
+       ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
+       uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
+       return (ENOSPC);
+}
+
+/*
+ * Reallocate a fragment to a bigger size
+ *
+ * The number and size of the old block is given, and a preference
+ * and new size is also specified. The allocator attempts to extend
+ * the original block. Failing that, the regular block allocator is
+ * invoked to get an appropriate block.
+ *
+ * => called with um_lock held
+ * => return with um_lock released
+ */
+int
+ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize,
+    int nsize, kauth_cred_t cred, struct buf **bpp, daddr_t *blknop)
+{
+       struct ufsmount *ump;
+       struct fs *fs;
+       struct buf *bp;
+       int cg, request, error;
+       daddr_t bprev, bno;
+
+       fs = ip->i_fs;
+       ump = ip->i_ump;
+
+       KASSERT(mutex_owned(&ump->um_lock));
+
+#ifdef UVM_PAGE_TRKOWN
+
+       /*
+        * Sanity-check that allocations within the file size
+        * do not allow other threads to read the stale contents
+        * of newly allocated blocks.
+        * Unlike in ffs_alloc(), here pages must always exist
+        * for such allocations, because only the last block of a file
+        * can be a fragment and ffs_write() will reallocate the
+        * fragment to the new size using ufs_balloc_range(),
+        * which always creates pages to cover blocks it allocates.
+        */
+
+       if (ITOV(ip)->v_type == VREG) {
+               struct vm_page *pg;
+               struct uvm_object *uobj = &ITOV(ip)->v_uobj;
+               voff_t off = trunc_page(lblktosize(fs, lbprev));
+               voff_t endoff = round_page(lblktosize(fs, lbprev) + osize);
+
+               mutex_enter(uobj->vmobjlock);
+               while (off < endoff) {
+                       pg = uvm_pagelookup(uobj, off);
+                       KASSERT(pg->owner == curproc->p_pid &&
+                               pg->lowner == curlwp->l_lid);
+                       off += PAGE_SIZE;
+               }
+               mutex_exit(uobj->vmobjlock);
+       }
+#endif
+
+#ifdef DIAGNOSTIC
+       if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
+           (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
+               printf(
+                   "dev = 0x%llx, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
+                   (unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
+                   fs->fs_fsmnt);
+               panic("ffs_realloccg: bad size");
+       }
+       if (cred == NOCRED)
+               panic("ffs_realloccg: missing credential");
+#endif /* DIAGNOSTIC */
+       if (freespace(fs, fs->fs_minfree) <= 0 &&
+           kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
+           NULL, NULL) != 0) {
+               mutex_exit(&ump->um_lock);
+               goto nospace;
+       }
+       if (fs->fs_magic == FS_UFS2_MAGIC)
+               bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs));
+       else
+               bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs));
+
+       if (bprev == 0) {
+               printf("dev = 0x%llx, bsize = %d, bprev = %" PRId64 ", fs = %s\n",
+                   (unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
+                   fs->fs_fsmnt);
+               panic("ffs_realloccg: bad bprev");
+       }
+       mutex_exit(&ump->um_lock);
+
+       /*
+        * Allocate the extra space in the buffer.
+        */
+       if (bpp != NULL &&
+           (error = bread(ITOV(ip), lbprev, osize, NOCRED, 0, &bp)) != 0) {
+               brelse(bp, 0);
+               return (error);
+       }
+#if defined(QUOTA) || defined(QUOTA2)
+       if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
+               if (bpp != NULL) {
+                       brelse(bp, 0);
+               }
+               return (error);
+       }
+#endif
+       /*
+        * Check for extension in the existing location.
+        */
+       cg = dtog(fs, bprev);
+       mutex_enter(&ump->um_lock);
+       if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) {
+               DIP_ADD(ip, blocks, btodb(nsize - osize));
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+               if (bpp != NULL) {
+                       if (bp->b_blkno != fsbtodb(fs, bno))
+                               panic("bad blockno");
+                       allocbuf(bp, nsize, 1);
+                       memset((char *)bp->b_data + osize, 0, nsize - osize);
+                       mutex_enter(bp->b_objlock);
+                       KASSERT(!cv_has_waiters(&bp->b_done));
+                       bp->b_oflags |= BO_DONE;
+                       mutex_exit(bp->b_objlock);
+                       *bpp = bp;
+               }
+               if (blknop != NULL) {
+                       *blknop = bno;
+               }
+               return (0);
+       }
+       /*
+        * Allocate a new disk location.
+        */
+       if (bpref >= fs->fs_size)
+               bpref = 0;
+       switch ((int)fs->fs_optim) {
+       case FS_OPTSPACE:
+               /*
+                * Allocate an exact sized fragment. Although this makes
+                * best use of space, we will waste time relocating it if
+                * the file continues to grow. If the fragmentation is
+                * less than half of the minimum free reserve, we choose
+                * to begin optimizing for time.
+                */
+               request = nsize;
+               if (fs->fs_minfree < 5 ||
+                   fs->fs_cstotal.cs_nffree >
+                   fs->fs_dsize * fs->fs_minfree / (2 * 100))
+                       break;
+
+               if (ffs_log_changeopt) {
+                       log(LOG_NOTICE,
+                               "%s: optimization changed from SPACE to TIME\n",
+                               fs->fs_fsmnt);
+               }
+
+               fs->fs_optim = FS_OPTTIME;
+               break;
+       case FS_OPTTIME:
+               /*
+                * At this point we have discovered a file that is trying to
+                * grow a small fragment to a larger fragment. To save time,
+                * we allocate a full sized block, then free the unused portion.
+                * If the file continues to grow, the `ffs_fragextend' call
+                * above will be able to grow it in place without further
+                * copying. If aberrant programs cause disk fragmentation to
+                * grow within 2% of the free reserve, we choose to begin
+                * optimizing for space.
+                */
+               request = fs->fs_bsize;
+               if (fs->fs_cstotal.cs_nffree <
+                   fs->fs_dsize * (fs->fs_minfree - 2) / 100)
+                       break;
+
+               if (ffs_log_changeopt) {
+                       log(LOG_NOTICE,
+                               "%s: optimization changed from TIME to SPACE\n",
+                               fs->fs_fsmnt);
+               }
+
+               fs->fs_optim = FS_OPTSPACE;
+               break;
+       default:
+               printf("dev = 0x%llx, optim = %d, fs = %s\n",
+                   (unsigned long long)ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
+               panic("ffs_realloccg: bad optim");
+               /* NOTREACHED */
+       }
+       bno = ffs_hashalloc(ip, cg, bpref, request, 0, ffs_alloccg);
+       if (bno > 0) {
+               if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+                   (ITOV(ip)->v_type != VREG)) {
+                       UFS_WAPBL_REGISTER_DEALLOCATION(
+                           ip->i_ump->um_mountp, fsbtodb(fs, bprev),
+                           osize);
+               } else {
+                       ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
+                           ip->i_number);
+               }
+               if (nsize < request) {
+                       if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+                           (ITOV(ip)->v_type != VREG)) {
+                               UFS_WAPBL_REGISTER_DEALLOCATION(
+                                   ip->i_ump->um_mountp,
+                                   fsbtodb(fs, (bno + numfrags(fs, nsize))),
+                                   request - nsize);
+                       } else
+                               ffs_blkfree(fs, ip->i_devvp,
+                                   bno + numfrags(fs, nsize),
+                                   (long)(request - nsize), ip->i_number);
+               }
+               DIP_ADD(ip, blocks, btodb(nsize - osize));
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               if (bpp != NULL) {
+                       bp->b_blkno = fsbtodb(fs, bno);
+                       allocbuf(bp, nsize, 1);
+                       memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
+                       mutex_enter(bp->b_objlock);
+                       KASSERT(!cv_has_waiters(&bp->b_done));
+                       bp->b_oflags |= BO_DONE;
+                       mutex_exit(bp->b_objlock);
+                       *bpp = bp;
+               }
+               if (blknop != NULL) {
+                       *blknop = bno;
+               }
+               return (0);
+       }
+       mutex_exit(&ump->um_lock);
+
+#if defined(QUOTA) || defined(QUOTA2)
+       /*
+        * Restore user's disk quota because allocation failed.
+        */
+       (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
+#endif
+       if (bpp != NULL) {
+               brelse(bp, 0);
+       }
+
+nospace:
+       /*
+        * no space available
+        */
+       ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
+       uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
+       return (ENOSPC);
+}
+
+/*
+ * Allocate an inode in the file system.
+ *
+ * If allocating a directory, use ffs_dirpref to select the inode.
+ * If allocating in a directory, the following hierarchy is followed:
+ *   1) allocate the preferred inode.
+ *   2) allocate an inode in the same cylinder group.
+ *   3) quadradically rehash into other cylinder groups, until an
+ *      available inode is located.
+ * If no inode preference is given the following hierarchy is used
+ * to allocate an inode:
+ *   1) allocate an inode in cylinder group 0.
+ *   2) quadradically rehash into other cylinder groups, until an
+ *      available inode is located.
+ *
+ * => um_lock not held upon entry or return
+ */
+int
+ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
+    struct vnode **vpp)
+{
+       struct ufsmount *ump;
+       struct inode *pip;
+       struct fs *fs;
+       struct inode *ip;
+       struct timespec ts;
+       ino_t ino, ipref;
+       int cg, error;
+
+       UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
+
+       *vpp = NULL;
+       pip = VTOI(pvp);
+       fs = pip->i_fs;
+       ump = pip->i_ump;
+
+       error = UFS_WAPBL_BEGIN(pvp->v_mount);
+       if (error) {
+               return error;
+       }
+       mutex_enter(&ump->um_lock);
+       if (fs->fs_cstotal.cs_nifree == 0)
+               goto noinodes;
+
+       if ((mode & IFMT) == IFDIR)
+               ipref = ffs_dirpref(pip);
+       else
+               ipref = pip->i_number;
+       if (ipref >= fs->fs_ncg * fs->fs_ipg)
+               ipref = 0;
+       cg = ino_to_cg(fs, ipref);
+       /*
+        * Track number of dirs created one after another
+        * in a same cg without intervening by files.
+        */
+       if ((mode & IFMT) == IFDIR) {
+               if (fs->fs_contigdirs[cg] < 255)
+                       fs->fs_contigdirs[cg]++;
+       } else {
+               if (fs->fs_contigdirs[cg] > 0)
+                       fs->fs_contigdirs[cg]--;
+       }
+       ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, ffs_nodealloccg);
+       if (ino == 0)
+               goto noinodes;
+       UFS_WAPBL_END(pvp->v_mount);
+       error = VFS_VGET(pvp->v_mount, ino, vpp);
+       if (error) {
+               int err;
+               err = UFS_WAPBL_BEGIN(pvp->v_mount);
+               if (err == 0)
+                       ffs_vfree(pvp, ino, mode);
+               if (err == 0)
+                       UFS_WAPBL_END(pvp->v_mount);
+               return (error);
+       }
+       KASSERT((*vpp)->v_type == VNON);
+       ip = VTOI(*vpp);
+       if (ip->i_mode) {
+#if 0
+               printf("mode = 0%o, inum = %d, fs = %s\n",
+                   ip->i_mode, ip->i_number, fs->fs_fsmnt);
+#else
+               printf("dmode %x mode %x dgen %x gen %x\n",
+                   DIP(ip, mode), ip->i_mode,
+                   DIP(ip, gen), ip->i_gen);
+               printf("size %llx blocks %llx\n",
+                   (long long)DIP(ip, size), (long long)DIP(ip, blocks));
+               printf("ino %llu ipref %llu\n", (unsigned long long)ino,
+                   (unsigned long long)ipref);
+#if 0
+               error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
+                   (int)fs->fs_bsize, NOCRED, 0, &bp);
+#endif
+
+#endif
+               panic("ffs_valloc: dup alloc");
+       }
+       if (DIP(ip, blocks)) {                          /* XXX */
+               printf("free inode %s/%llu had %" PRId64 " blocks\n",
+                   fs->fs_fsmnt, (unsigned long long)ino, DIP(ip, blocks));
+               DIP_ASSIGN(ip, blocks, 0);
+       }
+       ip->i_flag &= ~IN_SPACECOUNTED;
+       ip->i_flags = 0;
+       DIP_ASSIGN(ip, flags, 0);
+       /*
+        * Set up a new generation number for this inode.
+        */
+       ip->i_gen++;
+       DIP_ASSIGN(ip, gen, ip->i_gen);
+       if (fs->fs_magic == FS_UFS2_MAGIC) {
+               vfs_timestamp(&ts);
+               ip->i_ffs2_birthtime = ts.tv_sec;
+               ip->i_ffs2_birthnsec = ts.tv_nsec;
+       }
+       return (0);
+noinodes:
+       mutex_exit(&ump->um_lock);
+       UFS_WAPBL_END(pvp->v_mount);
+       ffs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes");
+       uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
+       return (ENOSPC);
+}
+
+/*
+ * Find a cylinder group in which to place a directory.
+ *
+ * The policy implemented by this algorithm is to allocate a
+ * directory inode in the same cylinder group as its parent
+ * directory, but also to reserve space for its files inodes
+ * and data. Restrict the number of directories which may be
+ * allocated one after another in the same cylinder group
+ * without intervening allocation of files.
+ *
+ * If we allocate a first level directory then force allocation
+ * in another cylinder group.
+ */
+static ino_t
+ffs_dirpref(struct inode *pip)
+{
+       register struct fs *fs;
+       int cg, prefcg;
+       int64_t dirsize, cgsize, curdsz;
+       int avgifree, avgbfree, avgndir;
+       int minifree, minbfree, maxndir;
+       int mincg, minndir;
+       int maxcontigdirs;
+
+       KASSERT(mutex_owned(&pip->i_ump->um_lock));
+
+       fs = pip->i_fs;
+
+       avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
+       avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
+       avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
+
+       /*
+        * Force allocation in another cg if creating a first level dir.
+        */
+       if (ITOV(pip)->v_vflag & VV_ROOT) {
+               prefcg = random() % fs->fs_ncg;
+               mincg = prefcg;
+               minndir = fs->fs_ipg;
+               for (cg = prefcg; cg < fs->fs_ncg; cg++)
+                       if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
+                           fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
+                           fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+                               mincg = cg;
+                               minndir = fs->fs_cs(fs, cg).cs_ndir;
+                       }
+               for (cg = 0; cg < prefcg; cg++)
+                       if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
+                           fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
+                           fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+                               mincg = cg;
+                               minndir = fs->fs_cs(fs, cg).cs_ndir;
+                       }
+               return ((ino_t)(fs->fs_ipg * mincg));
+       }
+
+       /*
+        * Count various limits which used for
+        * optimal allocation of a directory inode.
+        */
+       maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
+       minifree = avgifree - fs->fs_ipg / 4;
+       if (minifree < 0)
+               minifree = 0;
+       minbfree = avgbfree - fragstoblks(fs, fs->fs_fpg) / 4;
+       if (minbfree < 0)
+               minbfree = 0;
+       cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
+       dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
+       if (avgndir != 0) {
+               curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
+               if (dirsize < curdsz)
+                       dirsize = curdsz;
+       }
+       if (cgsize < dirsize * 255)
+               maxcontigdirs = cgsize / dirsize;
+       else
+               maxcontigdirs = 255;
+       if (fs->fs_avgfpdir > 0)
+               maxcontigdirs = min(maxcontigdirs,
+                                   fs->fs_ipg / fs->fs_avgfpdir);
+       if (maxcontigdirs == 0)
+               maxcontigdirs = 1;
+
+       /*
+        * Limit number of dirs in one cg and reserve space for
+        * regular files, but only if we have no deficit in
+        * inodes or space.
+        */
+       prefcg = ino_to_cg(fs, pip->i_number);
+       for (cg = prefcg; cg < fs->fs_ncg; cg++)
+               if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
+                   fs->fs_cs(fs, cg).cs_nifree >= minifree &&
+                   fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
+                       if (fs->fs_contigdirs[cg] < maxcontigdirs)
+                               return ((ino_t)(fs->fs_ipg * cg));
+               }
+       for (cg = 0; cg < prefcg; cg++)
+               if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
+                   fs->fs_cs(fs, cg).cs_nifree >= minifree &&
+                   fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
+                       if (fs->fs_contigdirs[cg] < maxcontigdirs)
+                               return ((ino_t)(fs->fs_ipg * cg));
+               }
+       /*
+        * This is a backstop when we are deficient in space.
+        */
+       for (cg = prefcg; cg < fs->fs_ncg; cg++)
+               if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
+                       return ((ino_t)(fs->fs_ipg * cg));
+       for (cg = 0; cg < prefcg; cg++)
+               if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
+                       break;
+       return ((ino_t)(fs->fs_ipg * cg));
+}
+
+/*
+ * Select the desired position for the next block in a file.  The file is
+ * logically divided into sections. The first section is composed of the
+ * direct blocks. Each additional section contains fs_maxbpg blocks.
+ *
+ * If no blocks have been allocated in the first section, the policy is to
+ * request a block in the same cylinder group as the inode that describes
+ * the file. If no blocks have been allocated in any other section, the
+ * policy is to place the section in a cylinder group with a greater than
+ * average number of free blocks.  An appropriate cylinder group is found
+ * by using a rotor that sweeps the cylinder groups. When a new group of
+ * blocks is needed, the sweep begins in the cylinder group following the
+ * cylinder group from which the previous allocation was made. The sweep
+ * continues until a cylinder group with greater than the average number
+ * of free blocks is found. If the allocation is for the first block in an
+ * indirect block, the information on the previous allocation is unavailable;
+ * here a best guess is made based upon the logical block number being
+ * allocated.
+ *
+ * If a section is already partially allocated, the policy is to
+ * contiguously allocate fs_maxcontig blocks.  The end of one of these
+ * contiguous blocks and the beginning of the next is laid out
+ * contigously if possible.
+ *
+ * => um_lock held on entry and exit
+ */
+daddr_t
+ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
+    int32_t *bap /* XXX ondisk32 */)
+{
+       struct fs *fs;
+       int cg;
+       int avgbfree, startcg;
+
+       KASSERT(mutex_owned(&ip->i_ump->um_lock));
+
+       fs = ip->i_fs;
+
+       /*
+        * If allocating a contiguous file with B_CONTIG, use the hints
+        * in the inode extentions to return the desired block.
+        *
+        * For metadata (indirect blocks) return the address of where
+        * the first indirect block resides - we'll scan for the next
+        * available slot if we need to allocate more than one indirect
+        * block.  For data, return the address of the actual block
+        * relative to the address of the first data block.
+        */
+       if (flags & B_CONTIG) {
+               KASSERT(ip->i_ffs_first_data_blk != 0);
+               KASSERT(ip->i_ffs_first_indir_blk != 0);
+               if (flags & B_METAONLY)
+                       return ip->i_ffs_first_indir_blk;
+               else
+                       return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
+       }
+
+       if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
+               if (lbn < NDADDR + NINDIR(fs)) {
+                       cg = ino_to_cg(fs, ip->i_number);
+                       return (cgbase(fs, cg) + fs->fs_frag);
+               }
+               /*
+                * Find a cylinder with greater than average number of
+                * unused data blocks.
+                */
+               if (indx == 0 || bap[indx - 1] == 0)
+                       startcg =
+                           ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
+               else
+                       startcg = dtog(fs,
+                               ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
+               startcg %= fs->fs_ncg;
+               avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
+               for (cg = startcg; cg < fs->fs_ncg; cg++)
+                       if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+                               return (cgbase(fs, cg) + fs->fs_frag);
+                       }
+               for (cg = 0; cg < startcg; cg++)
+                       if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+                               return (cgbase(fs, cg) + fs->fs_frag);
+                       }
+               return (0);
+       }
+       /*
+        * We just always try to lay things out contiguously.
+        */
+       return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
+}
+
+daddr_t
+ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
+    int64_t *bap)
+{
+       struct fs *fs;
+       int cg;
+       int avgbfree, startcg;
+
+       KASSERT(mutex_owned(&ip->i_ump->um_lock));
+
+       fs = ip->i_fs;
+
+       /*
+        * If allocating a contiguous file with B_CONTIG, use the hints
+        * in the inode extentions to return the desired block.
+        *
+        * For metadata (indirect blocks) return the address of where
+        * the first indirect block resides - we'll scan for the next
+        * available slot if we need to allocate more than one indirect
+        * block.  For data, return the address of the actual block
+        * relative to the address of the first data block.
+        */
+       if (flags & B_CONTIG) {
+               KASSERT(ip->i_ffs_first_data_blk != 0);
+               KASSERT(ip->i_ffs_first_indir_blk != 0);
+               if (flags & B_METAONLY)
+                       return ip->i_ffs_first_indir_blk;
+               else
+                       return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
+       }
+
+       if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
+               if (lbn < NDADDR + NINDIR(fs)) {
+                       cg = ino_to_cg(fs, ip->i_number);
+                       return (cgbase(fs, cg) + fs->fs_frag);
+               }
+               /*
+                * Find a cylinder with greater than average number of
+                * unused data blocks.
+                */
+               if (indx == 0 || bap[indx - 1] == 0)
+                       startcg =
+                           ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
+               else
+                       startcg = dtog(fs,
+                               ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
+               startcg %= fs->fs_ncg;
+               avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
+               for (cg = startcg; cg < fs->fs_ncg; cg++)
+                       if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+                               return (cgbase(fs, cg) + fs->fs_frag);
+                       }
+               for (cg = 0; cg < startcg; cg++)
+                       if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
+                               return (cgbase(fs, cg) + fs->fs_frag);
+                       }
+               return (0);
+       }
+       /*
+        * We just always try to lay things out contiguously.
+        */
+       return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
+}
+
+
+/*
+ * Implement the cylinder overflow algorithm.
+ *
+ * The policy implemented by this algorithm is:
+ *   1) allocate the block in its requested cylinder group.
+ *   2) quadradically rehash on the cylinder group number.
+ *   3) brute force search for a free block.
+ *
+ * => called with um_lock held
+ * => returns with um_lock released on success, held on failure
+ *    (*allocator releases lock on success, retains lock on failure)
+ */
+/*VARARGS5*/
+static daddr_t
+ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
+    int size /* size for data blocks, mode for inodes */,
+    int flags, daddr_t (*allocator)(struct inode *, int, daddr_t, int, int))
+{
+       struct fs *fs;
+       daddr_t result;
+       int i, icg = cg;
+
+       fs = ip->i_fs;
+       /*
+        * 1: preferred cylinder group
+        */
+       result = (*allocator)(ip, cg, pref, size, flags);
+       if (result)
+               return (result);
+
+       if (flags & B_CONTIG)
+               return (result);
+       /*
+        * 2: quadratic rehash
+        */
+       for (i = 1; i < fs->fs_ncg; i *= 2) {
+               cg += i;
+               if (cg >= fs->fs_ncg)
+                       cg -= fs->fs_ncg;
+               result = (*allocator)(ip, cg, 0, size, flags);
+               if (result)
+                       return (result);
+       }
+       /*
+        * 3: brute force search
+        * Note that we start at i == 2, since 0 was checked initially,
+        * and 1 is always checked in the quadratic rehash.
+        */
+       cg = (icg + 2) % fs->fs_ncg;
+       for (i = 2; i < fs->fs_ncg; i++) {
+               result = (*allocator)(ip, cg, 0, size, flags);
+               if (result)
+                       return (result);
+               cg++;
+               if (cg == fs->fs_ncg)
+                       cg = 0;
+       }
+       return (0);
+}
+
+/*
+ * Determine whether a fragment can be extended.
+ *
+ * Check to see if the necessary fragments are available, and
+ * if they are, allocate them.
+ *
+ * => called with um_lock held
+ * => returns with um_lock released on success, held on failure
+ */
+static daddr_t
+ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize)
+{
+       struct ufsmount *ump;
+       struct fs *fs;
+       struct cg *cgp;
+       struct buf *bp;
+       daddr_t bno;
+       int frags, bbase;
+       int i, error;
+       u_int8_t *blksfree;
+
+       fs = ip->i_fs;
+       ump = ip->i_ump;
+
+       KASSERT(mutex_owned(&ump->um_lock));
+
+       if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
+               return (0);
+       frags = numfrags(fs, nsize);
+       bbase = fragnum(fs, bprev);
+       if (bbase > fragnum(fs, (bprev + frags - 1))) {
+               /* cannot extend across a block boundary */
+               return (0);
+       }
+       mutex_exit(&ump->um_lock);
+       error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+               (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+       if (error)
+               goto fail;
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
+               goto fail;
+       cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs));
+       if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+           (fs->fs_old_flags & FS_FLAGS_UPDATED))
+               cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
+       bno = dtogd(fs, bprev);
+       blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs));
+       for (i = numfrags(fs, osize); i < frags; i++)
+               if (isclr(blksfree, bno + i))
+                       goto fail;
+       /*
+        * the current fragment can be extended
+        * deduct the count on fragment being extended into
+        * increase the count on the remaining fragment (if any)
+        * allocate the extended piece
+        */
+       for (i = frags; i < fs->fs_frag - bbase; i++)
+               if (isclr(blksfree, bno + i))
+                       break;
+       ufs_add32(cgp->cg_frsum[i - numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs));
+       if (i != frags)
+               ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
+       mutex_enter(&ump->um_lock);
+       for (i = numfrags(fs, osize); i < frags; i++) {
+               clrbit(blksfree, bno + i);
+               ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
+               fs->fs_cstotal.cs_nffree--;
+               fs->fs_cs(fs, cg).cs_nffree--;
+       }
+       fs->fs_fmod = 1;
+       ACTIVECG_CLR(fs, cg);
+       mutex_exit(&ump->um_lock);
+       bdwrite(bp);
+       return (bprev);
+
+ fail:
+       brelse(bp, 0);
+       mutex_enter(&ump->um_lock);
+       return (0);
+}
+
+/*
+ * Determine whether a block can be allocated.
+ *
+ * Check to see if a block of the appropriate size is available,
+ * and if it is, allocate it.
+ */
+static daddr_t
+ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int flags)
+{
+       struct ufsmount *ump;
+       struct fs *fs = ip->i_fs;
+       struct cg *cgp;
+       struct buf *bp;
+       int32_t bno;
+       daddr_t blkno;
+       int error, frags, allocsiz, i;
+       u_int8_t *blksfree;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       ump = ip->i_ump;
+
+       KASSERT(mutex_owned(&ump->um_lock));
+
+       if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
+               return (0);
+       mutex_exit(&ump->um_lock);
+       error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+               (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+       if (error)
+               goto fail;
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, needswap) ||
+           (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
+               goto fail;
+       cgp->cg_old_time = ufs_rw32(time_second, needswap);
+       if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+           (fs->fs_old_flags & FS_FLAGS_UPDATED))
+               cgp->cg_time = ufs_rw64(time_second, needswap);
+       if (size == fs->fs_bsize) {
+               mutex_enter(&ump->um_lock);
+               blkno = ffs_alloccgblk(ip, bp, bpref, flags);
+               ACTIVECG_CLR(fs, cg);
+               mutex_exit(&ump->um_lock);
+               bdwrite(bp);
+               return (blkno);
+       }
+       /*
+        * check to see if any fragments are already available
+        * allocsiz is the size which will be allocated, hacking
+        * it down to a smaller size if necessary
+        */
+       blksfree = cg_blksfree(cgp, needswap);
+       frags = numfrags(fs, size);
+       for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
+               if (cgp->cg_frsum[allocsiz] != 0)
+                       break;
+       if (allocsiz == fs->fs_frag) {
+               /*
+                * no fragments were available, so a block will be
+                * allocated, and hacked up
+                */
+               if (cgp->cg_cs.cs_nbfree == 0)
+                       goto fail;
+               mutex_enter(&ump->um_lock);
+               blkno = ffs_alloccgblk(ip, bp, bpref, flags);
+               bno = dtogd(fs, blkno);
+               for (i = frags; i < fs->fs_frag; i++)
+                       setbit(blksfree, bno + i);
+               i = fs->fs_frag - frags;
+               ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
+               fs->fs_cstotal.cs_nffree += i;
+               fs->fs_cs(fs, cg).cs_nffree += i;
+               fs->fs_fmod = 1;
+               ufs_add32(cgp->cg_frsum[i], 1, needswap);
+               ACTIVECG_CLR(fs, cg);
+               mutex_exit(&ump->um_lock);
+               bdwrite(bp);
+               return (blkno);
+       }
+       bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
+#if 0
+       /*
+        * XXX fvdl mapsearch will panic, and never return -1
+        *          also: returning NULL as daddr_t ?
+        */
+       if (bno < 0)
+               goto fail;
+#endif
+       for (i = 0; i < frags; i++)
+               clrbit(blksfree, bno + i);
+       mutex_enter(&ump->um_lock);
+       ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
+       fs->fs_cstotal.cs_nffree -= frags;
+       fs->fs_cs(fs, cg).cs_nffree -= frags;
+       fs->fs_fmod = 1;
+       ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
+       if (frags != allocsiz)
+               ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
+       blkno = cgbase(fs, cg) + bno;
+       ACTIVECG_CLR(fs, cg);
+       mutex_exit(&ump->um_lock);
+       bdwrite(bp);
+       return blkno;
+
+ fail:
+       brelse(bp, 0);
+       mutex_enter(&ump->um_lock);
+       return (0);
+}
+
+/*
+ * Allocate a block in a cylinder group.
+ *
+ * This algorithm implements the following policy:
+ *   1) allocate the requested block.
+ *   2) allocate a rotationally optimal block in the same cylinder.
+ *   3) allocate the next available block on the block rotor for the
+ *      specified cylinder group.
+ * Note that this routine only allocates fs_bsize blocks; these
+ * blocks may be fragmented by the routine that allocates them.
+ */
+static daddr_t
+ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int flags)
+{
+       struct ufsmount *ump;
+       struct fs *fs = ip->i_fs;
+       struct cg *cgp;
+       int cg;
+       daddr_t blkno;
+       int32_t bno;
+       u_int8_t *blksfree;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       ump = ip->i_ump;
+
+       KASSERT(mutex_owned(&ump->um_lock));
+
+       cgp = (struct cg *)bp->b_data;
+       blksfree = cg_blksfree(cgp, needswap);
+       if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
+               bpref = ufs_rw32(cgp->cg_rotor, needswap);
+       } else {
+               bpref = blknum(fs, bpref);
+               bno = dtogd(fs, bpref);
+               /*
+                * if the requested block is available, use it
+                */
+               if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
+                       goto gotit;
+               /*
+                * if the requested data block isn't available and we are
+                * trying to allocate a contiguous file, return an error.
+                */
+               if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
+                       return (0);
+       }
+
+       /*
+        * Take the next available block in this cylinder group.
+        */
+       bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
+       if (bno < 0)
+               return (0);
+       cgp->cg_rotor = ufs_rw32(bno, needswap);
+gotit:
+       blkno = fragstoblks(fs, bno);
+       ffs_clrblock(fs, blksfree, blkno);
+       ffs_clusteracct(fs, cgp, blkno, -1);
+       ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+       fs->fs_cstotal.cs_nbfree--;
+       fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
+       if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+           ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
+               int cylno;
+               cylno = old_cbtocylno(fs, bno);
+               KASSERT(cylno >= 0);
+               KASSERT(cylno < fs->fs_old_ncyl);
+               KASSERT(old_cbtorpos(fs, bno) >= 0);
+               KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos);
+               ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
+                   needswap);
+               ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
+       }
+       fs->fs_fmod = 1;
+       cg = ufs_rw32(cgp->cg_cgx, needswap);
+       blkno = cgbase(fs, cg) + bno;
+       return (blkno);
+}
+
+/*
+ * Determine whether an inode can be allocated.
+ *
+ * Check to see if an inode is available, and if it is,
+ * allocate it using the following policy:
+ *   1) allocate the requested inode.
+ *   2) allocate the next available inode after the requested
+ *      inode in the specified cylinder group.
+ */
+static daddr_t
+ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int flags)
+{
+       struct ufsmount *ump = ip->i_ump;
+       struct fs *fs = ip->i_fs;
+       struct cg *cgp;
+       struct buf *bp, *ibp;
+       u_int8_t *inosused;
+       int error, start, len, loc, map, i;
+       int32_t initediblk;
+       daddr_t nalloc;
+       struct ufs2_dinode *dp2;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       KASSERT(mutex_owned(&ump->um_lock));
+       UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);
+
+       if (fs->fs_cs(fs, cg).cs_nifree == 0)
+               return (0);
+       mutex_exit(&ump->um_lock);
+       ibp = NULL;
+       initediblk = -1;
+retry:
+       error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+               (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+       if (error)
+               goto fail;
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
+               goto fail;
+
+       if (ibp != NULL &&
+           initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
+               /* Another thread allocated more inodes so we retry the test. */
+               brelse(ibp, 0);
+               ibp = NULL;
+       }
+       /*
+        * Check to see if we need to initialize more inodes.
+        */
+       if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) {
+               initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
+               nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
+               if (nalloc + INOPB(fs) > initediblk &&
+                   initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
+                       /*
+                        * We have to release the cg buffer here to prevent
+                        * a deadlock when reading the inode block will
+                        * run a copy-on-write that might use this cg.
+                        */
+                       brelse(bp, 0);
+                       bp = NULL;
+                       error = ffs_getblk(ip->i_devvp, fsbtodb(fs,
+                           ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
+                           FFS_NOBLK, fs->fs_bsize, false, &ibp);
+                       if (error)
+                               goto fail;
+                       goto retry;
+               }
+       }
+
+       cgp->cg_old_time = ufs_rw32(time_second, needswap);
+       if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+           (fs->fs_old_flags & FS_FLAGS_UPDATED))
+               cgp->cg_time = ufs_rw64(time_second, needswap);
+       inosused = cg_inosused(cgp, needswap);
+       if (ipref) {
+               ipref %= fs->fs_ipg;
+               if (isclr(inosused, ipref))
+                       goto gotit;
+       }
+       start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY;
+       len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap),
+               NBBY);
+       loc = skpc(0xff, len, &inosused[start]);
+       if (loc == 0) {
+               len = start + 1;
+               start = 0;
+               loc = skpc(0xff, len, &inosused[0]);
+               if (loc == 0) {
+                       printf("cg = %d, irotor = %d, fs = %s\n",
+                           cg, ufs_rw32(cgp->cg_irotor, needswap),
+                               fs->fs_fsmnt);
+                       panic("ffs_nodealloccg: map corrupted");
+                       /* NOTREACHED */
+               }
+       }
+       i = start + len - loc;
+       map = inosused[i] ^ 0xff;
+       if (map == 0) {
+               printf("fs = %s\n", fs->fs_fsmnt);
+               panic("ffs_nodealloccg: block not in map");
+       }
+       ipref = i * NBBY + ffs(map) - 1;
+       cgp->cg_irotor = ufs_rw32(ipref, needswap);
+gotit:
+       UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
+           mode);
+       /*
+        * Check to see if we need to initialize more inodes.
+        */
+       if (ibp != NULL) {
+               KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
+               memset(ibp->b_data, 0, fs->fs_bsize);
+               dp2 = (struct ufs2_dinode *)(ibp->b_data);
+               for (i = 0; i < INOPB(fs); i++) {
+                       /*
+                        * Don't bother to swap, it's supposed to be
+                        * random, after all.
+                        */
+                       dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1;
+                       dp2++;
+               }
+               initediblk += INOPB(fs);
+               cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
+       }
+
+       mutex_enter(&ump->um_lock);
+       ACTIVECG_CLR(fs, cg);
+       setbit(inosused, ipref);
+       ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
+       fs->fs_cstotal.cs_nifree--;
+       fs->fs_cs(fs, cg).cs_nifree--;
+       fs->fs_fmod = 1;
+       if ((mode & IFMT) == IFDIR) {
+               ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
+               fs->fs_cstotal.cs_ndir++;
+               fs->fs_cs(fs, cg).cs_ndir++;
+       }
+       mutex_exit(&ump->um_lock);
+       if (ibp != NULL) {
+               bwrite(bp);
+               bawrite(ibp);
+       } else
+               bdwrite(bp);
+       return (cg * fs->fs_ipg + ipref);
+ fail:
+       if (bp != NULL)
+               brelse(bp, 0);
+       if (ibp != NULL)
+               brelse(ibp, 0);
+       mutex_enter(&ump->um_lock);
+       return (0);
+}
+
+/*
+ * Allocate a block or fragment.
+ *
+ * The specified block or fragment is removed from the
+ * free map, possibly fragmenting a block in the process.
+ *
+ * This implementation should mirror fs_blkfree
+ *
+ * => um_lock not held on entry or exit
+ */
+int
+ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
+{
+       int error;
+
+       error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
+           ip->i_dev, ip->i_uid);
+       if (error)
+               return error;
+
+       return ffs_blkalloc_ump(ip->i_ump, bno, size);
+}
+
+int
+ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size)
+{
+       struct fs *fs = ump->um_fs;
+       struct cg *cgp;
+       struct buf *bp;
+       int32_t fragno, cgbno;
+       int i, error, cg, blk, frags, bbase;
+       u_int8_t *blksfree;
+       const int needswap = UFS_FSNEEDSWAP(fs);
+
+       KASSERT((u_int)size <= fs->fs_bsize && fragoff(fs, size) == 0 &&
+           fragnum(fs, bno) + numfrags(fs, size) <= fs->fs_frag);
+       KASSERT(bno < fs->fs_size);
+
+       cg = dtog(fs, bno);
+       error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)),
+               (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return error;
+       }
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, needswap)) {
+               brelse(bp, 0);
+               return EIO;
+       }
+       cgp->cg_old_time = ufs_rw32(time_second, needswap);
+       cgp->cg_time = ufs_rw64(time_second, needswap);
+       cgbno = dtogd(fs, bno);
+       blksfree = cg_blksfree(cgp, needswap);
+
+       mutex_enter(&ump->um_lock);
+       if (size == fs->fs_bsize) {
+               fragno = fragstoblks(fs, cgbno);
+               if (!ffs_isblock(fs, blksfree, fragno)) {
+                       mutex_exit(&ump->um_lock);
+                       brelse(bp, 0);
+                       return EBUSY;
+               }
+               ffs_clrblock(fs, blksfree, fragno);
+               ffs_clusteracct(fs, cgp, fragno, -1);
+               ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+               fs->fs_cstotal.cs_nbfree--;
+               fs->fs_cs(fs, cg).cs_nbfree--;
+       } else {
+               bbase = cgbno - fragnum(fs, cgbno);
+
+               frags = numfrags(fs, size);
+               for (i = 0; i < frags; i++) {
+                       if (isclr(blksfree, cgbno + i)) {
+                               mutex_exit(&ump->um_lock);
+                               brelse(bp, 0);
+                               return EBUSY;
+                       }
+               }
+               /*
+                * if a complete block is being split, account for it
+                */
+               fragno = fragstoblks(fs, bbase);
+               if (ffs_isblock(fs, blksfree, fragno)) {
+                       ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
+                       fs->fs_cstotal.cs_nffree += fs->fs_frag;
+                       fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
+                       ffs_clusteracct(fs, cgp, fragno, -1);
+                       ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+                       fs->fs_cstotal.cs_nbfree--;
+                       fs->fs_cs(fs, cg).cs_nbfree--;
+               }
+               /*
+                * decrement the counts associated with the old frags
+                */
+               blk = blkmap(fs, blksfree, bbase);
+               ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
+               /*
+                * allocate the fragment
+                */
+               for (i = 0; i < frags; i++) {
+                       clrbit(blksfree, cgbno + i);
+               }
+               ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
+               fs->fs_cstotal.cs_nffree -= i;
+               fs->fs_cs(fs, cg).cs_nffree -= i;
+               /*
+                * add back in counts associated with the new frags
+                */
+               blk = blkmap(fs, blksfree, bbase);
+               ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
+       }
+       fs->fs_fmod = 1;
+       ACTIVECG_CLR(fs, cg);
+       mutex_exit(&ump->um_lock);
+       bdwrite(bp);
+       return 0;
+}
+
+/*
+ * Free a block or fragment.
+ *
+ * The specified block or fragment is placed back in the
+ * free map. If a fragment is deallocated, a possible
+ * block reassembly is checked.
+ *
+ * => um_lock not held on entry or exit
+ */
+void
+ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
+    ino_t inum)
+{
+       struct cg *cgp;
+       struct buf *bp;
+       struct ufsmount *ump;
+       daddr_t cgblkno;
+       int error, cg;
+       dev_t dev;
+       const bool devvp_is_snapshot = (devvp->v_type != VBLK);
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       KASSERT(!devvp_is_snapshot);
+
+       cg = dtog(fs, bno);
+       dev = devvp->v_rdev;
+       ump = VFSTOUFS(devvp->v_specmountpoint);
+       KASSERT(fs == ump->um_fs);
+       cgblkno = fsbtodb(fs, cgtod(fs, cg));
+       if (ffs_snapblkfree(fs, devvp, bno, size, inum))
+               return;
+
+       error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
+       if (error)
+               return;
+
+       error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
+           NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return;
+       }
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, needswap)) {
+               brelse(bp, 0);
+               return;
+       }
+
+       ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
+
+       bdwrite(bp);
+}
+
+/*
+ * Free a block or fragment from a snapshot cg copy.
+ *
+ * The specified block or fragment is placed back in the
+ * free map. If a fragment is deallocated, a possible
+ * block reassembly is checked.
+ *
+ * => um_lock not held on entry or exit
+ */
+void
+ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
+    ino_t inum)
+{
+       struct cg *cgp;
+       struct buf *bp;
+       struct ufsmount *ump;
+       daddr_t cgblkno;
+       int error, cg;
+       dev_t dev;
+       const bool devvp_is_snapshot = (devvp->v_type != VBLK);
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       KASSERT(devvp_is_snapshot);
+
+       cg = dtog(fs, bno);
+       dev = VTOI(devvp)->i_devvp->v_rdev;
+       ump = VFSTOUFS(devvp->v_mount);
+       cgblkno = fragstoblks(fs, cgtod(fs, cg));
+
+       error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
+       if (error)
+               return;
+
+       error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
+           NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return;
+       }
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, needswap)) {
+               brelse(bp, 0);
+               return;
+       }
+
+       ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
+
+       bdwrite(bp);
+}
+
+static void
+ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
+    struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot)
+{
+       struct cg *cgp;
+       int32_t fragno, cgbno;
+       int i, cg, blk, frags, bbase;
+       u_int8_t *blksfree;
+       const int needswap = UFS_FSNEEDSWAP(fs);
+
+       cg = dtog(fs, bno);
+       cgp = (struct cg *)bp->b_data;
+       cgp->cg_old_time = ufs_rw32(time_second, needswap);
+       if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+           (fs->fs_old_flags & FS_FLAGS_UPDATED))
+               cgp->cg_time = ufs_rw64(time_second, needswap);
+       cgbno = dtogd(fs, bno);
+       blksfree = cg_blksfree(cgp, needswap);
+       mutex_enter(&ump->um_lock);
+       if (size == fs->fs_bsize) {
+               fragno = fragstoblks(fs, cgbno);
+               if (!ffs_isfreeblock(fs, blksfree, fragno)) {
+                       if (devvp_is_snapshot) {
+                               mutex_exit(&ump->um_lock);
+                               return;
+                       }
+                       printf("dev = 0x%llx, block = %" PRId64 ", fs = %s\n",
+                           (unsigned long long)dev, bno, fs->fs_fsmnt);
+                       panic("blkfree: freeing free block");
+               }
+               ffs_setblock(fs, blksfree, fragno);
+               ffs_clusteracct(fs, cgp, fragno, 1);
+               ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
+               fs->fs_cstotal.cs_nbfree++;
+               fs->fs_cs(fs, cg).cs_nbfree++;
+               if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+                   ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
+                       i = old_cbtocylno(fs, cgbno);
+                       KASSERT(i >= 0);
+                       KASSERT(i < fs->fs_old_ncyl);
+                       KASSERT(old_cbtorpos(fs, cgbno) >= 0);
+                       KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos);
+                       ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
+                           needswap);
+                       ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
+               }
+       } else {
+               bbase = cgbno - fragnum(fs, cgbno);
+               /*
+                * decrement the counts associated with the old frags
+                */
+               blk = blkmap(fs, blksfree, bbase);
+               ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
+               /*
+                * deallocate the fragment
+                */
+               frags = numfrags(fs, size);
+               for (i = 0; i < frags; i++) {
+                       if (isset(blksfree, cgbno + i)) {
+                               printf("dev = 0x%llx, block = %" PRId64
+                                      ", fs = %s\n",
+                                   (unsigned long long)dev, bno + i,
+                                   fs->fs_fsmnt);
+                               panic("blkfree: freeing free frag");
+                       }
+                       setbit(blksfree, cgbno + i);
+               }
+               ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
+               fs->fs_cstotal.cs_nffree += i;
+               fs->fs_cs(fs, cg).cs_nffree += i;
+               /*
+                * add back in counts associated with the new frags
+                */
+               blk = blkmap(fs, blksfree, bbase);
+               ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
+               /*
+                * if a complete block has been reassembled, account for it
+                */
+               fragno = fragstoblks(fs, bbase);
+               if (ffs_isblock(fs, blksfree, fragno)) {
+                       ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
+                       fs->fs_cstotal.cs_nffree -= fs->fs_frag;
+                       fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
+                       ffs_clusteracct(fs, cgp, fragno, 1);
+                       ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
+                       fs->fs_cstotal.cs_nbfree++;
+                       fs->fs_cs(fs, cg).cs_nbfree++;
+                       if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+                           ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
+                               i = old_cbtocylno(fs, bbase);
+                               KASSERT(i >= 0);
+                               KASSERT(i < fs->fs_old_ncyl);
+                               KASSERT(old_cbtorpos(fs, bbase) >= 0);
+                               KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos);
+                               ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
+                                   bbase)], 1, needswap);
+                               ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
+                       }
+               }
+       }
+       fs->fs_fmod = 1;
+       ACTIVECG_CLR(fs, cg);
+       mutex_exit(&ump->um_lock);
+}
+
+/*
+ * Free an inode.
+ */
+int
+ffs_vfree(struct vnode *vp, ino_t ino, int mode)
+{
+
+       return ffs_freefile(vp->v_mount, ino, mode);
+}
+
+/*
+ * Do the actual free operation.
+ * The specified inode is placed back in the free map.
+ *
+ * => um_lock not held on entry or exit
+ */
+int
+ffs_freefile(struct mount *mp, ino_t ino, int mode)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       struct vnode *devvp;
+       struct cg *cgp;
+       struct buf *bp;
+       int error, cg;
+       daddr_t cgbno;
+       dev_t dev;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       cg = ino_to_cg(fs, ino);
+       devvp = ump->um_devvp;
+       dev = devvp->v_rdev;
+       cgbno = fsbtodb(fs, cgtod(fs, cg));
+
+       if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+               panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
+                   (long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
+       error = bread(devvp, cgbno, (int)fs->fs_cgsize,
+           NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (error);
+       }
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, needswap)) {
+               brelse(bp, 0);
+               return (0);
+       }
+
+       ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);
+
+       bdwrite(bp);
+
+       return 0;
+}
+
+int
+ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
+{
+       struct ufsmount *ump;
+       struct cg *cgp;
+       struct buf *bp;
+       int error, cg;
+       daddr_t cgbno;
+       dev_t dev;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       KASSERT(devvp->v_type != VBLK);
+
+       cg = ino_to_cg(fs, ino);
+       dev = VTOI(devvp)->i_devvp->v_rdev;
+       ump = VFSTOUFS(devvp->v_mount);
+       cgbno = fragstoblks(fs, cgtod(fs, cg));
+       if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+               panic("ifree: range: dev = 0x%llx, ino = %llu, fs = %s",
+                   (unsigned long long)dev, (unsigned long long)ino,
+                   fs->fs_fsmnt);
+       error = bread(devvp, cgbno, (int)fs->fs_cgsize,
+           NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (error);
+       }
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, needswap)) {
+               brelse(bp, 0);
+               return (0);
+       }
+       ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);
+
+       bdwrite(bp);
+
+       return 0;
+}
+
+static void
+ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
+    struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot)
+{
+       int cg;
+       struct cg *cgp;
+       u_int8_t *inosused;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       cg = ino_to_cg(fs, ino);
+       cgp = (struct cg *)bp->b_data;
+       cgp->cg_old_time = ufs_rw32(time_second, needswap);
+       if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+           (fs->fs_old_flags & FS_FLAGS_UPDATED))
+               cgp->cg_time = ufs_rw64(time_second, needswap);
+       inosused = cg_inosused(cgp, needswap);
+       ino %= fs->fs_ipg;
+       if (isclr(inosused, ino)) {
+               printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
+                   (unsigned long long)dev, (unsigned long long)ino +
+                   cg * fs->fs_ipg, fs->fs_fsmnt);
+               if (fs->fs_ronly == 0)
+                       panic("ifree: freeing free inode");
+       }
+       clrbit(inosused, ino);
+       if (!devvp_is_snapshot)
+               UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp,
+                   ino + cg * fs->fs_ipg, mode);
+       if (ino < ufs_rw32(cgp->cg_irotor, needswap))
+               cgp->cg_irotor = ufs_rw32(ino, needswap);
+       ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
+       mutex_enter(&ump->um_lock);
+       fs->fs_cstotal.cs_nifree++;
+       fs->fs_cs(fs, cg).cs_nifree++;
+       if ((mode & IFMT) == IFDIR) {
+               ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
+               fs->fs_cstotal.cs_ndir--;
+               fs->fs_cs(fs, cg).cs_ndir--;
+       }
+       fs->fs_fmod = 1;
+       ACTIVECG_CLR(fs, cg);
+       mutex_exit(&ump->um_lock);
+}
+
+/*
+ * Check to see if a file is free.
+ */
+int
+ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
+{
+       struct cg *cgp;
+       struct buf *bp;
+       daddr_t cgbno;
+       int ret, cg;
+       u_int8_t *inosused;
+       const bool devvp_is_snapshot = (devvp->v_type != VBLK);
+
+       KASSERT(devvp_is_snapshot);
+
+       cg = ino_to_cg(fs, ino);
+       if (devvp_is_snapshot)
+               cgbno = fragstoblks(fs, cgtod(fs, cg));
+       else
+               cgbno = fsbtodb(fs, cgtod(fs, cg));
+       if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
+               return 1;
+       if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, 0, &bp)) {
+               brelse(bp, 0);
+               return 1;
+       }
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
+               brelse(bp, 0);
+               return 1;
+       }
+       inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
+       ino %= fs->fs_ipg;
+       ret = isclr(inosused, ino);
+       brelse(bp, 0);
+       return ret;
+}
+
+/*
+ * Find a block of the specified size in the specified cylinder group.
+ *
+ * It is a panic if a request is made to find a block if none are
+ * available.
+ */
+static int32_t
+ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
+{
+       int32_t bno;
+       int start, len, loc, i;
+       int blk, field, subfield, pos;
+       int ostart, olen;
+       u_int8_t *blksfree;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       /* KASSERT(mutex_owned(&ump->um_lock)); */
+
+       /*
+        * find the fragment by searching through the free block
+        * map for an appropriate bit pattern
+        */
+       if (bpref)
+               start = dtogd(fs, bpref) / NBBY;
+       else
+               start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
+       blksfree = cg_blksfree(cgp, needswap);
+       len = howmany(fs->fs_fpg, NBBY) - start;
+       ostart = start;
+       olen = len;
+       loc = scanc((u_int)len,
+               (const u_char *)&blksfree[start],
+               (const u_char *)fragtbl[fs->fs_frag],
+               (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
+       if (loc == 0) {
+               len = start + 1;
+               start = 0;
+               loc = scanc((u_int)len,
+                       (const u_char *)&blksfree[0],
+                       (const u_char *)fragtbl[fs->fs_frag],
+                       (1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
+               if (loc == 0) {
+                       printf("start = %d, len = %d, fs = %s\n",
+                           ostart, olen, fs->fs_fsmnt);
+                       printf("offset=%d %ld\n",
+                               ufs_rw32(cgp->cg_freeoff, needswap),
+                               (long)blksfree - (long)cgp);
+                       printf("cg %d\n", cgp->cg_cgx);
+                       panic("ffs_alloccg: map corrupted");
+                       /* NOTREACHED */
+               }
+       }
+       bno = (start + len - loc) * NBBY;
+       cgp->cg_frotor = ufs_rw32(bno, needswap);
+       /*
+        * found the byte in the map
+        * sift through the bits to find the selected frag
+        */
+       for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
+               blk = blkmap(fs, blksfree, bno);
+               blk <<= 1;
+               field = around[allocsiz];
+               subfield = inside[allocsiz];
+               for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
+                       if ((blk & field) == subfield)
+                               return (bno + pos);
+                       field <<= 1;
+                       subfield <<= 1;
+               }
+       }
+       printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
+       panic("ffs_alloccg: block not in map");
+       /* return (-1); */
+}
+
+/*
+ * Fserr prints the name of a file system with an error diagnostic.
+ *
+ * The form of the error message is:
+ *     fs: error message
+ */
+static void
+ffs_fserr(struct fs *fs, u_int uid, const char *cp)
+{
+
+       log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
+           uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp);
+}
diff --git a/sys/ufs/ffs/ffs_appleufs.c b/sys/ufs/ffs/ffs_appleufs.c
new file mode 100644 (file)
index 0000000..0067d40
--- /dev/null
@@ -0,0 +1,154 @@
+/*     $NetBSD: ffs_appleufs.c,v 1.12 2011/11/19 22:51:31 tls Exp $    */
+
+/*
+ * Copyright (c) 2002 Darrin B. Jewell
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_appleufs.c,v 1.12 2011/11/19 22:51:31 tls Exp $");
+
+#include <sys/param.h>
+#include <sys/time.h>
+#if defined(_KERNEL)
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/cprng.h>
+#endif
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#if !defined(_KERNEL) && !defined(STANDALONE)
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#define KASSERT(x) assert(x)
+#endif
+
+/*
+ * this is the same calculation as in_cksum
+ */
+u_int16_t
+ffs_appleufs_cksum(const struct appleufslabel *appleufs)
+{
+       const u_int16_t *p = (const u_int16_t *)appleufs;
+       int len = APPLEUFS_LABEL_SIZE; /* sizeof(struct appleufslabel) */
+       long res = 0;
+       while (len > 1)  {
+               res += *p++;
+               len -= 2;
+       }
+#if 0 /* APPLEUFS_LABEL_SIZE is guaranteed to be even */
+       if (len == 1)
+               res += htobe16(*(u_char *)p<<8);
+#endif
+       res = (res >> 16) + (res & 0xffff);
+       res += (res >> 16);
+       return (~res);
+}
+
+/* copies o to n, validating and byteswapping along the way
+ * returns 0 if ok, EINVAL if not valid
+ */
+int
+ffs_appleufs_validate(const char *name, const struct appleufslabel *o,
+    struct appleufslabel *n)
+{
+       struct appleufslabel tmp;
+       if (!n) n = &tmp;
+
+       if (o->ul_magic != be32toh(APPLEUFS_LABEL_MAGIC)) {
+               return EINVAL;
+       }
+       *n = *o;
+       n->ul_checksum = 0;
+       n->ul_checksum = ffs_appleufs_cksum(n);
+       if (n->ul_checksum != o->ul_checksum) {
+#if defined(DIAGNOSTIC) || !defined(_KERNEL)
+               printf("%s: invalid APPLE UFS checksum. found 0x%x, expecting 0x%x",
+                       name,o->ul_checksum,n->ul_checksum);
+#endif
+               return EINVAL;
+       }
+       n->ul_magic = be32toh(o->ul_magic);
+       n->ul_version = be32toh(o->ul_version);
+       n->ul_time = be32toh(o->ul_time);
+       n->ul_namelen = be16toh(o->ul_namelen);
+
+       if (n->ul_namelen > APPLEUFS_MAX_LABEL_NAME) {
+#if defined(DIAGNOSTIC) || !defined(_KERNEL)
+               printf("%s: APPLE UFS label name too long, truncated.\n",
+                               name);
+#endif
+               n->ul_namelen = APPLEUFS_MAX_LABEL_NAME;
+       }
+       /* if len is max, will set ul_unused1 */
+       n->ul_name[n->ul_namelen - 1] = '\0';
+
+#ifdef DEBUG
+       printf("%s: found APPLE UFS label v%d: \"%s\"\n",
+           name,n->ul_version,n->ul_name);
+#endif
+       n->ul_uuid = be64toh(o->ul_uuid);
+
+       return 0;
+}
+
+void
+ffs_appleufs_set(struct appleufslabel *appleufs, const char *name, time_t t,
+    uint64_t uuid)
+{
+       size_t namelen;
+       if (!name) name = "untitled";
+       if (t == ((time_t)-1)) {
+#if defined(_KERNEL)
+               t = time_second;
+#elif defined(STANDALONE)
+               t = 0;
+#else
+               (void)time(&t);
+#endif
+       }
+       if (uuid == 0) {
+#if defined(_KERNEL) && !defined(STANDALONE)
+               uuid = cprng_fast64();
+#endif
+       }
+       namelen = strlen(name);
+       if (namelen > APPLEUFS_MAX_LABEL_NAME)
+               namelen = APPLEUFS_MAX_LABEL_NAME;
+       memset(appleufs, 0, APPLEUFS_LABEL_SIZE);
+       appleufs->ul_magic   = htobe32(APPLEUFS_LABEL_MAGIC);
+       appleufs->ul_version = htobe32(APPLEUFS_LABEL_VERSION);
+       appleufs->ul_time    = htobe32((u_int32_t)t);
+       appleufs->ul_namelen = htobe16(namelen);
+       strncpy(appleufs->ul_name,name,namelen);
+       appleufs->ul_uuid    = htobe64(uuid);
+       appleufs->ul_checksum = ffs_appleufs_cksum(appleufs);
+}
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
new file mode 100644 (file)
index 0000000..3683cbb
--- /dev/null
@@ -0,0 +1,1051 @@
+/*     $NetBSD: ffs_balloc.c,v 1.54 2011/04/23 07:36:02 hannken Exp $  */
+
+/*
+ * Copyright (c) 2002 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project by Marshall
+ * Kirk McKusick and Network Associates Laboratories, the Security
+ * Research Division of Network Associates, Inc. under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
+ * research program
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_balloc.c        8.8 (Berkeley) 6/16/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.54 2011/04/23 07:36:02 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/kauth.h>
+#include <sys/fstrans.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <uvm/uvm.h>
+
+static int ffs_balloc_ufs1(struct vnode *, off_t, int, kauth_cred_t, int,
+    struct buf **);
+static int ffs_balloc_ufs2(struct vnode *, off_t, int, kauth_cred_t, int,
+    struct buf **);
+
+/*
+ * Balloc defines the structure of file system storage
+ * by allocating the physical blocks on a device given
+ * the inode and the logical block number in a file.
+ */
+
+int
+ffs_balloc(struct vnode *vp, off_t off, int size, kauth_cred_t cred, int flags,
+    struct buf **bpp)
+{
+       int error;
+
+       if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC)
+               error = ffs_balloc_ufs2(vp, off, size, cred, flags, bpp);
+       else
+               error = ffs_balloc_ufs1(vp, off, size, cred, flags, bpp);
+
+       if (error == 0 && bpp != NULL && (error = fscow_run(*bpp, false)) != 0)
+               brelse(*bpp, 0);
+
+       return error;
+}
+
+static int
+ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
+    int flags, struct buf **bpp)
+{
+       daddr_t lbn, lastlbn;
+       struct buf *bp, *nbp;
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+       struct ufsmount *ump = ip->i_ump;
+       struct indir indirs[NIADDR + 2];
+       daddr_t newb, pref, nb;
+       int32_t *bap;   /* XXX ondisk32 */
+       int deallocated, osize, nsize, num, i, error;
+       int32_t *blkp, *allocblk, allociblk[NIADDR + 1];
+       int32_t *allocib;
+       int unwindidx = -1;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+       UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);
+
+       lbn = lblkno(fs, off);
+       size = blkoff(fs, off) + size;
+       if (size > fs->fs_bsize)
+               panic("ffs_balloc: blk too big");
+       if (bpp != NULL) {
+               *bpp = NULL;
+       }
+       UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0);
+
+       if (lbn < 0)
+               return (EFBIG);
+
+       /*
+        * If the next write will extend the file into a new block,
+        * and the file is currently composed of a fragment
+        * this fragment has to be extended to be a full block.
+        */
+
+       lastlbn = lblkno(fs, ip->i_size);
+       if (lastlbn < NDADDR && lastlbn < lbn) {
+               nb = lastlbn;
+               osize = blksize(fs, ip, nb);
+               if (osize < fs->fs_bsize && osize > 0) {
+                       mutex_enter(&ump->um_lock);
+                       error = ffs_realloccg(ip, nb,
+                                   ffs_blkpref_ufs1(ip, lastlbn, nb, flags,
+                                       &ip->i_ffs1_db[0]),
+                                   osize, (int)fs->fs_bsize, cred, bpp, &newb);
+                       if (error)
+                               return (error);
+                       ip->i_size = lblktosize(fs, nb + 1);
+                       ip->i_ffs1_size = ip->i_size;
+                       uvm_vnp_setsize(vp, ip->i_ffs1_size);
+                       ip->i_ffs1_db[nb] = ufs_rw32((u_int32_t)newb, needswap);
+                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+                       if (bpp && *bpp) {
+                               if (flags & B_SYNC)
+                                       bwrite(*bpp);
+                               else
+                                       bawrite(*bpp);
+                       }
+               }
+       }
+
+       /*
+        * The first NDADDR blocks are direct blocks
+        */
+
+       if (lbn < NDADDR) {
+               nb = ufs_rw32(ip->i_ffs1_db[lbn], needswap);
+               if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) {
+
+                       /*
+                        * The block is an already-allocated direct block
+                        * and the file already extends past this block,
+                        * thus this must be a whole block.
+                        * Just read the block (if requested).
+                        */
+
+                       if (bpp != NULL) {
+                               error = bread(vp, lbn, fs->fs_bsize, NOCRED,
+                                             B_MODIFY, bpp);
+                               if (error) {
+                                       brelse(*bpp, 0);
+                                       return (error);
+                               }
+                       }
+                       return (0);
+               }
+               if (nb != 0) {
+
+                       /*
+                        * Consider need to reallocate a fragment.
+                        */
+
+                       osize = fragroundup(fs, blkoff(fs, ip->i_size));
+                       nsize = fragroundup(fs, size);
+                       if (nsize <= osize) {
+
+                               /*
+                                * The existing block is already
+                                * at least as big as we want.
+                                * Just read the block (if requested).
+                                */
+
+                               if (bpp != NULL) {
+                                       error = bread(vp, lbn, osize, NOCRED,
+                                                     B_MODIFY, bpp);
+                                       if (error) {
+                                               brelse(*bpp, 0);
+                                               return (error);
+                                       }
+                               }
+                               return 0;
+                       } else {
+
+                               /*
+                                * The existing block is smaller than we want,
+                                * grow it.
+                                */
+                               mutex_enter(&ump->um_lock);
+                               error = ffs_realloccg(ip, lbn,
+                                   ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
+                                       &ip->i_ffs1_db[0]),
+                                   osize, nsize, cred, bpp, &newb);
+                               if (error)
+                                       return (error);
+                       }
+               } else {
+
+                       /*
+                        * the block was not previously allocated,
+                        * allocate a new block or fragment.
+                        */
+
+                       if (ip->i_size < lblktosize(fs, lbn + 1))
+                               nsize = fragroundup(fs, size);
+                       else
+                               nsize = fs->fs_bsize;
+                       mutex_enter(&ump->um_lock);
+                       error = ffs_alloc(ip, lbn,
+                           ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
+                               &ip->i_ffs1_db[0]),
+                           nsize, flags, cred, &newb);
+                       if (error)
+                               return (error);
+                       if (bpp != NULL) {
+                               error = ffs_getblk(vp, lbn, fsbtodb(fs, newb),
+                                   nsize, (flags & B_CLRBUF) != 0, bpp);
+                               if (error)
+                                       return error;
+                       }
+               }
+               ip->i_ffs1_db[lbn] = ufs_rw32((u_int32_t)newb, needswap);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               return (0);
+       }
+
+       /*
+        * Determine the number of levels of indirection.
+        */
+
+       pref = 0;
+       if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
+               return (error);
+
+       /*
+        * Fetch the first indirect block allocating if necessary.
+        */
+
+       --num;
+       nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap);
+       allocib = NULL;
+       allocblk = allociblk;
+       if (nb == 0) {
+               mutex_enter(&ump->um_lock);
+               pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL);
+               error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+                   flags | B_METAONLY, cred, &newb);
+               if (error)
+                       goto fail;
+               nb = newb;
+               *allocblk++ = nb;
+               error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb),
+                   fs->fs_bsize, true, &bp);
+               if (error)
+                       goto fail;
+               /*
+                * Write synchronously so that indirect blocks
+                * never point at garbage.
+                */
+               if ((error = bwrite(bp)) != 0)
+                       goto fail;
+               unwindidx = 0;
+               allocib = &ip->i_ffs1_ib[indirs[0].in_off];
+               *allocib = ufs_rw32(nb, needswap);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+       }
+
+       /*
+        * Fetch through the indirect blocks, allocating as necessary.
+        */
+
+       for (i = 1;;) {
+               error = bread(vp,
+                   indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               bap = (int32_t *)bp->b_data;    /* XXX ondisk32 */
+               nb = ufs_rw32(bap[indirs[i].in_off], needswap);
+               if (i == num)
+                       break;
+               i++;
+               if (nb != 0) {
+                       brelse(bp, 0);
+                       continue;
+               }
+               if (fscow_run(bp, true) != 0) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               mutex_enter(&ump->um_lock);
+               /* Try to keep snapshot indirect blocks contiguous. */
+               if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
+                       pref = ffs_blkpref_ufs1(ip, lbn, indirs[i-1].in_off,
+                           flags | B_METAONLY, &bap[0]);
+               if (pref == 0)
+                       pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY,
+                           NULL);
+               error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+                   flags | B_METAONLY, cred, &newb);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               nb = newb;
+               *allocblk++ = nb;
+               error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb),
+                   fs->fs_bsize, true, &nbp);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               /*
+                * Write synchronously so that indirect blocks
+                * never point at garbage.
+                */
+               if ((error = bwrite(nbp)) != 0) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               if (unwindidx < 0)
+                       unwindidx = i - 1;
+               bap[indirs[i - 1].in_off] = ufs_rw32(nb, needswap);
+
+               /*
+                * If required, write synchronously, otherwise use
+                * delayed write.
+                */
+
+               if (flags & B_SYNC) {
+                       bwrite(bp);
+               } else {
+                       bdwrite(bp);
+               }
+       }
+
+       if (flags & B_METAONLY) {
+               KASSERT(bpp != NULL);
+               *bpp = bp;
+               return (0);
+       }
+
+       /*
+        * Get the data block, allocating if necessary.
+        */
+
+       if (nb == 0) {
+               if (fscow_run(bp, true) != 0) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               mutex_enter(&ump->um_lock);
+               pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, flags,
+                   &bap[0]);
+               error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
+                   &newb);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               nb = newb;
+               *allocblk++ = nb;
+               if (bpp != NULL) {
+                       error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+                           fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
+                       if (error) {
+                               brelse(bp, 0);
+                               goto fail;
+                       }
+               }
+               bap[indirs[num].in_off] = ufs_rw32(nb, needswap);
+               if (allocib == NULL && unwindidx < 0) {
+                       unwindidx = i - 1;
+               }
+
+               /*
+                * If required, write synchronously, otherwise use
+                * delayed write.
+                */
+
+               if (flags & B_SYNC) {
+                       bwrite(bp);
+               } else {
+                       bdwrite(bp);
+               }
+               return (0);
+       }
+       brelse(bp, 0);
+       if (bpp != NULL) {
+               if (flags & B_CLRBUF) {
+                       error = bread(vp, lbn, (int)fs->fs_bsize,
+                           NOCRED, B_MODIFY, &nbp);
+                       if (error) {
+                               brelse(nbp, 0);
+                               goto fail;
+                       }
+               } else {
+                       error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+                           fs->fs_bsize, true, &nbp);
+                       if (error)
+                               goto fail;
+               }
+               *bpp = nbp;
+       }
+       return (0);
+
+fail:
+       /*
+        * If we have failed part way through block allocation, we
+        * have to deallocate any indirect blocks that we have allocated.
+        */
+
+       if (unwindidx >= 0) {
+
+               /*
+                * First write out any buffers we've created to resolve their
+                * softdeps.  This must be done in reverse order of creation
+                * so that we resolve the dependencies in one pass.
+                * Write the cylinder group buffers for these buffers too.
+                */
+
+               for (i = num; i >= unwindidx; i--) {
+                       if (i == 0) {
+                               break;
+                       }
+                       if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+                           fs->fs_bsize, false, &bp) != 0)
+                               continue;
+                       if (bp->b_oflags & BO_DELWRI) {
+                               nb = fsbtodb(fs, cgtod(fs, dtog(fs,
+                                   dbtofsb(fs, bp->b_blkno))));
+                               bwrite(bp);
+                               if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
+                                   fs->fs_cgsize, false, &bp) != 0)
+                                       continue;
+                               if (bp->b_oflags & BO_DELWRI) {
+                                       bwrite(bp);
+                               } else {
+                                       brelse(bp, BC_INVAL);
+                               }
+                       } else {
+                               brelse(bp, BC_INVAL);
+                       }
+               }
+
+               /*
+                * Undo the partial allocation.
+                */
+               if (unwindidx == 0) {
+                       *allocib = 0;
+                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               } else {
+                       int r;
+
+                       r = bread(vp, indirs[unwindidx].in_lbn,
+                           (int)fs->fs_bsize, NOCRED, 0, &bp);
+                       if (r) {
+                               panic("Could not unwind indirect block, error %d", r);
+                               brelse(bp, 0);
+                       } else {
+                               bap = (int32_t *)bp->b_data; /* XXX ondisk32 */
+                               bap[indirs[unwindidx].in_off] = 0;
+                               bwrite(bp);
+                       }
+               }
+               for (i = unwindidx + 1; i <= num; i++) {
+                       if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+                           fs->fs_bsize, false, &bp) == 0)
+                               brelse(bp, BC_INVAL);
+               }
+       }
+       for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
+               ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
+               deallocated += fs->fs_bsize;
+       }
+       if (deallocated) {
+#if defined(QUOTA) || defined(QUOTA2)
+               /*
+                * Restore user's disk quota because allocation failed.
+                */
+               (void)chkdq(ip, -btodb(deallocated), cred, FORCE);
+#endif
+               ip->i_ffs1_blocks -= btodb(deallocated);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+       }
+       return (error);
+}
+
+static int
+ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
+    int flags, struct buf **bpp)
+{
+       daddr_t lbn, lastlbn;
+       struct buf *bp, *nbp;
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+       struct ufsmount *ump = ip->i_ump;
+       struct indir indirs[NIADDR + 2];
+       daddr_t newb, pref, nb;
+       int64_t *bap;
+       int deallocated, osize, nsize, num, i, error;
+       daddr_t *blkp, *allocblk, allociblk[NIADDR + 1];
+       int64_t *allocib;
+       int unwindidx = -1;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+       UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);
+
+       lbn = lblkno(fs, off);
+       size = blkoff(fs, off) + size;
+       if (size > fs->fs_bsize)
+               panic("ffs_balloc: blk too big");
+       if (bpp != NULL) {
+               *bpp = NULL;
+       }
+       UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0);
+
+       if (lbn < 0)
+               return (EFBIG);
+
+#ifdef notyet
+       /*
+        * Check for allocating external data.
+        */
+       if (flags & IO_EXT) {
+               if (lbn >= NXADDR)
+                       return (EFBIG);
+               /*
+                * If the next write will extend the data into a new block,
+                * and the data is currently composed of a fragment
+                * this fragment has to be extended to be a full block.
+                */
+               lastlbn = lblkno(fs, dp->di_extsize);
+               if (lastlbn < lbn) {
+                       nb = lastlbn;
+                       osize = sblksize(fs, dp->di_extsize, nb);
+                       if (osize < fs->fs_bsize && osize > 0) {
+                               mutex_enter(&ump->um_lock);
+                               error = ffs_realloccg(ip, -1 - nb,
+                                   dp->di_extb[nb],
+                                   ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
+                                       flags, &dp->di_extb[0]),
+                                   osize,
+                                   (int)fs->fs_bsize, cred, &bp);
+                               if (error)
+                                       return (error);
+                               dp->di_extsize = smalllblktosize(fs, nb + 1);
+                               dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
+                               bp->b_xflags |= BX_ALTDATA;
+                               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+                               if (flags & IO_SYNC)
+                                       bwrite(bp);
+                               else
+                                       bawrite(bp);
+                       }
+               }
+               /*
+                * All blocks are direct blocks
+                */
+               if (flags & BA_METAONLY)
+                       panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
+               nb = dp->di_extb[lbn];
+               if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
+                       error = bread(vp, -1 - lbn, fs->fs_bsize,
+                           NOCRED, 0, &bp);
+                       if (error) {
+                               brelse(bp, 0);
+                               return (error);
+                       }
+                       mutex_enter(&bp->b_interlock);
+                       bp->b_blkno = fsbtodb(fs, nb);
+                       bp->b_xflags |= BX_ALTDATA;
+                       mutex_exit(&bp->b_interlock);
+                       *bpp = bp;
+                       return (0);
+               }
+               if (nb != 0) {
+                       /*
+                        * Consider need to reallocate a fragment.
+                        */
+                       osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
+                       nsize = fragroundup(fs, size);
+                       if (nsize <= osize) {
+                               error = bread(vp, -1 - lbn, osize,
+                                   NOCRED, 0, &bp);
+                               if (error) {
+                                       brelse(bp, 0);
+                                       return (error);
+                               }
+                               mutex_enter(&bp->b_interlock);
+                               bp->b_blkno = fsbtodb(fs, nb);
+                               bp->b_xflags |= BX_ALTDATA;
+                               mutex_exit(&bp->b_interlock);
+                       } else {
+                               mutex_enter(&ump->um_lock);
+                               error = ffs_realloccg(ip, -1 - lbn,
+                                   dp->di_extb[lbn],
+                                   ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+                                       &dp->di_extb[0]),
+                                   osize, nsize, cred, &bp);
+                               if (error)
+                                       return (error);
+                               bp->b_xflags |= BX_ALTDATA;
+                       }
+               } else {
+                       if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
+                               nsize = fragroundup(fs, size);
+                       else
+                               nsize = fs->fs_bsize;
+                       mutex_enter(&ump->um_lock);
+                       error = ffs_alloc(ip, lbn,
+                          ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+                              &dp->di_extb[0]),
+                          nsize, flags, cred, &newb);
+                       if (error)
+                               return (error);
+                       error = ffs_getblk(vp, -1 - lbn, fsbtodb(fs, newb),
+                           nsize, (flags & BA_CLRBUF) != 0, &bp);
+                       if (error)
+                               return error;
+                       bp->b_xflags |= BX_ALTDATA;
+               }
+               dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               *bpp = bp;
+               return (0);
+       }
+#endif
+       /*
+        * If the next write will extend the file into a new block,
+        * and the file is currently composed of a fragment
+        * this fragment has to be extended to be a full block.
+        */
+
+       lastlbn = lblkno(fs, ip->i_size);
+       if (lastlbn < NDADDR && lastlbn < lbn) {
+               nb = lastlbn;
+               osize = blksize(fs, ip, nb);
+               if (osize < fs->fs_bsize && osize > 0) {
+                       mutex_enter(&ump->um_lock);
+                       error = ffs_realloccg(ip, nb,
+                                   ffs_blkpref_ufs2(ip, lastlbn, nb, flags,
+                                       &ip->i_ffs2_db[0]),
+                                   osize, (int)fs->fs_bsize, cred, bpp, &newb);
+                       if (error)
+                               return (error);
+                       ip->i_size = lblktosize(fs, nb + 1);
+                       ip->i_ffs2_size = ip->i_size;
+                       uvm_vnp_setsize(vp, ip->i_size);
+                       ip->i_ffs2_db[nb] = ufs_rw64(newb, needswap);
+                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+                       if (bpp) {
+                               if (flags & B_SYNC)
+                                       bwrite(*bpp);
+                               else
+                                       bawrite(*bpp);
+                       }
+               }
+       }
+
+       /*
+        * The first NDADDR blocks are direct blocks
+        */
+
+       if (lbn < NDADDR) {
+               nb = ufs_rw64(ip->i_ffs2_db[lbn], needswap);
+               if (nb != 0 && ip->i_size >= lblktosize(fs, lbn + 1)) {
+
+                       /*
+                        * The block is an already-allocated direct block
+                        * and the file already extends past this block,
+                        * thus this must be a whole block.
+                        * Just read the block (if requested).
+                        */
+
+                       if (bpp != NULL) {
+                               error = bread(vp, lbn, fs->fs_bsize, NOCRED,
+                                             B_MODIFY, bpp);
+                               if (error) {
+                                       brelse(*bpp, 0);
+                                       return (error);
+                               }
+                       }
+                       return (0);
+               }
+               if (nb != 0) {
+
+                       /*
+                        * Consider need to reallocate a fragment.
+                        */
+
+                       osize = fragroundup(fs, blkoff(fs, ip->i_size));
+                       nsize = fragroundup(fs, size);
+                       if (nsize <= osize) {
+
+                               /*
+                                * The existing block is already
+                                * at least as big as we want.
+                                * Just read the block (if requested).
+                                */
+
+                               if (bpp != NULL) {
+                                       error = bread(vp, lbn, osize, NOCRED,
+                                                     B_MODIFY, bpp);
+                                       if (error) {
+                                               brelse(*bpp, 0);
+                                               return (error);
+                                       }
+                               }
+                               return 0;
+                       } else {
+
+                               /*
+                                * The existing block is smaller than we want,
+                                * grow it.
+                                */
+                               mutex_enter(&ump->um_lock);
+                               error = ffs_realloccg(ip, lbn,
+                                   ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+                                       &ip->i_ffs2_db[0]),
+                                   osize, nsize, cred, bpp, &newb);
+                               if (error)
+                                       return (error);
+                       }
+               } else {
+
+                       /*
+                        * the block was not previously allocated,
+                        * allocate a new block or fragment.
+                        */
+
+                       if (ip->i_size < lblktosize(fs, lbn + 1))
+                               nsize = fragroundup(fs, size);
+                       else
+                               nsize = fs->fs_bsize;
+                       mutex_enter(&ump->um_lock);
+                       error = ffs_alloc(ip, lbn,
+                           ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+                               &ip->i_ffs2_db[0]),
+                           nsize, flags, cred, &newb);
+                       if (error)
+                               return (error);
+                       if (bpp != NULL) {
+                               error = ffs_getblk(vp, lbn, fsbtodb(fs, newb),
+                                   nsize, (flags & B_CLRBUF) != 0, bpp);
+                               if (error)
+                                       return error;
+                       }
+               }
+               ip->i_ffs2_db[lbn] = ufs_rw64(newb, needswap);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               return (0);
+       }
+
+       /*
+        * Determine the number of levels of indirection.
+        */
+
+       pref = 0;
+       if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
+               return (error);
+
+       /*
+        * Fetch the first indirect block allocating if necessary.
+        */
+
+       --num;
+       nb = ufs_rw64(ip->i_ffs2_ib[indirs[0].in_off], needswap);
+       allocib = NULL;
+       allocblk = allociblk;
+       if (nb == 0) {
+               mutex_enter(&ump->um_lock);
+               pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL);
+               error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+                   flags | B_METAONLY, cred, &newb);
+               if (error)
+                       goto fail;
+               nb = newb;
+               *allocblk++ = nb;
+               error = ffs_getblk(vp, indirs[1].in_lbn, fsbtodb(fs, nb),
+                   fs->fs_bsize, true, &bp);
+               if (error)
+                       goto fail;
+               /*
+                * Write synchronously so that indirect blocks
+                * never point at garbage.
+                */
+               if ((error = bwrite(bp)) != 0)
+                       goto fail;
+               unwindidx = 0;
+               allocib = &ip->i_ffs2_ib[indirs[0].in_off];
+               *allocib = ufs_rw64(nb, needswap);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+       }
+
+       /*
+        * Fetch through the indirect blocks, allocating as necessary.
+        */
+
+       for (i = 1;;) {
+               error = bread(vp,
+                   indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, 0, &bp);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               bap = (int64_t *)bp->b_data;
+               nb = ufs_rw64(bap[indirs[i].in_off], needswap);
+               if (i == num)
+                       break;
+               i++;
+               if (nb != 0) {
+                       brelse(bp, 0);
+                       continue;
+               }
+               if (fscow_run(bp, true) != 0) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               mutex_enter(&ump->um_lock);
+               /* Try to keep snapshot indirect blocks contiguous. */
+               if (i == num && (ip->i_flags & SF_SNAPSHOT) != 0)
+                       pref = ffs_blkpref_ufs2(ip, lbn, indirs[i-1].in_off,
+                           flags | B_METAONLY, &bap[0]);
+               if (pref == 0)
+                       pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY,
+                           NULL);
+               error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+                   flags | B_METAONLY, cred, &newb);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               nb = newb;
+               *allocblk++ = nb;
+               error = ffs_getblk(vp, indirs[i].in_lbn, fsbtodb(fs, nb),
+                   fs->fs_bsize, true, &nbp);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               /*
+                * Write synchronously so that indirect blocks
+                * never point at garbage.
+                */
+               if ((error = bwrite(nbp)) != 0) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               if (unwindidx < 0)
+                       unwindidx = i - 1;
+               bap[indirs[i - 1].in_off] = ufs_rw64(nb, needswap);
+
+               /*
+                * If required, write synchronously, otherwise use
+                * delayed write.
+                */
+
+               if (flags & B_SYNC) {
+                       bwrite(bp);
+               } else {
+                       bdwrite(bp);
+               }
+       }
+
+       if (flags & B_METAONLY) {
+               KASSERT(bpp != NULL);
+               *bpp = bp;
+               return (0);
+       }
+
+       /*
+        * Get the data block, allocating if necessary.
+        */
+
+       if (nb == 0) {
+               if (fscow_run(bp, true) != 0) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               mutex_enter(&ump->um_lock);
+               pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, flags,
+                   &bap[0]);
+               error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
+                   &newb);
+               if (error) {
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               nb = newb;
+               *allocblk++ = nb;
+               if (bpp != NULL) {
+                       error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+                           fs->fs_bsize, (flags & B_CLRBUF) != 0, bpp);
+                       if (error) {
+                               brelse(bp, 0);
+                               goto fail;
+                       }
+               }
+               bap[indirs[num].in_off] = ufs_rw64(nb, needswap);
+               if (allocib == NULL && unwindidx < 0) {
+                       unwindidx = i - 1;
+               }
+
+               /*
+                * If required, write synchronously, otherwise use
+                * delayed write.
+                */
+
+               if (flags & B_SYNC) {
+                       bwrite(bp);
+               } else {
+                       bdwrite(bp);
+               }
+               return (0);
+       }
+       brelse(bp, 0);
+       if (bpp != NULL) {
+               if (flags & B_CLRBUF) {
+                       error = bread(vp, lbn, (int)fs->fs_bsize,
+                           NOCRED, B_MODIFY, &nbp);
+                       if (error) {
+                               brelse(nbp, 0);
+                               goto fail;
+                       }
+               } else {
+                       error = ffs_getblk(vp, lbn, fsbtodb(fs, nb),
+                           fs->fs_bsize, true, &nbp);
+                       if (error)
+                               goto fail;
+               }
+               *bpp = nbp;
+       }
+       return (0);
+
+fail:
+       /*
+        * If we have failed part way through block allocation, we
+        * have to deallocate any indirect blocks that we have allocated.
+        */
+
+       if (unwindidx >= 0) {
+
+               /*
+                * First write out any buffers we've created to resolve their
+                * softdeps.  This must be done in reverse order of creation
+                * so that we resolve the dependencies in one pass.
+                * Write the cylinder group buffers for these buffers too.
+                */
+
+               for (i = num; i >= unwindidx; i--) {
+                       if (i == 0) {
+                               break;
+                       }
+                       if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+                           fs->fs_bsize, false, &bp) != 0)
+                               continue;
+                       if (bp->b_oflags & BO_DELWRI) {
+                               nb = fsbtodb(fs, cgtod(fs, dtog(fs,
+                                   dbtofsb(fs, bp->b_blkno))));
+                               bwrite(bp);
+                               if (ffs_getblk(ip->i_devvp, nb, FFS_NOBLK,
+                                   fs->fs_cgsize, false, &bp) != 0)
+                                       continue;
+                               if (bp->b_oflags & BO_DELWRI) {
+                                       bwrite(bp);
+                               } else {
+                                       brelse(bp, BC_INVAL);
+                               }
+                       } else {
+                               brelse(bp, BC_INVAL);
+                       }
+               }
+
+               /*
+                * Now that any dependencies that we created have been
+                * resolved, we can undo the partial allocation.
+                */
+
+               if (unwindidx == 0) {
+                       *allocib = 0;
+                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               } else {
+                       int r;
+
+                       r = bread(vp, indirs[unwindidx].in_lbn,
+                           (int)fs->fs_bsize, NOCRED, 0, &bp);
+                       if (r) {
+                               panic("Could not unwind indirect block, error %d", r);
+                               brelse(bp, 0);
+                       } else {
+                               bap = (int64_t *)bp->b_data;
+                               bap[indirs[unwindidx].in_off] = 0;
+                               bwrite(bp);
+                       }
+               }
+               for (i = unwindidx + 1; i <= num; i++) {
+                       if (ffs_getblk(vp, indirs[i].in_lbn, FFS_NOBLK,
+                           fs->fs_bsize, false, &bp) == 0)
+                               brelse(bp, BC_INVAL);
+               }
+       }
+       for (deallocated = 0, blkp = allociblk; blkp < allocblk; blkp++) {
+               ffs_blkfree(fs, ip->i_devvp, *blkp, fs->fs_bsize, ip->i_number);
+               deallocated += fs->fs_bsize;
+       }
+       if (deallocated) {
+#if defined(QUOTA) || defined(QUOTA2)
+               /*
+                * Restore user's disk quota because allocation failed.
+                */
+               (void)chkdq(ip, -btodb(deallocated), cred, FORCE);
+#endif
+               ip->i_ffs2_blocks -= btodb(deallocated);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+       }
+
+       return (error);
+}
diff --git a/sys/ufs/ffs/ffs_bswap.c b/sys/ufs/ffs/ffs_bswap.c
new file mode 100644 (file)
index 0000000..ddac30d
--- /dev/null
@@ -0,0 +1,271 @@
+/*     $NetBSD: ffs_bswap.c,v 1.35 2011/03/06 17:08:38 bouyer Exp $    */
+
+/*
+ * Copyright (c) 1998 Manuel Bouyer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_bswap.c,v 1.35 2011/03/06 17:08:38 bouyer Exp $");
+
+#include <sys/param.h>
+#if defined(_KERNEL)
+#include <sys/systm.h>
+#endif
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#if !defined(_KERNEL)
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define panic(x)       printf("%s\n", (x)), abort()
+#endif
+
+void
+ffs_sb_swap(struct fs *o, struct fs *n)
+{
+       size_t i;
+       u_int32_t *o32, *n32;
+
+       /*
+        * In order to avoid a lot of lines, as the first N fields (52)
+        * of the superblock up to fs_fmod are u_int32_t, we just loop
+        * here to convert them.
+        */
+       o32 = (u_int32_t *)o;
+       n32 = (u_int32_t *)n;
+       for (i = 0; i < offsetof(struct fs, fs_fmod) / sizeof(u_int32_t); i++)
+               n32[i] = bswap32(o32[i]);
+
+       n->fs_swuid = bswap64(o->fs_swuid);
+       n->fs_cgrotor = bswap32(o->fs_cgrotor); /* Unused */
+       n->fs_old_cpc = bswap32(o->fs_old_cpc);
+
+       /* These fields overlap with a possible location for the
+        * historic FS_DYNAMICPOSTBLFMT postbl table, and with the
+        * first half of the historic FS_42POSTBLFMT postbl table.
+        */
+       n->fs_maxbsize = bswap32(o->fs_maxbsize);
+       /* XXX journal */
+       n->fs_quota_magic = bswap32(o->fs_quota_magic);
+       for (i = 0; i < MAXQUOTAS; i++)
+               n->fs_quotafile[i] = bswap64(o->fs_quotafile[i]);
+       n->fs_sblockloc = bswap64(o->fs_sblockloc);
+       ffs_csumtotal_swap(&o->fs_cstotal, &n->fs_cstotal);
+       n->fs_time = bswap64(o->fs_time);
+       n->fs_size = bswap64(o->fs_size);
+       n->fs_dsize = bswap64(o->fs_dsize);
+       n->fs_csaddr = bswap64(o->fs_csaddr);
+       n->fs_pendingblocks = bswap64(o->fs_pendingblocks);
+       n->fs_pendinginodes = bswap32(o->fs_pendinginodes);
+
+       /* These fields overlap with the second half of the
+        * historic FS_42POSTBLFMT postbl table
+        */
+       for (i = 0; i < FSMAXSNAP; i++)
+               n->fs_snapinum[i] = bswap32(o->fs_snapinum[i]);
+       n->fs_avgfilesize = bswap32(o->fs_avgfilesize);
+       n->fs_avgfpdir = bswap32(o->fs_avgfpdir);
+       /* fs_sparecon[28] - ignore for now */
+       n->fs_flags = bswap32(o->fs_flags);
+       n->fs_contigsumsize = bswap32(o->fs_contigsumsize);
+       n->fs_maxsymlinklen = bswap32(o->fs_maxsymlinklen);
+       n->fs_old_inodefmt = bswap32(o->fs_old_inodefmt);
+       n->fs_maxfilesize = bswap64(o->fs_maxfilesize);
+       n->fs_qbmask = bswap64(o->fs_qbmask);
+       n->fs_qfmask = bswap64(o->fs_qfmask);
+       n->fs_state = bswap32(o->fs_state);
+       n->fs_old_postblformat = bswap32(o->fs_old_postblformat);
+       n->fs_old_nrpos = bswap32(o->fs_old_nrpos);
+       n->fs_old_postbloff = bswap32(o->fs_old_postbloff);
+       n->fs_old_rotbloff = bswap32(o->fs_old_rotbloff);
+
+       n->fs_magic = bswap32(o->fs_magic);
+}
+
+void
+ffs_dinode1_swap(struct ufs1_dinode *o, struct ufs1_dinode *n)
+{
+
+       n->di_mode = bswap16(o->di_mode);
+       n->di_nlink = bswap16(o->di_nlink);
+       n->di_u.oldids[0] = bswap16(o->di_u.oldids[0]);
+       n->di_u.oldids[1] = bswap16(o->di_u.oldids[1]);
+       n->di_size = bswap64(o->di_size);
+       n->di_atime = bswap32(o->di_atime);
+       n->di_atimensec = bswap32(o->di_atimensec);
+       n->di_mtime = bswap32(o->di_mtime);
+       n->di_mtimensec = bswap32(o->di_mtimensec);
+       n->di_ctime = bswap32(o->di_ctime);
+       n->di_ctimensec = bswap32(o->di_ctimensec);
+       memcpy(n->di_db, o->di_db, (NDADDR + NIADDR) * sizeof(u_int32_t));
+       n->di_flags = bswap32(o->di_flags);
+       n->di_blocks = bswap32(o->di_blocks);
+       n->di_gen = bswap32(o->di_gen);
+       n->di_uid = bswap32(o->di_uid);
+       n->di_gid = bswap32(o->di_gid);
+}
+
+void
+ffs_dinode2_swap(struct ufs2_dinode *o, struct ufs2_dinode *n)
+{
+       n->di_mode = bswap16(o->di_mode);
+       n->di_nlink = bswap16(o->di_nlink);
+       n->di_uid = bswap32(o->di_uid);
+       n->di_gid = bswap32(o->di_gid);
+       n->di_blksize = bswap32(o->di_blksize);
+       n->di_size = bswap64(o->di_size);
+       n->di_blocks = bswap64(o->di_blocks);
+       n->di_atime = bswap64(o->di_atime);
+       n->di_atimensec = bswap32(o->di_atimensec);
+       n->di_mtime = bswap64(o->di_mtime);
+       n->di_mtimensec = bswap32(o->di_mtimensec);
+       n->di_ctime = bswap64(o->di_ctime);
+       n->di_ctimensec = bswap32(o->di_ctimensec);
+       n->di_birthtime = bswap64(o->di_birthtime);
+       n->di_birthnsec = bswap32(o->di_birthnsec);
+       n->di_gen = bswap32(o->di_gen);
+       n->di_kernflags = bswap32(o->di_kernflags);
+       n->di_flags = bswap32(o->di_flags);
+       n->di_extsize = bswap32(o->di_extsize);
+       memcpy(n->di_extb, o->di_extb, (NXADDR + NDADDR + NIADDR) * 8);
+}
+
+void
+ffs_csum_swap(struct csum *o, struct csum *n, int size)
+{
+       size_t i;
+       u_int32_t *oint, *nint;
+
+       oint = (u_int32_t*)o;
+       nint = (u_int32_t*)n;
+
+       for (i = 0; i < size / sizeof(u_int32_t); i++)
+               nint[i] = bswap32(oint[i]);
+}
+
+void
+ffs_csumtotal_swap(struct csum_total *o, struct csum_total *n)
+{
+       n->cs_ndir = bswap64(o->cs_ndir);
+       n->cs_nbfree = bswap64(o->cs_nbfree);
+       n->cs_nifree = bswap64(o->cs_nifree);
+       n->cs_nffree = bswap64(o->cs_nffree);
+}
+
+/*
+ * Note that ffs_cg_swap may be called with o == n.
+ */
+void
+ffs_cg_swap(struct cg *o, struct cg *n, struct fs *fs)
+{
+       int i;
+       u_int32_t *n32, *o32;
+       u_int16_t *n16, *o16;
+       int32_t btotoff, boff, clustersumoff;
+
+       n->cg_firstfield = bswap32(o->cg_firstfield);
+       n->cg_magic = bswap32(o->cg_magic);
+       n->cg_old_time = bswap32(o->cg_old_time);
+       n->cg_cgx = bswap32(o->cg_cgx);
+       n->cg_old_ncyl = bswap16(o->cg_old_ncyl);
+       n->cg_old_niblk = bswap16(o->cg_old_niblk);
+       n->cg_ndblk = bswap32(o->cg_ndblk);
+       n->cg_cs.cs_ndir = bswap32(o->cg_cs.cs_ndir);
+       n->cg_cs.cs_nbfree = bswap32(o->cg_cs.cs_nbfree);
+       n->cg_cs.cs_nifree = bswap32(o->cg_cs.cs_nifree);
+       n->cg_cs.cs_nffree = bswap32(o->cg_cs.cs_nffree);
+       n->cg_rotor = bswap32(o->cg_rotor);
+       n->cg_frotor = bswap32(o->cg_frotor);
+       n->cg_irotor = bswap32(o->cg_irotor);
+       for (i = 0; i < MAXFRAG; i++)
+               n->cg_frsum[i] = bswap32(o->cg_frsum[i]);
+
+       if ((fs->fs_magic != FS_UFS2_MAGIC) &&
+                       (fs->fs_old_postblformat == FS_42POSTBLFMT)) { /* old format */
+               struct ocg *on, *oo;
+               int j;
+               on = (struct ocg *)n;
+               oo = (struct ocg *)o;
+
+               for (i = 0; i < 32; i++) {
+                       on->cg_btot[i] = bswap32(oo->cg_btot[i]);
+                       for (j = 0; j < 8; j++)
+                               on->cg_b[i][j] = bswap16(oo->cg_b[i][j]);
+               }
+               memmove(on->cg_iused, oo->cg_iused, 256);
+               on->cg_magic = bswap32(oo->cg_magic);
+       } else {  /* new format */
+
+               n->cg_old_btotoff = bswap32(o->cg_old_btotoff);
+               n->cg_old_boff = bswap32(o->cg_old_boff);
+               n->cg_iusedoff = bswap32(o->cg_iusedoff);
+               n->cg_freeoff = bswap32(o->cg_freeoff);
+               n->cg_nextfreeoff = bswap32(o->cg_nextfreeoff);
+               n->cg_clustersumoff = bswap32(o->cg_clustersumoff);
+               n->cg_clusteroff = bswap32(o->cg_clusteroff);
+               n->cg_nclusterblks = bswap32(o->cg_nclusterblks);
+               n->cg_niblk = bswap32(o->cg_niblk);
+               n->cg_initediblk = bswap32(o->cg_initediblk);
+               n->cg_time = bswap64(o->cg_time);
+
+               if (n->cg_magic == CG_MAGIC) {
+                       btotoff = n->cg_old_btotoff;
+                       boff = n->cg_old_boff;
+                       clustersumoff = n->cg_clustersumoff;
+               } else {
+                       btotoff = bswap32(n->cg_old_btotoff);
+                       boff = bswap32(n->cg_old_boff);
+                       clustersumoff = bswap32(n->cg_clustersumoff);
+               }
+
+               n32 = (u_int32_t *)((u_int8_t *)n + clustersumoff);
+               o32 = (u_int32_t *)((u_int8_t *)o + clustersumoff);
+               for (i = 1; i < fs->fs_contigsumsize + 1; i++)
+                       n32[i] = bswap32(o32[i]);
+
+               if (fs->fs_magic == FS_UFS2_MAGIC)
+                       return;
+
+               n32 = (u_int32_t *)((u_int8_t *)n + btotoff);
+               o32 = (u_int32_t *)((u_int8_t *)o + btotoff);
+               n16 = (u_int16_t *)((u_int8_t *)n + boff);
+               o16 = (u_int16_t *)((u_int8_t *)o + boff);
+
+               for (i = 0; i < fs->fs_old_cpg; i++)
+                       n32[i] = bswap32(o32[i]);
+
+               for (i = 0; i < fs->fs_old_cpg * fs->fs_old_nrpos; i++)
+                       n16[i] = bswap16(o16[i]);
+       }
+}
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
new file mode 100644 (file)
index 0000000..0f6edcb
--- /dev/null
@@ -0,0 +1,725 @@
+/*     $NetBSD: ffs_inode.c,v 1.108 2011/11/23 19:42:10 bouyer Exp $   */
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.108 2011/11/23 19:42:10 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+#include <sys/vnode.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+static int ffs_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int,
+                         int64_t *);
+
+/*
+ * Update the access, modified, and inode change times as specified
+ * by the IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.
+ * The IN_MODIFIED flag is used to specify that the inode needs to be
+ * updated but that the times have already been set. The access
+ * and modified times are taken from the second and third parameters;
+ * the inode change time is always taken from the current time. If
+ * UPDATE_WAIT flag is set, or UPDATE_DIROP is set then wait for the
+ * disk write of the inode to complete.
+ */
+
+int
+ffs_update(struct vnode *vp, const struct timespec *acc,
+    const struct timespec *mod, int updflags)
+{
+       struct fs *fs;
+       struct buf *bp;
+       struct inode *ip;
+       int error;
+       void *cp;
+       int waitfor, flags;
+
+       if (vp->v_mount->mnt_flag & MNT_RDONLY)
+               return (0);
+       ip = VTOI(vp);
+       FFS_ITIMES(ip, acc, mod, NULL);
+       if (updflags & UPDATE_CLOSE)
+               flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED);
+       else
+               flags = ip->i_flag & IN_MODIFIED;
+       if (flags == 0)
+               return (0);
+       fs = ip->i_fs;
+
+       if ((flags & IN_MODIFIED) != 0 &&
+           (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) {
+               waitfor = updflags & UPDATE_WAIT;
+               if ((updflags & UPDATE_DIROP) != 0)
+                       waitfor |= UPDATE_WAIT;
+       } else
+               waitfor = 0;
+
+       /*
+        * Ensure that uid and gid are correct. This is a temporary
+        * fix until fsck has been changed to do the update.
+        */
+       if (fs->fs_magic == FS_UFS1_MAGIC &&                    /* XXX */
+           fs->fs_old_inodefmt < FS_44INODEFMT) {              /* XXX */
+               ip->i_ffs1_ouid = ip->i_uid;    /* XXX */
+               ip->i_ffs1_ogid = ip->i_gid;    /* XXX */
+       }                                                       /* XXX */
+       error = bread(ip->i_devvp,
+                     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+                     (int)fs->fs_bsize, NOCRED, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (error);
+       }
+       ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED);
+       /* Keep unlinked inode list up to date */
+       KDASSERT(DIP(ip, nlink) == ip->i_nlink);
+       if (ip->i_mode) {
+               if (ip->i_nlink > 0) {
+                       UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp,
+                           ip->i_number, ip->i_mode);
+               } else {
+                       UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp,
+                           ip->i_number, ip->i_mode);
+               }
+       }
+       if (fs->fs_magic == FS_UFS1_MAGIC) {
+               cp = (char *)bp->b_data +
+                   (ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE);
+#ifdef FFS_EI
+               if (UFS_FSNEEDSWAP(fs))
+                       ffs_dinode1_swap(ip->i_din.ffs1_din,
+                           (struct ufs1_dinode *)cp);
+               else
+#endif
+                       memcpy(cp, ip->i_din.ffs1_din, DINODE1_SIZE);
+       } else {
+               cp = (char *)bp->b_data +
+                   (ino_to_fsbo(fs, ip->i_number) * DINODE2_SIZE);
+#ifdef FFS_EI
+               if (UFS_FSNEEDSWAP(fs))
+                       ffs_dinode2_swap(ip->i_din.ffs2_din,
+                           (struct ufs2_dinode *)cp);
+               else
+#endif
+                       memcpy(cp, ip->i_din.ffs2_din, DINODE2_SIZE);
+       }
+       if (waitfor) {
+               return (bwrite(bp));
+       } else {
+               bdwrite(bp);
+               return (0);
+       }
+}
+
+#define        SINGLE  0       /* index of single indirect block */
+#define        DOUBLE  1       /* index of double indirect block */
+#define        TRIPLE  2       /* index of triple indirect block */
+/*
+ * Truncate the inode oip to at most length size, freeing the
+ * disk blocks.
+ */
+int
+ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
+{
+       daddr_t lastblock;
+       struct inode *oip = VTOI(ovp);
+       daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR];
+       daddr_t blks[NDADDR + NIADDR];
+       struct fs *fs;
+       int offset, pgoffset, level;
+       int64_t count, blocksreleased = 0;
+       int i, aflag, nblocks;
+       int error, allerror = 0;
+       off_t osize;
+       int sync;
+       struct ufsmount *ump = oip->i_ump;
+
+       if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
+           ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
+               KASSERT(oip->i_size == 0);
+               return 0;
+       }
+
+       if (length < 0)
+               return (EINVAL);
+
+       if (ovp->v_type == VLNK &&
+           (oip->i_size < ump->um_maxsymlinklen ||
+            (ump->um_maxsymlinklen == 0 && DIP(oip, blocks) == 0))) {
+               KDASSERT(length == 0);
+               memset(SHORTLINK(oip), 0, (size_t)oip->i_size);
+               oip->i_size = 0;
+               DIP_ASSIGN(oip, size, 0);
+               oip->i_flag |= IN_CHANGE | IN_UPDATE;
+               return (ffs_update(ovp, NULL, NULL, 0));
+       }
+       if (oip->i_size == length) {
+               /* still do a uvm_vnp_setsize() as writesize may be larger */
+               uvm_vnp_setsize(ovp, length);
+               oip->i_flag |= IN_CHANGE | IN_UPDATE;
+               return (ffs_update(ovp, NULL, NULL, 0));
+       }
+       fs = oip->i_fs;
+       if (length > ump->um_maxfilesize)
+               return (EFBIG);
+
+       if ((oip->i_flags & SF_SNAPSHOT) != 0)
+               ffs_snapremove(ovp);
+
+       osize = oip->i_size;
+       aflag = ioflag & IO_SYNC ? B_SYNC : 0;
+
+       /*
+        * Lengthen the size of the file. We must ensure that the
+        * last byte of the file is allocated. Since the smallest
+        * value of osize is 0, length will be at least 1.
+        */
+
+       if (osize < length) {
+               if (lblkno(fs, osize) < NDADDR &&
+                   lblkno(fs, osize) != lblkno(fs, length) &&
+                   blkroundup(fs, osize) != osize) {
+                       off_t eob;
+
+                       eob = blkroundup(fs, osize);
+                       uvm_vnp_setwritesize(ovp, eob);
+                       error = ufs_balloc_range(ovp, osize, eob - osize,
+                           cred, aflag);
+                       if (error) {
+                               (void) ffs_truncate(ovp, osize,
+                                   ioflag & IO_SYNC, cred);
+                               return error;
+                       }
+                       if (ioflag & IO_SYNC) {
+                               mutex_enter(ovp->v_interlock);
+                               VOP_PUTPAGES(ovp,
+                                   trunc_page(osize & fs->fs_bmask),
+                                   round_page(eob), PGO_CLEANIT | PGO_SYNCIO |
+                                   PGO_JOURNALLOCKED);
+                       }
+               }
+               uvm_vnp_setwritesize(ovp, length);
+               error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag);
+               if (error) {
+                       (void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred);
+                       return (error);
+               }
+               uvm_vnp_setsize(ovp, length);
+               oip->i_flag |= IN_CHANGE | IN_UPDATE;
+               KASSERT(ovp->v_size == oip->i_size);
+               return (ffs_update(ovp, NULL, NULL, 0));
+       }
+
+       /*
+        * When truncating a regular file down to a non-block-aligned size,
+        * we must zero the part of last block which is past the new EOF.
+        * We must synchronously flush the zeroed pages to disk
+        * since the new pages will be invalidated as soon as we
+        * inform the VM system of the new, smaller size.
+        * We must do this before acquiring the GLOCK, since fetching
+        * the pages will acquire the GLOCK internally.
+        * So there is a window where another thread could see a whole
+        * zeroed page past EOF, but that's life.
+        */
+
+       offset = blkoff(fs, length);
+       pgoffset = length & PAGE_MASK;
+       if (ovp->v_type == VREG && (pgoffset != 0 || offset != 0) &&
+           osize > length) {
+               daddr_t lbn;
+               voff_t eoz;
+               int size;
+
+               if (offset != 0) {
+                       error = ufs_balloc_range(ovp, length - 1, 1, cred,
+                           aflag);
+                       if (error)
+                               return error;
+               }
+               lbn = lblkno(fs, length);
+               size = blksize(fs, oip, lbn);
+               eoz = MIN(MAX(lblktosize(fs, lbn) + size, round_page(pgoffset)),
+                   osize);
+               ubc_zerorange(&ovp->v_uobj, length, eoz - length,
+                   UBC_UNMAP_FLAG(ovp));
+               if (round_page(eoz) > round_page(length)) {
+                       mutex_enter(ovp->v_interlock);
+                       error = VOP_PUTPAGES(ovp, round_page(length),
+                           round_page(eoz),
+                           PGO_CLEANIT | PGO_DEACTIVATE | PGO_JOURNALLOCKED |
+                           ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
+                       if (error)
+                               return error;
+               }
+       }
+
+       genfs_node_wrlock(ovp);
+       oip->i_size = length;
+       DIP_ASSIGN(oip, size, length);
+       uvm_vnp_setsize(ovp, length);
+       /*
+        * Calculate index into inode's block list of
+        * last direct and indirect blocks (if any)
+        * which we want to keep.  Lastblock is -1 when
+        * the file is truncated to 0.
+        */
+       lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
+       lastiblock[SINGLE] = lastblock - NDADDR;
+       lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
+       lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
+       nblocks = btodb(fs->fs_bsize);
+       /*
+        * Update file and block pointers on disk before we start freeing
+        * blocks.  If we crash before free'ing blocks below, the blocks
+        * will be returned to the free list.  lastiblock values are also
+        * normalized to -1 for calls to ffs_indirtrunc below.
+        */
+       sync = 0;
+       for (level = TRIPLE; level >= SINGLE; level--) {
+               blks[NDADDR + level] = DIP(oip, ib[level]);
+               if (lastiblock[level] < 0 && blks[NDADDR + level] != 0) {
+                       sync = 1;
+                       DIP_ASSIGN(oip, ib[level], 0);
+                       lastiblock[level] = -1;
+               }
+       }
+       for (i = 0; i < NDADDR; i++) {
+               blks[i] = DIP(oip, db[i]);
+               if (i > lastblock && blks[i] != 0) {
+                       sync = 1;
+                       DIP_ASSIGN(oip, db[i], 0);
+               }
+       }
+       oip->i_flag |= IN_CHANGE | IN_UPDATE;
+       if (sync) {
+               error = ffs_update(ovp, NULL, NULL, UPDATE_WAIT);
+               if (error && !allerror)
+                       allerror = error;
+       }
+
+       /*
+        * Having written the new inode to disk, save its new configuration
+        * and put back the old block pointers long enough to process them.
+        * Note that we save the new block configuration so we can check it
+        * when we are done.
+        */
+       for (i = 0; i < NDADDR; i++) {
+               bn = DIP(oip, db[i]);
+               DIP_ASSIGN(oip, db[i], blks[i]);
+               blks[i] = bn;
+       }
+       for (i = 0; i < NIADDR; i++) {
+               bn = DIP(oip, ib[i]);
+               DIP_ASSIGN(oip, ib[i], blks[NDADDR + i]);
+               blks[NDADDR + i] = bn;
+       }
+
+       oip->i_size = osize;
+       DIP_ASSIGN(oip, size, osize);
+       error = vtruncbuf(ovp, lastblock + 1, 0, 0);
+       if (error && !allerror)
+               allerror = error;
+
+       /*
+        * Indirect blocks first.
+        */
+       indir_lbn[SINGLE] = -NDADDR;
+       indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
+       indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
+       for (level = TRIPLE; level >= SINGLE; level--) {
+               if (oip->i_ump->um_fstype == UFS1)
+                       bn = ufs_rw32(oip->i_ffs1_ib[level],UFS_FSNEEDSWAP(fs));
+               else
+                       bn = ufs_rw64(oip->i_ffs2_ib[level],UFS_FSNEEDSWAP(fs));
+               if (bn != 0) {
+                       error = ffs_indirtrunc(oip, indir_lbn[level],
+                           fsbtodb(fs, bn), lastiblock[level], level, &count);
+                       if (error)
+                               allerror = error;
+                       blocksreleased += count;
+                       if (lastiblock[level] < 0) {
+                               DIP_ASSIGN(oip, ib[level], 0);
+                               if (oip->i_ump->um_mountp->mnt_wapbl) {
+                                       UFS_WAPBL_REGISTER_DEALLOCATION(
+                                           oip->i_ump->um_mountp,
+                                           fsbtodb(fs, bn), fs->fs_bsize);
+                               } else
+                                       ffs_blkfree(fs, oip->i_devvp, bn,
+                                           fs->fs_bsize, oip->i_number);
+                               blocksreleased += nblocks;
+                       }
+               }
+               if (lastiblock[level] >= 0)
+                       goto done;
+       }
+
+       /*
+        * All whole direct blocks or frags.
+        */
+       for (i = NDADDR - 1; i > lastblock; i--) {
+               long bsize;
+
+               if (oip->i_ump->um_fstype == UFS1)
+                       bn = ufs_rw32(oip->i_ffs1_db[i], UFS_FSNEEDSWAP(fs));
+               else
+                       bn = ufs_rw64(oip->i_ffs2_db[i], UFS_FSNEEDSWAP(fs));
+               if (bn == 0)
+                       continue;
+               DIP_ASSIGN(oip, db[i], 0);
+               bsize = blksize(fs, oip, i);
+               if ((oip->i_ump->um_mountp->mnt_wapbl) &&
+                   (ovp->v_type != VREG)) {
+                       UFS_WAPBL_REGISTER_DEALLOCATION(oip->i_ump->um_mountp,
+                           fsbtodb(fs, bn), bsize);
+               } else
+                       ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number);
+               blocksreleased += btodb(bsize);
+       }
+       if (lastblock < 0)
+               goto done;
+
+       /*
+        * Finally, look for a change in size of the
+        * last direct block; release any frags.
+        */
+       if (oip->i_ump->um_fstype == UFS1)
+               bn = ufs_rw32(oip->i_ffs1_db[lastblock], UFS_FSNEEDSWAP(fs));
+       else
+               bn = ufs_rw64(oip->i_ffs2_db[lastblock], UFS_FSNEEDSWAP(fs));
+       if (bn != 0) {
+               long oldspace, newspace;
+
+               /*
+                * Calculate amount of space we're giving
+                * back as old block size minus new block size.
+                */
+               oldspace = blksize(fs, oip, lastblock);
+               oip->i_size = length;
+               DIP_ASSIGN(oip, size, length);
+               newspace = blksize(fs, oip, lastblock);
+               if (newspace == 0)
+                       panic("itrunc: newspace");
+               if (oldspace - newspace > 0) {
+                       /*
+                        * Block number of space to be free'd is
+                        * the old block # plus the number of frags
+                        * required for the storage we're keeping.
+                        */
+                       bn += numfrags(fs, newspace);
+                       if ((oip->i_ump->um_mountp->mnt_wapbl) &&
+                           (ovp->v_type != VREG)) {
+                               UFS_WAPBL_REGISTER_DEALLOCATION(
+                                   oip->i_ump->um_mountp, fsbtodb(fs, bn),
+                                   oldspace - newspace);
+                       } else
+                               ffs_blkfree(fs, oip->i_devvp, bn,
+                                   oldspace - newspace, oip->i_number);
+                       blocksreleased += btodb(oldspace - newspace);
+               }
+       }
+
+done:
+#ifdef DIAGNOSTIC
+       for (level = SINGLE; level <= TRIPLE; level++)
+               if (blks[NDADDR + level] != DIP(oip, ib[level]))
+                       panic("itrunc1");
+       for (i = 0; i < NDADDR; i++)
+               if (blks[i] != DIP(oip, db[i]))
+                       panic("itrunc2");
+       if (length == 0 &&
+           (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+               panic("itrunc3");
+#endif /* DIAGNOSTIC */
+       /*
+        * Put back the real size.
+        */
+       oip->i_size = length;
+       DIP_ASSIGN(oip, size, length);
+       DIP_ADD(oip, blocks, -blocksreleased);
+       genfs_node_unlock(ovp);
+       oip->i_flag |= IN_CHANGE;
+       UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0);
+#if defined(QUOTA) || defined(QUOTA2)
+       (void) chkdq(oip, -blocksreleased, NOCRED, 0);
+#endif
+       KASSERT(ovp->v_type != VREG || ovp->v_size == oip->i_size);
+       return (allerror);
+}
+
+/*
+ * Release blocks associated with the inode ip and stored in the indirect
+ * block bn.  Blocks are free'd in LIFO order up to (but not including)
+ * lastbn.  If level is greater than SINGLE, the block is an indirect block
+ * and recursive calls to indirtrunc must be used to cleanse other indirect
+ * blocks.
+ *
+ * NB: triple indirect blocks are untested.
+ */
+static int
+ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
+    int level, int64_t *countp)
+{
+       int i;
+       struct buf *bp;
+       struct fs *fs = ip->i_fs;
+       int32_t *bap1 = NULL;
+       int64_t *bap2 = NULL;
+       struct vnode *vp;
+       daddr_t nb, nlbn, last;
+       char *copy = NULL;
+       int64_t blkcount, factor, blocksreleased = 0;
+       int nblocks;
+       int error = 0, allerror = 0;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+#define RBAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? \
+           ufs_rw32(bap1[i], needswap) : ufs_rw64(bap2[i], needswap))
+#define BAP_ASSIGN(ip, i, value)                                       \
+       do {                                                            \
+               if ((ip)->i_ump->um_fstype == UFS1)                     \
+                       bap1[i] = (value);                              \
+               else                                                    \
+                       bap2[i] = (value);                              \
+       } while(0)
+
+       /*
+        * Calculate index in current block of last
+        * block to be kept.  -1 indicates the entire
+        * block so we need not calculate the index.
+        */
+       factor = 1;
+       for (i = SINGLE; i < level; i++)
+               factor *= NINDIR(fs);
+       last = lastbn;
+       if (lastbn > 0)
+               last /= factor;
+       nblocks = btodb(fs->fs_bsize);
+       /*
+        * Get buffer of block pointers, zero those entries corresponding
+        * to blocks to be free'd, and update on disk copy first.  Since
+        * double(triple) indirect before single(double) indirect, calls
+        * to bmap on these blocks will fail.  However, we already have
+        * the on disk address, so we have to set the b_blkno field
+        * explicitly instead of letting bread do everything for us.
+        */
+       vp = ITOV(ip);
+       error = ffs_getblk(vp, lbn, FFS_NOBLK, fs->fs_bsize, false, &bp);
+       if (error) {
+               *countp = 0;
+               return error;
+       }
+       if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+               /* Braces must be here in case trace evaluates to nothing. */
+               trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn);
+       } else {
+               trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn);
+               curlwp->l_ru.ru_inblock++;      /* pay for read */
+               bp->b_flags |= B_READ;
+               bp->b_flags &= ~B_COWDONE;      /* we change blkno below */
+               if (bp->b_bcount > bp->b_bufsize)
+                       panic("ffs_indirtrunc: bad buffer size");
+               bp->b_blkno = dbn;
+               BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+               VOP_STRATEGY(vp, bp);
+               error = biowait(bp);
+               if (error == 0)
+                       error = fscow_run(bp, true);
+       }
+       if (error) {
+               brelse(bp, 0);
+               *countp = 0;
+               return (error);
+       }
+
+       if (ip->i_ump->um_fstype == UFS1)
+               bap1 = (int32_t *)bp->b_data;
+       else
+               bap2 = (int64_t *)bp->b_data;
+       if (lastbn >= 0) {
+               copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK);
+               memcpy((void *)copy, bp->b_data, (u_int)fs->fs_bsize);
+               for (i = last + 1; i < NINDIR(fs); i++)
+                       BAP_ASSIGN(ip, i, 0);
+               error = bwrite(bp);
+               if (error)
+                       allerror = error;
+               if (ip->i_ump->um_fstype == UFS1)
+                       bap1 = (int32_t *)copy;
+               else
+                       bap2 = (int64_t *)copy;
+       }
+
+       /*
+        * Recursively free totally unused blocks.
+        */
+       for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
+           i--, nlbn += factor) {
+               nb = RBAP(ip, i);
+               if (nb == 0)
+                       continue;
+               if (level > SINGLE) {
+                       error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+                                              (daddr_t)-1, level - 1,
+                                              &blkcount);
+                       if (error)
+                               allerror = error;
+                       blocksreleased += blkcount;
+               }
+               if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+                   ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) {
+                       UFS_WAPBL_REGISTER_DEALLOCATION(ip->i_ump->um_mountp,
+                           fsbtodb(fs, nb), fs->fs_bsize);
+               } else
+                       ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize,
+                           ip->i_number);
+               blocksreleased += nblocks;
+       }
+
+       /*
+        * Recursively free last partial block.
+        */
+       if (level > SINGLE && lastbn >= 0) {
+               last = lastbn % factor;
+               nb = RBAP(ip, i);
+               if (nb != 0) {
+                       error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
+                                              last, level - 1, &blkcount);
+                       if (error)
+                               allerror = error;
+                       blocksreleased += blkcount;
+               }
+       }
+
+       if (copy != NULL) {
+               free(copy, M_TEMP);
+       } else {
+               brelse(bp, BC_INVAL);
+       }
+
+       *countp = blocksreleased;
+       return (allerror);
+}
+
+void
+ffs_itimes(struct inode *ip, const struct timespec *acc,
+    const struct timespec *mod, const struct timespec *cre)
+{
+       struct timespec now;
+
+       if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
+               return;
+       }
+
+       vfs_timestamp(&now);
+       if (ip->i_flag & IN_ACCESS) {
+               if (acc == NULL)
+                       acc = &now;
+               DIP_ASSIGN(ip, atime, acc->tv_sec);
+               DIP_ASSIGN(ip, atimensec, acc->tv_nsec);
+       }
+       if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
+               if ((ip->i_flags & SF_SNAPSHOT) == 0) {
+                       if (mod == NULL)
+                               mod = &now;
+                       DIP_ASSIGN(ip, mtime, mod->tv_sec);
+                       DIP_ASSIGN(ip, mtimensec, mod->tv_nsec);
+               }
+               ip->i_modrev++;
+       }
+       if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
+               if (cre == NULL)
+                       cre = &now;
+               DIP_ASSIGN(ip, ctime, cre->tv_sec);
+               DIP_ASSIGN(ip, ctimensec, cre->tv_nsec);
+       }
+       if (ip->i_flag & (IN_ACCESS | IN_MODIFY))
+               ip->i_flag |= IN_ACCESSED;
+       if (ip->i_flag & (IN_UPDATE | IN_CHANGE))
+               ip->i_flag |= IN_MODIFIED;
+       ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
diff --git a/sys/ufs/ffs/ffs_quota2.c b/sys/ufs/ffs/ffs_quota2.c
new file mode 100644 (file)
index 0000000..b3d45b3
--- /dev/null
@@ -0,0 +1,118 @@
+/* $NetBSD: ffs_quota2.c,v 1.4 2011/06/12 03:36:00 rmind Exp $ */
+/*-
+  * Copyright (c) 2010 Manuel Bouyer
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  *
+  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  * POSSIBILITY OF SUCH DAMAGE.
+  */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_quota2.c,v 1.4 2011/06/12 03:36:00 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota2.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/ffs/fs.h>
+
+
+int
+ffs_quota2_mount(struct mount *mp)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       int error = 0;
+       struct vnode *vp;
+       struct lwp *l = curlwp;
+
+       if ((fs->fs_flags & FS_DOQUOTA2) == 0)
+               return 0;
+
+       ump->um_flags |= UFS_QUOTA2;
+       ump->umq2_bsize = fs->fs_bsize;
+       ump->umq2_bmask = fs->fs_qbmask;
+       if (fs->fs_quota_magic != Q2_HEAD_MAGIC) {
+               printf("%s: Invalid quota magic number\n",
+                   mp->mnt_stat.f_mntonname);
+               return EINVAL;
+       }
+        if ((fs->fs_quota_flags & FS_Q2_DO_TYPE(USRQUOTA)) &&
+            fs->fs_quotafile[USRQUOTA] == 0) {
+                printf("%s: no user quota inode\n",
+                   mp->mnt_stat.f_mntonname); 
+                error = EINVAL;
+        }
+        if ((fs->fs_quota_flags & FS_Q2_DO_TYPE(GRPQUOTA)) &&
+            fs->fs_quotafile[GRPQUOTA] == 0) {
+                printf("%s: no group quota inode\n",
+                   mp->mnt_stat.f_mntonname);
+                error = EINVAL;
+        }
+       if (error)
+               return error;
+
+        if (fs->fs_quota_flags & FS_Q2_DO_TYPE(USRQUOTA) &&
+           ump->um_quotas[USRQUOTA] == NULLVP) {
+               error = VFS_VGET(mp, fs->fs_quotafile[USRQUOTA], &vp);
+               if (error) {
+                       printf("%s: can't vget() user quota inode: %d\n",
+                           mp->mnt_stat.f_mntonname, error);
+                       return error;
+               }
+               ump->um_quotas[USRQUOTA] = vp;
+               ump->um_cred[USRQUOTA] = l->l_cred;
+               mutex_enter(vp->v_interlock);
+               vp->v_writecount++;
+               mutex_exit(vp->v_interlock);
+               VOP_UNLOCK(vp);
+       }
+        if (fs->fs_quota_flags & FS_Q2_DO_TYPE(GRPQUOTA) &&
+           ump->um_quotas[GRPQUOTA] == NULLVP) {
+               error = VFS_VGET(mp, fs->fs_quotafile[GRPQUOTA], &vp);
+               if (error) {
+                       vn_close(ump->um_quotas[USRQUOTA],
+                           FREAD|FWRITE, l->l_cred);
+                       printf("%s: can't vget() group quota inode: %d\n",
+                           mp->mnt_stat.f_mntonname, error);
+                       return error;
+               }
+               ump->um_quotas[GRPQUOTA] = vp;
+               ump->um_cred[GRPQUOTA] = l->l_cred;
+               mutex_enter(vp->v_interlock);
+               vp->v_vflag |= VV_SYSTEM;
+               vp->v_writecount++;
+               mutex_exit(vp->v_interlock);
+               VOP_UNLOCK(vp);
+       }
+       mp->mnt_flag |= MNT_QUOTA;
+       return 0;
+}
diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c
new file mode 100644 (file)
index 0000000..b1e07c1
--- /dev/null
@@ -0,0 +1,2331 @@
+/*     $NetBSD: ffs_snapshot.c,v 1.118 2011/10/07 09:35:06 hannken Exp $       */
+
+/*
+ * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
+ *
+ * Further information about snapshots can be obtained from:
+ *
+ *     Marshall Kirk McKusick          http://www.mckusick.com/softdep/
+ *     1614 Oxford Street              mckusick@mckusick.com
+ *     Berkeley, CA 94709-1608         +1-510-843-9542
+ *     USA
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_snapshot.c      8.11 (McKusick) 7/23/00
+ *
+ *     from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.118 2011/10/07 09:35:06 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/sched.h>
+#include <sys/stat.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/vnode.h>
+#include <sys/kauth.h>
+#include <sys/fstrans.h>
+#include <sys/wapbl.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <uvm/uvm.h>
+
+struct snap_info {
+       kmutex_t si_lock;                       /* Lock this snapinfo */
+       kmutex_t si_snaplock;                   /* Snapshot vnode common lock */
+       lwp_t *si_owner;                        /* Sanplock owner */
+       TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
+       daddr_t *si_snapblklist;                /* Snapshot block hints list */
+       uint32_t si_gen;                        /* Incremented on change */
+};
+
+#if !defined(FFS_NO_SNAPSHOT)
+typedef int (*acctfunc_t)
+    (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
+
+static int snapshot_setup(struct mount *, struct vnode *);
+static int snapshot_copyfs(struct mount *, struct vnode *, void **);
+static int snapshot_expunge(struct mount *, struct vnode *,
+    struct fs *, daddr_t *, daddr_t **);
+static int snapshot_expunge_snap(struct mount *, struct vnode *,
+    struct fs *, daddr_t);
+static int snapshot_writefs(struct mount *, struct vnode *, void *);
+static int cgaccount(struct vnode *, int, int *);
+static int cgaccount1(int, struct vnode *, void *, int);
+static int expunge(struct vnode *, struct inode *, struct fs *,
+    acctfunc_t, int);
+static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
+    daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
+static int fullacct(struct vnode *, void *, int, int, struct fs *,
+    daddr_t, int);
+static int snapacct(struct vnode *, void *, int, int, struct fs *,
+    daddr_t, int);
+static int mapacct(struct vnode *, void *, int, int, struct fs *,
+    daddr_t, int);
+#endif /* !defined(FFS_NO_SNAPSHOT) */
+
+static int ffs_copyonwrite(void *, struct buf *, bool);
+static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
+static int rwfsblk(struct vnode *, int, void *, daddr_t);
+static int syncsnap(struct vnode *);
+static int wrsnapblk(struct vnode *, void *, daddr_t);
+#if !defined(FFS_NO_SNAPSHOT)
+static int blocks_in_journal(struct fs *);
+#endif
+
+static inline bool is_active_snapshot(struct snap_info *, struct inode *);
+static inline daddr_t db_get(struct inode *, int);
+static inline void db_assign(struct inode *, int, daddr_t);
+static inline daddr_t ib_get(struct inode *, int);
+static inline void ib_assign(struct inode *, int, daddr_t);
+static inline daddr_t idb_get(struct inode *, void *, int);
+static inline void idb_assign(struct inode *, void *, int, daddr_t);
+
+#ifdef DEBUG
+static int snapdebug = 0;
+#endif
+
+int
+ffs_snapshot_init(struct ufsmount *ump)
+{
+       struct snap_info *si;
+
+       si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
+       if (si == NULL)
+               return ENOMEM;
+
+       TAILQ_INIT(&si->si_snapshots);
+       mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
+       mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
+       si->si_owner = NULL;
+       si->si_gen = 0;
+       si->si_snapblklist = NULL;
+
+       return 0;
+}
+
+void
+ffs_snapshot_fini(struct ufsmount *ump)
+{
+       struct snap_info *si;
+
+       si = ump->um_snapinfo;
+       ump->um_snapinfo = NULL;
+
+       KASSERT(TAILQ_EMPTY(&si->si_snapshots));
+       mutex_destroy(&si->si_lock);
+       mutex_destroy(&si->si_snaplock);
+       KASSERT(si->si_snapblklist == NULL);
+       kmem_free(si, sizeof(*si));
+}
+
+/*
+ * Create a snapshot file and initialize it for the filesystem.
+ * Vnode is locked on entry and return.
+ */
+int
+ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
+{
+#if defined(FFS_NO_SNAPSHOT)
+       return EOPNOTSUPP;
+}
+#else /* defined(FFS_NO_SNAPSHOT) */
+       bool suspended = false;
+       int error, redo = 0, snaploc;
+       void *sbbuf = NULL;
+       daddr_t *snaplist = NULL, snaplistsize = 0;
+       struct buf *bp, *nbp;
+       struct fs *copy_fs = NULL;
+       struct fs *fs = VFSTOUFS(mp)->um_fs;
+       struct inode *ip = VTOI(vp);
+       struct lwp *l = curlwp;
+       struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
+       struct timespec ts;
+       struct timeval starttime;
+#ifdef DEBUG
+       struct timeval endtime;
+#endif
+       struct vnode *devvp = ip->i_devvp;
+
+       /*
+        * If the vnode already is a snapshot, return.
+        */
+       if ((VTOI(vp)->i_flags & SF_SNAPSHOT)) {
+               if ((VTOI(vp)->i_flags & SF_SNAPINVAL))
+                       return EINVAL;
+               if (ctime) {
+                       ctime->tv_sec = DIP(VTOI(vp), mtime);
+                       ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
+               }
+               return 0;
+       }
+       /*
+        * Check for free snapshot slot in the superblock.
+        */
+       for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
+               if (fs->fs_snapinum[snaploc] == 0)
+                       break;
+       if (snaploc == FSMAXSNAP)
+               return (ENOSPC);
+       /*
+        * Prepare the vnode to become a snapshot.
+        */
+       error = snapshot_setup(mp, vp);
+       if (error)
+               goto out;
+
+       /*
+        * Copy all the cylinder group maps. Although the
+        * filesystem is still active, we hope that only a few
+        * cylinder groups will change between now and when we
+        * suspend operations. Thus, we will be able to quickly
+        * touch up the few cylinder groups that changed during
+        * the suspension period.
+        */
+       error = cgaccount(vp, 1, NULL);
+       if (error)
+               goto out;
+
+       /*
+        * snapshot is now valid
+        */
+       ip->i_flags &= ~SF_SNAPINVAL;
+       DIP_ASSIGN(ip, flags, ip->i_flags);
+       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+       /*
+        * Ensure that the snapshot is completely on disk.
+        * Since we have marked it as a snapshot it is safe to
+        * unlock it as no process will be allowed to write to it.
+        */
+       error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
+       if (error)
+               goto out;
+       VOP_UNLOCK(vp);
+       /*
+        * All allocations are done, so we can now suspend the filesystem.
+        */
+       error = vfs_suspend(vp->v_mount, 0);
+       vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+       if (error)
+               goto out;
+       suspended = true;
+       getmicrotime(&starttime);
+       /*
+        * First, copy all the cylinder group maps that have changed.
+        */
+       error = cgaccount(vp, 2, &redo);
+       if (error)
+               goto out;
+       /*
+        * Create a copy of the superblock and its summary information.
+        */
+       error = snapshot_copyfs(mp, vp, &sbbuf);
+       copy_fs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
+       if (error)
+               goto out;
+       /*
+        * Expunge unlinked files from our view.
+        */
+       error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
+       if (error)
+               goto out;
+       /*
+        * Record snapshot inode. Since this is the newest snapshot,
+        * it must be placed at the end of the list.
+        */
+       if (ip->i_nlink > 0)
+               fs->fs_snapinum[snaploc] = ip->i_number;
+
+       mutex_enter(&si->si_lock);
+       if (is_active_snapshot(si, ip))
+               panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
+       TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
+       if (TAILQ_FIRST(&si->si_snapshots) == ip) {
+               /*
+                * If this is the first snapshot on this filesystem, put the
+                * preliminary list in place and establish the cow handler.
+                */
+               si->si_snapblklist = snaplist;
+               fscow_establish(mp, ffs_copyonwrite, devvp);
+       }
+       si->si_gen++;
+       mutex_exit(&si->si_lock);
+
+       vp->v_vflag |= VV_SYSTEM;
+       /*
+        * Set the mtime to the time the snapshot has been taken.
+        */
+       TIMEVAL_TO_TIMESPEC(&starttime, &ts);
+       if (ctime)
+               *ctime = ts;
+       DIP_ASSIGN(ip, mtime, ts.tv_sec);
+       DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
+       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+       /*
+        * Copy allocation information from all snapshots and then
+        * expunge them from our view.
+        */
+       error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
+       if (error)
+               goto out;
+       /*
+        * Write the superblock and its summary information to the snapshot.
+        */
+       error = snapshot_writefs(mp, vp, sbbuf);
+       if (error)
+               goto out;
+       /*
+        * We're nearly done, ensure that the snapshot is completely on disk.
+        */
+       error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
+       if (error)
+               goto out;
+       /*
+        * Invalidate and free all pages on the snapshot vnode.
+        * We will read and write through the buffercache.
+        */
+       mutex_enter(vp->v_interlock);
+       error = VOP_PUTPAGES(vp, 0, 0,
+                   PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
+       if (error)
+               goto out;
+       /*
+        * Invalidate short ( < fs_bsize ) buffers.  We will always read
+        * full size buffers later.
+        */
+       mutex_enter(&bufcache_lock);
+       KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+       for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+               nbp = LIST_NEXT(bp, b_vnbufs);
+               KASSERT((bp->b_cflags & BC_BUSY) == 0);
+               if (bp->b_bcount < fs->fs_bsize) {
+                       bp->b_cflags |= BC_BUSY;
+                       brelsel(bp, BC_INVAL | BC_VFLUSH);
+               }
+       }
+       mutex_exit(&bufcache_lock);
+
+out:
+       if (sbbuf != NULL) {
+               free(copy_fs->fs_csp, M_UFSMNT);
+               free(sbbuf, M_UFSMNT);
+       }
+       if (fs->fs_active != NULL) {
+               free(fs->fs_active, M_DEVBUF);
+               fs->fs_active = NULL;
+       }
+
+       mutex_enter(&si->si_lock);
+       if (snaplist != NULL) {
+               if (si->si_snapblklist == snaplist)
+                       si->si_snapblklist = NULL;
+               free(snaplist, M_UFSMNT);
+       }
+       if (error) {
+               fs->fs_snapinum[snaploc] = 0;
+       } else {
+               /*
+                * As this is the newest list, it is the most inclusive, so
+                * should replace the previous list.
+                */
+               si->si_snapblklist = ip->i_snapblklist;
+       }
+       si->si_gen++;
+       mutex_exit(&si->si_lock);
+
+       if (suspended) {
+               VOP_UNLOCK(vp);
+               vfs_resume(vp->v_mount);
+               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef DEBUG
+               getmicrotime(&endtime);
+               timersub(&endtime, &starttime, &endtime);
+               printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
+                   mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
+                   endtime.tv_usec / 1000, redo, fs->fs_ncg);
+#endif
+       }
+       if (error) {
+               if (!UFS_WAPBL_BEGIN(mp)) {
+                       (void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
+                       UFS_WAPBL_END(mp);
+               }
+       } else if (ip->i_nlink > 0)
+               vref(vp);
+       return (error);
+}
+
+/*
+ * Prepare vnode to become a snapshot.
+ */
+static int
+snapshot_setup(struct mount *mp, struct vnode *vp)
+{
+       int error, n, len, loc, cg;
+       daddr_t blkno, numblks;
+       struct buf *ibp, *nbp;
+       struct fs *fs = VFSTOUFS(mp)->um_fs;
+       struct lwp *l = curlwp;
+       const int wbreak = blocks_in_journal(fs)/8;
+       struct inode *ip = VTOI(vp);
+
+       /*
+        * Check mount, exclusive reference and owner.
+        */
+       if (vp->v_mount != mp)
+               return EXDEV;
+       if (vp->v_usecount != 1 || vp->v_writecount != 0)
+               return EBUSY;
+       if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+           NULL) != 0 &&
+           VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred))
+               return EACCES;
+
+       if (vp->v_size != 0) {
+               error = ffs_truncate(vp, 0, 0, NOCRED);
+               if (error)
+                       return error;
+       }
+
+       /* Change inode to snapshot type file. */
+       error = UFS_WAPBL_BEGIN(mp);
+       if (error)
+               return error;
+#if defined(QUOTA) || defined(QUOTA2)
+       /* shapshot inodes are not accounted in quotas */
+       chkiq(ip, -1, l->l_cred, 0);
+#endif
+       ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
+       DIP_ASSIGN(ip, flags, ip->i_flags);
+       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+       ffs_update(vp, NULL, NULL, UPDATE_WAIT);
+       UFS_WAPBL_END(mp);
+
+       KASSERT(ip->i_flags & SF_SNAPSHOT);
+       /*
+        * Write an empty list of preallocated blocks to the end of
+        * the snapshot to set size to at least that of the filesystem.
+        */
+       numblks = howmany(fs->fs_size, fs->fs_frag);
+       blkno = 1;
+       blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
+       error = vn_rdwr(UIO_WRITE, vp,
+           (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
+           UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
+       if (error)
+               return error;
+       /*
+        * Preallocate critical data structures so that we can copy
+        * them in without further allocation after we suspend all
+        * operations on the filesystem. We would like to just release
+        * the allocated buffers without writing them since they will
+        * be filled in below once we are ready to go, but this upsets
+        * the soft update code, so we go ahead and write the new buffers.
+        *
+        * Allocate all indirect blocks and mark all of them as not
+        * needing to be copied.
+        */
+       error = UFS_WAPBL_BEGIN(mp);
+       if (error)
+               return error;
+       for (blkno = NDADDR, n = 0; blkno < numblks; blkno += NINDIR(fs)) {
+               error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
+                   fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
+               if (error)
+                       goto out;
+               brelse(ibp, 0);
+               if (wbreak > 0 && (++n % wbreak) == 0) {
+                       UFS_WAPBL_END(mp);
+                       error = UFS_WAPBL_BEGIN(mp);
+                       if (error)
+                               return error;
+               }
+       }
+       /*
+        * Allocate copies for the superblock and its summary information.
+        */
+       error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
+           0, &nbp);
+       if (error)
+               goto out;
+       bawrite(nbp);
+       blkno = fragstoblks(fs, fs->fs_csaddr);
+       len = howmany(fs->fs_cssize, fs->fs_bsize);
+       for (loc = 0; loc < len; loc++) {
+               error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
+                   fs->fs_bsize, l->l_cred, 0, &nbp);
+               if (error)
+                       goto out;
+               bawrite(nbp);
+               if (wbreak > 0 && (++n % wbreak) == 0) {
+                       UFS_WAPBL_END(mp);
+                       error = UFS_WAPBL_BEGIN(mp);
+                       if (error)
+                               return error;
+               }
+       }
+       /*
+        * Allocate all cylinder group blocks.
+        */
+       for (cg = 0; cg < fs->fs_ncg; cg++) {
+               error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
+                   fs->fs_bsize, l->l_cred, 0, &nbp);
+               if (error)
+                       goto out;
+               bawrite(nbp);
+               if (wbreak > 0 && (++n % wbreak) == 0) {
+                       UFS_WAPBL_END(mp);
+                       error = UFS_WAPBL_BEGIN(mp);
+                       if (error)
+                               return error;
+               }
+       }
+
+out:
+       UFS_WAPBL_END(mp);
+       return error;
+}
+
+/*
+ * Create a copy of the superblock and its summary information.
+ * It is up to the caller to free copyfs and copy_fs->fs_csp.
+ */
+static int
+snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
+{
+       int error, i, len, loc, size;
+       void *space;
+       int32_t *lp;
+       struct buf *bp;
+       struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
+       struct lwp *l = curlwp;
+       struct vnode *devvp = VTOI(vp)->i_devvp;
+
+       /*
+        * Grab a copy of the superblock and its summary information.
+        * We delay writing it until the suspension is released below.
+        */
+       *sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
+       loc = blkoff(fs, fs->fs_sblockloc);
+       if (loc > 0)
+               memset(*sbbuf, 0, loc);
+       copyfs = (struct fs *)((char *)(*sbbuf) + loc);
+       memcpy(copyfs, fs, fs->fs_sbsize);
+       size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
+       if (fs->fs_sbsize < size)
+               memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0, 
+                   size - fs->fs_sbsize);
+       size = blkroundup(fs, fs->fs_cssize);
+       if (fs->fs_contigsumsize > 0)
+               size += fs->fs_ncg * sizeof(int32_t);
+       space = malloc(size, M_UFSMNT, M_WAITOK);
+       copyfs->fs_csp = space;
+       memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
+       space = (char *)space + fs->fs_cssize;
+       loc = howmany(fs->fs_cssize, fs->fs_fsize);
+       i = fs->fs_frag - loc % fs->fs_frag;
+       len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
+       if (len > 0) {
+               if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
+                   len, l->l_cred, 0, &bp)) != 0) {
+                       brelse(bp, 0);
+                       free(copyfs->fs_csp, M_UFSMNT);
+                       free(*sbbuf, M_UFSMNT);
+                       *sbbuf = NULL;
+                       return error;
+               }
+               memcpy(space, bp->b_data, (u_int)len);
+               space = (char *)space + len;
+               brelse(bp, BC_INVAL | BC_NOCACHE);
+       }
+       if (fs->fs_contigsumsize > 0) {
+               copyfs->fs_maxcluster = lp = space;
+               for (i = 0; i < fs->fs_ncg; i++)
+                       *lp++ = fs->fs_contigsumsize;
+       }
+       if (mp->mnt_wapbl)
+               copyfs->fs_flags &= ~FS_DOWAPBL;
+       return 0;
+}
+
+/*
+ * We must check for active files that have been unlinked (e.g., with a zero
+ * link count). We have to expunge all trace of these files from the snapshot
+ * so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
+ * Note that we skip unlinked snapshot files as they will be handled separately.
+ * Calculate the snapshot list size and create a preliminary list.
+ */
+static int
+snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
+    daddr_t *snaplistsize, daddr_t **snaplist)
+{
+       int cg, error = 0, len, loc;
+       daddr_t blkno, *blkp;
+       struct fs *fs = VFSTOUFS(mp)->um_fs;
+       struct inode *xp;
+       struct lwp *l = curlwp;
+       struct vattr vat;
+       struct vnode *logvp = NULL, *mvp = NULL, *xvp;
+
+       *snaplist = NULL;
+       /*
+        * Get the log inode if any.
+        */
+       if ((fs->fs_flags & FS_DOWAPBL) &&
+           fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
+               error = VFS_VGET(mp,
+                   fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
+               if (error)
+                       goto out;
+       }
+       /*
+        * Allocate a marker vnode.
+        */
+       mvp = vnalloc(mp);
+       /*
+        * We also calculate the needed size for the snapshot list.
+        */
+       *snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
+           FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
+       mutex_enter(&mntvnode_lock);
+       /*
+        * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+        * and vclean() can be called indirectly
+        */
+       for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) {
+               vmark(mvp, xvp);
+               /*
+                * Make sure this vnode wasn't reclaimed in getnewvnode().
+                * Start over if it has (it won't be on the list anymore).
+                */
+               if (xvp->v_mount != mp || vismarker(xvp))
+                       continue;
+               mutex_enter(xvp->v_interlock);
+               if ((xvp->v_iflag & VI_XLOCK) ||
+                   xvp->v_usecount == 0 || xvp->v_type == VNON ||
+                   VTOI(xvp) == NULL ||
+                   (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
+                       mutex_exit(xvp->v_interlock);
+                       continue;
+               }
+               mutex_exit(&mntvnode_lock);
+               /*
+                * XXXAD should increase vnode ref count to prevent it
+                * disappearing or being recycled.
+                */
+               mutex_exit(xvp->v_interlock);
+#ifdef DEBUG
+               if (snapdebug)
+                       vprint("ffs_snapshot: busy vnode", xvp);
+#endif
+               xp = VTOI(xvp);
+               if (xvp != logvp) {
+                       if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
+                           vat.va_nlink > 0) {
+                               mutex_enter(&mntvnode_lock);
+                               continue;
+                       }
+                       if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
+                               mutex_enter(&mntvnode_lock);
+                               continue;
+                       }
+               }
+               /*
+                * If there is a fragment, clear it here.
+                */
+               blkno = 0;
+               loc = howmany(xp->i_size, fs->fs_bsize) - 1;
+               if (loc < NDADDR) {
+                       len = fragroundup(fs, blkoff(fs, xp->i_size));
+                       if (len > 0 && len < fs->fs_bsize) {
+                               error = UFS_WAPBL_BEGIN(mp);
+                               if (error) {
+                                       (void)vunmark(mvp);
+                                       goto out;
+                               }
+                               ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
+                                   len, xp->i_number);
+                               blkno = db_get(xp, loc);
+                               db_assign(xp, loc, 0);
+                               UFS_WAPBL_END(mp);
+                       }
+               }
+               *snaplistsize += 1;
+               error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
+               if (blkno)
+                       db_assign(xp, loc, blkno);
+               if (!error) {
+                       error = UFS_WAPBL_BEGIN(mp);
+                       if (!error) {
+                               error = ffs_freefile_snap(copy_fs, vp,
+                                   xp->i_number, xp->i_mode);
+                               UFS_WAPBL_END(mp);
+                       }
+               }
+               if (error) {
+                       (void)vunmark(mvp);
+                       goto out;
+               }
+               mutex_enter(&mntvnode_lock);
+       }
+       mutex_exit(&mntvnode_lock);
+       /*
+        * Create a preliminary list of preallocated snapshot blocks.
+        */
+       *snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
+       blkp = &(*snaplist)[1];
+       *blkp++ = lblkno(fs, fs->fs_sblockloc);
+       blkno = fragstoblks(fs, fs->fs_csaddr);
+       for (cg = 0; cg < fs->fs_ncg; cg++) {
+               if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
+                       break;
+               *blkp++ = fragstoblks(fs, cgtod(fs, cg));
+       }
+       len = howmany(fs->fs_cssize, fs->fs_bsize);
+       for (loc = 0; loc < len; loc++)
+               *blkp++ = blkno + loc;
+       for (; cg < fs->fs_ncg; cg++)
+               *blkp++ = fragstoblks(fs, cgtod(fs, cg));
+       (*snaplist)[0] = blkp - &(*snaplist)[0];
+
+out:
+       if (mvp != NULL)
+               vnfree(mvp);
+       if (logvp != NULL)
+               vput(logvp);
+       if (error && *snaplist != NULL) {
+               free(*snaplist, M_UFSMNT);
+               *snaplist = NULL;
+       }
+
+       return error;
+}
+
+/*
+ * Copy allocation information from all the snapshots in this snapshot and
+ * then expunge them from its view. Also, collect the list of allocated
+ * blocks in i_snapblklist.
+ */
+static int
+snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
+    struct fs *copy_fs, daddr_t snaplistsize)
+{
+       int error = 0, i;
+       daddr_t numblks, *snaplist = NULL;
+       struct fs *fs = VFSTOUFS(mp)->um_fs;
+       struct inode *ip = VTOI(vp), *xp;
+       struct lwp *l = curlwp;
+       struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
+
+       TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
+               if (xp != ip) {
+                       error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
+                       if (error)
+                               break;
+               }
+               if (xp->i_nlink != 0)
+                       continue;
+               error = UFS_WAPBL_BEGIN(mp);
+               if (error)
+                       break;
+               error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
+               UFS_WAPBL_END(mp);
+               if (error)
+                       break;
+       }
+       if (error)
+               goto out;
+       /*
+        * Allocate space for the full list of preallocated snapshot blocks.
+        */
+       snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
+       ip->i_snapblklist = &snaplist[1];
+       /*
+        * Expunge the blocks used by the snapshots from the set of
+        * blocks marked as used in the snapshot bitmaps. Also, collect
+        * the list of allocated blocks in i_snapblklist.
+        */
+       error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
+       if (error)
+               goto out;
+       if (snaplistsize < ip->i_snapblklist - snaplist)
+               panic("ffs_snapshot: list too small");
+       snaplistsize = ip->i_snapblklist - snaplist;
+       snaplist[0] = snaplistsize;
+       ip->i_snapblklist = &snaplist[0];
+       /*
+        * Write out the list of allocated blocks to the end of the snapshot.
+        */
+       numblks = howmany(fs->fs_size, fs->fs_frag);
+       for (i = 0; i < snaplistsize; i++)
+               snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
+       error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
+           snaplistsize * sizeof(daddr_t), lblktosize(fs, (off_t)numblks),
+           UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
+       for (i = 0; i < snaplistsize; i++)
+               snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
+out:
+       if (error && snaplist != NULL) {
+               free(snaplist, M_UFSMNT);
+               ip->i_snapblklist = NULL;
+       }
+       return error;
+}
+
+/*
+ * Write the superblock and its summary information to the snapshot.
+ * Make sure, the first NDADDR blocks get copied to the snapshot.
+ */
+static int
+snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
+{
+       int error, len, loc;
+       void *space;
+       daddr_t blkno;
+       struct buf *bp;
+       struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
+       struct inode *ip = VTOI(vp);
+       struct lwp *l = curlwp;
+
+       copyfs = (struct fs *)((char *)sbbuf + blkoff(fs, fs->fs_sblockloc));
+
+       /*
+        * Write the superblock and its summary information
+        * to the snapshot.
+        */
+       blkno = fragstoblks(fs, fs->fs_csaddr);
+       len = howmany(fs->fs_cssize, fs->fs_bsize);
+       space = copyfs->fs_csp;
+#ifdef FFS_EI
+       if (UFS_FSNEEDSWAP(fs)) {
+               ffs_sb_swap(copyfs, copyfs);
+               ffs_csum_swap(space, space, fs->fs_cssize);
+       }
+#endif
+       error = UFS_WAPBL_BEGIN(mp);
+       if (error)
+               return error;
+       for (loc = 0; loc < len; loc++) {
+               error = bread(vp, blkno + loc, fs->fs_bsize, l->l_cred,
+                   B_MODIFY, &bp);
+               if (error) {
+                       brelse(bp, 0);
+                       break;
+               }
+               memcpy(bp->b_data, space, fs->fs_bsize);
+               space = (char *)space + fs->fs_bsize;
+               bawrite(bp);
+       }
+       if (error)
+               goto out;
+       error = bread(vp, lblkno(fs, fs->fs_sblockloc),
+           fs->fs_bsize, l->l_cred, B_MODIFY, &bp);
+       if (error) {
+               brelse(bp, 0);
+               goto out;
+       } else {
+               memcpy(bp->b_data, sbbuf, fs->fs_bsize);
+               bawrite(bp);
+       }
+       /*
+        * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite()
+        * and ffs_snapblkfree() will always work on indirect blocks.
+        */
+       for (loc = 0; loc < NDADDR; loc++) {
+               if (db_get(ip, loc) != 0)
+                       continue;
+               error = ffs_balloc(vp, lblktosize(fs, (off_t)loc),
+                   fs->fs_bsize, l->l_cred, 0, &bp);
+               if (error)
+                       break;
+               error = rwfsblk(vp, B_READ, bp->b_data, loc);
+               if (error) {
+                       brelse(bp, 0);
+                       break;
+               }
+               bawrite(bp);
+       }
+
+out:
+       UFS_WAPBL_END(mp);
+       return error;
+}
+
+/*
+ * Copy all cylinder group maps.
+ */
+static int
+cgaccount(struct vnode *vp, int passno, int *redo)
+{
+       int cg, error = 0;
+       struct buf *nbp;
+       struct fs *fs = VTOI(vp)->i_fs;
+
+       if (redo != NULL)
+               *redo = 0;
+       if (passno == 1)
+               fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
+                   M_DEVBUF, M_WAITOK | M_ZERO);
+       for (cg = 0; cg < fs->fs_ncg; cg++) {
+               if (passno == 2 && ACTIVECG_ISSET(fs, cg))
+                       continue;
+
+               if (redo != NULL)
+                       *redo += 1;
+               error = UFS_WAPBL_BEGIN(vp->v_mount);
+               if (error)
+                       return error;
+               error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
+                   fs->fs_bsize, curlwp->l_cred, 0, &nbp);
+               if (error) {
+                       UFS_WAPBL_END(vp->v_mount);
+                       break;
+               }
+               error = cgaccount1(cg, vp, nbp->b_data, passno);
+               bawrite(nbp);
+               UFS_WAPBL_END(vp->v_mount);
+               if (error)
+                       break;
+       }
+       return error;
+}
+
+/*
+ * Copy a cylinder group map. All the unallocated blocks are marked
+ * BLK_NOCOPY so that the snapshot knows that it need not copy them
+ * if they are later written. If passno is one, then this is a first
+ * pass, so only setting needs to be done. If passno is 2, then this
+ * is a revision to a previous pass which must be undone as the
+ * replacement pass is done.
+ */
+static int
+cgaccount1(int cg, struct vnode *vp, void *data, int passno)
+{
+       struct buf *bp, *ibp;
+       struct inode *ip;
+       struct cg *cgp;
+       struct fs *fs;
+       struct lwp *l = curlwp;
+       daddr_t base, numblks;
+       int error, len, loc, ns, indiroff;
+
+       ip = VTOI(vp);
+       fs = ip->i_fs;
+       ns = UFS_FSNEEDSWAP(fs);
+       error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+               (int)fs->fs_cgsize, l->l_cred, 0, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (error);
+       }
+       cgp = (struct cg *)bp->b_data;
+       if (!cg_chkmagic(cgp, ns)) {
+               brelse(bp, 0);
+               return (EIO);
+       }
+       ACTIVECG_SET(fs, cg);
+
+       memcpy(data, bp->b_data, fs->fs_cgsize);
+       brelse(bp, 0);
+       if (fs->fs_cgsize < fs->fs_bsize)
+               memset((char *)data + fs->fs_cgsize, 0,
+                   fs->fs_bsize - fs->fs_cgsize);
+       numblks = howmany(fs->fs_size, fs->fs_frag);
+       len = howmany(fs->fs_fpg, fs->fs_frag);
+       base = cg * fs->fs_fpg / fs->fs_frag;
+       if (base + len >= numblks)
+               len = numblks - base - 1;
+       loc = 0;
+       if (base < NDADDR) {
+               for ( ; loc < NDADDR; loc++) {
+                       if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
+                               db_assign(ip, loc, BLK_NOCOPY);
+                       else if (db_get(ip, loc) == BLK_NOCOPY) {
+                               if (passno == 2)
+                                       db_assign(ip, loc, 0);
+                               else if (passno == 1)
+                                       panic("ffs_snapshot: lost direct block");
+                       }
+               }
+       }
+       if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
+           fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
+               return (error);
+       indiroff = (base + loc - NDADDR) % NINDIR(fs);
+       for ( ; loc < len; loc++, indiroff++) {
+               if (indiroff >= NINDIR(fs)) {
+                       bawrite(ibp);
+                       if ((error = ffs_balloc(vp,
+                           lblktosize(fs, (off_t)(base + loc)),
+                           fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
+                               return (error);
+                       indiroff = 0;
+               }
+               if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
+                       idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
+               else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
+                       if (passno == 2)
+                               idb_assign(ip, ibp->b_data, indiroff, 0);
+                       else if (passno == 1)
+                               panic("ffs_snapshot: lost indirect block");
+               }
+       }
+       bdwrite(ibp);
+       return (0);
+}
+
+/*
+ * Before expunging a snapshot inode, note all the
+ * blocks that it claims with BLK_SNAP so that fsck will
+ * be able to account for those blocks properly and so
+ * that this snapshot knows that it need not copy them
+ * if the other snapshot holding them is freed.
+ */
+static int
+expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
+    acctfunc_t acctfunc, int expungetype)
+{
+       int i, error, ns;
+       daddr_t lbn, rlbn;
+       daddr_t len, blkno, numblks, blksperindir;
+       struct ufs1_dinode *dip1;
+       struct ufs2_dinode *dip2;
+       struct lwp *l = curlwp;
+       void *bap;
+       struct buf *bp;
+       struct mount *mp;
+
+       ns = UFS_FSNEEDSWAP(fs);
+       mp = snapvp->v_mount;
+
+       error = UFS_WAPBL_BEGIN(mp);
+       if (error)
+               return error;
+       /*
+        * Prepare to expunge the inode. If its inode block has not
+        * yet been copied, then allocate and fill the copy.
+        */
+       lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
+       error = snapblkaddr(snapvp, lbn, &blkno);
+       if (error)
+               return error;
+       if (blkno != 0) {
+               error = bread(snapvp, lbn, fs->fs_bsize, l->l_cred,
+                   B_MODIFY, &bp);
+       } else {
+               error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
+                   fs->fs_bsize, l->l_cred, 0, &bp);
+               if (! error)
+                       error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
+       }
+       if (error) {
+               UFS_WAPBL_END(mp);
+               return error;
+       }
+       /*
+        * Set a snapshot inode to be a zero length file, regular files
+        * or unlinked snapshots to be completely unallocated.
+        */
+       if (fs->fs_magic == FS_UFS1_MAGIC) {
+               dip1 = (struct ufs1_dinode *)bp->b_data +
+                   ino_to_fsbo(fs, cancelip->i_number);
+               if (cancelip->i_flags & SF_SNAPSHOT) {
+                       dip1->di_flags =
+                           ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
+                           SF_SNAPINVAL, ns);
+               }
+               if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
+                       dip1->di_mode = 0;
+               dip1->di_size = 0;
+               dip1->di_blocks = 0;
+               memset(&dip1->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int32_t));
+       } else {
+               dip2 = (struct ufs2_dinode *)bp->b_data +
+                   ino_to_fsbo(fs, cancelip->i_number);
+               if (cancelip->i_flags & SF_SNAPSHOT) {
+                       dip2->di_flags =
+                           ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
+                           SF_SNAPINVAL, ns);
+               }
+               if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
+                       dip2->di_mode = 0;
+               dip2->di_size = 0;
+               dip2->di_blocks = 0;
+               memset(&dip2->di_db[0], 0, (NDADDR + NIADDR) * sizeof(int64_t));
+       }
+       bdwrite(bp);
+       UFS_WAPBL_END(mp);
+       /*
+        * Now go through and expunge all the blocks in the file
+        * using the function requested.
+        */
+       numblks = howmany(cancelip->i_size, fs->fs_bsize);
+       if (fs->fs_magic == FS_UFS1_MAGIC)
+               bap = &cancelip->i_ffs1_db[0];
+       else
+               bap = &cancelip->i_ffs2_db[0];
+       error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype);
+       if (error)
+               return (error);
+       if (fs->fs_magic == FS_UFS1_MAGIC)
+               bap = &cancelip->i_ffs1_ib[0];
+       else
+               bap = &cancelip->i_ffs2_ib[0];
+       error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype);
+       if (error)
+               return (error);
+       blksperindir = 1;
+       lbn = -NDADDR;
+       len = numblks - NDADDR;
+       rlbn = NDADDR;
+       for (i = 0; len > 0 && i < NIADDR; i++) {
+               error = indiracct(snapvp, ITOV(cancelip), i,
+                   ib_get(cancelip, i), lbn, rlbn, len,
+                   blksperindir, fs, acctfunc, expungetype);
+               if (error)
+                       return (error);
+               blksperindir *= NINDIR(fs);
+               lbn -= blksperindir + 1;
+               len -= blksperindir;
+               rlbn += blksperindir;
+       }
+       return (0);
+}
+
+/*
+ * Descend an indirect block chain for vnode cancelvp accounting for all
+ * its indirect blocks in snapvp.
+ */
+static int
+indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
+    daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
+    daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
+{
+       int error, num, i;
+       daddr_t subblksperindir;
+       struct indir indirs[NIADDR + 2];
+       daddr_t last;
+       void *bap;
+       struct buf *bp;
+
+       if (blkno == 0) {
+               if (expungetype == BLK_NOCOPY)
+                       return (0);
+               panic("indiracct: missing indir");
+       }
+       if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
+               return (error);
+       if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
+               panic("indiracct: botched params");
+       /*
+        * We have to expand bread here since it will deadlock looking
+        * up the block number for any blocks that are not in the cache.
+        */
+       error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
+           false, &bp);
+       if (error)
+               return error;
+       if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
+           rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
+               brelse(bp, 0);
+               return (error);
+       }
+       /*
+        * Account for the block pointers in this indirect block.
+        */
+       last = howmany(remblks, blksperindir);
+       if (last > NINDIR(fs))
+               last = NINDIR(fs);
+       bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
+       memcpy((void *)bap, bp->b_data, fs->fs_bsize);
+       brelse(bp, 0);
+       error = (*acctfunc)(snapvp, bap, 0, last,
+           fs, level == 0 ? rlbn : -1, expungetype);
+       if (error || level == 0)
+               goto out;
+       /*
+        * Account for the block pointers in each of the indirect blocks
+        * in the levels below us.
+        */
+       subblksperindir = blksperindir / NINDIR(fs);
+       for (lbn++, level--, i = 0; i < last; i++) {
+               error = indiracct(snapvp, cancelvp, level,
+                   idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
+                   subblksperindir, fs, acctfunc, expungetype);
+               if (error)
+                       goto out;
+               rlbn += blksperindir;
+               lbn -= blksperindir;
+               remblks -= blksperindir;
+       }
+out:
+       free(bap, M_DEVBUF);
+       return (error);
+}
+
+/*
+ * Do both snap accounting and map accounting.
+ */
+static int
+fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
+    struct fs *fs, daddr_t lblkno,
+    int exptype /* BLK_SNAP or BLK_NOCOPY */)
+{
+       int error;
+
+       if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
+               return (error);
+       return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
+}
+
+/*
+ * Identify a set of blocks allocated in a snapshot inode.
+ */
+static int
+snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
+    struct fs *fs, daddr_t lblkno,
+    int expungetype /* BLK_SNAP or BLK_NOCOPY */)
+{
+       struct inode *ip = VTOI(vp);
+       struct lwp *l = curlwp;
+       struct mount *mp = vp->v_mount;
+       daddr_t blkno;
+       daddr_t lbn;
+       struct buf *ibp;
+       int error, n;
+       const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
+
+       error = UFS_WAPBL_BEGIN(mp);
+       if (error)
+               return error;
+       for ( n = 0; oldblkp < lastblkp; oldblkp++) {
+               blkno = idb_get(ip, bap, oldblkp);
+               if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
+                       continue;
+               lbn = fragstoblks(fs, blkno);
+               if (lbn < NDADDR) {
+                       blkno = db_get(ip, lbn);
+                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               } else {
+                       error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
+                           fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
+                       if (error)
+                               break;
+                       blkno = idb_get(ip, ibp->b_data,
+                           (lbn - NDADDR) % NINDIR(fs));
+               }
+               /*
+                * If we are expunging a snapshot vnode and we
+                * find a block marked BLK_NOCOPY, then it is
+                * one that has been allocated to this snapshot after
+                * we took our current snapshot and can be ignored.
+                */
+               if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
+                       if (lbn >= NDADDR)
+                               brelse(ibp, 0);
+               } else {
+                       if (blkno != 0)
+                               panic("snapacct: bad block");
+                       if (lbn < NDADDR)
+                               db_assign(ip, lbn, expungetype);
+                       else {
+                               idb_assign(ip, ibp->b_data,
+                                   (lbn - NDADDR) % NINDIR(fs), expungetype);
+                               bdwrite(ibp);
+                       }
+               }
+               if (wbreak > 0 && (++n % wbreak) == 0) {
+                       UFS_WAPBL_END(mp);
+                       error = UFS_WAPBL_BEGIN(mp);
+                       if (error)
+                               return error;
+               }
+       }
+       UFS_WAPBL_END(mp);
+       return error;
+}
+
+/*
+ * Account for a set of blocks allocated in a snapshot inode.
+ */
+static int
+mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
+    struct fs *fs, daddr_t lblkno, int expungetype)
+{
+       daddr_t blkno;
+       struct inode *ip;
+       struct mount *mp = vp->v_mount;
+       ino_t inum;
+       int acctit, error, n;
+       const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
+
+       error = UFS_WAPBL_BEGIN(mp);
+       if (error)
+               return error;
+       ip = VTOI(vp);
+       inum = ip->i_number;
+       if (lblkno == -1)
+               acctit = 0;
+       else
+               acctit = 1;
+       for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
+               blkno = idb_get(ip, bap, oldblkp);
+               if (blkno == 0 || blkno == BLK_NOCOPY)
+                       continue;
+               if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
+                       *ip->i_snapblklist++ = lblkno;
+               if (blkno == BLK_SNAP)
+                       blkno = blkstofrags(fs, lblkno);
+               ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
+               if (wbreak > 0 && (++n % wbreak) == 0) {
+                       UFS_WAPBL_END(mp);
+                       error = UFS_WAPBL_BEGIN(mp);
+                       if (error)
+                               return error;
+               }
+       }
+       UFS_WAPBL_END(mp);
+       return (0);
+}
+
+/*
+ * Number of blocks that fit into the journal or zero if not logging.
+ */
+static int
+blocks_in_journal(struct fs *fs)
+{
+       off_t bpj;
+
+       if ((fs->fs_flags & FS_DOWAPBL) == 0)
+               return 0;
+       bpj = 1;
+       if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
+               switch (fs->fs_journal_location) {
+               case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+                       bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
+                           fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
+                       break;
+               case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+                       bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
+                           fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+                       break;
+               }
+       }
+       bpj /= fs->fs_bsize;
+       return (bpj > 0 ? bpj : 1);
+}
+#endif /* defined(FFS_NO_SNAPSHOT) */
+
+/*
+ * Decrement extra reference on snapshot when last name is removed.
+ * It will not be freed until the last open reference goes away.
+ */
+void
+ffs_snapgone(struct inode *ip)
+{
+       struct mount *mp = ip->i_devvp->v_specmountpoint;
+       struct inode *xp;
+       struct fs *fs;
+       struct snap_info *si;
+       int snaploc;
+
+       si = VFSTOUFS(mp)->um_snapinfo;
+
+       /*
+        * Find snapshot in incore list.
+        */
+       mutex_enter(&si->si_lock);
+       TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
+               if (xp == ip)
+                       break;
+       mutex_exit(&si->si_lock);
+       if (xp != NULL)
+               vrele(ITOV(ip));
+#ifdef DEBUG
+       else if (snapdebug)
+               printf("ffs_snapgone: lost snapshot vnode %llu\n",
+                   (unsigned long long)ip->i_number);
+#endif
+       /*
+        * Delete snapshot inode from superblock. Keep list dense.
+        */
+       mutex_enter(&si->si_lock);
+       fs = ip->i_fs;
+       for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
+               if (fs->fs_snapinum[snaploc] == ip->i_number)
+                       break;
+       if (snaploc < FSMAXSNAP) {
+               for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
+                       if (fs->fs_snapinum[snaploc] == 0)
+                               break;
+                       fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
+               }
+               fs->fs_snapinum[snaploc - 1] = 0;
+       }
+       si->si_gen++;
+       mutex_exit(&si->si_lock);
+}
+
+/*
+ * Prepare a snapshot file for being removed.
+ */
+void
+ffs_snapremove(struct vnode *vp)
+{
+       struct inode *ip = VTOI(vp), *xp;
+       struct vnode *devvp = ip->i_devvp;
+       struct fs *fs = ip->i_fs;
+       struct mount *mp = devvp->v_specmountpoint;
+       struct buf *ibp;
+       struct snap_info *si;
+       struct lwp *l = curlwp;
+       daddr_t numblks, blkno, dblk;
+       int error, loc, last;
+
+       si = VFSTOUFS(mp)->um_snapinfo;
+       /*
+        * If active, delete from incore list (this snapshot may
+        * already have been in the process of being deleted, so
+        * would not have been active).
+        *
+        * Clear copy-on-write flag if last snapshot.
+        */
+       mutex_enter(&si->si_snaplock);
+       mutex_enter(&si->si_lock);
+       if (is_active_snapshot(si, ip)) {
+               TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
+               if (TAILQ_FIRST(&si->si_snapshots) != 0) {
+                       /* Roll back the list of preallocated blocks. */
+                       xp = TAILQ_LAST(&si->si_snapshots, inodelst);
+                       si->si_snapblklist = xp->i_snapblklist;
+                       si->si_gen++;
+                       mutex_exit(&si->si_lock);
+                       mutex_exit(&si->si_snaplock);
+               } else {
+                       si->si_snapblklist = 0;
+                       si->si_gen++;
+                       mutex_exit(&si->si_lock);
+                       mutex_exit(&si->si_snaplock);
+                       fscow_disestablish(mp, ffs_copyonwrite, devvp);
+               }
+               if (ip->i_snapblklist != NULL) {
+                       free(ip->i_snapblklist, M_UFSMNT);
+                       ip->i_snapblklist = NULL;
+               }
+       } else {
+               mutex_exit(&si->si_lock);
+               mutex_exit(&si->si_snaplock);
+       }
+       /*
+        * Clear all BLK_NOCOPY fields. Pass any block claims to other
+        * snapshots that want them (see ffs_snapblkfree below).
+        */
+       for (blkno = 1; blkno < NDADDR; blkno++) {
+               dblk = db_get(ip, blkno);
+               if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
+                       db_assign(ip, blkno, 0);
+               else if ((dblk == blkstofrags(fs, blkno) &&
+                    ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
+                    ip->i_number))) {
+                       DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
+                       db_assign(ip, blkno, 0);
+               }
+       }
+       numblks = howmany(ip->i_size, fs->fs_bsize);
+       for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
+               error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
+                   fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
+               if (error)
+                       continue;
+               if (fs->fs_size - blkno > NINDIR(fs))
+                       last = NINDIR(fs);
+               else
+                       last = fs->fs_size - blkno;
+               for (loc = 0; loc < last; loc++) {
+                       dblk = idb_get(ip, ibp->b_data, loc);
+                       if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
+                               idb_assign(ip, ibp->b_data, loc, 0);
+                       else if (dblk == blkstofrags(fs, blkno) &&
+                           ffs_snapblkfree(fs, ip->i_devvp, dblk,
+                           fs->fs_bsize, ip->i_number)) {
+                               DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
+                               idb_assign(ip, ibp->b_data, loc, 0);
+                       }
+               }
+               bawrite(ibp);
+               UFS_WAPBL_END(mp);
+               error = UFS_WAPBL_BEGIN(mp);
+               KASSERT(error == 0);
+       }
+       /*
+        * Clear snapshot flag and drop reference.
+        */
+       ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
+       DIP_ASSIGN(ip, flags, ip->i_flags);
+       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+#if defined(QUOTA) || defined(QUOTA2)
+       chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
+       chkiq(ip, 1, l->l_cred, FORCE);
+#endif
+}
+
+/*
+ * Notification that a block is being freed. Return zero if the free
+ * should be allowed to proceed. Return non-zero if the snapshot file
+ * wants to claim the block. The block will be claimed if it is an
+ * uncopied part of one of the snapshots. It will be freed if it is
+ * either a BLK_NOCOPY or has already been copied in all of the snapshots.
+ * If a fragment is being freed, then all snapshots that care about
+ * it must make a copy since a snapshot file can only claim full sized
+ * blocks. Note that if more than one snapshot file maps the block,
+ * we can pick one at random to claim it. Since none of the snapshots
+ * can change, we are assurred that they will all see the same unmodified
+ * image. When deleting a snapshot file (see ffs_snapremove above), we
+ * must push any of these claimed blocks to one of the other snapshots
+ * that maps it. These claimed blocks are easily identified as they will
+ * have a block number equal to their logical block number within the
+ * snapshot. A copied block can never have this property because they
+ * must always have been allocated from a BLK_NOCOPY location.
+ */
+int
+ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
+    long size, ino_t inum)
+{
+       struct mount *mp = devvp->v_specmountpoint;
+       struct buf *ibp;
+       struct inode *ip;
+       struct vnode *vp = NULL;
+       struct snap_info *si;
+       void *saved_data = NULL;
+       daddr_t lbn;
+       daddr_t blkno;
+       uint32_t gen;
+       int indiroff = 0, error = 0, claimedblk = 0;
+
+       si = VFSTOUFS(mp)->um_snapinfo;
+       lbn = fragstoblks(fs, bno);
+       mutex_enter(&si->si_snaplock);
+       mutex_enter(&si->si_lock);
+       si->si_owner = curlwp;
+               
+retry:
+       gen = si->si_gen;
+       TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
+               vp = ITOV(ip);
+               /*
+                * Lookup block being written.
+                */
+               if (lbn < NDADDR) {
+                       blkno = db_get(ip, lbn);
+               } else {
+                       mutex_exit(&si->si_lock);
+                       error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
+                           fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
+                       if (error) {
+                               mutex_enter(&si->si_lock);
+                               break;
+                       }
+                       indiroff = (lbn - NDADDR) % NINDIR(fs);
+                       blkno = idb_get(ip, ibp->b_data, indiroff);
+                       mutex_enter(&si->si_lock);
+                       if (gen != si->si_gen) {
+                               brelse(ibp, 0);
+                               goto retry;
+                       }
+               }
+               /*
+                * Check to see if block needs to be copied.
+                */
+               if (blkno == 0) {
+                       /*
+                        * A block that we map is being freed. If it has not
+                        * been claimed yet, we will claim or copy it (below).
+                        */
+                       claimedblk = 1;
+               } else if (blkno == BLK_SNAP) {
+                       /*
+                        * No previous snapshot claimed the block,
+                        * so it will be freed and become a BLK_NOCOPY
+                        * (don't care) for us.
+                        */
+                       if (claimedblk)
+                               panic("snapblkfree: inconsistent block type");
+                       if (lbn < NDADDR) {
+                               db_assign(ip, lbn, BLK_NOCOPY);
+                               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+                       } else {
+                               idb_assign(ip, ibp->b_data, indiroff,
+                                   BLK_NOCOPY);
+                               mutex_exit(&si->si_lock);
+                               if (ip->i_nlink > 0)
+                                       bwrite(ibp);
+                               else
+                                       bdwrite(ibp);
+                               mutex_enter(&si->si_lock);
+                               if (gen != si->si_gen)
+                                       goto retry;
+                       }
+                       continue;
+               } else /* BLK_NOCOPY or default */ {
+                       /*
+                        * If the snapshot has already copied the block
+                        * (default), or does not care about the block,
+                        * it is not needed.
+                        */
+                       if (lbn >= NDADDR)
+                               brelse(ibp, 0);
+                       continue;
+               }
+               /*
+                * If this is a full size block, we will just grab it
+                * and assign it to the snapshot inode. Otherwise we
+                * will proceed to copy it. See explanation for this
+                * routine as to why only a single snapshot needs to
+                * claim this block.
+                */
+               if (size == fs->fs_bsize) {
+#ifdef DEBUG
+                       if (snapdebug)
+                               printf("%s %llu lbn %" PRId64
+                                   "from inum %llu\n",
+                                   "Grabonremove: snapino",
+                                   (unsigned long long)ip->i_number,
+                                   lbn, (unsigned long long)inum);
+#endif
+                       mutex_exit(&si->si_lock);
+                       if (lbn < NDADDR) {
+                               db_assign(ip, lbn, bno);
+                       } else {
+                               idb_assign(ip, ibp->b_data, indiroff, bno);
+                               if (ip->i_nlink > 0)
+                                       bwrite(ibp);
+                               else
+                                       bdwrite(ibp);
+                       }
+                       DIP_ADD(ip, blocks, btodb(size));
+                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+                       if (ip->i_nlink > 0 && mp->mnt_wapbl)
+                               error = syncsnap(vp);
+                       else
+                               error = 0;
+                       mutex_enter(&si->si_lock);
+                       si->si_owner = NULL;
+                       mutex_exit(&si->si_lock);
+                       mutex_exit(&si->si_snaplock);
+                       return (error == 0);
+               }
+               if (lbn >= NDADDR)
+                       brelse(ibp, 0);
+#ifdef DEBUG
+               if (snapdebug)
+                       printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
+                           "Copyonremove: snapino ",
+                           (unsigned long long)ip->i_number,
+                           lbn, "for inum", (unsigned long long)inum, size);
+#endif
+               /*
+                * If we have already read the old block contents, then
+                * simply copy them to the new block. Note that we need
+                * to synchronously write snapshots that have not been
+                * unlinked, and hence will be visible after a crash,
+                * to ensure their integrity.
+                */
+               mutex_exit(&si->si_lock);
+               if (saved_data == NULL) {
+                       saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
+                       error = rwfsblk(vp, B_READ, saved_data, lbn);
+                       if (error) {
+                               free(saved_data, M_UFSMNT);
+                               saved_data = NULL;
+                               mutex_enter(&si->si_lock);
+                               break;
+                       }
+               }
+               error = wrsnapblk(vp, saved_data, lbn);
+               if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
+                       error = syncsnap(vp);
+               mutex_enter(&si->si_lock);
+               if (error)
+                       break;
+               if (gen != si->si_gen)
+                       goto retry;
+       }
+       si->si_owner = NULL;
+       mutex_exit(&si->si_lock);
+       mutex_exit(&si->si_snaplock);
+       if (saved_data)
+               free(saved_data, M_UFSMNT);
+       /*
+        * If we have been unable to allocate a block in which to do
+        * the copy, then return non-zero so that the fragment will
+        * not be freed. Although space will be lost, the snapshot
+        * will stay consistent.
+        */
+       return (error);
+}
+
+/*
+ * Associate snapshot files when mounting.
+ */
+void
+ffs_snapshot_mount(struct mount *mp)
+{
+       struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
+       struct fs *fs = VFSTOUFS(mp)->um_fs;
+       struct lwp *l = curlwp;
+       struct vnode *vp;
+       struct inode *ip, *xp;
+       struct snap_info *si;
+       daddr_t snaplistsize, *snapblklist;
+       int i, error, ns, snaploc, loc;
+
+       /*
+        * No persistent snapshots on apple ufs file systems.
+        */
+       if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
+               return;
+
+       si = VFSTOUFS(mp)->um_snapinfo;
+       ns = UFS_FSNEEDSWAP(fs);
+       /*
+        * XXX The following needs to be set before ffs_truncate or
+        * VOP_READ can be called.
+        */
+       mp->mnt_stat.f_iosize = fs->fs_bsize;
+       /*
+        * Process each snapshot listed in the superblock.
+        */
+       vp = NULL;
+       mutex_enter(&si->si_lock);
+       for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
+               if (fs->fs_snapinum[snaploc] == 0)
+                       break;
+               if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
+                   &vp)) != 0) {
+                       printf("ffs_snapshot_mount: vget failed %d\n", error);
+                       continue;
+               }
+               ip = VTOI(vp);
+               if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
+                   SF_SNAPSHOT) {
+                       printf("ffs_snapshot_mount: non-snapshot inode %d\n",
+                           fs->fs_snapinum[snaploc]);
+                       vput(vp);
+                       vp = NULL;
+                       for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
+                               if (fs->fs_snapinum[loc] == 0)
+                                       break;
+                               fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
+                       }
+                       fs->fs_snapinum[loc - 1] = 0;
+                       snaploc--;
+                       continue;
+               }
+
+               /*
+                * Read the block hints list. Use an empty list on
+                * read errors.
+                */
+               error = vn_rdwr(UIO_READ, vp,
+                   (void *)&snaplistsize, sizeof(snaplistsize),
+                   lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
+                   UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
+                   l->l_cred, NULL, NULL);
+               if (error) {
+                       printf("ffs_snapshot_mount: read_1 failed %d\n", error);
+                       snaplistsize = 1;
+               } else
+                       snaplistsize = ufs_rw64(snaplistsize, ns);
+               snapblklist = malloc(
+                   snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
+               if (error)
+                       snapblklist[0] = 1;
+               else {
+                       error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
+                           snaplistsize * sizeof(daddr_t),
+                           lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
+                           UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
+                           l->l_cred, NULL, NULL);
+                       for (i = 0; i < snaplistsize; i++)
+                               snapblklist[i] = ufs_rw64(snapblklist[i], ns);
+                       if (error) {
+                               printf("ffs_snapshot_mount: read_2 failed %d\n",
+                                   error);
+                               snapblklist[0] = 1;
+                       }
+               }
+               ip->i_snapblklist = &snapblklist[0];
+
+               /*
+                * Link it onto the active snapshot list.
+                */
+               if (is_active_snapshot(si, ip))
+                       panic("ffs_snapshot_mount: %"PRIu64" already on list",
+                           ip->i_number);
+               else
+                       TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
+               vp->v_vflag |= VV_SYSTEM;
+               VOP_UNLOCK(vp);
+       }
+       /*
+        * No usable snapshots found.
+        */
+       if (vp == NULL) {
+               mutex_exit(&si->si_lock);
+               return;
+       }
+       /*
+        * Attach the block hints list. We always want to
+        * use the list from the newest snapshot.
+       */
+       xp = TAILQ_LAST(&si->si_snapshots, inodelst);
+       si->si_snapblklist = xp->i_snapblklist;
+       fscow_establish(mp, ffs_copyonwrite, devvp);
+       si->si_gen++;
+       mutex_exit(&si->si_lock);
+}
+
+/*
+ * Disassociate snapshot files when unmounting.
+ */
+void
+ffs_snapshot_unmount(struct mount *mp)
+{
+       struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
+       struct inode *xp;
+       struct vnode *vp = NULL;
+       struct snap_info *si;
+
+       si = VFSTOUFS(mp)->um_snapinfo;
+       mutex_enter(&si->si_lock);
+       while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
+               vp = ITOV(xp);
+               TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
+               if (xp->i_snapblklist == si->si_snapblklist)
+                       si->si_snapblklist = NULL;
+               free(xp->i_snapblklist, M_UFSMNT);
+               if (xp->i_nlink > 0) {
+                       si->si_gen++;
+                       mutex_exit(&si->si_lock);
+                       vrele(vp);
+                       mutex_enter(&si->si_lock);
+               }
+       }
+       si->si_gen++;
+       mutex_exit(&si->si_lock);
+       if (vp)
+               fscow_disestablish(mp, ffs_copyonwrite, devvp);
+}
+
+/*
+ * Check for need to copy block that is about to be written,
+ * copying the block if necessary.
+ */
+static int
+ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
+{
+       struct fs *fs;
+       struct inode *ip;
+       struct vnode *devvp = v, *vp = NULL;
+       struct mount *mp = devvp->v_specmountpoint;
+       struct snap_info *si;
+       void *saved_data = NULL;
+       daddr_t lbn, blkno, *snapblklist;
+       uint32_t gen;
+       int lower, upper, mid, snapshot_locked = 0, error = 0;
+
+       /*
+        * Check for valid snapshots.
+        */
+       si = VFSTOUFS(mp)->um_snapinfo;
+       mutex_enter(&si->si_lock);
+       ip = TAILQ_FIRST(&si->si_snapshots);
+       if (ip == NULL) {
+               mutex_exit(&si->si_lock);
+               return 0;
+       }
+       /*
+        * First check to see if it is after the file system,
+        * in the journal or in the preallocated list.
+        * By doing these checks we avoid several potential deadlocks.
+        */
+       fs = ip->i_fs;
+       lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
+       if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) {
+               mutex_exit(&si->si_lock);
+               return 0;
+       }
+       if ((fs->fs_flags & FS_DOWAPBL) &&
+           fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
+               off_t blk_off, log_start, log_end;
+
+               log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
+                   fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+               log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
+                   fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+               blk_off = dbtob(bp->b_blkno);
+               if (blk_off >= log_start && blk_off < log_end) {
+                       mutex_exit(&si->si_lock);
+                       return 0;
+               }
+       }
+       snapblklist = si->si_snapblklist;
+       upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
+       lower = 1;
+       while (lower <= upper) {
+               mid = (lower + upper) / 2;
+               if (snapblklist[mid] == lbn)
+                       break;
+               if (snapblklist[mid] < lbn)
+                       lower = mid + 1;
+               else
+                       upper = mid - 1;
+       }
+       if (lower <= upper) {
+               mutex_exit(&si->si_lock);
+               return 0;
+       }
+       /*
+        * Not in the precomputed list, so check the snapshots.
+        */
+        if (si->si_owner != curlwp) {
+               if (!mutex_tryenter(&si->si_snaplock)) {
+                       mutex_exit(&si->si_lock);
+                       mutex_enter(&si->si_snaplock);
+                       mutex_enter(&si->si_lock);
+               }
+               si->si_owner = curlwp;
+               snapshot_locked = 1;
+        }
+        if (data_valid && bp->b_bcount == fs->fs_bsize)
+               saved_data = bp->b_data;
+retry:
+       gen = si->si_gen;
+       TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
+               vp = ITOV(ip);
+               /*
+                * We ensure that everything of our own that needs to be
+                * copied will be done at the time that ffs_snapshot is
+                * called. Thus we can skip the check here which can
+                * deadlock in doing the lookup in ffs_balloc.
+                */
+               if (bp->b_vp == vp)
+                       continue;
+               /*
+                * Check to see if block needs to be copied.
+                */
+               if (lbn < NDADDR) {
+                       blkno = db_get(ip, lbn);
+               } else {
+                       mutex_exit(&si->si_lock);
+                       blkno = 0; /* XXX: GCC */
+                       if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
+                               mutex_enter(&si->si_lock);
+                               break;
+                       }
+                       mutex_enter(&si->si_lock);
+                       if (gen != si->si_gen)
+                               goto retry;
+               }
+#ifdef DIAGNOSTIC
+               if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
+                       panic("ffs_copyonwrite: bad copy block");
+#endif
+               if (blkno != 0)
+                       continue;
+
+               if (curlwp == uvm.pagedaemon_lwp) {
+                       error = ENOMEM;
+                       break;
+               }
+               /* Only one level of recursion allowed. */
+               KASSERT(snapshot_locked);
+               /*
+                * Allocate the block into which to do the copy. Since
+                * multiple processes may all try to copy the same block,
+                * we have to recheck our need to do a copy if we sleep
+                * waiting for the lock.
+                *
+                * Because all snapshots on a filesystem share a single
+                * lock, we ensure that we will never be in competition
+                * with another process to allocate a block.
+                */
+#ifdef DEBUG
+               if (snapdebug) {
+                       printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
+                           (unsigned long long)ip->i_number, lbn);
+                       if (bp->b_vp == devvp)
+                               printf("fs metadata");
+                       else
+                               printf("inum %llu", (unsigned long long)
+                                   VTOI(bp->b_vp)->i_number);
+                       printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
+               }
+#endif
+               /*
+                * If we have already read the old block contents, then
+                * simply copy them to the new block. Note that we need
+                * to synchronously write snapshots that have not been
+                * unlinked, and hence will be visible after a crash,
+                * to ensure their integrity.
+                */
+               mutex_exit(&si->si_lock);
+               if (saved_data == NULL) {
+                       saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
+                       error = rwfsblk(vp, B_READ, saved_data, lbn);
+                       if (error) {
+                               free(saved_data, M_UFSMNT);
+                               saved_data = NULL;
+                               mutex_enter(&si->si_lock);
+                               break;
+                       }
+               }
+               error = wrsnapblk(vp, saved_data, lbn);
+               if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
+                       error = syncsnap(vp);
+               mutex_enter(&si->si_lock);
+               if (error)
+                       break;
+               if (gen != si->si_gen)
+                       goto retry;
+       }
+       /*
+        * Note that we need to synchronously write snapshots that
+        * have not been unlinked, and hence will be visible after
+        * a crash, to ensure their integrity.
+        */
+       if (snapshot_locked) {
+               si->si_owner = NULL;
+               mutex_exit(&si->si_lock);
+               mutex_exit(&si->si_snaplock);
+       } else
+               mutex_exit(&si->si_lock);
+       if (saved_data && saved_data != bp->b_data)
+               free(saved_data, M_UFSMNT);
+       return error;
+}
+
+/*
+ * Read from a snapshot.
+ */
+int
+ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
+{
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+       struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
+       struct buf *bp;
+       daddr_t lbn, nextlbn;
+       off_t fsbytes, bytesinfile;
+       long size, xfersize, blkoffset;
+       int error;
+
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+       mutex_enter(&si->si_snaplock);
+
+       if (ioflag & IO_ALTSEMANTICS)
+               fsbytes = ip->i_size;
+       else
+               fsbytes = lfragtosize(fs, fs->fs_size);
+       for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+               bytesinfile = fsbytes - uio->uio_offset;
+               if (bytesinfile <= 0)
+                       break;
+               lbn = lblkno(fs, uio->uio_offset);
+               nextlbn = lbn + 1;
+               size = fs->fs_bsize;
+               blkoffset = blkoff(fs, uio->uio_offset);
+               xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
+                   bytesinfile);
+
+               if (lblktosize(fs, nextlbn + 1) >= fsbytes) {
+                       if (lblktosize(fs, lbn) + size > fsbytes)
+                               size = fragroundup(fs,
+                                   fsbytes - lblktosize(fs, lbn));
+                       error = bread(vp, lbn, size, NOCRED, 0, &bp);
+               } else {
+                       int nextsize = fs->fs_bsize;
+                       error = breadn(vp, lbn,
+                           size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+               }
+               if (error)
+                       break;
+
+               /*
+                * We should only get non-zero b_resid when an I/O error
+                * has occurred, which should cause us to break above.
+                * However, if the short read did not cause an error,
+                * then we want to ensure that we do not uiomove bad
+                * or uninitialized data.
+                */
+               size -= bp->b_resid;
+               if (size < blkoffset + xfersize) {
+                       xfersize = size - blkoffset;
+                       if (xfersize <= 0)
+                               break;
+               }
+               error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+               if (error)
+                       break;
+               brelse(bp, BC_AGE);
+       }
+       if (bp != NULL)
+               brelse(bp, BC_AGE);
+
+       mutex_exit(&si->si_snaplock);
+       fstrans_done(vp->v_mount);
+       return error;
+}
+
+/*
+ * Lookup a snapshots data block address.
+ * Simpler than UFS_BALLOC() as we know all metadata is already allocated
+ * and safe even for the pagedaemon where we cannot bread().
+ */
+static int
+snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
+{
+       struct indir indirs[NIADDR + 2];
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+       struct buf *bp;
+       int error, num;
+
+       KASSERT(lbn >= 0);
+
+       if (lbn < NDADDR) {
+               *res = db_get(ip, lbn);
+               return 0;
+       }
+       if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
+               return error;
+       if (curlwp == uvm.pagedaemon_lwp) {
+               mutex_enter(&bufcache_lock);
+               bp = incore(vp, indirs[num-1].in_lbn);
+               if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
+                       *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
+                       error = 0;
+               } else
+                       error = ENOMEM;
+               mutex_exit(&bufcache_lock);
+               return error;
+       }
+       error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
+       if (error == 0)
+               *res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
+       brelse(bp, 0);
+
+       return error;
+}
+
+/*
+ * Read or write the specified block of the filesystem vp resides on
+ * from or to the disk bypassing the buffer cache.
+ */
+static int
+rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
+{
+       int error;
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+       struct buf *nbp;
+
+       nbp = getiobuf(NULL, true);
+       nbp->b_flags = flags;
+       nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
+       nbp->b_error = 0;
+       nbp->b_data = data;
+       nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
+       nbp->b_proc = NULL;
+       nbp->b_dev = ip->i_devvp->v_rdev;
+       SET(nbp->b_cflags, BC_BUSY);    /* mark buffer busy */
+
+       bdev_strategy(nbp);
+
+       error = biowait(nbp);
+
+       putiobuf(nbp);
+
+       return error;
+}
+
+/*
+ * Write all dirty buffers to disk and invalidate them.
+ */
+static int
+syncsnap(struct vnode *vp)
+{
+       int error;
+       buf_t *bp;
+       struct fs *fs = VTOI(vp)->i_fs;
+
+       mutex_enter(&bufcache_lock);
+       while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
+               error = bbusy(bp, false, 0, NULL);
+               if (error == EPASSTHROUGH)
+                       continue;
+               else if (error != 0) {
+                       mutex_exit(&bufcache_lock);
+                       return error;
+               }
+               KASSERT(bp->b_bcount == fs->fs_bsize);
+               mutex_exit(&bufcache_lock);
+               error = rwfsblk(vp, B_WRITE, bp->b_data,
+                   fragstoblks(fs, dbtofsb(fs, bp->b_blkno)));
+               brelse(bp, BC_INVAL | BC_VFLUSH);
+               if (error)
+                       return error;
+               mutex_enter(&bufcache_lock);
+       }
+       mutex_exit(&bufcache_lock);
+
+       return 0;
+}
+
+/*
+ * Write the specified block to a snapshot.
+ */
+static int
+wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
+{
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+       struct buf *bp;
+       int error;
+
+       error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
+           FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
+       if (error)
+               return error;
+       memcpy(bp->b_data, data, fs->fs_bsize);
+       if (ip->i_nlink > 0)
+               error = bwrite(bp);
+       else
+               bawrite(bp);
+
+       return error;
+}
+
+/*
+ * Check if this inode is present on the active snapshot list.
+ * Must be called with snapinfo locked.
+ */
+static inline bool
+is_active_snapshot(struct snap_info *si, struct inode *ip)
+{
+       struct inode *xp;
+
+       KASSERT(mutex_owned(&si->si_lock));
+
+       TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
+               if (xp == ip)
+                       return true;
+       return false;
+}
+
+/*
+ * Get/Put direct block from inode or buffer containing disk addresses. Take
+ * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
+ * into a global include.
+ */
+static inline daddr_t
+db_get(struct inode *ip, int loc)
+{
+       if (ip->i_ump->um_fstype == UFS1)
+               return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
+       else
+               return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
+}
+
+static inline void
+db_assign(struct inode *ip, int loc, daddr_t val)
+{
+       if (ip->i_ump->um_fstype == UFS1)
+               ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
+       else
+               ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
+}
+
+static inline daddr_t
+ib_get(struct inode *ip, int loc)
+{
+       if (ip->i_ump->um_fstype == UFS1)
+               return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
+       else
+               return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
+}
+
+static inline void
+ib_assign(struct inode *ip, int loc, daddr_t val)
+{
+       if (ip->i_ump->um_fstype == UFS1)
+               ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
+       else
+               ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
+}
+
+static inline daddr_t
+idb_get(struct inode *ip, void *bf, int loc)
+{
+       if (ip->i_ump->um_fstype == UFS1)
+               return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
+       else
+               return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
+}
+
+static inline void
+idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
+{
+       if (ip->i_ump->um_fstype == UFS1)
+               ((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
+       else
+               ((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
+}
diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c
new file mode 100644 (file)
index 0000000..6b68403
--- /dev/null
@@ -0,0 +1,371 @@
+/*     $NetBSD: ffs_subr.c,v 1.47 2011/08/14 12:37:09 christos Exp $   */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_subr.c  8.5 (Berkeley) 3/21/95
+ */
+
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_subr.c,v 1.47 2011/08/14 12:37:09 christos Exp $");
+
+#include <sys/param.h>
+
+/* in ffs_tables.c */
+extern const int inside[], around[];
+extern const u_char * const fragtbl[];
+
+#ifndef _KERNEL
+#define FFS_EI /* always include byteswapped filesystems support */
+#endif
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+#ifndef _KERNEL
+#include <ufs/ufs/dinode.h>
+void    panic(const char *, ...)
+    __attribute__((__noreturn__,__format__(__printf__,1,2)));
+
+#else  /* _KERNEL */
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/inttypes.h>
+#include <sys/pool.h>
+#include <sys/fstrans.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+/*
+ * Load up the contents of an inode and copy the appropriate pieces
+ * to the incore copy.
+ */
+void
+ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino)
+{
+       struct ufs1_dinode *dp1;
+       struct ufs2_dinode *dp2;
+
+       if (ip->i_ump->um_fstype == UFS1) {
+               dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
+#ifdef FFS_EI
+               if (UFS_FSNEEDSWAP(fs))
+                       ffs_dinode1_swap(dp1, ip->i_din.ffs1_din);
+               else
+#endif
+               *ip->i_din.ffs1_din = *dp1;
+
+               ip->i_mode = ip->i_ffs1_mode;
+               ip->i_nlink = ip->i_ffs1_nlink;
+               ip->i_size = ip->i_ffs1_size;
+               ip->i_flags = ip->i_ffs1_flags;
+               ip->i_gen = ip->i_ffs1_gen;
+               ip->i_uid = ip->i_ffs1_uid;
+               ip->i_gid = ip->i_ffs1_gid;
+       } else {
+               dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
+#ifdef FFS_EI
+               if (UFS_FSNEEDSWAP(fs))
+                       ffs_dinode2_swap(dp2, ip->i_din.ffs2_din);
+               else
+#endif
+               *ip->i_din.ffs2_din = *dp2;
+
+               ip->i_mode = ip->i_ffs2_mode;
+               ip->i_nlink = ip->i_ffs2_nlink;
+               ip->i_size = ip->i_ffs2_size;
+               ip->i_flags = ip->i_ffs2_flags;
+               ip->i_gen = ip->i_ffs2_gen;
+               ip->i_uid = ip->i_ffs2_uid;
+               ip->i_gid = ip->i_ffs2_gid;
+       }
+}
+
+int
+ffs_getblk(struct vnode *vp, daddr_t lblkno, daddr_t blkno, int size,
+    bool clearbuf, buf_t **bpp)
+{
+       int error = 0;
+
+       KASSERT(blkno >= 0 || blkno == FFS_NOBLK);
+
+       if ((*bpp = getblk(vp, lblkno, size, 0, 0)) == NULL)
+               return ENOMEM;
+       if (blkno != FFS_NOBLK)
+               (*bpp)->b_blkno = blkno;
+       if (clearbuf)
+               clrbuf(*bpp);
+       if ((*bpp)->b_blkno >= 0 && (error = fscow_run(*bpp, false)) != 0)
+               brelse(*bpp, BC_INVAL);
+       return error;
+}
+
+#endif /* _KERNEL */
+
+/*
+ * Update the frsum fields to reflect addition or deletion
+ * of some frags.
+ */
+void
+ffs_fragacct(struct fs *fs, int fragmap, int32_t fraglist[], int cnt,
+    int needswap)
+{
+       int inblk;
+       int field, subfield;
+       int siz, pos;
+
+       inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
+       fragmap <<= 1;
+       for (siz = 1; siz < fs->fs_frag; siz++) {
+               if ((inblk & (1 << (siz + (fs->fs_frag & (NBBY - 1))))) == 0)
+                       continue;
+               field = around[siz];
+               subfield = inside[siz];
+               for (pos = siz; pos <= fs->fs_frag; pos++) {
+                       if ((fragmap & field) == subfield) {
+                               fraglist[siz] = ufs_rw32(
+                                   ufs_rw32(fraglist[siz], needswap) + cnt,
+                                   needswap);
+                               pos += siz;
+                               field <<= siz;
+                               subfield <<= siz;
+                       }
+                       field <<= 1;
+                       subfield <<= 1;
+               }
+       }
+}
+
+/*
+ * block operations
+ *
+ * check if a block is available
+ *  returns true if all the correponding bits in the free map are 1
+ *  returns false if any corresponding bit in the free map is 0
+ */
+int
+ffs_isblock(struct fs *fs, u_char *cp, int32_t h)
+{
+       u_char mask;
+
+       switch ((int)fs->fs_fragshift) {
+       case 3:
+               return (cp[h] == 0xff);
+       case 2:
+               mask = 0x0f << ((h & 0x1) << 2);
+               return ((cp[h >> 1] & mask) == mask);
+       case 1:
+               mask = 0x03 << ((h & 0x3) << 1);
+               return ((cp[h >> 2] & mask) == mask);
+       case 0:
+               mask = 0x01 << (h & 0x7);
+               return ((cp[h >> 3] & mask) == mask);
+       default:
+               panic("ffs_isblock: unknown fs_fragshift %d",
+                   (int)fs->fs_fragshift);
+       }
+}
+
+/*
+ * check if a block is completely allocated
+ *  returns true if all the corresponding bits in the free map are 0
+ *  returns false if any corresponding bit in the free map is 1
+ */
+int
+ffs_isfreeblock(struct fs *fs, u_char *cp, int32_t h)
+{
+
+       switch ((int)fs->fs_fragshift) {
+       case 3:
+               return (cp[h] == 0);
+       case 2:
+               return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
+       case 1:
+               return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
+       case 0:
+               return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
+       default:
+               panic("ffs_isfreeblock: unknown fs_fragshift %d",
+                   (int)fs->fs_fragshift);
+       }
+}
+
+/*
+ * take a block out of the map
+ */
+void
+ffs_clrblock(struct fs *fs, u_char *cp, int32_t h)
+{
+
+       switch ((int)fs->fs_fragshift) {
+       case 3:
+               cp[h] = 0;
+               return;
+       case 2:
+               cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
+               return;
+       case 1:
+               cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
+               return;
+       case 0:
+               cp[h >> 3] &= ~(0x01 << (h & 0x7));
+               return;
+       default:
+               panic("ffs_clrblock: unknown fs_fragshift %d",
+                   (int)fs->fs_fragshift);
+       }
+}
+
+/*
+ * put a block into the map
+ */
+void
+ffs_setblock(struct fs *fs, u_char *cp, int32_t h)
+{
+
+       switch ((int)fs->fs_fragshift) {
+       case 3:
+               cp[h] = 0xff;
+               return;
+       case 2:
+               cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
+               return;
+       case 1:
+               cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
+               return;
+       case 0:
+               cp[h >> 3] |= (0x01 << (h & 0x7));
+               return;
+       default:
+               panic("ffs_setblock: unknown fs_fragshift %d",
+                   (int)fs->fs_fragshift);
+       }
+}
+
+/*
+ * Update the cluster map because of an allocation or free.
+ *
+ * Cnt == 1 means free; cnt == -1 means allocating.
+ */
+void
+ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt)
+{
+       int32_t *sump;
+       int32_t *lp;
+       u_char *freemapp, *mapp;
+       int i, start, end, forw, back, map, bit;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       /* KASSERT(mutex_owned(&ump->um_lock)); */
+
+       if (fs->fs_contigsumsize <= 0)
+               return;
+       freemapp = cg_clustersfree(cgp, needswap);
+       sump = cg_clustersum(cgp, needswap);
+       /*
+        * Allocate or clear the actual block.
+        */
+       if (cnt > 0)
+               setbit(freemapp, blkno);
+       else
+               clrbit(freemapp, blkno);
+       /*
+        * Find the size of the cluster going forward.
+        */
+       start = blkno + 1;
+       end = start + fs->fs_contigsumsize;
+       if ((uint32_t)end >= ufs_rw32(cgp->cg_nclusterblks, needswap))
+               end = ufs_rw32(cgp->cg_nclusterblks, needswap);
+       mapp = &freemapp[start / NBBY];
+       map = *mapp++;
+       bit = 1 << (start % NBBY);
+       for (i = start; i < end; i++) {
+               if ((map & bit) == 0)
+                       break;
+               if ((i & (NBBY - 1)) != (NBBY - 1)) {
+                       bit <<= 1;
+               } else {
+                       map = *mapp++;
+                       bit = 1;
+               }
+       }
+       forw = i - start;
+       /*
+        * Find the size of the cluster going backward.
+        */
+       start = blkno - 1;
+       end = start - fs->fs_contigsumsize;
+       if (end < 0)
+               end = -1;
+       mapp = &freemapp[start / NBBY];
+       map = *mapp--;
+       bit = 1 << (start % NBBY);
+       for (i = start; i > end; i--) {
+               if ((map & bit) == 0)
+                       break;
+               if ((i & (NBBY - 1)) != 0) {
+                       bit >>= 1;
+               } else {
+                       map = *mapp--;
+                       bit = 1 << (NBBY - 1);
+               }
+       }
+       back = start - i;
+       /*
+        * Account for old cluster and the possibly new forward and
+        * back clusters.
+        */
+       i = back + forw + 1;
+       if (i > fs->fs_contigsumsize)
+               i = fs->fs_contigsumsize;
+       ufs_add32(sump[i], cnt, needswap);
+       if (back > 0)
+               ufs_add32(sump[back], -cnt, needswap);
+       if (forw > 0)
+               ufs_add32(sump[forw], -cnt, needswap);
+
+       /*
+        * Update cluster summary information.
+        */
+       lp = &sump[fs->fs_contigsumsize];
+       for (i = fs->fs_contigsumsize; i > 0; i--)
+               if (ufs_rw32(*lp--, needswap) > 0)
+                       break;
+#if defined(_KERNEL)
+       fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i;
+#endif
+}
diff --git a/sys/ufs/ffs/ffs_tables.c b/sys/ufs/ffs/ffs_tables.c
new file mode 100644 (file)
index 0000000..29f4542
--- /dev/null
@@ -0,0 +1,141 @@
+/*     $NetBSD: ffs_tables.c,v 1.9 2005/12/11 12:25:25 christos Exp $  */
+
+/*
+ * Copyright (c) 1982, 1986, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_tables.c        8.1 (Berkeley) 6/11/93
+ */
+
+#if HAVE_NBTOOL_CONFIG_H
+#include "nbtool_config.h"
+#endif
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_tables.c,v 1.9 2005/12/11 12:25:25 christos Exp $");
+
+#include <sys/param.h>
+
+/*
+ * Bit patterns for identifying fragments in the block map
+ * used as ((map & around) == inside)
+ */
+const int around[9] = {
+       0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
+};
+const int inside[9] = {
+       0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
+};
+
+/*
+ * Given a block map bit pattern, the frag tables tell whether a
+ * particular size fragment is available.
+ *
+ * used as:
+ * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] {
+ *     at least one fragment of the indicated size is available
+ * }
+ *
+ * These tables are used by the scanc instruction on the VAX to
+ * quickly find an appropriate fragment.
+ */
+const u_char fragtbl124[256] = {
+       0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e,
+       0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a,
+       0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+       0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+       0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+       0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+       0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e,
+       0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa,
+       0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+       0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+       0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+       0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+       0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e,
+       0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae,
+       0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e,
+       0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce,
+       0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+       0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+       0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+       0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+       0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e,
+       0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e,
+       0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e,
+       0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe,
+       0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e,
+       0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa,
+       0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e,
+       0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe,
+       0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e,
+       0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce,
+       0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce,
+       0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a,
+};
+
+const u_char fragtbl8[256] = {
+       0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04,
+       0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08,
+       0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+       0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10,
+       0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+       0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+       0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
+       0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20,
+       0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+       0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+       0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+       0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+       0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
+       0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a,
+       0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04,
+       0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40,
+       0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+       0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+       0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+       0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11,
+       0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05,
+       0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09,
+       0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07,
+       0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21,
+       0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06,
+       0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a,
+       0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07,
+       0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12,
+       0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04,
+       0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c,
+       0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c,
+       0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80,
+};
+
+/*
+ * The actual fragtbl array.
+ */
+const u_char * const fragtbl[MAXFRAG + 1] = {
+       0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8,
+};
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
new file mode 100644 (file)
index 0000000..28bbe32
--- /dev/null
@@ -0,0 +1,2144 @@
+/*     $NetBSD: ffs_vfsops.c,v 1.271 2011/11/14 18:35:14 hannken Exp $ */
+
+/*-
+ * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc, and by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1989, 1991, 1993, 1994
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_vfsops.c        8.31 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.271 2011/11/14 18:35:14 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#include "opt_wapbl.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/socket.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/disk.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+#include <sys/module.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+MODULE(MODULE_CLASS_VFS, ffs, NULL);
+
+static int     ffs_vfs_fsync(vnode_t *, int);
+
+static struct sysctllog *ffs_sysctl_log;
+
+/* how many times ffs_init() was called */
+int ffs_initcount = 0;
+
+extern const struct vnodeopv_desc ffs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc ffs_specop_opv_desc;
+extern const struct vnodeopv_desc ffs_fifoop_opv_desc;
+
+const struct vnodeopv_desc * const ffs_vnodeopv_descs[] = {
+       &ffs_vnodeop_opv_desc,
+       &ffs_specop_opv_desc,
+       &ffs_fifoop_opv_desc,
+       NULL,
+};
+
+struct vfsops ffs_vfsops = {
+       MOUNT_FFS,
+       sizeof (struct ufs_args),
+       ffs_mount,
+       ufs_start,
+       ffs_unmount,
+       ufs_root,
+       ufs_quotactl,
+       ffs_statvfs,
+       ffs_sync,
+       ffs_vget,
+       ffs_fhtovp,
+       ffs_vptofh,
+       ffs_init,
+       ffs_reinit,
+       ffs_done,
+       ffs_mountroot,
+       ffs_snapshot,
+       ffs_extattrctl,
+       ffs_suspendctl,
+       genfs_renamelock_enter,
+       genfs_renamelock_exit,
+       ffs_vfs_fsync,
+       ffs_vnodeopv_descs,
+       0,
+       { NULL, NULL },
+};
+
+static const struct genfs_ops ffs_genfsops = {
+       .gop_size = ffs_gop_size,
+       .gop_alloc = ufs_gop_alloc,
+       .gop_write = genfs_gop_write,
+       .gop_markupdate = ufs_gop_markupdate,
+};
+
+static const struct ufs_ops ffs_ufsops = {
+       .uo_itimes = ffs_itimes,
+       .uo_update = ffs_update,
+       .uo_truncate = ffs_truncate,
+       .uo_valloc = ffs_valloc,
+       .uo_vfree = ffs_vfree,
+       .uo_balloc = ffs_balloc,
+       .uo_unmark_vnode = (void (*)(vnode_t *))nullop,
+};
+
+static int
+ffs_modcmd(modcmd_t cmd, void *arg)
+{
+       int error;
+
+#if 0
+       extern int doasyncfree;
+#endif
+#ifdef UFS_EXTATTR
+       extern int ufs_extattr_autocreate;
+#endif
+       extern int ffs_log_changeopt;
+
+       switch (cmd) {
+       case MODULE_CMD_INIT:
+               error = vfs_attach(&ffs_vfsops);
+               if (error != 0)
+                       break;
+
+               sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT,
+                              CTLTYPE_NODE, "vfs", NULL,
+                              NULL, 0, NULL, 0,
+                              CTL_VFS, CTL_EOL);
+               sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT,
+                              CTLTYPE_NODE, "ffs",
+                              SYSCTL_DESCR("Berkeley Fast File System"),
+                              NULL, 0, NULL, 0,
+                              CTL_VFS, 1, CTL_EOL);
+               /*
+                * @@@ should we even bother with these first three?
+                */
+               sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                              CTLTYPE_INT, "doclusterread", NULL,
+                              sysctl_notavail, 0, NULL, 0,
+                              CTL_VFS, 1, FFS_CLUSTERREAD, CTL_EOL);
+               sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                              CTLTYPE_INT, "doclusterwrite", NULL,
+                              sysctl_notavail, 0, NULL, 0,
+                              CTL_VFS, 1, FFS_CLUSTERWRITE, CTL_EOL);
+               sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                              CTLTYPE_INT, "doreallocblks", NULL,
+                              sysctl_notavail, 0, NULL, 0,
+                              CTL_VFS, 1, FFS_REALLOCBLKS, CTL_EOL);
+#if 0
+               sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                              CTLTYPE_INT, "doasyncfree",
+                              SYSCTL_DESCR("Release dirty blocks asynchronously"),
+                              NULL, 0, &doasyncfree, 0,
+                              CTL_VFS, 1, FFS_ASYNCFREE, CTL_EOL);
+#endif
+               sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                              CTLTYPE_INT, "log_changeopt",
+                              SYSCTL_DESCR("Log changes in optimization strategy"),
+                              NULL, 0, &ffs_log_changeopt, 0,
+                              CTL_VFS, 1, FFS_LOG_CHANGEOPT, CTL_EOL);
+#ifdef UFS_EXTATTR
+               sysctl_createv(&ffs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                              CTLTYPE_INT, "extattr_autocreate",
+                              SYSCTL_DESCR("Size of attribute for "
+                                           "backing file autocreation"),
+                              NULL, 0, &ufs_extattr_autocreate, 0,
+                              CTL_VFS, 1, FFS_EXTATTR_AUTOCREATE, CTL_EOL);
+               
+#endif /* UFS_EXTATTR */
+
+               break;
+       case MODULE_CMD_FINI:
+               error = vfs_detach(&ffs_vfsops);
+               if (error != 0)
+                       break;
+               sysctl_teardown(&ffs_sysctl_log);
+               break;
+       default:
+               error = ENOTTY;
+               break;
+       }
+
+       return (error);
+}
+
+pool_cache_t ffs_inode_cache;
+pool_cache_t ffs_dinode1_cache;
+pool_cache_t ffs_dinode2_cache;
+
+static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, daddr_t);
+static void ffs_oldfscompat_write(struct fs *, struct ufsmount *);
+
+/*
+ * Called by main() when ffs is going to be mounted as root.
+ */
+
+int
+ffs_mountroot(void)
+{
+       struct fs *fs;
+       struct mount *mp;
+       struct lwp *l = curlwp;                 /* XXX */
+       struct ufsmount *ump;
+       int error;
+
+       if (device_class(root_device) != DV_DISK)
+               return (ENODEV);
+
+       if ((error = vfs_rootmountalloc(MOUNT_FFS, "root_device", &mp))) {
+               vrele(rootvp);
+               return (error);
+       }
+
+       /*
+        * We always need to be able to mount the root file system.
+        */
+       mp->mnt_flag |= MNT_FORCE;
+       if ((error = ffs_mountfs(rootvp, mp, l)) != 0) {
+               vfs_unbusy(mp, false, NULL);
+               vfs_destroy(mp);
+               return (error);
+       }
+       mp->mnt_flag &= ~MNT_FORCE;
+       mutex_enter(&mountlist_lock);
+       CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+       mutex_exit(&mountlist_lock);
+       ump = VFSTOUFS(mp);
+       fs = ump->um_fs;
+       memset(fs->fs_fsmnt, 0, sizeof(fs->fs_fsmnt));
+       (void)copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0);
+       (void)ffs_statvfs(mp, &mp->mnt_stat);
+       vfs_unbusy(mp, false, NULL);
+       setrootfstime((time_t)fs->fs_time);
+       return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+int
+ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+       struct lwp *l = curlwp;
+       struct vnode *devvp = NULL;
+       struct ufs_args *args = data;
+       struct ufsmount *ump = NULL;
+       struct fs *fs;
+       int error = 0, flags, update;
+       mode_t accessmode;
+
+       if (*data_len < sizeof *args)
+               return EINVAL;
+
+       if (mp->mnt_flag & MNT_GETARGS) {
+               ump = VFSTOUFS(mp);
+               if (ump == NULL)
+                       return EIO;
+               args->fspec = NULL;
+               *data_len = sizeof *args;
+               return 0;
+       }
+
+       update = mp->mnt_flag & MNT_UPDATE;
+
+       /* Check arguments */
+       if (args->fspec != NULL) {
+               /*
+                * Look up the name and verify that it's sane.
+                */
+               error = namei_simple_user(args->fspec,
+                                       NSM_FOLLOW_NOEMULROOT, &devvp);
+               if (error != 0)
+                       return (error);
+
+               if (!update) {
+                       /*
+                        * Be sure this is a valid block device
+                        */
+                       if (devvp->v_type != VBLK)
+                               error = ENOTBLK;
+                       else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+                               error = ENXIO;
+               } else {
+                       /*
+                        * Be sure we're still naming the same device
+                        * used for our initial mount
+                        */
+                       ump = VFSTOUFS(mp);
+                       if (devvp != ump->um_devvp) {
+                               if (devvp->v_rdev != ump->um_devvp->v_rdev)
+                                       error = EINVAL;
+                               else {
+                                       vrele(devvp);
+                                       devvp = ump->um_devvp;
+                                       vref(devvp);
+                               }
+                       }
+               }
+       } else {
+               if (!update) {
+                       /* New mounts must have a filename for the device */
+                       return (EINVAL);
+               } else {
+                       /* Use the extant mount */
+                       ump = VFSTOUFS(mp);
+                       devvp = ump->um_devvp;
+                       vref(devvp);
+               }
+       }
+
+       /*
+        * If mount by non-root, then verify that user has necessary
+        * permissions on the device.
+        *
+        * Permission to update a mount is checked higher, so here we presume
+        * updating the mount is okay (for example, as far as securelevel goes)
+        * which leaves us with the normal check.
+        */
+       if (error == 0) {
+               accessmode = VREAD;
+               if (update ?
+                   (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
+                   (mp->mnt_flag & MNT_RDONLY) == 0)
+                       accessmode |= VWRITE;
+               vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+               error = genfs_can_mount(devvp, accessmode, l->l_cred);
+               VOP_UNLOCK(devvp);
+       }
+
+       if (error) {
+               vrele(devvp);
+               return (error);
+       }
+
+#ifdef WAPBL
+       /* WAPBL can only be enabled on a r/w mount. */
+       if ((mp->mnt_flag & MNT_RDONLY) && !(mp->mnt_iflag & IMNT_WANTRDWR)) {
+               mp->mnt_flag &= ~MNT_LOG;
+       }
+#else /* !WAPBL */
+       mp->mnt_flag &= ~MNT_LOG;
+#endif /* !WAPBL */
+
+       if (!update) {
+               int xflags;
+
+               if (mp->mnt_flag & MNT_RDONLY)
+                       xflags = FREAD;
+               else
+                       xflags = FREAD | FWRITE;
+               vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+               error = VOP_OPEN(devvp, xflags, FSCRED);
+               VOP_UNLOCK(devvp);
+               if (error)
+                       goto fail;
+               error = ffs_mountfs(devvp, mp, l);
+               if (error) {
+                       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+                       (void)VOP_CLOSE(devvp, xflags, NOCRED);
+                       VOP_UNLOCK(devvp);
+                       goto fail;
+               }
+
+               ump = VFSTOUFS(mp);
+               fs = ump->um_fs;
+       } else {
+               /*
+                * Update the mount.
+                */
+
+               /*
+                * The initial mount got a reference on this
+                * device, so drop the one obtained via
+                * namei(), above.
+                */
+               vrele(devvp);
+
+               ump = VFSTOUFS(mp);
+               fs = ump->um_fs;
+               if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+                       /*
+                        * Changing from r/w to r/o
+                        */
+                       flags = WRITECLOSE;
+                       if (mp->mnt_flag & MNT_FORCE)
+                               flags |= FORCECLOSE;
+                       error = ffs_flushfiles(mp, flags, l);
+                       if (error == 0)
+                               error = UFS_WAPBL_BEGIN(mp);
+                       if (error == 0 &&
+                           ffs_cgupdate(ump, MNT_WAIT) == 0 &&
+                           fs->fs_clean & FS_WASCLEAN) {
+                               if (mp->mnt_flag & MNT_SOFTDEP)
+                                       fs->fs_flags &= ~FS_DOSOFTDEP;
+                               fs->fs_clean = FS_ISCLEAN;
+                               (void) ffs_sbupdate(ump, MNT_WAIT);
+                       }
+                       if (error == 0)
+                               UFS_WAPBL_END(mp);
+                       if (error)
+                               return (error);
+               }
+
+#ifdef WAPBL
+               if ((mp->mnt_flag & MNT_LOG) == 0) {
+                       error = ffs_wapbl_stop(mp, mp->mnt_flag & MNT_FORCE);
+                       if (error)
+                               return error;
+               }
+#endif /* WAPBL */
+
+               if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+                       /*
+                        * Finish change from r/w to r/o
+                        */
+                       fs->fs_ronly = 1;
+                       fs->fs_fmod = 0;
+               }
+
+               if (mp->mnt_flag & MNT_RELOAD) {
+                       error = ffs_reload(mp, l->l_cred, l);
+                       if (error)
+                               return (error);
+               }
+
+               if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
+                       /*
+                        * Changing from read-only to read/write
+                        */
+#ifndef QUOTA2
+                       if (fs->fs_flags & FS_DOQUOTA2) {
+                               ump->um_flags |= UFS_QUOTA2;
+                               uprintf("%s: options QUOTA2 not enabled%s\n",
+                                   mp->mnt_stat.f_mntonname,
+                                   (mp->mnt_flag & MNT_FORCE) ? "" :
+                                   ", not mounting");
+                               return EINVAL;
+                       }
+#endif
+                       fs->fs_ronly = 0;
+                       fs->fs_clean <<= 1;
+                       fs->fs_fmod = 1;
+#ifdef WAPBL
+                       if (fs->fs_flags & FS_DOWAPBL) {
+                               printf("%s: replaying log to disk\n",
+                                   fs->fs_fsmnt);
+                               KDASSERT(mp->mnt_wapbl_replay);
+                               error = wapbl_replay_write(mp->mnt_wapbl_replay,
+                                                          devvp);
+                               if (error) {
+                                       return error;
+                               }
+                               wapbl_replay_stop(mp->mnt_wapbl_replay);
+                               fs->fs_clean = FS_WASCLEAN;
+                       }
+#endif /* WAPBL */
+                       if (fs->fs_snapinum[0] != 0)
+                               ffs_snapshot_mount(mp);
+               }
+
+#ifdef WAPBL
+               error = ffs_wapbl_start(mp);
+               if (error)
+                       return error;
+#endif /* WAPBL */
+
+#ifdef QUOTA2
+               if (!fs->fs_ronly) {
+                       error = ffs_quota2_mount(mp);
+                       if (error) {
+                               return error;
+                       }
+               }
+#endif
+               if (args->fspec == NULL)
+                       return 0;
+       }
+
+       error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+           UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+       if (error == 0)
+               (void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname,
+                   sizeof(fs->fs_fsmnt));
+       fs->fs_flags &= ~FS_DOSOFTDEP;
+       if (fs->fs_fmod != 0) { /* XXX */
+               int err;
+
+               fs->fs_fmod = 0;
+               if (fs->fs_clean & FS_WASCLEAN)
+                       fs->fs_time = time_second;
+               else {
+                       printf("%s: file system not clean (fs_clean=%#x); "
+                           "please fsck(8)\n", mp->mnt_stat.f_mntfromname,
+                           fs->fs_clean);
+                       printf("%s: lost blocks %" PRId64 " files %d\n",
+                           mp->mnt_stat.f_mntfromname, fs->fs_pendingblocks,
+                           fs->fs_pendinginodes);
+               }
+               err = UFS_WAPBL_BEGIN(mp);
+               if (err == 0) {
+                       (void) ffs_cgupdate(ump, MNT_WAIT);
+                       UFS_WAPBL_END(mp);
+               }
+       }
+       if ((mp->mnt_flag & MNT_SOFTDEP) != 0) {
+               printf("%s: `-o softdep' is no longer supported, "
+                   "consider `-o log'\n", mp->mnt_stat.f_mntfromname);
+               mp->mnt_flag &= ~MNT_SOFTDEP;
+       }
+
+       return (error);
+
+fail:
+       vrele(devvp);
+       return (error);
+}
+
+/*
+ * Reload all incore data for a filesystem (used after running fsck on
+ * the root filesystem and finding things to fix). The filesystem must
+ * be mounted read-only.
+ *
+ * Things to do to update the mount:
+ *     1) invalidate all cached meta-data.
+ *     2) re-read superblock from disk.
+ *     3) re-read summary information from disk.
+ *     4) invalidate all inactive vnodes.
+ *     5) invalidate all cached file data.
+ *     6) re-read inode data for all active vnodes.
+ */
+int
+ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
+{
+       struct vnode *vp, *mvp, *devvp;
+       struct inode *ip;
+       void *space;
+       struct buf *bp;
+       struct fs *fs, *newfs;
+       struct dkwedge_info dkw;
+       int i, bsize, blks, error;
+       int32_t *lp;
+       struct ufsmount *ump;
+       daddr_t sblockloc;
+
+       if ((mp->mnt_flag & MNT_RDONLY) == 0)
+               return (EINVAL);
+
+       ump = VFSTOUFS(mp);
+       /*
+        * Step 1: invalidate all cached meta-data.
+        */
+       devvp = ump->um_devvp;
+       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+       error = vinvalbuf(devvp, 0, cred, l, 0, 0);
+       VOP_UNLOCK(devvp);
+       if (error)
+               panic("ffs_reload: dirty1");
+       /*
+        * Step 2: re-read superblock from disk.
+        */
+       fs = ump->um_fs;
+
+       /* XXX we don't handle possibility that superblock moved. */
+       error = bread(devvp, fs->fs_sblockloc / DEV_BSIZE, fs->fs_sbsize,
+                     NOCRED, 0, &bp);
+       if (error) {
+               brelse(bp, 0);
+               return (error);
+       }
+       newfs = malloc(fs->fs_sbsize, M_UFSMNT, M_WAITOK);
+       memcpy(newfs, bp->b_data, fs->fs_sbsize);
+#ifdef FFS_EI
+       if (ump->um_flags & UFS_NEEDSWAP) {
+               ffs_sb_swap((struct fs*)bp->b_data, newfs);
+               fs->fs_flags |= FS_SWAPPED;
+       } else
+#endif
+               fs->fs_flags &= ~FS_SWAPPED;
+       if ((newfs->fs_magic != FS_UFS1_MAGIC &&
+            newfs->fs_magic != FS_UFS2_MAGIC)||
+            newfs->fs_bsize > MAXBSIZE ||
+            newfs->fs_bsize < sizeof(struct fs)) {
+               brelse(bp, 0);
+               free(newfs, M_UFSMNT);
+               return (EIO);           /* XXX needs translation */
+       }
+       /* Store off old fs_sblockloc for fs_oldfscompat_read. */
+       sblockloc = fs->fs_sblockloc;
+       /*
+        * Copy pointer fields back into superblock before copying in   XXX
+        * new superblock. These should really be in the ufsmount.      XXX
+        * Note that important parameters (eg fs_ncg) are unchanged.
+        */
+       newfs->fs_csp = fs->fs_csp;
+       newfs->fs_maxcluster = fs->fs_maxcluster;
+       newfs->fs_contigdirs = fs->fs_contigdirs;
+       newfs->fs_ronly = fs->fs_ronly;
+       newfs->fs_active = fs->fs_active;
+       memcpy(fs, newfs, (u_int)fs->fs_sbsize);
+       brelse(bp, 0);
+       free(newfs, M_UFSMNT);
+
+       /* Recheck for apple UFS filesystem */
+       ump->um_flags &= ~UFS_ISAPPLEUFS;
+       /* First check to see if this is tagged as an Apple UFS filesystem
+        * in the disklabel
+        */
+       if (getdiskinfo(devvp, &dkw) == 0 &&
+           strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0)
+               ump->um_flags |= UFS_ISAPPLEUFS;
+#ifdef APPLE_UFS
+       else {
+               /* Manually look for an apple ufs label, and if a valid one
+                * is found, then treat it like an Apple UFS filesystem anyway
+                *
+                * EINVAL is most probably a blocksize or alignment problem,
+                * it is unlikely that this is an Apple UFS filesystem then.
+                */
+               error = bread(devvp, (daddr_t)(APPLEUFS_LABEL_OFFSET / DEV_BSIZE),
+                       APPLEUFS_LABEL_SIZE, cred, 0, &bp);
+               if (error && error != EINVAL) {
+                       brelse(bp, 0);
+                       return (error);
+               }
+               if (error == 0) {
+                       error = ffs_appleufs_validate(fs->fs_fsmnt,
+                               (struct appleufslabel *)bp->b_data, NULL);
+                       if (error == 0)
+                               ump->um_flags |= UFS_ISAPPLEUFS;
+               }
+               brelse(bp, 0);
+               bp = NULL;
+       }
+#else
+       if (ump->um_flags & UFS_ISAPPLEUFS)
+               return (EIO);
+#endif
+
+       if (UFS_MPISAPPLEUFS(ump)) {
+               /* see comment about NeXT below */
+               ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN;
+               ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ;
+               mp->mnt_iflag |= IMNT_DTYPE;
+       } else {
+               ump->um_maxsymlinklen = fs->fs_maxsymlinklen;
+               ump->um_dirblksiz = DIRBLKSIZ;
+               if (ump->um_maxsymlinklen > 0)
+                       mp->mnt_iflag |= IMNT_DTYPE;
+               else
+                       mp->mnt_iflag &= ~IMNT_DTYPE;
+       }
+       ffs_oldfscompat_read(fs, ump, sblockloc);
+
+       mutex_enter(&ump->um_lock);
+       ump->um_maxfilesize = fs->fs_maxfilesize;
+       if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
+               uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
+                   mp->mnt_stat.f_mntonname, fs->fs_flags,
+                   (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+               if ((mp->mnt_flag & MNT_FORCE) == 0) {
+                       mutex_exit(&ump->um_lock);
+                       return (EINVAL);
+               }
+       }
+       if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
+               fs->fs_pendingblocks = 0;
+               fs->fs_pendinginodes = 0;
+       }
+       mutex_exit(&ump->um_lock);
+
+       ffs_statvfs(mp, &mp->mnt_stat);
+       /*
+        * Step 3: re-read summary information from disk.
+        */
+       blks = howmany(fs->fs_cssize, fs->fs_fsize);
+       space = fs->fs_csp;
+       for (i = 0; i < blks; i += fs->fs_frag) {
+               bsize = fs->fs_bsize;
+               if (i + fs->fs_frag > blks)
+                       bsize = (blks - i) * fs->fs_fsize;
+               error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), bsize,
+                             NOCRED, 0, &bp);
+               if (error) {
+                       brelse(bp, 0);
+                       return (error);
+               }
+#ifdef FFS_EI
+               if (UFS_FSNEEDSWAP(fs))
+                       ffs_csum_swap((struct csum *)bp->b_data,
+                           (struct csum *)space, bsize);
+               else
+#endif
+                       memcpy(space, bp->b_data, (size_t)bsize);
+               space = (char *)space + bsize;
+               brelse(bp, 0);
+       }
+       if (fs->fs_snapinum[0] != 0)
+               ffs_snapshot_mount(mp);
+       /*
+        * We no longer know anything about clusters per cylinder group.
+        */
+       if (fs->fs_contigsumsize > 0) {
+               lp = fs->fs_maxcluster;
+               for (i = 0; i < fs->fs_ncg; i++)
+                       *lp++ = fs->fs_contigsumsize;
+       }
+
+       /* Allocate a marker vnode. */
+       mvp = vnalloc(mp);
+       /*
+        * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+        * and vclean() can be called indirectly
+        */
+       mutex_enter(&mntvnode_lock);
+ loop:
+       for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+               vmark(mvp, vp);
+               if (vp->v_mount != mp || vismarker(vp))
+                       continue;
+               /*
+                * Step 4: invalidate all inactive vnodes.
+                */
+               if (vrecycle(vp, &mntvnode_lock, l)) {
+                       mutex_enter(&mntvnode_lock);
+                       (void)vunmark(mvp);
+                       goto loop;
+               }
+               /*
+                * Step 5: invalidate all cached file data.
+                */
+               mutex_enter(vp->v_interlock);
+               mutex_exit(&mntvnode_lock);
+               if (vget(vp, LK_EXCLUSIVE)) {
+                       (void)vunmark(mvp);
+                       goto loop;
+               }
+               if (vinvalbuf(vp, 0, cred, l, 0, 0))
+                       panic("ffs_reload: dirty2");
+               /*
+                * Step 6: re-read inode data for all active vnodes.
+                */
+               ip = VTOI(vp);
+               error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+                             (int)fs->fs_bsize, NOCRED, 0, &bp);
+               if (error) {
+                       brelse(bp, 0);
+                       vput(vp);
+                       (void)vunmark(mvp);
+                       break;
+               }
+               ffs_load_inode(bp, ip, fs, ip->i_number);
+               brelse(bp, 0);
+               vput(vp);
+               mutex_enter(&mntvnode_lock);
+       }
+       mutex_exit(&mntvnode_lock);
+       vnfree(mvp);
+       return (error);
+}
+
+/*
+ * Possible superblock locations ordered from most to least likely.
+ */
+static const int sblock_try[] = SBLOCKSEARCH;
+
+/*
+ * Common code for mount and mountroot
+ */
+int
+ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
+{
+       struct ufsmount *ump;
+       struct buf *bp;
+       struct fs *fs;
+       dev_t dev;
+       struct dkwedge_info dkw;
+       void *space;
+       daddr_t sblockloc, fsblockloc;
+       int blks, fstype;
+       int error, i, bsize, ronly, bset = 0;
+#ifdef FFS_EI
+       int needswap = 0;               /* keep gcc happy */
+#endif
+       int32_t *lp;
+       kauth_cred_t cred;
+       u_int32_t sbsize = 8192;        /* keep gcc happy*/
+       int32_t fsbsize;
+
+       dev = devvp->v_rdev;
+       cred = l ? l->l_cred : NOCRED;
+
+       /* Flush out any old buffers remaining from a previous use. */
+       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+       error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+       VOP_UNLOCK(devvp);
+       if (error)
+               return (error);
+
+       ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+       bp = NULL;
+       ump = NULL;
+       fs = NULL;
+       sblockloc = 0;
+       fstype = 0;
+
+       error = fstrans_mount(mp);
+       if (error)
+               return error;
+
+       ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK);
+       memset(ump, 0, sizeof *ump);
+       mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE);
+       error = ffs_snapshot_init(ump);
+       if (error)
+               goto out;
+       ump->um_ops = &ffs_ufsops;
+
+#ifdef WAPBL
+ sbagain:
+#endif
+       /*
+        * Try reading the superblock in each of its possible locations.
+        */
+       for (i = 0; ; i++) {
+               if (bp != NULL) {
+                       brelse(bp, BC_NOCACHE);
+                       bp = NULL;
+               }
+               if (sblock_try[i] == -1) {
+                       error = EINVAL;
+                       fs = NULL;
+                       goto out;
+               }
+               error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE, cred,
+                             0, &bp);
+               if (error) {
+                       fs = NULL;
+                       goto out;
+               }
+               fs = (struct fs*)bp->b_data;
+               fsblockloc = sblockloc = sblock_try[i];
+               if (fs->fs_magic == FS_UFS1_MAGIC) {
+                       sbsize = fs->fs_sbsize;
+                       fstype = UFS1;
+                       fsbsize = fs->fs_bsize;
+#ifdef FFS_EI
+                       needswap = 0;
+               } else if (fs->fs_magic == bswap32(FS_UFS1_MAGIC)) {
+                       sbsize = bswap32(fs->fs_sbsize);
+                       fstype = UFS1;
+                       fsbsize = bswap32(fs->fs_bsize);
+                       needswap = 1;
+#endif
+               } else if (fs->fs_magic == FS_UFS2_MAGIC) {
+                       sbsize = fs->fs_sbsize;
+                       fstype = UFS2;
+                       fsbsize = fs->fs_bsize;
+#ifdef FFS_EI
+                       needswap = 0;
+               } else if (fs->fs_magic == bswap32(FS_UFS2_MAGIC)) {
+                       sbsize = bswap32(fs->fs_sbsize);
+                       fstype = UFS2;
+                       fsbsize = bswap32(fs->fs_bsize);
+                       needswap = 1;
+#endif
+               } else
+                       continue;
+
+
+               /* fs->fs_sblockloc isn't defined for old filesystems */
+               if (fstype == UFS1 && !(fs->fs_old_flags & FS_FLAGS_UPDATED)) {
+                       if (sblockloc == SBLOCK_UFS2)
+                               /*
+                                * This is likely to be the first alternate
+                                * in a filesystem with 64k blocks.
+                                * Don't use it.
+                                */
+                               continue;
+                       fsblockloc = sblockloc;
+               } else {
+                       fsblockloc = fs->fs_sblockloc;
+#ifdef FFS_EI
+                       if (needswap)
+                               fsblockloc = bswap64(fsblockloc);
+#endif
+               }
+
+               /* Check we haven't found an alternate superblock */
+               if (fsblockloc != sblockloc)
+                       continue;
+
+               /* Validate size of superblock */
+               if (sbsize > MAXBSIZE || sbsize < sizeof(struct fs))
+                       continue;
+
+               /* Check that we can handle the file system blocksize */
+               if (fsbsize > MAXBSIZE) {
+                       printf("ffs_mountfs: block size (%d) > MAXBSIZE (%d)\n",
+                           fsbsize, MAXBSIZE);
+                       continue;
+               }
+
+               /* Ok seems to be a good superblock */
+               break;
+       }
+
+       fs = malloc((u_long)sbsize, M_UFSMNT, M_WAITOK);
+       memcpy(fs, bp->b_data, sbsize);
+       ump->um_fs = fs;
+
+#ifdef FFS_EI
+       if (needswap) {
+               ffs_sb_swap((struct fs*)bp->b_data, fs);
+               fs->fs_flags |= FS_SWAPPED;
+       } else
+#endif
+               fs->fs_flags &= ~FS_SWAPPED;
+
+#ifdef WAPBL
+       if ((mp->mnt_wapbl_replay == 0) && (fs->fs_flags & FS_DOWAPBL)) {
+               error = ffs_wapbl_replay_start(mp, fs, devvp);
+               if (error && (mp->mnt_flag & MNT_FORCE) == 0)
+                       goto out;
+               if (!error) {
+                       if (!ronly) {
+                               /* XXX fsmnt may be stale. */
+                               printf("%s: replaying log to disk\n",
+                                   fs->fs_fsmnt);
+                               error = wapbl_replay_write(mp->mnt_wapbl_replay,
+                                   devvp);
+                               if (error)
+                                       goto out;
+                               wapbl_replay_stop(mp->mnt_wapbl_replay);
+                               fs->fs_clean = FS_WASCLEAN;
+                       } else {
+                               /* XXX fsmnt may be stale */
+                               printf("%s: replaying log to memory\n",
+                                   fs->fs_fsmnt);
+                       }
+
+                       /* Force a re-read of the superblock */
+                       brelse(bp, BC_INVAL);
+                       bp = NULL;
+                       free(fs, M_UFSMNT);
+                       fs = NULL;
+                       goto sbagain;
+               }
+       }
+#else /* !WAPBL */
+       if ((fs->fs_flags & FS_DOWAPBL) && (mp->mnt_flag & MNT_FORCE) == 0) {
+               error = EPERM;
+               goto out;
+       }
+#endif /* !WAPBL */
+
+       ffs_oldfscompat_read(fs, ump, sblockloc);
+       ump->um_maxfilesize = fs->fs_maxfilesize;
+
+       if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
+               uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
+                   mp->mnt_stat.f_mntonname, fs->fs_flags,
+                   (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+               if ((mp->mnt_flag & MNT_FORCE) == 0) {
+                       error = EINVAL;
+                       goto out;
+               }
+       }
+
+       if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
+               fs->fs_pendingblocks = 0;
+               fs->fs_pendinginodes = 0;
+       }
+
+       ump->um_fstype = fstype;
+       if (fs->fs_sbsize < SBLOCKSIZE)
+               brelse(bp, BC_INVAL);
+       else
+               brelse(bp, 0);
+       bp = NULL;
+
+       /* First check to see if this is tagged as an Apple UFS filesystem
+        * in the disklabel
+        */
+       if (getdiskinfo(devvp, &dkw) == 0 &&
+           strcmp(dkw.dkw_ptype, DKW_PTYPE_APPLEUFS) == 0)
+               ump->um_flags |= UFS_ISAPPLEUFS;
+#ifdef APPLE_UFS
+       else {
+               /* Manually look for an apple ufs label, and if a valid one
+                * is found, then treat it like an Apple UFS filesystem anyway
+                */
+               error = bread(devvp, (daddr_t)(APPLEUFS_LABEL_OFFSET / DEV_BSIZE),
+                       APPLEUFS_LABEL_SIZE, cred, 0, &bp);
+               if (error)
+                       goto out;
+               error = ffs_appleufs_validate(fs->fs_fsmnt,
+                       (struct appleufslabel *)bp->b_data, NULL);
+               if (error == 0) {
+                       ump->um_flags |= UFS_ISAPPLEUFS;
+               }
+               brelse(bp, 0);
+               bp = NULL;
+       }
+#else
+       if (ump->um_flags & UFS_ISAPPLEUFS) {
+               error = EINVAL;
+               goto out;
+       }
+#endif
+
+#if 0
+/*
+ * XXX This code changes the behaviour of mounting dirty filesystems, to
+ * XXX require "mount -f ..." to mount them.  This doesn't match what
+ * XXX mount(8) describes and is disabled for now.
+ */
+       /*
+        * If the file system is not clean, don't allow it to be mounted
+        * unless MNT_FORCE is specified.  (Note: MNT_FORCE is always set
+        * for the root file system.)
+        */
+       if (fs->fs_flags & FS_DOWAPBL) {
+               /*
+                * wapbl normally expects to be FS_WASCLEAN when the FS_DOWAPBL
+                * bit is set, although there's a window in unmount where it
+                * could be FS_ISCLEAN
+                */
+               if ((mp->mnt_flag & MNT_FORCE) == 0 &&
+                   (fs->fs_clean & (FS_WASCLEAN | FS_ISCLEAN)) == 0) {
+                       error = EPERM;
+                       goto out;
+               }
+       } else
+               if ((fs->fs_clean & FS_ISCLEAN) == 0 &&
+                   (mp->mnt_flag & MNT_FORCE) == 0) {
+                       error = EPERM;
+                       goto out;
+               }
+#endif
+
+       /*
+        * verify that we can access the last block in the fs
+        * if we're mounting read/write.
+        */
+
+       if (!ronly) {
+               error = bread(devvp, fsbtodb(fs, fs->fs_size - 1), fs->fs_fsize,
+                   cred, 0, &bp);
+               if (bp->b_bcount != fs->fs_fsize)
+                       error = EINVAL;
+               if (error) {
+                       bset = BC_INVAL;
+                       goto out;
+               }
+               brelse(bp, BC_INVAL);
+               bp = NULL;
+       }
+
+       fs->fs_ronly = ronly;
+       /* Don't bump fs_clean if we're replaying journal */
+       if (!((fs->fs_flags & FS_DOWAPBL) && (fs->fs_clean & FS_WASCLEAN)))
+               if (ronly == 0) {
+                       fs->fs_clean <<= 1;
+                       fs->fs_fmod = 1;
+               }
+       bsize = fs->fs_cssize;
+       blks = howmany(bsize, fs->fs_fsize);
+       if (fs->fs_contigsumsize > 0)
+               bsize += fs->fs_ncg * sizeof(int32_t);
+       bsize += fs->fs_ncg * sizeof(*fs->fs_contigdirs);
+       space = malloc((u_long)bsize, M_UFSMNT, M_WAITOK);
+       fs->fs_csp = space;
+       for (i = 0; i < blks; i += fs->fs_frag) {
+               bsize = fs->fs_bsize;
+               if (i + fs->fs_frag > blks)
+                       bsize = (blks - i) * fs->fs_fsize;
+               error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), bsize,
+                             cred, 0, &bp);
+               if (error) {
+                       free(fs->fs_csp, M_UFSMNT);
+                       goto out;
+               }
+#ifdef FFS_EI
+               if (needswap)
+                       ffs_csum_swap((struct csum *)bp->b_data,
+                               (struct csum *)space, bsize);
+               else
+#endif
+                       memcpy(space, bp->b_data, (u_int)bsize);
+
+               space = (char *)space + bsize;
+               brelse(bp, 0);
+               bp = NULL;
+       }
+       if (fs->fs_contigsumsize > 0) {
+               fs->fs_maxcluster = lp = space;
+               for (i = 0; i < fs->fs_ncg; i++)
+                       *lp++ = fs->fs_contigsumsize;
+               space = lp;
+       }
+       bsize = fs->fs_ncg * sizeof(*fs->fs_contigdirs);
+       fs->fs_contigdirs = space;
+       space = (char *)space + bsize;
+       memset(fs->fs_contigdirs, 0, bsize);
+               /* Compatibility for old filesystems - XXX */
+       if (fs->fs_avgfilesize <= 0)
+               fs->fs_avgfilesize = AVFILESIZ;
+       if (fs->fs_avgfpdir <= 0)
+               fs->fs_avgfpdir = AFPDIR;
+       fs->fs_active = NULL;
+       mp->mnt_data = ump;
+       mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+       mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_FFS);
+       mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+       mp->mnt_stat.f_namemax = FFS_MAXNAMLEN;
+       if (UFS_MPISAPPLEUFS(ump)) {
+               /* NeXT used to keep short symlinks in the inode even
+                * when using FS_42INODEFMT.  In that case fs->fs_maxsymlinklen
+                * is probably -1, but we still need to be able to identify
+                * short symlinks.
+                */
+               ump->um_maxsymlinklen = APPLEUFS_MAXSYMLINKLEN;
+               ump->um_dirblksiz = APPLEUFS_DIRBLKSIZ;
+               mp->mnt_iflag |= IMNT_DTYPE;
+       } else {
+               ump->um_maxsymlinklen = fs->fs_maxsymlinklen;
+               ump->um_dirblksiz = DIRBLKSIZ;
+               if (ump->um_maxsymlinklen > 0)
+                       mp->mnt_iflag |= IMNT_DTYPE;
+               else
+                       mp->mnt_iflag &= ~IMNT_DTYPE;
+       }
+       mp->mnt_fs_bshift = fs->fs_bshift;
+       mp->mnt_dev_bshift = DEV_BSHIFT;        /* XXX */
+       mp->mnt_flag |= MNT_LOCAL;
+       mp->mnt_iflag |= IMNT_MPSAFE;
+#ifdef FFS_EI
+       if (needswap)
+               ump->um_flags |= UFS_NEEDSWAP;
+#endif
+       ump->um_mountp = mp;
+       ump->um_dev = dev;
+       ump->um_devvp = devvp;
+       ump->um_nindir = fs->fs_nindir;
+       ump->um_lognindir = ffs(fs->fs_nindir) - 1;
+       ump->um_bptrtodb = fs->fs_fshift - DEV_BSHIFT;
+       ump->um_seqinc = fs->fs_frag;
+       for (i = 0; i < MAXQUOTAS; i++)
+               ump->um_quotas[i] = NULLVP;
+       devvp->v_specmountpoint = mp;
+       if (ronly == 0 && fs->fs_snapinum[0] != 0)
+               ffs_snapshot_mount(mp);
+#ifdef WAPBL
+       if (!ronly) {
+               KDASSERT(fs->fs_ronly == 0);
+               /*
+                * ffs_wapbl_start() needs mp->mnt_stat initialised if it
+                * needs to create a new log file in-filesystem.
+                */
+               ffs_statvfs(mp, &mp->mnt_stat);
+
+               error = ffs_wapbl_start(mp);
+               if (error) {
+                       free(fs->fs_csp, M_UFSMNT);
+                       goto out;
+               }
+       }
+#endif /* WAPBL */
+       if (ronly == 0) {
+#ifdef QUOTA2
+               error = ffs_quota2_mount(mp);
+               if (error) {
+                       free(fs->fs_csp, M_UFSMNT);
+                       goto out;
+               }
+#else
+               if (fs->fs_flags & FS_DOQUOTA2) {
+                       ump->um_flags |= UFS_QUOTA2;
+                       uprintf("%s: options QUOTA2 not enabled%s\n",
+                           mp->mnt_stat.f_mntonname,
+                           (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+                       if ((mp->mnt_flag & MNT_FORCE) == 0) {
+                               error = EINVAL;
+                               free(fs->fs_csp, M_UFSMNT);
+                               goto out;
+                       }
+               }
+#endif
+        }
+#ifdef UFS_EXTATTR
+       /*
+        * Initialize file-backed extended attributes on UFS1 file
+        * systems.
+        */
+       if (ump->um_fstype == UFS1)
+               ufs_extattr_uepm_init(&ump->um_extattr);        
+#endif /* UFS_EXTATTR */
+
+       return (0);
+out:
+#ifdef WAPBL
+       if (mp->mnt_wapbl_replay) {
+               wapbl_replay_stop(mp->mnt_wapbl_replay);
+               wapbl_replay_free(mp->mnt_wapbl_replay);
+               mp->mnt_wapbl_replay = 0;
+       }
+#endif
+
+       fstrans_unmount(mp);
+       if (fs)
+               free(fs, M_UFSMNT);
+       devvp->v_specmountpoint = NULL;
+       if (bp)
+               brelse(bp, bset);
+       if (ump) {
+               if (ump->um_oldfscompat)
+                       free(ump->um_oldfscompat, M_UFSMNT);
+               mutex_destroy(&ump->um_lock);
+               free(ump, M_UFSMNT);
+               mp->mnt_data = NULL;
+       }
+       return (error);
+}
+
+/*
+ * Sanity checks for loading old filesystem superblocks.
+ * See ffs_oldfscompat_write below for unwound actions.
+ *
+ * XXX - Parts get retired eventually.
+ * Unfortunately new bits get added.
+ */
+static void
+ffs_oldfscompat_read(struct fs *fs, struct ufsmount *ump, daddr_t sblockloc)
+{
+       off_t maxfilesize;
+       int32_t *extrasave;
+
+       if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+           (fs->fs_old_flags & FS_FLAGS_UPDATED))
+               return;
+
+       if (!ump->um_oldfscompat)
+               ump->um_oldfscompat = malloc(512 + 3*sizeof(int32_t),
+                   M_UFSMNT, M_WAITOK);
+
+       memcpy(ump->um_oldfscompat, &fs->fs_old_postbl_start, 512);
+       extrasave = ump->um_oldfscompat;
+       extrasave += 512/sizeof(int32_t);
+       extrasave[0] = fs->fs_old_npsect;
+       extrasave[1] = fs->fs_old_interleave;
+       extrasave[2] = fs->fs_old_trackskew;
+
+       /* These fields will be overwritten by their
+        * original values in fs_oldfscompat_write, so it is harmless
+        * to modify them here.
+        */
+       fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
+       fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
+       fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
+       fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
+
+       fs->fs_maxbsize = fs->fs_bsize;
+       fs->fs_time = fs->fs_old_time;
+       fs->fs_size = fs->fs_old_size;
+       fs->fs_dsize = fs->fs_old_dsize;
+       fs->fs_csaddr = fs->fs_old_csaddr;
+       fs->fs_sblockloc = sblockloc;
+
+       fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL);
+
+       if (fs->fs_old_postblformat == FS_42POSTBLFMT) {
+               fs->fs_old_nrpos = 8;
+               fs->fs_old_npsect = fs->fs_old_nsect;
+               fs->fs_old_interleave = 1;
+               fs->fs_old_trackskew = 0;
+       }
+
+       if (fs->fs_old_inodefmt < FS_44INODEFMT) {
+               fs->fs_maxfilesize = (u_quad_t) 1LL << 39;
+               fs->fs_qbmask = ~fs->fs_bmask;
+               fs->fs_qfmask = ~fs->fs_fmask;
+       }
+
+       maxfilesize = (u_int64_t)0x80000000 * fs->fs_bsize - 1;
+       if (fs->fs_maxfilesize > maxfilesize)
+               fs->fs_maxfilesize = maxfilesize;
+
+       /* Compatibility for old filesystems */
+       if (fs->fs_avgfilesize <= 0)
+               fs->fs_avgfilesize = AVFILESIZ;
+       if (fs->fs_avgfpdir <= 0)
+               fs->fs_avgfpdir = AFPDIR;
+
+#if 0
+       if (bigcgs) {
+               fs->fs_save_cgsize = fs->fs_cgsize;
+               fs->fs_cgsize = fs->fs_bsize;
+       }
+#endif
+}
+
+/*
+ * Unwinding superblock updates for old filesystems.
+ * See ffs_oldfscompat_read above for details.
+ *
+ * XXX - Parts get retired eventually.
+ * Unfortunately new bits get added.
+ */
+static void
+ffs_oldfscompat_write(struct fs *fs, struct ufsmount *ump)
+{
+       int32_t *extrasave;
+
+       if ((fs->fs_magic != FS_UFS1_MAGIC) ||
+           (fs->fs_old_flags & FS_FLAGS_UPDATED))
+               return;
+
+       fs->fs_old_time = fs->fs_time;
+       fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
+       fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
+       fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
+       fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
+       fs->fs_old_flags = fs->fs_flags;
+
+#if 0
+       if (bigcgs) {
+               fs->fs_cgsize = fs->fs_save_cgsize;
+       }
+#endif
+
+       memcpy(&fs->fs_old_postbl_start, ump->um_oldfscompat, 512);
+       extrasave = ump->um_oldfscompat;
+       extrasave += 512/sizeof(int32_t);
+       fs->fs_old_npsect = extrasave[0];
+       fs->fs_old_interleave = extrasave[1];
+       fs->fs_old_trackskew = extrasave[2];
+
+}
+
+/*
+ * unmount vfs operation
+ */
+int
+ffs_unmount(struct mount *mp, int mntflags)
+{
+       struct lwp *l = curlwp;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       int error, flags;
+#ifdef WAPBL
+       extern int doforce;
+#endif
+
+       flags = 0;
+       if (mntflags & MNT_FORCE)
+               flags |= FORCECLOSE;
+       if ((error = ffs_flushfiles(mp, flags, l)) != 0)
+               return (error);
+       error = UFS_WAPBL_BEGIN(mp);
+       if (error == 0)
+               if (fs->fs_ronly == 0 &&
+                   ffs_cgupdate(ump, MNT_WAIT) == 0 &&
+                   fs->fs_clean & FS_WASCLEAN) {
+                       fs->fs_clean = FS_ISCLEAN;
+                       fs->fs_fmod = 0;
+                       (void) ffs_sbupdate(ump, MNT_WAIT);
+               }
+       if (error == 0)
+               UFS_WAPBL_END(mp);
+#ifdef WAPBL
+       KASSERT(!(mp->mnt_wapbl_replay && mp->mnt_wapbl));
+       if (mp->mnt_wapbl_replay) {
+               KDASSERT(fs->fs_ronly);
+               wapbl_replay_stop(mp->mnt_wapbl_replay);
+               wapbl_replay_free(mp->mnt_wapbl_replay);
+               mp->mnt_wapbl_replay = 0;
+       }
+       error = ffs_wapbl_stop(mp, doforce && (mntflags & MNT_FORCE));
+       if (error) {
+               return error;
+       }
+#endif /* WAPBL */
+#ifdef UFS_EXTATTR
+       if (ump->um_fstype == UFS1) {
+               ufs_extattr_stop(mp, l);
+               ufs_extattr_uepm_destroy(&ump->um_extattr);
+       }
+#endif /* UFS_EXTATTR */
+
+       if (ump->um_devvp->v_type != VBAD)
+               ump->um_devvp->v_specmountpoint = NULL;
+       vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+       (void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD | FWRITE,
+               NOCRED);
+       vput(ump->um_devvp);
+       free(fs->fs_csp, M_UFSMNT);
+       free(fs, M_UFSMNT);
+       if (ump->um_oldfscompat != NULL)
+               free(ump->um_oldfscompat, M_UFSMNT);
+       mutex_destroy(&ump->um_lock);
+       ffs_snapshot_fini(ump);
+       free(ump, M_UFSMNT);
+       mp->mnt_data = NULL;
+       mp->mnt_flag &= ~MNT_LOCAL;
+       fstrans_unmount(mp);
+       return (0);
+}
+
+/*
+ * Flush out all the files in a filesystem.
+ */
+int
+ffs_flushfiles(struct mount *mp, int flags, struct lwp *l)
+{
+       extern int doforce;
+       struct ufsmount *ump;
+       int error;
+
+       if (!doforce)
+               flags &= ~FORCECLOSE;
+       ump = VFSTOUFS(mp);
+#ifdef QUOTA
+       if ((error = quota1_umount(mp, flags)) != 0)
+               return (error);
+#endif
+#ifdef QUOTA2
+       if ((error = quota2_umount(mp, flags)) != 0)
+               return (error);
+#endif
+       if ((error = vflush(mp, 0, SKIPSYSTEM | flags)) != 0)
+               return (error);
+       ffs_snapshot_unmount(mp);
+       /*
+        * Flush all the files.
+        */
+       error = vflush(mp, NULLVP, flags);
+       if (error)
+               return (error);
+       /*
+        * Flush filesystem metadata.
+        */
+       vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+       error = VOP_FSYNC(ump->um_devvp, l->l_cred, FSYNC_WAIT, 0, 0);
+       VOP_UNLOCK(ump->um_devvp);
+       if (flags & FORCECLOSE) /* XXXDBJ */
+               error = 0;
+
+#ifdef WAPBL
+       if (error)
+               return error;
+       if (mp->mnt_wapbl) {
+               error = wapbl_flush(mp->mnt_wapbl, 1);
+               if (flags & FORCECLOSE)
+                       error = 0;
+       }
+#endif
+
+       return (error);
+}
+
+/*
+ * Get file system statistics.
+ */
+int
+ffs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+       struct ufsmount *ump;
+       struct fs *fs;
+
+       ump = VFSTOUFS(mp);
+       fs = ump->um_fs;
+       mutex_enter(&ump->um_lock);
+       sbp->f_bsize = fs->fs_bsize;
+       sbp->f_frsize = fs->fs_fsize;
+       sbp->f_iosize = fs->fs_bsize;
+       sbp->f_blocks = fs->fs_dsize;
+       sbp->f_bfree = blkstofrags(fs, fs->fs_cstotal.cs_nbfree) +
+           fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
+       sbp->f_bresvd = ((u_int64_t) fs->fs_dsize * (u_int64_t)
+           fs->fs_minfree) / (u_int64_t) 100;
+       if (sbp->f_bfree > sbp->f_bresvd)
+               sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
+       else
+               sbp->f_bavail = 0;
+       sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
+       sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
+       sbp->f_favail = sbp->f_ffree;
+       sbp->f_fresvd = 0;
+       mutex_exit(&ump->um_lock);
+       copy_statvfs_info(sbp, mp);
+
+       return (0);
+}
+
+/*
+ * Go through the disk queues to initiate sandbagged IO;
+ * go through the inodes to write those that have been modified;
+ * initiate the writing of the super block if it has been modified.
+ *
+ * Note: we are always called with the filesystem marked `MPBUSY'.
+ */
+int
+ffs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
+{
+       struct vnode *vp, *mvp, *nvp;
+       struct inode *ip;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs;
+       int error, allerror = 0;
+       bool is_suspending;
+
+       fs = ump->um_fs;
+       if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {            /* XXX */
+               printf("fs = %s\n", fs->fs_fsmnt);
+               panic("update: rofs mod");
+       }
+
+       /* Allocate a marker vnode. */
+       mvp = vnalloc(mp);
+
+       fstrans_start(mp, FSTRANS_SHARED);
+       is_suspending = (fstrans_getstate(mp) == FSTRANS_SUSPENDING);
+       /*
+        * Write back each (modified) inode.
+        */
+       mutex_enter(&mntvnode_lock);
+loop:
+       /*
+        * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
+        * and vclean() can be called indirectly
+        */
+       for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
+               nvp = TAILQ_NEXT(vp, v_mntvnodes);
+               /*
+                * If the vnode that we are about to sync is no longer
+                * associated with this mount point, start over.
+                */
+               if (vp->v_mount != mp)
+                       goto loop;
+               /*
+                * Don't interfere with concurrent scans of this FS.
+                */
+               if (vismarker(vp))
+                       continue;
+               mutex_enter(vp->v_interlock);
+               ip = VTOI(vp);
+
+               /*
+                * Skip the vnode/inode if inaccessible.
+                */
+               if (ip == NULL || (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 ||
+                   vp->v_type == VNON) {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+
+               /*
+                * We deliberately update inode times here.  This will
+                * prevent a massive queue of updates accumulating, only
+                * to be handled by a call to unmount.
+                *
+                * XXX It would be better to have the syncer trickle these
+                * out.  Adjustment needed to allow registering vnodes for
+                * sync when the vnode is clean, but the inode dirty.  Or
+                * have ufs itself trickle out inode updates.
+                *
+                * If doing a lazy sync, we don't care about metadata or
+                * data updates, because they are handled by each vnode's
+                * synclist entry.  In this case we are only interested in
+                * writing back modified inodes.
+                */
+               if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE |
+                   IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) == 0 &&
+                   (waitfor == MNT_LAZY || (LIST_EMPTY(&vp->v_dirtyblkhd) &&
+                   UVM_OBJ_IS_CLEAN(&vp->v_uobj)))) {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+               if (vp->v_type == VBLK && is_suspending) {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+               vmark(mvp, vp);
+               mutex_exit(&mntvnode_lock);
+               error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT);
+               if (error) {
+                       mutex_enter(&mntvnode_lock);
+                       nvp = vunmark(mvp);
+                       if (error == ENOENT) {
+                               goto loop;
+                       }
+                       continue;
+               }
+               if (waitfor == MNT_LAZY) {
+                       error = UFS_WAPBL_BEGIN(vp->v_mount);
+                       if (!error) {
+                               error = ffs_update(vp, NULL, NULL,
+                                   UPDATE_CLOSE);
+                               UFS_WAPBL_END(vp->v_mount);
+                       }
+               } else {
+                       error = VOP_FSYNC(vp, cred, FSYNC_NOLOG |
+                           (waitfor == MNT_WAIT ? FSYNC_WAIT : 0), 0, 0);
+               }
+               if (error)
+                       allerror = error;
+               vput(vp);
+               mutex_enter(&mntvnode_lock);
+               nvp = vunmark(mvp);
+       }
+       mutex_exit(&mntvnode_lock);
+       /*
+        * Force stale file system control information to be flushed.
+        */
+       if (waitfor != MNT_LAZY && (ump->um_devvp->v_numoutput > 0 ||
+           !LIST_EMPTY(&ump->um_devvp->v_dirtyblkhd))) {
+               vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+               if ((error = VOP_FSYNC(ump->um_devvp, cred,
+                   (waitfor == MNT_WAIT ? FSYNC_WAIT : 0) | FSYNC_NOLOG,
+                   0, 0)) != 0)
+                       allerror = error;
+               VOP_UNLOCK(ump->um_devvp);
+               if (allerror == 0 && waitfor == MNT_WAIT && !mp->mnt_wapbl) {
+                       mutex_enter(&mntvnode_lock);
+                       goto loop;
+               }
+       }
+#if defined(QUOTA) || defined(QUOTA2)
+       qsync(mp);
+#endif
+       /*
+        * Write back modified superblock.
+        */
+       if (fs->fs_fmod != 0) {
+               fs->fs_fmod = 0;
+               fs->fs_time = time_second;
+               error = UFS_WAPBL_BEGIN(mp);
+               if (error)
+                       allerror = error;
+               else {
+                       if ((error = ffs_cgupdate(ump, waitfor)))
+                               allerror = error;
+                       UFS_WAPBL_END(mp);
+               }
+       }
+
+#ifdef WAPBL
+       if (mp->mnt_wapbl) {
+               error = wapbl_flush(mp->mnt_wapbl, 0);
+               if (error)
+                       allerror = error;
+       }
+#endif
+
+       fstrans_done(mp);
+       vnfree(mvp);
+       return (allerror);
+}
+
+/*
+ * Look up a FFS dinode number to find its incore vnode, otherwise read it
+ * in from disk.  If it is in core, wait for the lock bit to clear, then
+ * return the inode locked.  Detection and handling of mount points must be
+ * done by the calling routine.
+ */
+int
+ffs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+       struct fs *fs;
+       struct inode *ip;
+       struct ufsmount *ump;
+       struct buf *bp;
+       struct vnode *vp;
+       dev_t dev;
+       int error;
+
+       ump = VFSTOUFS(mp);
+       dev = ump->um_dev;
+
+ retry:
+       if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
+               return (0);
+
+       /* Allocate a new vnode/inode. */
+       error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, NULL, &vp);
+       if (error) {
+               *vpp = NULL;
+               return (error);
+       }
+       ip = pool_cache_get(ffs_inode_cache, PR_WAITOK);
+
+       /*
+        * If someone beat us to it, put back the freshly allocated
+        * vnode/inode pair and retry.
+        */
+       mutex_enter(&ufs_hashlock);
+       if (ufs_ihashget(dev, ino, 0) != NULL) {
+               mutex_exit(&ufs_hashlock);
+               ungetnewvnode(vp);
+               pool_cache_put(ffs_inode_cache, ip);
+               goto retry;
+       }
+
+       vp->v_vflag |= VV_LOCKSWORK;
+
+       /*
+        * XXX MFS ends up here, too, to allocate an inode.  Should we
+        * XXX create another pool for MFS inodes?
+        */
+
+       memset(ip, 0, sizeof(struct inode));
+       vp->v_data = ip;
+       ip->i_vnode = vp;
+       ip->i_ump = ump;
+       ip->i_fs = fs = ump->um_fs;
+       ip->i_dev = dev;
+       ip->i_number = ino;
+#if defined(QUOTA) || defined(QUOTA2)
+       ufsquota_init(ip);
+#endif
+
+       /*
+        * Initialize genfs node, we might proceed to destroy it in
+        * error branches.
+        */
+       genfs_node_init(vp, &ffs_genfsops);
+
+       /*
+        * Put it onto its hash chain and lock it so that other requests for
+        * this inode will block if they arrive while we are sleeping waiting
+        * for old data structures to be purged or for the contents of the
+        * disk portion of this inode to be read.
+        */
+
+       ufs_ihashins(ip);
+       mutex_exit(&ufs_hashlock);
+
+       /* Read in the disk contents for the inode, copy into the inode. */
+       error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
+                     (int)fs->fs_bsize, NOCRED, 0, &bp);
+       if (error) {
+
+               /*
+                * The inode does not contain anything useful, so it would
+                * be misleading to leave it on its hash chain. With mode
+                * still zero, it will be unlinked and returned to the free
+                * list by vput().
+                */
+
+               vput(vp);
+               brelse(bp, 0);
+               *vpp = NULL;
+               return (error);
+       }
+       if (ip->i_ump->um_fstype == UFS1)
+               ip->i_din.ffs1_din = pool_cache_get(ffs_dinode1_cache,
+                   PR_WAITOK);
+       else
+               ip->i_din.ffs2_din = pool_cache_get(ffs_dinode2_cache,
+                   PR_WAITOK);
+       ffs_load_inode(bp, ip, fs, ino);
+       brelse(bp, 0);
+
+       /*
+        * Initialize the vnode from the inode, check for aliases.
+        * Note that the underlying vnode may have changed.
+        */
+
+       ufs_vinit(mp, ffs_specop_p, ffs_fifoop_p, &vp);
+
+       /*
+        * Finish inode initialization now that aliasing has been resolved.
+        */
+
+       ip->i_devvp = ump->um_devvp;
+       vref(ip->i_devvp);
+
+       /*
+        * Ensure that uid and gid are correct. This is a temporary
+        * fix until fsck has been changed to do the update.
+        */
+
+       if (fs->fs_old_inodefmt < FS_44INODEFMT) {              /* XXX */
+               ip->i_uid = ip->i_ffs1_ouid;                    /* XXX */
+               ip->i_gid = ip->i_ffs1_ogid;                    /* XXX */
+       }                                                       /* XXX */
+       uvm_vnp_setsize(vp, ip->i_size);
+       *vpp = vp;
+       return (0);
+}
+
+/*
+ * File handle to vnode
+ *
+ * Have to be really careful about stale file handles:
+ * - check that the inode number is valid
+ * - call ffs_vget() to get the locked inode
+ * - check for an unallocated inode (i_mode == 0)
+ * - check that the given client host has export rights and return
+ *   those rights via. exflagsp and credanonp
+ */
+int
+ffs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+       struct ufid ufh;
+       struct fs *fs;
+
+       if (fhp->fid_len != sizeof(struct ufid))
+               return EINVAL;
+
+       memcpy(&ufh, fhp, sizeof(ufh));
+       fs = VFSTOUFS(mp)->um_fs;
+       if (ufh.ufid_ino < ROOTINO ||
+           ufh.ufid_ino >= fs->fs_ncg * fs->fs_ipg)
+               return (ESTALE);
+       return (ufs_fhtovp(mp, &ufh, vpp));
+}
+
+/*
+ * Vnode pointer to File handle
+ */
+/* ARGSUSED */
+int
+ffs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+       struct inode *ip;
+       struct ufid ufh;
+
+       if (*fh_size < sizeof(struct ufid)) {
+               *fh_size = sizeof(struct ufid);
+               return E2BIG;
+       }
+       ip = VTOI(vp);
+       *fh_size = sizeof(struct ufid);
+       memset(&ufh, 0, sizeof(ufh));
+       ufh.ufid_len = sizeof(struct ufid);
+       ufh.ufid_ino = ip->i_number;
+       ufh.ufid_gen = ip->i_gen;
+       memcpy(fhp, &ufh, sizeof(ufh));
+       return (0);
+}
+
+void
+ffs_init(void)
+{
+       if (ffs_initcount++ > 0)
+               return;
+
+       ffs_inode_cache = pool_cache_init(sizeof(struct inode), 0, 0, 0,
+           "ffsino", NULL, IPL_NONE, NULL, NULL, NULL);
+       ffs_dinode1_cache = pool_cache_init(sizeof(struct ufs1_dinode), 0, 0, 0,
+           "ffsdino1", NULL, IPL_NONE, NULL, NULL, NULL);
+       ffs_dinode2_cache = pool_cache_init(sizeof(struct ufs2_dinode), 0, 0, 0,
+           "ffsdino2", NULL, IPL_NONE, NULL, NULL, NULL);
+       ufs_init();
+}
+
+void
+ffs_reinit(void)
+{
+
+       ufs_reinit();
+}
+
+void
+ffs_done(void)
+{
+       if (--ffs_initcount > 0)
+               return;
+
+       ufs_done();
+       pool_cache_destroy(ffs_dinode2_cache);
+       pool_cache_destroy(ffs_dinode1_cache);
+       pool_cache_destroy(ffs_inode_cache);
+}
+
+/*
+ * Write a superblock and associated information back to disk.
+ */
+int
+ffs_sbupdate(struct ufsmount *mp, int waitfor)
+{
+       struct fs *fs = mp->um_fs;
+       struct buf *bp;
+       int error = 0;
+       u_int32_t saveflag;
+
+       error = ffs_getblk(mp->um_devvp,
+           fs->fs_sblockloc / DEV_BSIZE, FFS_NOBLK,
+           fs->fs_sbsize, false, &bp);
+       if (error)
+               return error;
+       saveflag = fs->fs_flags & FS_INTERNAL;
+       fs->fs_flags &= ~FS_INTERNAL;
+
+       memcpy(bp->b_data, fs, fs->fs_sbsize);
+
+       ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
+#ifdef FFS_EI
+       if (mp->um_flags & UFS_NEEDSWAP)
+               ffs_sb_swap((struct fs *)bp->b_data, (struct fs *)bp->b_data);
+#endif
+       fs->fs_flags |= saveflag;
+
+       if (waitfor == MNT_WAIT)
+               error = bwrite(bp);
+       else
+               bawrite(bp);
+       return (error);
+}
+
+int
+ffs_cgupdate(struct ufsmount *mp, int waitfor)
+{
+       struct fs *fs = mp->um_fs;
+       struct buf *bp;
+       int blks;
+       void *space;
+       int i, size, error = 0, allerror = 0;
+
+       allerror = ffs_sbupdate(mp, waitfor);
+       blks = howmany(fs->fs_cssize, fs->fs_fsize);
+       space = fs->fs_csp;
+       for (i = 0; i < blks; i += fs->fs_frag) {
+               size = fs->fs_bsize;
+               if (i + fs->fs_frag > blks)
+                       size = (blks - i) * fs->fs_fsize;
+               error = ffs_getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
+                   FFS_NOBLK, size, false, &bp);
+               if (error)
+                       break;
+#ifdef FFS_EI
+               if (mp->um_flags & UFS_NEEDSWAP)
+                       ffs_csum_swap((struct csum*)space,
+                           (struct csum*)bp->b_data, size);
+               else
+#endif
+                       memcpy(bp->b_data, space, (u_int)size);
+               space = (char *)space + size;
+               if (waitfor == MNT_WAIT)
+                       error = bwrite(bp);
+               else
+                       bawrite(bp);
+       }
+       if (!allerror && error)
+               allerror = error;
+       return (allerror);
+}
+
+int
+ffs_extattrctl(struct mount *mp, int cmd, struct vnode *vp,
+    int attrnamespace, const char *attrname)
+{
+#ifdef UFS_EXTATTR
+       /*
+        * File-backed extended attributes are only supported on UFS1.
+        * UFS2 has native extended attributes.
+        */
+       if (VFSTOUFS(mp)->um_fstype == UFS1)
+               return (ufs_extattrctl(mp, cmd, vp, attrnamespace, attrname));
+#endif
+       return (vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname));
+}
+
+int
+ffs_suspendctl(struct mount *mp, int cmd)
+{
+       int error;
+       struct lwp *l = curlwp;
+
+       switch (cmd) {
+       case SUSPEND_SUSPEND:
+               if ((error = fstrans_setstate(mp, FSTRANS_SUSPENDING)) != 0)
+                       return error;
+               error = ffs_sync(mp, MNT_WAIT, l->l_proc->p_cred);
+               if (error == 0)
+                       error = fstrans_setstate(mp, FSTRANS_SUSPENDED);
+#ifdef WAPBL
+               if (error == 0 && mp->mnt_wapbl)
+                       error = wapbl_flush(mp->mnt_wapbl, 1);
+#endif
+               if (error != 0) {
+                       (void) fstrans_setstate(mp, FSTRANS_NORMAL);
+                       return error;
+               }
+               return 0;
+
+       case SUSPEND_RESUME:
+               return fstrans_setstate(mp, FSTRANS_NORMAL);
+
+       default:
+               return EINVAL;
+       }
+}
+
+/*
+ * Synch vnode for a mounted file system.
+ */
+static int
+ffs_vfs_fsync(vnode_t *vp, int flags)
+{
+       int error, i, pflags;
+#ifdef WAPBL
+       struct mount *mp;
+#endif
+
+       KASSERT(vp->v_type == VBLK);
+       KASSERT(vp->v_specmountpoint != NULL);
+
+       /*
+        * Flush all dirty data associated with the vnode.
+        */
+       pflags = PGO_ALLPAGES | PGO_CLEANIT;
+       if ((flags & FSYNC_WAIT) != 0)
+               pflags |= PGO_SYNCIO;
+       mutex_enter(vp->v_interlock);
+       error = VOP_PUTPAGES(vp, 0, 0, pflags);
+       if (error)
+               return error;
+
+#ifdef WAPBL
+       mp = vp->v_specmountpoint;
+       if (mp && mp->mnt_wapbl) {
+               /*
+                * Don't bother writing out metadata if the syncer is
+                * making the request.  We will let the sync vnode
+                * write it out in a single burst through a call to
+                * VFS_SYNC().
+                */
+               if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY | FSYNC_NOLOG)) != 0)
+                       return 0;
+
+               /*
+                * Don't flush the log if the vnode being flushed
+                * contains no dirty buffers that could be in the log.
+                */
+               if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
+                       error = wapbl_flush(mp->mnt_wapbl, 0);
+                       if (error)
+                               return error;
+               }
+
+               if ((flags & FSYNC_WAIT) != 0) {
+                       mutex_enter(vp->v_interlock);
+                       while (vp->v_numoutput)
+                               cv_wait(&vp->v_cv, vp->v_interlock);
+                       mutex_exit(vp->v_interlock);
+               }
+
+               return 0;
+       }
+#endif /* WAPBL */
+
+       error = vflushbuf(vp, (flags & FSYNC_WAIT) != 0);
+       if (error == 0 && (flags & FSYNC_CACHE) != 0) {
+               i = 1;
+               (void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE,
+                   kauth_cred_get());
+       }
+
+       return error;
+}
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
new file mode 100644 (file)
index 0000000..9acc0bd
--- /dev/null
@@ -0,0 +1,785 @@
+/*     $NetBSD: ffs_vnops.c,v 1.120 2011/06/27 16:34:47 manu Exp $     */
+
+/*-
+ * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc, and by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.120 2011/06/27 16:34:47 manu Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_wapbl.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/pool.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <uvm/uvm.h>
+
+/* Global vfs data structures for ufs. */
+int (**ffs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, ufs_lookup },               /* lookup */
+       { &vop_create_desc, ufs_create },               /* create */
+       { &vop_whiteout_desc, ufs_whiteout },           /* whiteout */
+       { &vop_mknod_desc, ufs_mknod },                 /* mknod */
+       { &vop_open_desc, ufs_open },                   /* open */
+       { &vop_close_desc, ufs_close },                 /* close */
+       { &vop_access_desc, ufs_access },               /* access */
+       { &vop_getattr_desc, ufs_getattr },             /* getattr */
+       { &vop_setattr_desc, ufs_setattr },             /* setattr */
+       { &vop_read_desc, ffs_read },                   /* read */
+       { &vop_write_desc, ffs_write },                 /* write */
+       { &vop_ioctl_desc, ufs_ioctl },                 /* ioctl */
+       { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
+       { &vop_poll_desc, ufs_poll },                   /* poll */
+       { &vop_kqfilter_desc, genfs_kqfilter },         /* kqfilter */
+       { &vop_revoke_desc, ufs_revoke },               /* revoke */
+       { &vop_mmap_desc, ufs_mmap },                   /* mmap */
+       { &vop_fsync_desc, ffs_fsync },                 /* fsync */
+       { &vop_seek_desc, ufs_seek },                   /* seek */
+       { &vop_remove_desc, ufs_remove },               /* remove */
+       { &vop_link_desc, ufs_link },                   /* link */
+       { &vop_rename_desc, ufs_rename },               /* rename */
+       { &vop_mkdir_desc, ufs_mkdir },                 /* mkdir */
+       { &vop_rmdir_desc, ufs_rmdir },                 /* rmdir */
+       { &vop_symlink_desc, ufs_symlink },             /* symlink */
+       { &vop_readdir_desc, ufs_readdir },             /* readdir */
+       { &vop_readlink_desc, ufs_readlink },           /* readlink */
+       { &vop_abortop_desc, ufs_abortop },             /* abortop */
+       { &vop_inactive_desc, ufs_inactive },           /* inactive */
+       { &vop_reclaim_desc, ffs_reclaim },             /* reclaim */
+       { &vop_lock_desc, ufs_lock },                   /* lock */
+       { &vop_unlock_desc, ufs_unlock },               /* unlock */
+       { &vop_bmap_desc, ufs_bmap },                   /* bmap */
+       { &vop_strategy_desc, ufs_strategy },           /* strategy */
+       { &vop_print_desc, ufs_print },                 /* print */
+       { &vop_islocked_desc, ufs_islocked },           /* islocked */
+       { &vop_pathconf_desc, ufs_pathconf },           /* pathconf */
+       { &vop_advlock_desc, ufs_advlock },             /* advlock */
+       { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
+       { &vop_getpages_desc, genfs_getpages },         /* getpages */
+       { &vop_putpages_desc, genfs_putpages },         /* putpages */
+       { &vop_openextattr_desc, ffs_openextattr },     /* openextattr */
+       { &vop_closeextattr_desc, ffs_closeextattr },   /* closeextattr */
+       { &vop_getextattr_desc, ffs_getextattr },       /* getextattr */
+       { &vop_setextattr_desc, ffs_setextattr },       /* setextattr */
+       { &vop_listextattr_desc, ffs_listextattr },     /* listextattr */
+       { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc ffs_vnodeop_opv_desc =
+       { &ffs_vnodeop_p, ffs_vnodeop_entries };
+
+int (**ffs_specop_p)(void *);
+const struct vnodeopv_entry_desc ffs_specop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, spec_lookup },              /* lookup */
+       { &vop_create_desc, spec_create },              /* create */
+       { &vop_mknod_desc, spec_mknod },                /* mknod */
+       { &vop_open_desc, spec_open },                  /* open */
+       { &vop_close_desc, ufsspec_close },             /* close */
+       { &vop_access_desc, ufs_access },               /* access */
+       { &vop_getattr_desc, ufs_getattr },             /* getattr */
+       { &vop_setattr_desc, ufs_setattr },             /* setattr */
+       { &vop_read_desc, ufsspec_read },               /* read */
+       { &vop_write_desc, ufsspec_write },             /* write */
+       { &vop_ioctl_desc, spec_ioctl },                /* ioctl */
+       { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
+       { &vop_poll_desc, spec_poll },                  /* poll */
+       { &vop_kqfilter_desc, spec_kqfilter },          /* kqfilter */
+       { &vop_revoke_desc, spec_revoke },              /* revoke */
+       { &vop_mmap_desc, spec_mmap },                  /* mmap */
+       { &vop_fsync_desc, ffs_spec_fsync },            /* fsync */
+       { &vop_seek_desc, spec_seek },                  /* seek */
+       { &vop_remove_desc, spec_remove },              /* remove */
+       { &vop_link_desc, spec_link },                  /* link */
+       { &vop_rename_desc, spec_rename },              /* rename */
+       { &vop_mkdir_desc, spec_mkdir },                /* mkdir */
+       { &vop_rmdir_desc, spec_rmdir },                /* rmdir */
+       { &vop_symlink_desc, spec_symlink },            /* symlink */
+       { &vop_readdir_desc, spec_readdir },            /* readdir */
+       { &vop_readlink_desc, spec_readlink },          /* readlink */
+       { &vop_abortop_desc, spec_abortop },            /* abortop */
+       { &vop_inactive_desc, ufs_inactive },           /* inactive */
+       { &vop_reclaim_desc, ffs_reclaim },             /* reclaim */
+       { &vop_lock_desc, ufs_lock },                   /* lock */
+       { &vop_unlock_desc, ufs_unlock },               /* unlock */
+       { &vop_bmap_desc, spec_bmap },                  /* bmap */
+       { &vop_strategy_desc, spec_strategy },          /* strategy */
+       { &vop_print_desc, ufs_print },                 /* print */
+       { &vop_islocked_desc, ufs_islocked },           /* islocked */
+       { &vop_pathconf_desc, spec_pathconf },          /* pathconf */
+       { &vop_advlock_desc, spec_advlock },            /* advlock */
+       { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
+       { &vop_getpages_desc, spec_getpages },          /* getpages */
+       { &vop_putpages_desc, spec_putpages },          /* putpages */
+       { &vop_openextattr_desc, ffs_openextattr },     /* openextattr */
+       { &vop_closeextattr_desc, ffs_closeextattr },   /* closeextattr */
+       { &vop_getextattr_desc, ffs_getextattr },       /* getextattr */
+       { &vop_setextattr_desc, ffs_setextattr },       /* setextattr */
+       { &vop_listextattr_desc, ffs_listextattr },     /* listextattr */
+       { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc ffs_specop_opv_desc =
+       { &ffs_specop_p, ffs_specop_entries };
+
+int (**ffs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, vn_fifo_bypass },           /* lookup */
+       { &vop_create_desc, vn_fifo_bypass },           /* create */
+       { &vop_mknod_desc, vn_fifo_bypass },            /* mknod */
+       { &vop_open_desc, vn_fifo_bypass },             /* open */
+       { &vop_close_desc, ufsfifo_close },             /* close */
+       { &vop_access_desc, ufs_access },               /* access */
+       { &vop_getattr_desc, ufs_getattr },             /* getattr */
+       { &vop_setattr_desc, ufs_setattr },             /* setattr */
+       { &vop_read_desc, ufsfifo_read },               /* read */
+       { &vop_write_desc, ufsfifo_write },             /* write */
+       { &vop_ioctl_desc, vn_fifo_bypass },            /* ioctl */
+       { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
+       { &vop_poll_desc, vn_fifo_bypass },             /* poll */
+       { &vop_kqfilter_desc, vn_fifo_bypass },         /* kqfilter */
+       { &vop_revoke_desc, vn_fifo_bypass },           /* revoke */
+       { &vop_mmap_desc, vn_fifo_bypass },             /* mmap */
+       { &vop_fsync_desc, ffs_fsync },                 /* fsync */
+       { &vop_seek_desc, vn_fifo_bypass },             /* seek */
+       { &vop_remove_desc, vn_fifo_bypass },           /* remove */
+       { &vop_link_desc, vn_fifo_bypass },             /* link */
+       { &vop_rename_desc, vn_fifo_bypass },           /* rename */
+       { &vop_mkdir_desc, vn_fifo_bypass },            /* mkdir */
+       { &vop_rmdir_desc, vn_fifo_bypass },            /* rmdir */
+       { &vop_symlink_desc, vn_fifo_bypass },          /* symlink */
+       { &vop_readdir_desc, vn_fifo_bypass },          /* readdir */
+       { &vop_readlink_desc, vn_fifo_bypass },         /* readlink */
+       { &vop_abortop_desc, vn_fifo_bypass },          /* abortop */
+       { &vop_inactive_desc, ufs_inactive },           /* inactive */
+       { &vop_reclaim_desc, ffs_reclaim },             /* reclaim */
+       { &vop_lock_desc, ufs_lock },                   /* lock */
+       { &vop_unlock_desc, ufs_unlock },               /* unlock */
+       { &vop_bmap_desc, vn_fifo_bypass },             /* bmap */
+       { &vop_strategy_desc, vn_fifo_bypass },         /* strategy */
+       { &vop_print_desc, ufs_print },                 /* print */
+       { &vop_islocked_desc, ufs_islocked },           /* islocked */
+       { &vop_pathconf_desc, vn_fifo_bypass },         /* pathconf */
+       { &vop_advlock_desc, vn_fifo_bypass },          /* advlock */
+       { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
+       { &vop_putpages_desc, vn_fifo_bypass },         /* putpages */
+       { &vop_openextattr_desc, ffs_openextattr },     /* openextattr */
+       { &vop_closeextattr_desc, ffs_closeextattr },   /* closeextattr */
+       { &vop_getextattr_desc, ffs_getextattr },       /* getextattr */
+       { &vop_setextattr_desc, ffs_setextattr },       /* setextattr */
+       { &vop_listextattr_desc, ffs_listextattr },     /* listextattr */
+       { &vop_deleteextattr_desc, ffs_deleteextattr }, /* deleteextattr */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc ffs_fifoop_opv_desc =
+       { &ffs_fifoop_p, ffs_fifoop_entries };
+
+#include <ufs/ufs/ufs_readwrite.c>
+
+int
+ffs_spec_fsync(void *v)
+{
+       struct vop_fsync_args /* {
+               struct vnode *a_vp;
+               kauth_cred_t a_cred;
+               int a_flags;
+               off_t a_offlo;
+               off_t a_offhi;
+               struct lwp *a_l;
+       } */ *ap = v;
+       int error, flags, uflags;
+       struct vnode *vp;
+       struct mount *mp;
+
+       flags = ap->a_flags;
+       uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
+       vp = ap->a_vp;
+       mp = vp->v_mount;
+
+       fstrans_start(mp, FSTRANS_LAZY);
+
+       error = spec_fsync(v);
+       if (error)
+               goto out;
+
+#ifdef WAPBL
+       if (mp && mp->mnt_wapbl) {
+               /*
+                * Don't bother writing out metadata if the syncer is
+                * making the request.  We will let the sync vnode
+                * write it out in a single burst through a call to
+                * VFS_SYNC().
+                */
+               if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
+                       goto out;
+               if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
+                   | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
+                       error = UFS_WAPBL_BEGIN(mp);
+                       if (error != 0)
+                               goto out;
+                       error = ffs_update(vp, NULL, NULL, uflags);
+                       UFS_WAPBL_END(mp);
+               }
+               goto out;
+       }
+#endif /* WAPBL */
+
+       error = ffs_update(vp, NULL, NULL, uflags);
+
+out:
+       fstrans_done(mp);
+       return error;
+}
+
+int
+ffs_fsync(void *v)
+{
+       struct vop_fsync_args /* {
+               struct vnode *a_vp;
+               kauth_cred_t a_cred;
+               int a_flags;
+               off_t a_offlo;
+               off_t a_offhi;
+               struct lwp *a_l;
+       } */ *ap = v;
+       struct buf *bp;
+       int num, error, i;
+       struct indir ia[NIADDR + 1];
+       int bsize;
+       daddr_t blk_high;
+       struct vnode *vp;
+       struct mount *mp;
+
+       vp = ap->a_vp;
+       mp = vp->v_mount;
+
+       fstrans_start(mp, FSTRANS_LAZY);
+       if ((ap->a_offlo == 0 && ap->a_offhi == 0) || (vp->v_type != VREG)) {
+               error = ffs_full_fsync(vp, ap->a_flags);
+               goto out;
+       }
+
+       bsize = mp->mnt_stat.f_iosize;
+       blk_high = ap->a_offhi / bsize;
+       if (ap->a_offhi % bsize != 0)
+               blk_high++;
+
+       /*
+        * First, flush all pages in range.
+        */
+
+       mutex_enter(vp->v_interlock);
+       error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
+           round_page(ap->a_offhi), PGO_CLEANIT |
+           ((ap->a_flags & FSYNC_WAIT) ? PGO_SYNCIO : 0));
+       if (error) {
+               goto out;
+       }
+
+#ifdef WAPBL
+       KASSERT(vp->v_type == VREG);
+       if (mp->mnt_wapbl) {
+               /*
+                * Don't bother writing out metadata if the syncer is
+                * making the request.  We will let the sync vnode
+                * write it out in a single burst through a call to
+                * VFS_SYNC().
+                */
+               if ((ap->a_flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) {
+                       fstrans_done(mp);
+                       return 0;
+               }
+               error = 0;
+               if (vp->v_tag == VT_UFS && VTOI(vp)->i_flag &
+                   (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY |
+                                IN_MODIFIED | IN_ACCESSED)) {
+                       error = UFS_WAPBL_BEGIN(mp);
+                       if (error) {
+                               fstrans_done(mp);
+                               return error;
+                       }
+                       error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE |
+                           ((ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0));
+                       UFS_WAPBL_END(mp);
+               }
+               if (error || (ap->a_flags & FSYNC_NOLOG) != 0) {
+                       fstrans_done(mp);
+                       return error;
+               }
+               error = wapbl_flush(mp->mnt_wapbl, 0);
+               fstrans_done(mp);
+               return error;
+       }
+#endif /* WAPBL */
+
+       /*
+        * Then, flush indirect blocks.
+        */
+
+       if (blk_high >= NDADDR) {
+               error = ufs_getlbns(vp, blk_high, ia, &num);
+               if (error)
+                       goto out;
+
+               mutex_enter(&bufcache_lock);
+               for (i = 0; i < num; i++) {
+                       if ((bp = incore(vp, ia[i].in_lbn)) == NULL)
+                               continue;
+                       if ((bp->b_cflags & BC_BUSY) != 0 ||
+                           (bp->b_oflags & BO_DELWRI) == 0)
+                               continue;
+                       bp->b_cflags |= BC_BUSY | BC_VFLUSH;
+                       mutex_exit(&bufcache_lock);
+                       bawrite(bp);
+                       mutex_enter(&bufcache_lock);
+               }
+               mutex_exit(&bufcache_lock);
+       }
+
+       if (ap->a_flags & FSYNC_WAIT) {
+               mutex_enter(vp->v_interlock);
+               while (vp->v_numoutput > 0)
+                       cv_wait(&vp->v_cv, vp->v_interlock);
+               mutex_exit(vp->v_interlock);
+       }
+
+       error = ffs_update(vp, NULL, NULL, UPDATE_CLOSE |
+           (((ap->a_flags & (FSYNC_WAIT | FSYNC_DATAONLY)) == FSYNC_WAIT)
+           ? UPDATE_WAIT : 0));
+
+       if (error == 0 && ap->a_flags & FSYNC_CACHE) {
+               int l = 0;
+               VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+                       curlwp->l_cred);
+       }
+
+out:
+       fstrans_done(mp);
+       return error;
+}
+
+/*
+ * Synch an open file.  Called for VOP_FSYNC().
+ */
+/* ARGSUSED */
+int
+ffs_full_fsync(struct vnode *vp, int flags)
+{
+       int error, i, uflags;
+       struct mount *mp;
+
+       KASSERT(vp->v_tag == VT_UFS);
+       KASSERT(VTOI(vp) != NULL);
+       KASSERT(vp->v_type != VCHR && vp->v_type != VBLK);
+
+       error = 0;
+       uflags = UPDATE_CLOSE | ((flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
+
+       mp = vp->v_mount;
+
+       /*
+        * Flush all dirty data associated with the vnode.
+        */
+       if (vp->v_type == VREG) {
+               int pflags = PGO_ALLPAGES | PGO_CLEANIT;
+
+               if ((flags & FSYNC_WAIT))
+                       pflags |= PGO_SYNCIO;
+               if (fstrans_getstate(mp) == FSTRANS_SUSPENDING)
+                       pflags |= PGO_FREE;
+               mutex_enter(vp->v_interlock);
+               error = VOP_PUTPAGES(vp, 0, 0, pflags);
+               if (error)
+                       return error;
+       }
+
+#ifdef WAPBL
+       if (mp && mp->mnt_wapbl) {
+               /*
+                * Don't bother writing out metadata if the syncer is
+                * making the request.  We will let the sync vnode
+                * write it out in a single burst through a call to
+                * VFS_SYNC().
+                */
+               if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
+                       return 0;
+
+               if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
+                   | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
+                       error = UFS_WAPBL_BEGIN(mp);
+                       if (error)
+                               return error;
+                       error = ffs_update(vp, NULL, NULL, uflags);
+                       UFS_WAPBL_END(mp);
+               }
+               if (error || (flags & FSYNC_NOLOG) != 0)
+                       return error;
+
+               /*
+                * Don't flush the log if the vnode being flushed
+                * contains no dirty buffers that could be in the log.
+                */
+               if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
+                       error = wapbl_flush(mp->mnt_wapbl, 0);
+                       if (error)
+                               return error;
+               }
+
+               if ((flags & FSYNC_WAIT) != 0) {
+                       mutex_enter(vp->v_interlock);
+                       while (vp->v_numoutput != 0)
+                               cv_wait(&vp->v_cv, vp->v_interlock);
+                       mutex_exit(vp->v_interlock);
+               }
+
+               return error;
+       }
+#endif /* WAPBL */
+
+       error = vflushbuf(vp, (flags & FSYNC_WAIT) != 0);
+       if (error == 0)
+               error = ffs_update(vp, NULL, NULL, uflags);
+       if (error == 0 && (flags & FSYNC_CACHE) != 0) {
+               i = 1;
+               (void)VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE,
+                   kauth_cred_get());
+       }
+
+       return error;
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+int
+ffs_reclaim(void *v)
+{
+       struct vop_reclaim_args /* {
+               struct vnode *a_vp;
+               struct lwp *a_l;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       struct mount *mp = vp->v_mount;
+       struct ufsmount *ump = ip->i_ump;
+       void *data;
+       int error;
+
+       fstrans_start(mp, FSTRANS_LAZY);
+       /*
+        * The inode must be freed and updated before being removed
+        * from its hash chain.  Other threads trying to gain a hold
+        * on the inode will be stalled because it is locked (VI_XLOCK).
+        */
+       error = UFS_WAPBL_BEGIN(mp);
+       if (error) {
+               fstrans_done(mp);
+               return error;
+       }
+       if (ip->i_nlink <= 0 && ip->i_omode != 0 &&
+           (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+               ffs_vfree(vp, ip->i_number, ip->i_omode);
+       UFS_WAPBL_END(mp);
+       if ((error = ufs_reclaim(vp)) != 0) {
+               fstrans_done(mp);
+               return (error);
+       }
+       if (ip->i_din.ffs1_din != NULL) {
+               if (ump->um_fstype == UFS1)
+                       pool_cache_put(ffs_dinode1_cache, ip->i_din.ffs1_din);
+               else
+                       pool_cache_put(ffs_dinode2_cache, ip->i_din.ffs2_din);
+       }
+       /*
+        * To interlock with ffs_sync().
+        */
+       genfs_node_destroy(vp);
+       mutex_enter(vp->v_interlock);
+       data = vp->v_data;
+       vp->v_data = NULL;
+       mutex_exit(vp->v_interlock);
+
+       /*
+        * XXX MFS ends up here, too, to free an inode.  Should we create
+        * XXX a separate pool for MFS inodes?
+        */
+       pool_cache_put(ffs_inode_cache, data);
+       fstrans_done(mp);
+       return (0);
+}
+
+/*
+ * Return the last logical file offset that should be written for this file
+ * if we're doing a write that ends at "size".
+ */
+
+void
+ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
+{
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+       daddr_t olbn, nlbn;
+
+       olbn = lblkno(fs, ip->i_size);
+       nlbn = lblkno(fs, size);
+       if (nlbn < NDADDR && olbn <= nlbn) {
+               *eobp = fragroundup(fs, size);
+       } else {
+               *eobp = blkroundup(fs, size);
+       }
+}
+
+int
+ffs_openextattr(void *v)
+{
+       struct vop_openextattr_args /* {
+               struct vnode *a_vp;
+               kauth_cred_t a_cred;
+               struct proc *a_p;
+       } */ *ap = v;
+       struct inode *ip = VTOI(ap->a_vp);
+       struct fs *fs = ip->i_fs;
+
+       /* Not supported for UFS1 file systems. */
+       if (fs->fs_magic == FS_UFS1_MAGIC)
+               return (EOPNOTSUPP);
+
+       /* XXX Not implemented for UFS2 file systems. */
+       return (EOPNOTSUPP);
+}
+
+int
+ffs_closeextattr(void *v)
+{
+       struct vop_closeextattr_args /* {
+               struct vnode *a_vp;
+               int a_commit;
+               kauth_cred_t a_cred;
+               struct proc *a_p;
+       } */ *ap = v;
+       struct inode *ip = VTOI(ap->a_vp);
+       struct fs *fs = ip->i_fs;
+
+       /* Not supported for UFS1 file systems. */
+       if (fs->fs_magic == FS_UFS1_MAGIC)
+               return (EOPNOTSUPP);
+
+       /* XXX Not implemented for UFS2 file systems. */
+       return (EOPNOTSUPP);
+}
+
+int
+ffs_getextattr(void *v)
+{
+       struct vop_getextattr_args /* {
+               struct vnode *a_vp;
+               int a_attrnamespace;
+               const char *a_name;
+               struct uio *a_uio;
+               size_t *a_size;
+               kauth_cred_t a_cred;
+               struct proc *a_p;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+
+       if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+               int error;
+
+               fstrans_start(vp->v_mount, FSTRANS_SHARED);
+               error = ufs_getextattr(ap);
+               fstrans_done(vp->v_mount);
+               return error;
+#else
+               return (EOPNOTSUPP);
+#endif
+       }
+
+       /* XXX Not implemented for UFS2 file systems. */
+       return (EOPNOTSUPP);
+}
+
+int
+ffs_setextattr(void *v)
+{
+       struct vop_setextattr_args /* {
+               struct vnode *a_vp;
+               int a_attrnamespace;
+               const char *a_name;
+               struct uio *a_uio;
+               kauth_cred_t a_cred;
+               struct proc *a_p;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+
+       if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+               int error;
+
+               fstrans_start(vp->v_mount, FSTRANS_SHARED);
+               error = ufs_setextattr(ap);
+               fstrans_done(vp->v_mount);
+               return error;
+#else
+               return (EOPNOTSUPP);
+#endif
+       }
+
+       /* XXX Not implemented for UFS2 file systems. */
+       return (EOPNOTSUPP);
+}
+
+int
+ffs_listextattr(void *v)
+{
+       struct vop_listextattr_args /* {
+               struct vnode *a_vp;
+               int a_attrnamespace;
+               struct uio *a_uio;
+               size_t *a_size;
+               kauth_cred_t a_cred;
+               struct proc *a_p;
+       } */ *ap = v;
+       struct inode *ip = VTOI(ap->a_vp);
+       struct fs *fs = ip->i_fs;
+
+       if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+               struct vnode *vp = ap->a_vp;
+               int error;
+
+               fstrans_start(vp->v_mount, FSTRANS_SHARED);
+               error = ufs_listextattr(ap);
+               fstrans_done(vp->v_mount);
+               return error;
+#else
+               return (EOPNOTSUPP);
+#endif
+       }
+
+       /* XXX Not implemented for UFS2 file systems. */
+       return (EOPNOTSUPP);
+}
+
+int
+ffs_deleteextattr(void *v)
+{
+       struct vop_deleteextattr_args /* {
+               struct vnode *a_vp;
+               int a_attrnamespace;
+               kauth_cred_t a_cred;
+               struct proc *a_p;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       struct fs *fs = ip->i_fs;
+
+       if (fs->fs_magic == FS_UFS1_MAGIC) {
+#ifdef UFS_EXTATTR
+               int error;
+
+               fstrans_start(vp->v_mount, FSTRANS_SHARED);
+               error = ufs_deleteextattr(ap);
+               fstrans_done(vp->v_mount);
+               return error;
+#else
+               return (EOPNOTSUPP);
+#endif
+       }
+
+       /* XXX Not implemented for UFS2 file systems. */
+       return (EOPNOTSUPP);
+}
diff --git a/sys/ufs/ffs/ffs_wapbl.c b/sys/ufs/ffs/ffs_wapbl.c
new file mode 100644 (file)
index 0000000..aa6b2da
--- /dev/null
@@ -0,0 +1,883 @@
+/*     $NetBSD: ffs_wapbl.c,v 1.17 2010/12/24 13:38:57 mlelstv Exp $   */
+
+/*-
+ * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.17 2010/12/24 13:38:57 mlelstv Exp $");
+
+#define WAPBL_INTERNAL
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/file.h>
+#include <sys/disk.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#undef WAPBL_DEBUG
+#ifdef WAPBL_DEBUG
+int ffs_wapbl_debug = 1;
+#define DPRINTF(fmt, args...)                                          \
+do {                                                                   \
+       if (ffs_wapbl_debug)                                            \
+               printf("%s:%d "fmt, __func__ , __LINE__, ##args);       \
+} while (/* CONSTCOND */0)
+#else
+#define        DPRINTF(fmt, args...)                                           \
+do {                                                                   \
+       /* nothing */                                                   \
+} while (/* CONSTCOND */0)
+#endif
+
+static int ffs_superblock_layout(struct fs *);
+static int wapbl_log_position(struct mount *, struct fs *, struct vnode *,
+    daddr_t *, size_t *, size_t *, uint64_t *);
+static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *,
+    daddr_t *, size_t *, uint64_t *);
+static void wapbl_find_log_start(struct mount *, struct vnode *, off_t,
+    daddr_t *, daddr_t *, size_t *);
+static int wapbl_remove_log(struct mount *);
+static int wapbl_allocate_log_file(struct mount *, struct vnode *,
+    daddr_t *, size_t *, uint64_t *);
+
+/*
+ * Return the super block layout format - UFS1 or UFS2.
+ * WAPBL only works with UFS2 layout (which is still available
+ * with FFSv1).
+ *
+ * XXX Should this be in ufs/ffs/fs.h?  Same style of check is
+ * also used in ffs_alloc.c in a few places.
+ */
+static int
+ffs_superblock_layout(struct fs *fs)
+{
+       if ((fs->fs_magic == FS_UFS1_MAGIC) &&
+           ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))
+               return 1;
+       else
+               return 2;
+}
+
+/*
+ * This function is invoked after a log is replayed to
+ * disk to perform logical cleanup actions as described by
+ * the log
+ */
+void
+ffs_wapbl_replay_finish(struct mount *mp)
+{
+       struct wapbl_replay *wr = mp->mnt_wapbl_replay;
+       int i;
+       int error;
+
+       if (!wr)
+               return;
+
+       KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0);
+
+       for (i = 0; i < wr->wr_inodescnt; i++) {
+               struct vnode *vp;
+               struct inode *ip;
+               error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber, &vp);
+               if (error) {
+                       printf("ffs_wapbl_replay_finish: "
+                           "unable to cleanup inode %" PRIu32 "\n",
+                           wr->wr_inodes[i].wr_inumber);
+                       continue;
+               }
+               ip = VTOI(vp);
+               KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number);
+#ifdef WAPBL_DEBUG
+               printf("ffs_wapbl_replay_finish: "
+                   "cleaning inode %" PRIu64 " size=%" PRIu64 " mode=%o nlink=%d\n",
+                   ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink);
+#endif
+               KASSERT(ip->i_nlink == 0);
+
+               /*
+                * The journal may have left partially allocated inodes in mode
+                * zero.  This may occur if a crash occurs betweeen the node
+                * allocation in ffs_nodeallocg and when the node is properly
+                * initialized in ufs_makeinode.  If so, just dallocate them.
+                */
+               if (ip->i_mode == 0) {
+                       UFS_WAPBL_BEGIN(mp);
+                       ffs_vfree(vp, ip->i_number, wr->wr_inodes[i].wr_imode);
+                       UFS_WAPBL_END(mp);
+               }
+               vput(vp);
+       }
+       wapbl_replay_stop(wr);
+       wapbl_replay_free(wr);
+       mp->mnt_wapbl_replay = NULL;
+}
+
+/* Callback for wapbl */
+void
+ffs_wapbl_sync_metadata(struct mount *mp, daddr_t *deallocblks,
+    int *dealloclens, int dealloccnt)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       int i, error;
+
+#ifdef WAPBL_DEBUG_INODES
+       ufs_wapbl_verify_inodes(mp, "ffs_wapbl_sync_metadata");
+#endif
+
+       for (i = 0; i< dealloccnt; i++) {
+               /*
+                * blkfree errors are unreported, might silently fail
+                * if it cannot read the cylinder group block
+                */
+               ffs_blkfree(fs, ump->um_devvp,
+                   dbtofsb(fs, deallocblks[i]), dealloclens[i], -1);
+       }
+
+       fs->fs_fmod = 0;
+       fs->fs_time = time_second;
+       error = ffs_cgupdate(ump, 0);
+       KASSERT(error == 0);
+}
+
+void
+ffs_wapbl_abort_sync_metadata(struct mount *mp, daddr_t *deallocblks,
+    int *dealloclens, int dealloccnt)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       int i;
+
+       for (i = 0; i < dealloccnt; i++) {
+               /*
+                * Since the above blkfree may have failed, this blkalloc might
+                * fail as well, so don't check its error.  Note that if the
+                * blkfree succeeded above, then this shouldn't fail because
+                * the buffer will be locked in the current transaction.
+                */
+               ffs_blkalloc_ump(ump, dbtofsb(fs, deallocblks[i]),
+                   dealloclens[i]);
+       }
+}
+
+static int
+wapbl_remove_log(struct mount *mp)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       struct vnode *vp;
+       struct inode *ip;
+       ino_t log_ino;
+       int error;
+
+       /* If super block layout is too old to support WAPBL, return */
+       if (ffs_superblock_layout(fs) < 2)
+               return 0;
+
+       /* If all the log locators are 0, just clean up */
+       if (fs->fs_journallocs[0] == 0 &&
+           fs->fs_journallocs[1] == 0 &&
+           fs->fs_journallocs[2] == 0 &&
+           fs->fs_journallocs[3] == 0) {
+               DPRINTF("empty locators, just clear\n");
+               goto done;
+       }
+
+       switch (fs->fs_journal_location) {
+       case UFS_WAPBL_JOURNALLOC_NONE:
+               /* nothing! */
+               DPRINTF("no log\n");
+               break;
+
+       case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+               log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO];
+               DPRINTF("in-fs log, ino = %" PRId64 "\n",log_ino);
+
+               /* if no existing log inode, just clear all fields and bail */
+               if (log_ino == 0)
+                       goto done;
+               error = VFS_VGET(mp, log_ino, &vp);
+               if (error != 0) {
+                       printf("ffs_wapbl: vget failed %d\n",
+                           error);
+                       /* clear out log info on error */
+                       goto done;
+               }
+               ip = VTOI(vp);
+               KASSERT(log_ino == ip->i_number);
+               if ((ip->i_flags & SF_LOG) == 0) {
+                       printf("ffs_wapbl: try to clear non-log inode "
+                           "%" PRId64 "\n", log_ino);
+                       vput(vp);
+                       /* clear out log info on error */
+                       goto done;
+               }
+
+               /*
+                * remove the log inode by setting its link count back
+                * to zero and bail.
+                */
+               ip->i_nlink = 0;
+               DIP_ASSIGN(ip, nlink, 0);
+               vput(vp);
+
+       case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+               DPRINTF("end-of-partition log\n");
+               /* no extra work required */
+               break;
+
+       default:
+               printf("ffs_wapbl: unknown journal type %d\n",
+                   fs->fs_journal_location);
+               break;
+       }
+
+
+done:
+       /* Clear out all previous knowledge of journal */
+       fs->fs_journal_version = 0;
+       fs->fs_journal_location = 0;
+       fs->fs_journal_flags = 0;
+       fs->fs_journallocs[0] = 0;
+       fs->fs_journallocs[1] = 0;
+       fs->fs_journallocs[2] = 0;
+       fs->fs_journallocs[3] = 0;
+       (void) ffs_sbupdate(ump, MNT_WAIT);
+
+       return 0;
+}
+
+int
+ffs_wapbl_start(struct mount *mp)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       struct vnode *devvp = ump->um_devvp;
+       daddr_t off;
+       size_t count;
+       size_t blksize;
+       uint64_t extradata;
+       int error;
+
+       if (mp->mnt_wapbl == NULL) {
+               if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) {
+                       /* Clear out any existing journal file */
+                       error = wapbl_remove_log(mp);
+                       if (error != 0)
+                               return error;
+               }
+
+               if (mp->mnt_flag & MNT_LOG) {
+                       KDASSERT(fs->fs_ronly == 0);
+
+                       /* WAPBL needs UFS2 format super block */
+                       if (ffs_superblock_layout(fs) < 2) {
+                               printf("%s fs superblock in old format, "
+                                  "not journaling\n",
+                                  VFSTOUFS(mp)->um_fs->fs_fsmnt);
+                               mp->mnt_flag &= ~MNT_LOG;
+                               return EINVAL;
+                       }
+
+                       error = wapbl_log_position(mp, fs, devvp, &off,
+                           &count, &blksize, &extradata);
+                       if (error)
+                               return error;
+
+                       error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off,
+                           count, blksize, mp->mnt_wapbl_replay,
+                           ffs_wapbl_sync_metadata,
+                           ffs_wapbl_abort_sync_metadata);
+                       if (error)
+                               return error;
+
+                       mp->mnt_wapbl_op = &wapbl_ops;
+
+#ifdef WAPBL_DEBUG
+                       printf("%s: enabling logging\n", fs->fs_fsmnt);
+#endif
+
+                       if ((fs->fs_flags & FS_DOWAPBL) == 0) {
+                               UFS_WAPBL_BEGIN(mp);
+                               fs->fs_flags |= FS_DOWAPBL;
+                               error = ffs_sbupdate(ump, MNT_WAIT);
+                               if (error) {
+                                       UFS_WAPBL_END(mp);
+                                       ffs_wapbl_stop(mp, MNT_FORCE);
+                                       return error;
+                               }
+                               UFS_WAPBL_END(mp);
+                               error = wapbl_flush(mp->mnt_wapbl, 1);
+                               if (error) {
+                                       ffs_wapbl_stop(mp, MNT_FORCE);
+                                       return error;
+                               }
+                       }
+               } else if (fs->fs_flags & FS_DOWAPBL) {
+                       fs->fs_fmod = 1;
+                       fs->fs_flags &= ~FS_DOWAPBL;
+               }
+       }
+
+       /*
+        * It is recommended that you finish replay with logging enabled.
+        * However, even if logging is not enabled, the remaining log
+        * replay should be safely recoverable with an fsck, so perform
+        * it anyway.
+        */
+       if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) {
+               int saveflag = mp->mnt_flag & MNT_RDONLY;
+               /*
+                * Make sure MNT_RDONLY is not set so that the inode
+                * cleanup in ufs_inactive will actually do its work.
+                */
+               mp->mnt_flag &= ~MNT_RDONLY;
+               ffs_wapbl_replay_finish(mp);
+               mp->mnt_flag |= saveflag;
+               KASSERT(fs->fs_ronly == 0);
+       }
+
+       return 0;
+}
+
+int
+ffs_wapbl_stop(struct mount *mp, int force)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       int error;
+
+       if (mp->mnt_wapbl) {
+               KDASSERT(fs->fs_ronly == 0);
+
+               /*
+                * Make sure turning off FS_DOWAPBL is only removed
+                * as the only change in the final flush since otherwise
+                * a transaction may reorder writes.
+                */
+               error = wapbl_flush(mp->mnt_wapbl, 1);
+               if (error && !force)
+                       return error;
+               if (error && force)
+                       goto forceout;
+               error = UFS_WAPBL_BEGIN(mp);
+               if (error && !force)
+                       return error;
+               if (error && force)
+                       goto forceout;
+               KASSERT(fs->fs_flags & FS_DOWAPBL);
+
+               fs->fs_flags &= ~FS_DOWAPBL;
+               error = ffs_sbupdate(ump, MNT_WAIT);
+               KASSERT(error == 0);    /* XXX a bit drastic! */
+               UFS_WAPBL_END(mp);
+       forceout:
+               error = wapbl_stop(mp->mnt_wapbl, force);
+               if (error) {
+                       KASSERT(!force);
+                       fs->fs_flags |= FS_DOWAPBL;
+                       return error;
+               }
+               fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */
+               mp->mnt_wapbl = NULL;
+
+#ifdef WAPBL_DEBUG
+               printf("%s: disabled logging\n", fs->fs_fsmnt);
+#endif
+       }
+
+       return 0;
+}
+
+int
+ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp)
+{
+       int error;
+       daddr_t off;
+       size_t count;
+       size_t blksize;
+       uint64_t extradata;
+
+       /*
+        * WAPBL needs UFS2 format super block, if we got here with a
+        * UFS1 format super block something is amiss...
+        */
+       if (ffs_superblock_layout(fs) < 2)
+               return EINVAL;
+
+       error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize,
+           &extradata);
+
+       if (error)
+               return error;
+
+       error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off,
+               count, blksize);
+       if (error)
+               return error;
+
+       mp->mnt_wapbl_op = &wapbl_ops;
+
+       return 0;
+}
+
+/*
+ * If the superblock doesn't already have a recorded journal location
+ * then we allocate the journal in one of two positions:
+ *
+ *  - At the end of the partition after the filesystem if there's
+ *    enough space.  "Enough space" is defined as >= 1MB of journal
+ *    per 1GB of filesystem or 64MB, whichever is smaller.
+ *
+ *  - Inside the filesystem.  We try to allocate a contiguous journal
+ *    based on the total filesystem size - the target is 1MB of journal
+ *    per 1GB of filesystem, up to a maximum journal size of 64MB.  As
+ *    a worst case allowing for fragmentation, we'll allocate a journal
+ *    1/4 of the desired size but never smaller than 1MB.
+ *
+ *    XXX In the future if we allow for non-contiguous journal files we
+ *    can tighten the above restrictions.
+ *
+ * XXX
+ * These seems like a lot of duplication both here and in some of
+ * the userland tools (fsck_ffs, dumpfs, tunefs) with similar 
+ * "switch (fs_journal_location)" constructs.  Can we centralise
+ * this sort of code somehow/somewhere?
+ */
+static int
+wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp,
+    daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       daddr_t logstart, logend, desired_logsize;
+       uint64_t numsecs;
+       unsigned secsize;
+       int error, location;
+
+       if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
+               switch (fs->fs_journal_location) {
+               case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+                       DPRINTF("found existing end-of-partition log\n");
+                       *startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR];
+                       *countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
+                       *blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ];
+                       DPRINTF(" start = %" PRId64 ", size = %zu, "
+                           "blksize = %zu\n", *startp, *countp, *blksizep);
+                       return 0;
+
+               case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+                       DPRINTF("found existing in-filesystem log\n");
+                       *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR];
+                       *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+                       *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+                       DPRINTF(" start = %" PRId64 ", size = %zu, "
+                           "blksize = %zu\n", *startp, *countp, *blksizep);
+                       return 0;
+
+               default:
+                       printf("ffs_wapbl: unknown journal type %d\n",
+                           fs->fs_journal_location);
+                       return EINVAL;
+               }
+       }
+
+       desired_logsize =
+           lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE;
+       DPRINTF("desired log size = %" PRId64 " kB\n", desired_logsize / 1024);
+       desired_logsize = max(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
+       desired_logsize = min(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
+       DPRINTF("adjusted desired log size = %" PRId64 " kB\n",
+           desired_logsize / 1024);
+
+       /* Is there space after after filesystem on partition for log? */
+       logstart = fsbtodb(fs, fs->fs_size);
+       error = getdisksize(devvp, &numsecs, &secsize);
+       if (error)
+               return error;
+       KDASSERT(secsize != 0);
+       logend = btodb(numsecs * secsize);
+
+       if (dbtob(logend - logstart) >= desired_logsize) {
+               DPRINTF("enough space, use end-of-partition log\n");
+
+               location = UFS_WAPBL_JOURNALLOC_END_PARTITION;
+               *blksizep = secsize;
+
+               *startp = logstart;
+               *countp = (logend - logstart);
+               *extradatap = 0;
+
+               /* convert to physical block numbers */
+               *startp = dbtob(*startp) / secsize;
+               *countp = dbtob(*countp) / secsize;
+
+               fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp;
+               fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp;
+               fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep;
+               fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap;
+       } else {
+               DPRINTF("end-of-partition has only %" PRId64 " free\n",
+                   logend - logstart);
+
+               location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM;
+               *blksizep = secsize;
+
+               error = wapbl_create_infs_log(mp, fs, devvp,
+                                 startp, countp, extradatap);
+               ffs_sync(mp, MNT_WAIT, FSCRED);
+
+               /* convert to physical block numbers */
+               *startp = dbtob(*startp) / secsize;
+               *countp = dbtob(*countp) / secsize;
+
+               fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] = *startp;
+               fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = *countp;
+               fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = *blksizep;
+               fs->fs_journallocs[UFS_WAPBL_INFS_INO] = *extradatap;
+       }
+
+       if (error == 0) {
+               /* update superblock with log location */
+               fs->fs_journal_version = UFS_WAPBL_VERSION;
+               fs->fs_journal_location = location;
+               fs->fs_journal_flags = 0;
+
+               error = ffs_sbupdate(ump, MNT_WAIT);
+       }
+
+       return error;
+}
+
+/*
+ * Try to create a journal log inside the filesystem.
+ */
+static int
+wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp,
+    daddr_t *startp, size_t *countp, uint64_t *extradatap)
+{
+       struct vnode *vp, *rvp;
+       struct inode *ip;
+       int error;
+
+       if ((error = VFS_ROOT(mp, &rvp)) != 0)
+               return error;
+
+       error = UFS_VALLOC(rvp, 0 | S_IFREG, NOCRED, &vp);
+       if (mp->mnt_flag & MNT_UPDATE) {
+               vput(rvp);
+       } else {
+               VOP_UNLOCK(rvp);
+               vgone(rvp);
+       }
+       if (error != 0)
+               return error;
+
+       vp->v_type = VREG;
+       ip = VTOI(vp);
+       ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+       ip->i_mode = 0 | IFREG;
+       DIP_ASSIGN(ip, mode, ip->i_mode);
+       ip->i_flags = SF_LOG;
+       DIP_ASSIGN(ip, flags, ip->i_flags);
+       ip->i_nlink = 1;
+       DIP_ASSIGN(ip, nlink, 1);
+       ffs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+       if ((error = wapbl_allocate_log_file(mp, vp,
+                        startp, countp, extradatap)) != 0) {
+               /*
+                * If we couldn't allocate the space for the log file,
+                * remove the inode by setting its link count back to
+                * zero and bail.
+                */
+               ip->i_nlink = 0;
+               DIP_ASSIGN(ip, nlink, 0);
+               VOP_UNLOCK(vp);
+               vgone(vp);
+
+               return error;
+       }
+
+       /*
+        * Now that we have the place-holder inode for the journal,
+        * we don't need the vnode ever again.
+        */
+       VOP_UNLOCK(vp);
+       vgone(vp);
+
+       return 0;
+}
+
+int
+wapbl_allocate_log_file(struct mount *mp, struct vnode *vp,
+    daddr_t *startp, size_t *countp, uint64_t *extradatap)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       daddr_t addr, indir_addr;
+       off_t logsize;
+       size_t size;
+       int error;
+
+       logsize = 0;
+       /* check if there's a suggested log size */
+       if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG &&
+           fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM)
+               logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+
+       if (vp->v_size > 0) {
+               printf("%s: file size (%" PRId64 ") non zero\n", __func__,
+                   vp->v_size);
+               return EEXIST;
+       }
+       wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size);
+       if (addr == 0) {
+               printf("%s: log not allocated, largest extent is "
+                   "%" PRId64 "MB\n", __func__,
+                   lblktosize(fs, size) / (1024 * 1024));
+               return ENOSPC;
+       }
+
+       logsize = lblktosize(fs, size); /* final log size */
+
+       VTOI(vp)->i_ffs_first_data_blk = addr;
+       VTOI(vp)->i_ffs_first_indir_blk = indir_addr;
+
+       error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED);
+       if (error) {
+               printf("%s: GOP_ALLOC error %d\n", __func__, error);
+               return error;
+       }
+
+       *startp     = fsbtodb(fs, addr);
+       *countp     = btodb(logsize);
+       *extradatap = VTOI(vp)->i_number;
+
+       return 0;
+}
+
+/*
+ * Find a suitable location for the journal in the filesystem.
+ *
+ * Our strategy here is to look for a contiguous block of free space
+ * at least "logfile" MB in size (plus room for any indirect blocks).
+ * We start at the middle of the filesystem and check each cylinder
+ * group working outwards.  If "logfile" MB is not available as a
+ * single contigous chunk, then return the address and size of the
+ * largest chunk found.
+ *
+ * XXX 
+ * At what stage does the search fail?  Is if the largest space we could
+ * find is less than a quarter the requested space reasonable?  If the
+ * search fails entirely, return a block address if "0" it indicate this.
+ */
+static void
+wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize,
+    daddr_t *addr, daddr_t *indir_addr, size_t *size)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct fs *fs = ump->um_fs;
+       struct vnode *devvp = ump->um_devvp;
+       struct cg *cgp;
+       struct buf *bp;
+       uint8_t *blksfree;
+       daddr_t blkno, best_addr, start_addr;
+       daddr_t desired_blks, min_desired_blks;
+       daddr_t freeblks, best_blks;
+       int bpcg, cg, error, fixedsize, indir_blks, n, s;
+#ifdef FFS_EI
+       const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+       if (logsize == 0) {
+               fixedsize = 0;  /* We can adjust the size if tight */
+               logsize = lfragtosize(fs, fs->fs_dsize) /
+                   UFS_WAPBL_JOURNAL_SCALE;
+               DPRINTF("suggested log size = %" PRId64 "\n", logsize);
+               logsize = max(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
+               logsize = min(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
+               DPRINTF("adjusted log size = %" PRId64 "\n", logsize);
+       } else {
+               fixedsize = 1;
+               DPRINTF("fixed log size = %" PRId64 "\n", logsize);
+       }
+
+       desired_blks = logsize / fs->fs_bsize;
+       DPRINTF("desired blocks = %" PRId64 "\n", desired_blks);
+
+       /* add in number of indirect blocks needed */
+       indir_blks = 0;
+       if (desired_blks >= NDADDR) {
+               struct indir indirs[NIADDR + 2];
+               int num;
+
+               error = ufs_getlbns(vp, desired_blks, indirs, &num);
+               if (error) {
+                       printf("%s: ufs_getlbns failed, error %d!\n",
+                           __func__, error);
+                       goto bad;
+               }
+
+               switch (num) {
+               case 2:
+                       indir_blks = 1;         /* 1st level indirect */
+                       break;
+               case 3:
+                       indir_blks = 1 +        /* 1st level indirect */
+                           1 +                 /* 2nd level indirect */
+                           indirs[1].in_off + 1; /* extra 1st level indirect */
+                       break;
+               default:
+                       printf("%s: unexpected numlevels %d from ufs_getlbns\n",
+                           __func__, num);
+                       *size = 0;
+                       goto bad;
+               }
+               desired_blks += indir_blks;
+       }
+       DPRINTF("desired blocks = %" PRId64 " (including indirect)\n",
+           desired_blks);
+
+       /*
+        * If a specific size wasn't requested, allow for a smaller log
+        * if we're really tight for space...
+        */
+       min_desired_blks = desired_blks;
+       if (!fixedsize)
+               min_desired_blks = desired_blks / 4;
+
+       /* Look at number of blocks per CG.  If it's too small, bail early. */
+       bpcg = fragstoblks(fs, fs->fs_fpg);
+       if (min_desired_blks > bpcg) {
+               printf("ffs_wapbl: cylinder group size of %" PRId64 " MB "
+                   " is not big enough for journal\n",
+                   lblktosize(fs, bpcg) / (1024 * 1024));
+               goto bad;
+       }
+
+       /*
+        * Start with the middle cylinder group, and search outwards in
+        * both directions until we either find the requested log size
+        * or reach the start/end of the file system.  If we reach the
+        * start/end without finding enough space for the full requested
+        * log size, use the largest extent found if it is large enough
+        * to satisfy the our minimum size.
+        *
+        * XXX
+        * Can we just use the cluster contigsum stuff (esp on UFS2)
+        * here to simplify this search code?
+        */
+       best_addr = 0;
+       best_blks = 0;
+       for (cg = fs->fs_ncg / 2, s = 0, n = 1;
+           best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg;
+           s++, n = -n, cg += n * s) {
+               DPRINTF("check cg %d of %d\n", cg, fs->fs_ncg);
+               error = bread(devvp, fsbtodb(fs, cgtod(fs, cg)),
+                   fs->fs_cgsize, FSCRED, 0, &bp);
+               cgp = (struct cg *)bp->b_data;
+               if (error || !cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
+                       brelse(bp, 0);
+                       continue;
+               }
+
+               blksfree = cg_blksfree(cgp, needswap);
+
+               for (blkno = 0; blkno < bpcg;) {
+                       /* look for next free block */
+                       /* XXX use scanc() and fragtbl[] here? */
+                       for (; blkno < bpcg - min_desired_blks; blkno++)
+                               if (ffs_isblock(fs, blksfree, blkno))
+                                       break;
+
+                       /* past end of search space in this CG? */
+                       if (blkno >= bpcg - min_desired_blks)
+                               break;
+
+                       /* count how many free blocks in this extent */
+                       start_addr = blkno;
+                       for (freeblks = 0; blkno < bpcg; blkno++, freeblks++)
+                               if (!ffs_isblock(fs, blksfree, blkno))
+                                       break;
+
+                       if (freeblks > best_blks) {
+                               best_blks = freeblks;
+                               best_addr = blkstofrags(fs, start_addr) +
+                                   cgbase(fs, cg);
+
+                               if (freeblks >= desired_blks) {
+                                       DPRINTF("found len %" PRId64
+                                           " at offset %" PRId64 " in gc\n",
+                                           freeblks, start_addr);
+                                       break;
+                               }
+                       }
+               }
+               brelse(bp, 0);
+       }
+       DPRINTF("best found len = %" PRId64 ", wanted %" PRId64
+           " at addr %" PRId64 "\n", best_blks, desired_blks, best_addr);
+
+       if (best_blks < min_desired_blks) {
+               *addr = 0;
+               *indir_addr = 0;
+       } else {
+               /* put indirect blocks at start, and data blocks after */
+               *addr = best_addr + blkstofrags(fs, indir_blks);
+               *indir_addr = best_addr;
+       }
+       *size = min(desired_blks, best_blks) - indir_blks;
+       return;
+
+bad:
+       *addr = 0;
+       *indir_addr = 0;
+       *size = 0;
+       return;
+}
similarity index 100%
rename from include/ufs/ffs/fs.h
rename to sys/ufs/ffs/fs.h
diff --git a/sys/ufs/files.ufs b/sys/ufs/files.ufs
new file mode 100644 (file)
index 0000000..7bd59a0
--- /dev/null
@@ -0,0 +1,89 @@
+#      $NetBSD: files.ufs,v 1.27 2011/11/24 15:51:31 ahoka Exp $
+
+deffs                                  FFS
+deffs                                  EXT2FS
+deffs                                  MFS
+deffs                                  LFS
+deffs                                  CHFS
+
+defflag        opt_ffs.h                       FFS_EI FFS_NO_SNAPSHOT APPLE_UFS
+                                       UFS_DIRHASH
+                                       UFS_EXTATTR UFS_EXTATTR_AUTOSTART
+
+defflag        opt_lfs.h                       LFS_KERNEL_RFW
+
+file   ufs/ext2fs/ext2fs_alloc.c       ext2fs
+file   ufs/ext2fs/ext2fs_balloc.c      ext2fs
+file   ufs/ext2fs/ext2fs_bmap.c        ext2fs
+file   ufs/ext2fs/ext2fs_bswap.c       ext2fs
+file   ufs/ext2fs/ext2fs_inode.c       ext2fs
+file   ufs/ext2fs/ext2fs_lookup.c      ext2fs
+file   ufs/ext2fs/ext2fs_readwrite.c   ext2fs
+file   ufs/ext2fs/ext2fs_subr.c        ext2fs
+file   ufs/ext2fs/ext2fs_vfsops.c      ext2fs
+file   ufs/ext2fs/ext2fs_vnops.c       ext2fs
+
+file   ufs/chfs/ebh.c                  chfs
+file   ufs/chfs/chfs_ihash.c           chfs
+file   ufs/chfs/chfs_scan.c            chfs
+file   ufs/chfs/chfs_write.c           chfs
+file   ufs/chfs/chfs_vnode_cache.c     chfs
+file   ufs/chfs/chfs_erase.c           chfs
+file   ufs/chfs/chfs_build.c           chfs
+file   ufs/chfs/chfs_wbuf.c            chfs
+file   ufs/chfs/chfs_vnops.c           chfs
+file   ufs/chfs/chfs_gc.c              chfs
+file   ufs/chfs/chfs_nodeops.c         chfs
+file   ufs/chfs/chfs_malloc.c          chfs
+file   ufs/chfs/chfs_pool.c            chfs
+file   ufs/chfs/debug.c                        chfs
+file   ufs/chfs/chfs_vnode.c           chfs
+file   ufs/chfs/chfs_subr.c            chfs
+file   ufs/chfs/chfs_vfsops.c          chfs
+file   ufs/chfs/chfs_readinode.c       chfs
+
+file   ufs/ffs/ffs_alloc.c             ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ffs/ffs_balloc.c            ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ffs/ffs_bswap.c             (ffs | mfs) & ffs_ei
+file   ufs/ffs/ffs_inode.c             ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ffs/ffs_snapshot.c          ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ffs/ffs_subr.c              ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ffs/ffs_tables.c            ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ffs/ffs_vfsops.c            ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ffs/ffs_vnops.c             ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ffs/ffs_wapbl.c             ffs & wapbl
+file   ufs/ffs/ffs_appleufs.c          ffs & apple_ufs
+file   ufs/ffs/ffs_quota2.c            quota2 & (ffs | lfs | mfs | ext2fs | chfs)
+
+file   ufs/lfs/lfs_alloc.c             lfs
+file   ufs/lfs/lfs_balloc.c            lfs
+file   ufs/lfs/lfs_bio.c               lfs
+file   ufs/lfs/lfs_cksum.c             lfs
+file   ufs/lfs/lfs_debug.c             lfs
+file   ufs/lfs/lfs_inode.c             lfs
+file   ufs/lfs/lfs_itimes.c            lfs
+file   ufs/lfs/lfs_rfw.c               lfs & lfs_kernel_rfw
+file   ufs/lfs/lfs_segment.c           lfs
+file   ufs/lfs/lfs_subr.c              lfs
+file   ufs/lfs/lfs_syscalls.c          lfs
+file   ufs/lfs/lfs_vfsops.c            lfs
+file   ufs/lfs/lfs_vnops.c             lfs
+
+file   ufs/mfs/mfs_vfsops.c            mfs
+file   ufs/mfs/mfs_vnops.c             mfs
+file   ufs/mfs/mfs_miniroot.c
+
+file   ufs/ufs/ufs_bmap.c              ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ufs/ufs_dirhash.c           (ffs | lfs | mfs | ext2fs | chfs) & ufs_dirhash
+file   ufs/ufs/ufs_extattr.c           (ffs | mfs) & ufs_extattr
+file   ufs/ufs/ufs_ihash.c             ffs | lfs | mfs | ext2fs
+file   ufs/ufs/ufs_inode.c             ffs | lfs | mfs | ext2fs
+file   ufs/ufs/ufs_lookup.c            ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ufs/ufs_quota.c             (quota | quota2) & (ffs | lfs | mfs | ext2fs | chfs)
+file   ufs/ufs/ufs_quota1.c            quota & (ffs | lfs | mfs | ext2fs | chfs)
+file   ufs/ufs/ufs_quota2.c            quota2 & (ffs | lfs | mfs | ext2fs | chfs)
+file   ufs/ufs/quota1_subr.c
+file   ufs/ufs/quota2_subr.c           quota2 & (ffs | lfs | mfs | ext2fs | chfs)
+file   ufs/ufs/ufs_vfsops.c            ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ufs/ufs_vnops.c             ffs | lfs | mfs | ext2fs | chfs
+file   ufs/ufs/ufs_wapbl.c             ffs & wapbl
diff --git a/sys/ufs/lfs/CHANGES b/sys/ufs/lfs/CHANGES
new file mode 100644 (file)
index 0000000..dfad485
--- /dev/null
@@ -0,0 +1,169 @@
+#   $NetBSD: CHANGES,v 1.5 2005/12/11 12:25:26 christos Exp $
+
+kernel:
+
+- Instead of blindly continuing when it encounters an Inode that is
+  locked by another process, lfs_markv will process the rest of the
+  inodes passed to it and then return EAGAIN.  The cleaner will
+  recognize this and not mark the segment clean.  When the cleaner runs
+  again, the segment containg the (formerly) locked inode will sort high
+  for cleaning, since it is now almost entirely empty.
+
+- A beginning has been made to test keeping atime information in the
+  Ifile, instead of on the inodes.  This should make read-mostly
+  filesystems significantly faster, since the inodes will then remain
+  close to the data blocks on disk; but of course the ifile will be
+  somewhat larger.  This code is not enabled, as it makes the format of
+  IFILEs change.
+
+- The superblock has been broken into two components: an on-disk
+  superblock using fixed-size types, exactly 512 bytes regardless of
+  architecture (or could be enlarged in multiples of the media block
+  size up to LFS_SBPAD); and an in-memory superblock containing the
+  information only useful to a running LFS, including segment pointers,
+  etc.  The superblock checksumming code has been modified to make
+  future changes to the superblock format easier.
+
+- Because of the way that lfs_writeseg works, buffers are freed before
+  they are really written to disk: their contents are copied into large
+  buffers which are written async.  Because the buffer cache does not
+  serve to throttle these writes, and malloced memory is used to hold them,
+  there is a danger of running out of kmem_map.  To avoid this, a new
+  compile-time parameter, LFS_THROTTLE, is used as an upper bound for the
+  number of partial-segments allowed to be in progress writing at any
+  given time.
+
+- If the system crashes between the point that a checkpoint is scheduled
+  for writing and the time that the write completes, the filesystem
+  could be left in an inconsistent state (no valid checkpoints on
+  disk).  To avoid this, we toggle between the first two superblocks
+  when checkpointing, and (if it is indicated that no roll-forward agent
+  exists) do not allow one checkpoint to occur before the last one has
+  completed.  When the filesystem is mounted, it uses the *older* of the
+  first two superblocks.
+
+- DIROPs:
+
+  The design of the LFS includes segregating vnodes used in directory
+  operations, so that they can be written at the same time during a
+  checkpoint, avoiding filesystem inconsistency after a crash.  Code for
+  this was partially written for BSD4.4, but was not complete or enabled.
+
+  In particular, vnodes marked VDIROP could be flushed by getnewvnode at
+  any time, negating the usefulness of marking a vnode VDIROP, since if
+  the filesystem then crashed it would be inconsistent.  Now, when a
+  vnode is first marked VDIROP it is also referenced.  To avoid running
+  out of vnodes, an attempt to mark more than LFS_MAXDIROP vnodes wth
+  VDIROP will sleep, and trigger a partial-segment write when no dirops
+  are active.
+
+- LFS maintains a linked list of free inode numbers in the Ifile;
+  accesses to this list are now protected by a simple lock.
+
+- lfs_vfree is not allowed to run while an inode has blocks scheduled
+  for writing, since that could trigger a miscounting in lfs_truncate.
+
+- lfs_balloc now correctly extends fragments, if a block is written
+  beyond the current end-of-file.
+
+- Blocks which have already been gathered into a partial-segment are not
+  allowed to be extended, since if they were, any blocks following them
+  would either be written in the wrong place, or overwrite other blocks.
+
+- The LFS buffer-header accounting, which triggers a partial-segment
+  write if too many buffer-headers are in use by the LFS subystem, has
+  been expanded to include *bytes* used in LFS buffers as well.
+
+- Reads of the Ifile, which almost always come from the cleaner, can no
+  longer trigger a partial-segment write, since this could cause a
+  deadlock.
+
+- Support has been added (but not tested, and currently disabled by
+  default) for true read-only filesystems.  Currently, if a filesystem
+  is mounted read-only the cleaner can still operate on it, but this
+  obviously would not be true for read-only media.  (I think the
+  original plan was for the roll-forward agent to operate using this
+  "feature"?)
+
+- If a fake buffer is created by lfs_markv and another process draws the
+  same block in and changes it, the fake buffer is now discarded and
+  replaced by the "real" buffer containing the new data.
+
+- An inode which has blocks gathered no longer has IN_MODIFIED set, but
+  still does in fact have dirty blocks attached.  lfs_update will now
+  wait for such an inode's writes to complete before it runs,
+  suppressing a panic in vinvalbuf.
+
+- Many filesystem operations now update the Ifile's mtime, allowing the
+  cleaner to detect when the filesystem is idle, and clean more
+  vigorously during such times (cf. Blackwell et al., 1995).
+
+- When writing a partial-segment, make sure that the current segment is
+  still marked ACTIVE afterward (otherwise the cleaner might try to
+  clean it, since it might well be mostly empty).
+
+- Don't trust the cleaner so much.  Sort the blocks during gathering,
+  even if they came from the cleaner; verify the location of on-disk
+  inodes, even if the cleaner says it knows where they came from.
+
+- The cleaning code (lfs_markv in particular) has been entirely
+  rewritten, and the partial-segment writing code changed to match.
+  Lfs_markv no longer uses its own implementation of lfs_segwrite, but
+  marks inodes with IN_CLEANING to differentiate them from the
+  non-cleaning inodes.  This change fixes numerous problems with the old
+  cleaner, including a buffer overrun, and lost extensions in active
+  fragments.  lfs_bmapv looks up and returns the addresses of inode
+  blocks, so the cleaner can do something intelligent with them.
+
+  If IN_CLEANING is set on an inode during partial-segment write, only fake
+  buffers will be written, and IN_MODIFIED will not be cleared, saving
+  us from a panic in vinvalbuf.  The addition of IN_CLEANING also allows
+  dirops to be active while cleaning is in progress; since otherwise
+  buffers engaged in active dirops might be written ahead of schedule,
+  and cause an inconsistent checkpoint to be written to disk.
+
+  (XXX - even now, DIROP blocks can sometimes be written to disk, if we
+  are cleaning the same blocks as are active?  Grr, I don't see a good
+  solution for this!)
+
+- Added sysctl entries for LFS.  In particular, `writeindir' controls
+  whether indirect blocks are written during non-checkpoint writes.
+  (Since there is no roll-forward agent as yet, there is no penalty in
+  not writing indirect blocks.)
+
+- Wake up the cleaner at fs-unmount time, so it can die (if we unmount
+  and then remount, we could conceivably get more than one cleaner
+  operating at once).
+
+newfs_lfs:
+
+- The ifile inode is now created with the schg flag set, since nothing
+  ever modifies it.  This could be a pain for the roll-forward agent,
+  but since that should really run *before* the filesystem is mounted,
+  I don't care.
+
+- For large disks, it may be necessary to write one or more indirect
+  blocks when the ifile inode is created.  Newlfs has been changed to
+  write the first indirect block, if necessary.  It should instead just
+  build a set of inodes and blocks, and then use the partial-segment
+  writing routine mentioned above to write an ifile of whatever size is
+  desired.
+
+lfs_cleanerd:
+
+- Now writes information to the syslog.
+
+- Can now deal properly with fragments.
+
+- Sometimes, the cleaner can die.  (Why?)  If this happens and we don't
+  notice, we're screwed, since the fs will overfill.  So, the invoked
+  cleaner now spawns itself repeatedly, a la init(8), to ensure that a
+  cleaner is always present to clean the fs.
+
+- Added a flag to clean more actively, not on low load average but
+  filesystem inactivity; a la Blackwell et al., 1995.
+
+fsck_lfs:
+
+- Exists, although it currently cannot actually fix anything (it is a
+  diagnostic tool only at this point).
diff --git a/sys/ufs/lfs/Makefile b/sys/ufs/lfs/Makefile
new file mode 100644 (file)
index 0000000..bb61c7b
--- /dev/null
@@ -0,0 +1,7 @@
+#      $NetBSD: Makefile,v 1.1 1998/06/12 23:23:12 cgd Exp $
+
+INCSDIR= /usr/include/ufs/lfs
+
+INCS=  lfs.h lfs_extern.h
+
+.include <bsd.kinc.mk>
diff --git a/sys/ufs/lfs/README b/sys/ufs/lfs/README
new file mode 100644 (file)
index 0000000..827edbf
--- /dev/null
@@ -0,0 +1,137 @@
+#      $NetBSD: README,v 1.3 1999/03/15 00:46:47 perseant Exp $
+
+#      @(#)README      8.1 (Berkeley) 6/11/93
+
+The file system is reasonably stable...I think.
+
+For details on the implementation, performance and why garbage
+collection always wins, see Dr. Margo Seltzer's thesis available for
+anonymous ftp from toe.cs.berkeley.edu, in the directory
+pub/personal/margo/thesis.ps.Z, or the January 1993 USENIX paper.
+
+----------
+The disk is laid out in segments.  The first segment starts 8K into the
+disk (the first 8K is used for boot information).  Each segment is composed
+of the following:
+
+       An optional super block
+       One or more groups of:
+               segment summary
+               0 or more data blocks
+               0 or more inode blocks
+
+The segment summary and inode/data blocks start after the super block (if
+present), and grow toward the end of the segment.
+
+       _______________________________________________
+       |         |            |         |            |
+       | summary | data/inode | summary | data/inode |
+       |  block  |   blocks   |  block  |   blocks   | ...
+       |_________|____________|_________|____________|
+
+The data/inode blocks following a summary block are described by the
+summary block.  In order to permit the segment to be written in any order
+and in a forward direction only, a checksum is calculated across the
+blocks described by the summary.  Additionally, the summary is checksummed
+and timestamped.  Both of these are intended for recovery; the former is
+to make it easy to determine that it *is* a summary block and the latter
+is to make it easy to determine when recovery is finished for partially
+written segments.  These checksums are also used by the cleaner.
+
+       Summary block (detail)
+       ________________
+       | sum cksum    |
+       | data cksum   |
+       | next segment |
+       | timestamp    |
+       | FINFO count  |
+       | inode count  |
+       | flags        |
+       |______________|
+       |   FINFO-1    | 0 or more file info structures, identifying the
+       |     .        | blocks in the segment.
+       |     .        |
+       |     .        |
+       |   FINFO-N    |
+       |   inode-N    |
+       |     .        |
+       |     .        |
+       |     .        | 0 or more inode daddr_t's, identifying the inode
+       |   inode-1    | blocks in the segment.
+       |______________|
+
+Inode blocks are blocks of on-disk inodes in the same format as those in
+the FFS.  However, spare[0] contains the inode number of the inode so we
+can find a particular inode on a page.  They are packed page_size /
+sizeof(inode) to a block.  Data blocks are exactly as in the FFS.  Both
+inodes and data blocks move around the file system at will.
+
+The file system is described by a super-block which is replicated and
+occurs as the first block of the first and other segments.  (The maximum
+number of super-blocks is MAXNUMSB).  Each super-block maintains a list
+of the disk addresses of all the super-blocks.  The super-block maintains
+a small amount of checkpoint information, essentially just enough to find
+the inode for the IFILE (fs->lfs_idaddr).
+
+The IFILE is visible in the file system, as inode number IFILE_INUM.  It
+contains information shared between the kernel and various user processes.
+
+       Ifile (detail)
+       ________________
+       | cleaner info | Cleaner information per file system.  (Page
+       |              | granularity.)
+       |______________|
+       | segment      | Space available and last modified times per
+       | usage table  | segment.  (Page granularity.)
+       |______________|
+       |   IFILE-1    | Per inode status information: current version #,
+       |     .        | if currently allocated, last access time and
+       |     .        | current disk address of containing inode block.
+       |     .        | If current disk address is LFS_UNUSED_DADDR, the
+       |   IFILE-N    | inode is not in use, and it's on the free list.
+       |______________|
+
+
+First Segment at Creation Time:
+_____________________________________________________________
+|        |       |         |       |       |       |       |
+| 8K pad | Super | summary | inode | ifile | root  | l + f |
+|        | block |         | block |       | dir   | dir   |
+|________|_______|_________|_______|_______|_______|_______|
+         ^
+           Segment starts here.
+
+Some differences from the Sprite LFS implementation.
+
+1. The LFS implementation placed the ifile metadata and the super block
+   at fixed locations.  This implementation replicates the super block
+   and puts each at a fixed location.  The checkpoint data is divided into
+   two parts -- just enough information to find the IFILE is stored in
+   two of the super blocks, although it is not toggled between them as in
+   the Sprite implementation.  (This was deliberate, to avoid a single
+   point of failure.)  The remaining checkpoint information is treated as
+   a regular file, which means that the cleaner info, the segment usage
+   table and the ifile meta-data are stored in normal log segments.
+   (Tastes great, less filling...)
+
+2. The segment layout is radically different in Sprite; this implementation
+   uses something a lot like network framing, where data/inode blocks are
+   written asynchronously, and a checksum is used to validate any set of
+   summary and data/inode blocks.  Sprite writes summary blocks synchronously
+   after the data/inode blocks have been written and the existence of the
+   summary block validates the data/inode blocks.  This permits us to write
+   everything contiguously, even partial segments and their summaries, whereas
+   Sprite is forced to seek (from the end of the data inode to the summary
+   which lives at the end of the segment).  Additionally, writing the summary
+   synchronously should cost about 1/2 a rotation per summary.
+
+3. Sprite LFS distinguishes between different types of blocks in the segment.
+   Other than inode blocks and data blocks, we don't.
+
+4. Sprite LFS traverses the IFILE looking for free blocks.  We maintain a
+   free list threaded through the IFILE entries.
+
+5. The cleaner runs in user space, as opposed to kernel space.  It shares
+   information with the kernel by reading/writing the IFILE and through
+   cleaner specific system calls.
+
diff --git a/sys/ufs/lfs/TODO b/sys/ufs/lfs/TODO
new file mode 100644 (file)
index 0000000..e86ecdb
--- /dev/null
@@ -0,0 +1,109 @@
+#   $NetBSD: TODO,v 1.10 2005/12/11 12:25:26 christos Exp $
+
+- Lock audit.  Need to check locking for multiprocessor case in particular.
+
+- Get rid of lfs_segclean(); the kernel should clean a dirty segment IFF it
+  has passed two checkpoints containing zero live bytes.
+
+- Now that our cache is basically all of physical memory, we need to make
+  sure that segwrite is not starving other important things.  Need a way
+  to prioritize which blocks are most important to write, and write only
+  those, saving the rest for later.  Does this change our notion of what
+  a checkpoint is?
+
+- Investigate alternate inode locking strategy: Inode locks are useful
+  for locking against simultaneous changes to inode size (balloc,
+  truncate, write) but because the assignment of disk blocks is also
+  covered by the segment lock, we don't really need to pay attention to
+  the inode lock when writing a segment, right?  If this is true, the
+  locking problem in lfs_{bmapv,markv} goes away and lfs_reserve can go,
+  too.
+
+- Get rid of DEV_BSIZE, pay attention to the media block size at mount time.
+
+- More fs ops need to call lfs_imtime.  Which ones?  (Blackwell et al., 1995)
+
+- lfs_vunref_head exists so that vnodes loaded solely for cleaning can
+  be put back on the *head* of the vnode free list.  Make sure we
+  actually do this, since we now take IN_CLEANING off during segment write.
+
+- The cleaner could be enhanced to be controlled from other processes,
+  and possibly perform additional tasks:
+
+  - Backups.  At a minimum, turn the cleaner off and on to allow
+    effective live backups.  More aggressively, the cleaner itself could
+    be the backup agent, and dump_lfs would merely be a controller.
+
+  - Cleaning time policies.  Be able to tweak the cleaner's thresholds
+    to allow more thorough cleaning during policy-determined idle
+    periods (regardless of actual idleness) or put off until later
+    during short, intensive write periods.
+
+  - File coalescing and placement.  During periods we expect to be idle,
+    coalesce fragmented files into one place on disk for better read
+    performance.  Ideally, move files that have not been accessed in a
+    while to the extremes of the disk, thereby shortening seek times for
+    files that are accessed more frequently (though how the cleaner
+    should communicate "please put this near the beginning or end of the
+    disk" to the kernel is a very good question; flags to lfs_markv?).
+
+  - Versioning.  When it cleans a segment it could write data for files
+    that were less than n versions old to tape or elsewhere.  Perhaps it
+    could even write them back onto the disk, although that requires
+    more thought (and kernel mods).
+
+- Move lfs_countlocked() into vfs_bio.c, to replace count_locked_queue;
+  perhaps keep the name, replace the function.  Could it count referenced
+  vnodes as well, if it was in vfs_subr.c instead?
+
+- Why not delete the lfs_bmapv call, just mark everything dirty that
+  isn't deleted/truncated?  Get some numbers about what percentage of
+  the stuff that the cleaner thinks might be live is live.  If it's
+  high, get rid of lfs_bmapv.
+
+- There is a nasty problem in that it may take *more* room to write the
+  data to clean a segment than is returned by the new segment because of
+  indirect blocks in segment 2 being dirtied by the data being copied
+  into the log from segment 1.  The suggested solution at this point is
+  to detect it when we have no space left on the filesystem, write the
+  extra data into the last segment (leaving no clean ones), make it a
+  checkpoint and shut down the file system for fixing by a utility
+  reading the raw partition.  Argument is that this should never happen
+  and is practically impossible to fix since the cleaner would have to
+  theoretically build a model of the entire filesystem in memory to
+  detect the condition occurring.  A file coalescing cleaner will help
+  avoid the problem, and one that reads/writes from the raw disk could
+  fix it.
+
+- Need to keep vnode v_numoutput up to date for pending writes?
+
+- If delete a file that's being executed, the version number isn't
+  updated, and fsck_lfs has to figure this out; case is the same as if
+  have an inode that no directory references, so the file should be
+  reattached into lost+found.
+
+- Currently there's no notion of write error checking.
+  + Failed data/inode writes should be rescheduled (kernel level bad blocking).
+  + Failed superblock writes should cause selection of new superblock
+  for checkpointing.
+
+- Future fantasies:
+  - unrm, versioning
+  - transactions
+  - extended cleaner policies (hot/cold data, data placement)
+
+- Problem with the concept of multiple buffer headers referencing the segment:
+  Positives:
+    Don't lock down 1 segment per file system of physical memory.
+    Don't copy from buffers to segment memory.
+    Don't tie down the bus to transfer 1M.
+    Works on controllers supporting less than large transfers.
+    Disk can start writing immediately instead of waiting 1/2 rotation
+        and the full transfer.
+  Negatives:
+    Have to do segment write then segment summary write, since the latter
+    is what verifies that the segment is okay.  (Is there another way
+    to do this?)
+
+- The algorithm for selecting the disk addresses of the super-blocks
+  has to be available to the user program which checks the file system.
similarity index 100%
rename from include/ufs/lfs/lfs.h
rename to sys/ufs/lfs/lfs.h
diff --git a/sys/ufs/lfs/lfs_alloc.c b/sys/ufs/lfs/lfs_alloc.c
new file mode 100644 (file)
index 0000000..8d2baa0
--- /dev/null
@@ -0,0 +1,674 @@
+/*     $NetBSD: lfs_alloc.c,v 1.111 2011/06/12 03:36:01 rmind Exp $    */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_alloc.c 8.4 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.111 2011/06/12 03:36:01 rmind Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/buf.h>
+#include <sys/lock.h>
+#include <sys/vnode.h>
+#include <sys/syslog.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/proc.h>
+#include <sys/tree.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+/* Constants for inode free bitmap */
+#define BMSHIFT 5      /* 2 ** 5 = 32 */
+#define BMMASK  ((1 << BMSHIFT) - 1)
+#define SET_BITMAP_FREE(F, I) do { \
+       DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d set\n", (int)(I),  \
+            (int)((I) >> BMSHIFT), (int)((I) & BMMASK)));              \
+       (F)->lfs_ino_bitmap[(I) >> BMSHIFT] |= (1 << ((I) & BMMASK));   \
+} while (0)
+#define CLR_BITMAP_FREE(F, I) do { \
+       DLOG((DLOG_ALLOC, "lfs: ino %d wrd %d bit %d clr\n", (int)(I),  \
+            (int)((I) >> BMSHIFT), (int)((I) & BMMASK)));              \
+       (F)->lfs_ino_bitmap[(I) >> BMSHIFT] &= ~(1 << ((I) & BMMASK));  \
+} while(0)
+
+#define ISSET_BITMAP_FREE(F, I) \
+       ((F)->lfs_ino_bitmap[(I) >> BMSHIFT] & (1 << ((I) & BMMASK)))
+
+/*
+ * Add a new block to the Ifile, to accommodate future file creations.
+ * Called with the segment lock held.
+ */
+int
+lfs_extend_ifile(struct lfs *fs, kauth_cred_t cred)
+{
+       struct vnode *vp;
+       struct inode *ip;
+       IFILE *ifp;
+       IFILE_V1 *ifp_v1;
+       struct buf *bp, *cbp;
+       int error;
+       daddr_t i, blkno, xmax;
+       ino_t oldlast, maxino;
+       CLEANERINFO *cip;
+
+       ASSERT_SEGLOCK(fs);
+
+       vp = fs->lfs_ivnode;
+       ip = VTOI(vp);
+       blkno = lblkno(fs, ip->i_size);
+       if ((error = lfs_balloc(vp, ip->i_size, fs->lfs_bsize, cred, 0,
+                               &bp)) != 0) {
+               return (error);
+       }
+       ip->i_size += fs->lfs_bsize;
+       ip->i_ffs1_size = ip->i_size;
+       uvm_vnp_setsize(vp, ip->i_size);
+
+       maxino = ((ip->i_size >> fs->lfs_bshift) - fs->lfs_cleansz -
+                 fs->lfs_segtabsz) * fs->lfs_ifpb;
+       fs->lfs_ino_bitmap = (lfs_bm_t *)
+               realloc(fs->lfs_ino_bitmap, ((maxino + BMMASK) >> BMSHIFT) *
+                       sizeof(lfs_bm_t), M_SEGMENT, M_WAITOK);
+       KASSERT(fs->lfs_ino_bitmap != NULL);
+
+       i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) *
+               fs->lfs_ifpb;
+
+       /*
+        * We insert the new inodes at the head of the free list.
+        * Under normal circumstances, the free list is empty here,
+        * so we are also incidentally placing them at the end (which
+        * we must do if we are to keep them in order).
+        */
+       LFS_GET_HEADFREE(fs, cip, cbp, &oldlast);
+       LFS_PUT_HEADFREE(fs, cip, cbp, i);
+#ifdef DIAGNOSTIC
+       if (fs->lfs_freehd == LFS_UNUSED_INUM)
+               panic("inode 0 allocated [2]");
+#endif /* DIAGNOSTIC */
+       xmax = i + fs->lfs_ifpb;
+
+       if (fs->lfs_version == 1) {
+               for (ifp_v1 = (IFILE_V1 *)bp->b_data; i < xmax; ++ifp_v1) {
+                       SET_BITMAP_FREE(fs, i);
+                       ifp_v1->if_version = 1;
+                       ifp_v1->if_daddr = LFS_UNUSED_DADDR;
+                       ifp_v1->if_nextfree = ++i;
+               }
+               ifp_v1--;
+               ifp_v1->if_nextfree = oldlast;
+       } else {
+               for (ifp = (IFILE *)bp->b_data; i < xmax; ++ifp) {
+                       SET_BITMAP_FREE(fs, i);
+                       ifp->if_version = 1;
+                       ifp->if_daddr = LFS_UNUSED_DADDR;
+                       ifp->if_nextfree = ++i;
+               }
+               ifp--;
+               ifp->if_nextfree = oldlast;
+       }
+       LFS_PUT_TAILFREE(fs, cip, cbp, xmax - 1);
+
+       (void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+       return 0;
+}
+
+/* Allocate a new inode. */
+/* ARGSUSED */
+/* VOP_BWRITE 2i times */
+int
+lfs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
+    struct vnode **vpp)
+{
+       struct lfs *fs;
+       struct buf *bp, *cbp;
+       struct ifile *ifp;
+       ino_t new_ino;
+       int error;
+       int new_gen;
+       CLEANERINFO *cip;
+
+       fs = VTOI(pvp)->i_lfs;
+       if (fs->lfs_ronly)
+               return EROFS;
+
+       ASSERT_NO_SEGLOCK(fs);
+
+       lfs_seglock(fs, SEGM_PROT);
+       vn_lock(fs->lfs_ivnode, LK_EXCLUSIVE);
+
+       /* Get the head of the freelist. */
+       LFS_GET_HEADFREE(fs, cip, cbp, &new_ino);
+       KASSERT(new_ino != LFS_UNUSED_INUM && new_ino != LFS_IFILE_INUM);
+
+       DLOG((DLOG_ALLOC, "lfs_valloc: allocate inode %lld\n",
+            (long long)new_ino));
+
+       /*
+        * Remove the inode from the free list and write the new start
+        * of the free list into the superblock.
+        */
+       CLR_BITMAP_FREE(fs, new_ino);
+       LFS_IENTRY(ifp, fs, new_ino, bp);
+       if (ifp->if_daddr != LFS_UNUSED_DADDR)
+               panic("lfs_valloc: inuse inode %llu on the free list",
+                   (unsigned long long)new_ino);
+       LFS_PUT_HEADFREE(fs, cip, cbp, ifp->if_nextfree);
+       DLOG((DLOG_ALLOC, "lfs_valloc: headfree %lld -> %lld\n",
+            (long long)new_ino, (long long)ifp->if_nextfree));
+
+       new_gen = ifp->if_version; /* version was updated by vfree */
+       brelse(bp, 0);
+
+       /* Extend IFILE so that the next lfs_valloc will succeed. */
+       if (fs->lfs_freehd == LFS_UNUSED_INUM) {
+               if ((error = lfs_extend_ifile(fs, cred)) != 0) {
+                       LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
+                       VOP_UNLOCK(fs->lfs_ivnode);
+                       lfs_segunlock(fs);
+                       return error;
+               }
+       }
+#ifdef DIAGNOSTIC
+       if (fs->lfs_freehd == LFS_UNUSED_INUM)
+               panic("inode 0 allocated [3]");
+#endif /* DIAGNOSTIC */
+
+       /* Set superblock modified bit and increment file count. */
+       mutex_enter(&lfs_lock);
+       fs->lfs_fmod = 1;
+       mutex_exit(&lfs_lock);
+       ++fs->lfs_nfiles;
+
+       VOP_UNLOCK(fs->lfs_ivnode);
+       lfs_segunlock(fs);
+
+       return lfs_ialloc(fs, pvp, new_ino, new_gen, vpp);
+}
+
+/*
+ * Finish allocating a new inode, given an inode and generation number.
+ */
+int
+lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen,
+          struct vnode **vpp)
+{
+       struct inode *ip;
+       struct vnode *vp;
+
+       ASSERT_NO_SEGLOCK(fs);
+
+       vp = *vpp;
+       mutex_enter(&ufs_hashlock);
+       /* Create an inode to associate with the vnode. */
+       lfs_vcreate(pvp->v_mount, new_ino, vp);
+
+       ip = VTOI(vp);
+       mutex_enter(&lfs_lock);
+       LFS_SET_UINO(ip, IN_CHANGE);
+       mutex_exit(&lfs_lock);
+       /* on-disk structure has been zeroed out by lfs_vcreate */
+       ip->i_din.ffs1_din->di_inumber = new_ino;
+
+       /* Note no blocks yet */
+       ip->i_lfs_hiblk = -1;
+
+       /* Set a new generation number for this inode. */
+       if (new_gen) {
+               ip->i_gen = new_gen;
+               ip->i_ffs1_gen = new_gen;
+       }
+
+       /* Insert into the inode hash table. */
+       ufs_ihashins(ip);
+       mutex_exit(&ufs_hashlock);
+
+       ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, vpp);
+       vp = *vpp;
+       ip = VTOI(vp);
+
+       memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize));
+
+       uvm_vnp_setsize(vp, 0);
+       lfs_mark_vnode(vp);
+       genfs_node_init(vp, &lfs_genfsops);
+       vref(ip->i_devvp);
+       return (0);
+}
+
+/* Create a new vnode/inode pair and initialize what fields we can. */
+void
+lfs_vcreate(struct mount *mp, ino_t ino, struct vnode *vp)
+{
+       struct inode *ip;
+       struct ufs1_dinode *dp;
+       struct ufsmount *ump;
+
+       /* Get a pointer to the private mount structure. */
+       ump = VFSTOUFS(mp);
+
+       ASSERT_NO_SEGLOCK(ump->um_lfs);
+
+       /* Initialize the inode. */
+       ip = pool_get(&lfs_inode_pool, PR_WAITOK);
+       memset(ip, 0, sizeof(*ip));
+       dp = pool_get(&lfs_dinode_pool, PR_WAITOK);
+       memset(dp, 0, sizeof(*dp));
+       ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK);
+       memset(ip->inode_ext.lfs, 0, sizeof(*ip->inode_ext.lfs));
+       vp->v_data = ip;
+       ip->i_din.ffs1_din = dp;
+       ip->i_ump = ump;
+       ip->i_vnode = vp;
+       ip->i_devvp = ump->um_devvp;
+       ip->i_dev = ump->um_dev;
+       ip->i_number = dp->di_inumber = ino;
+       ip->i_lfs = ump->um_lfs;
+       ip->i_lfs_effnblks = 0;
+       SPLAY_INIT(&ip->i_lfs_lbtree);
+       ip->i_lfs_nbtree = 0;
+       LIST_INIT(&ip->i_lfs_segdhd);
+#ifdef QUOTA
+       ufsquota_init(ip);
+#endif
+}
+
+#if 0
+/*
+ * Find the highest-numbered allocated inode.
+ * This will be used to shrink the Ifile.
+ */
+static inline ino_t
+lfs_last_alloc_ino(struct lfs *fs)
+{
+       ino_t ino, maxino;
+
+       maxino = ((fs->lfs_ivnode->v_size >> fs->lfs_bshift) -
+                 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
+       for (ino = maxino - 1; ino > LFS_UNUSED_INUM; --ino) {
+               if (ISSET_BITMAP_FREE(fs, ino) == 0)
+                       break;
+       }
+       return ino;
+}
+#endif
+
+/*
+ * Find the previous (next lowest numbered) free inode, if any.
+ * If there is none, return LFS_UNUSED_INUM.
+ */
+static inline ino_t
+lfs_freelist_prev(struct lfs *fs, ino_t ino)
+{
+       ino_t tino, bound, bb, freehdbb;
+
+       if (fs->lfs_freehd == LFS_UNUSED_INUM)   /* No free inodes at all */
+               return LFS_UNUSED_INUM;
+
+       /* Search our own word first */
+       bound = ino & ~BMMASK;
+       for (tino = ino - 1; tino >= bound && tino > LFS_UNUSED_INUM; tino--)
+               if (ISSET_BITMAP_FREE(fs, tino))
+                       return tino;
+       /* If there are no lower words to search, just return */
+       if (ino >> BMSHIFT == 0)
+               return LFS_UNUSED_INUM;
+
+       /*
+        * Find a word with a free inode in it.  We have to be a bit
+        * careful here since ino_t is unsigned.
+        */
+       freehdbb = (fs->lfs_freehd >> BMSHIFT);
+       for (bb = (ino >> BMSHIFT) - 1; bb >= freehdbb && bb > 0; --bb)
+               if (fs->lfs_ino_bitmap[bb])
+                       break;
+       if (fs->lfs_ino_bitmap[bb] == 0)
+               return LFS_UNUSED_INUM;
+
+       /* Search the word we found */
+       for (tino = (bb << BMSHIFT) | BMMASK; tino >= (bb << BMSHIFT) &&
+            tino > LFS_UNUSED_INUM; tino--)
+               if (ISSET_BITMAP_FREE(fs, tino))
+                       break;
+
+       if (tino <= LFS_IFILE_INUM)
+               tino = LFS_UNUSED_INUM;
+
+       return tino;
+}
+
+/* Free an inode. */
+/* ARGUSED */
+/* VOP_BWRITE 2i times */
+int
+lfs_vfree(struct vnode *vp, ino_t ino, int mode)
+{
+       SEGUSE *sup;
+       CLEANERINFO *cip;
+       struct buf *cbp, *bp;
+       struct ifile *ifp;
+       struct inode *ip;
+       struct lfs *fs;
+       daddr_t old_iaddr;
+       ino_t otail;
+
+       /* Get the inode number and file system. */
+       ip = VTOI(vp);
+       fs = ip->i_lfs;
+       ino = ip->i_number;
+
+       ASSERT_NO_SEGLOCK(fs);
+       DLOG((DLOG_ALLOC, "lfs_vfree: free ino %lld\n", (long long)ino));
+
+       /* Drain of pending writes */
+       mutex_enter(vp->v_interlock);
+       while (fs->lfs_version > 1 && WRITEINPROG(vp)) {
+               cv_wait(&vp->v_cv, vp->v_interlock);
+       }
+       mutex_exit(vp->v_interlock);
+
+       lfs_seglock(fs, SEGM_PROT);
+       vn_lock(fs->lfs_ivnode, LK_EXCLUSIVE);
+
+       lfs_unmark_vnode(vp);
+       mutex_enter(&lfs_lock);
+       if (vp->v_uflag & VU_DIROP) {
+               vp->v_uflag &= ~VU_DIROP;
+               --lfs_dirvcount;
+               --fs->lfs_dirvcount;
+               TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+               wakeup(&fs->lfs_dirvcount);
+               wakeup(&lfs_dirvcount);
+               mutex_exit(&lfs_lock);
+               lfs_vunref(vp);
+
+               /*
+                * If this inode is not going to be written any more, any
+                * segment accounting left over from its truncation needs
+                * to occur at the end of the next dirops flush.  Attach
+                * them to the fs-wide list for that purpose.
+                */
+               if (LIST_FIRST(&ip->i_lfs_segdhd) != NULL) {
+                       struct segdelta *sd;
+       
+                       while((sd = LIST_FIRST(&ip->i_lfs_segdhd)) != NULL) {
+                               LIST_REMOVE(sd, list);
+                               LIST_INSERT_HEAD(&fs->lfs_segdhd, sd, list);
+                       }
+               }
+       } else {
+               /*
+                * If it's not a dirop, we can finalize right away.
+                */
+               mutex_exit(&lfs_lock);
+               lfs_finalize_ino_seguse(fs, ip);
+       }
+
+       mutex_enter(&lfs_lock);
+       LFS_CLR_UINO(ip, IN_ACCESSED|IN_CLEANING|IN_MODIFIED);
+       mutex_exit(&lfs_lock);
+       ip->i_flag &= ~IN_ALLMOD;
+       ip->i_lfs_iflags |= LFSI_DELETED;
+       
+       /*
+        * Set the ifile's inode entry to unused, increment its version number
+        * and link it onto the free chain.
+        */
+       SET_BITMAP_FREE(fs, ino);
+       LFS_IENTRY(ifp, fs, ino, bp);
+       old_iaddr = ifp->if_daddr;
+       ifp->if_daddr = LFS_UNUSED_DADDR;
+       ++ifp->if_version;
+       if (fs->lfs_version == 1) {
+               LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
+               LFS_PUT_HEADFREE(fs, cip, cbp, ino);
+               (void) LFS_BWRITE_LOG(bp); /* Ifile */
+       } else {
+               ino_t tino, onf;
+
+               ifp->if_nextfree = LFS_UNUSED_INUM;
+               (void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+               tino = lfs_freelist_prev(fs, ino);
+               if (tino == LFS_UNUSED_INUM) {
+                       /* Nothing free below us, put us on the head */
+                       LFS_IENTRY(ifp, fs, ino, bp);
+                       LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
+                       LFS_PUT_HEADFREE(fs, cip, cbp, ino);
+                       DLOG((DLOG_ALLOC, "lfs_vfree: headfree %lld -> %lld\n",
+                            (long long)ifp->if_nextfree, (long long)ino));
+                       LFS_BWRITE_LOG(bp); /* Ifile */
+
+                       /* If the list was empty, set tail too */
+                       LFS_GET_TAILFREE(fs, cip, cbp, &otail);
+                       if (otail == LFS_UNUSED_INUM) {
+                               LFS_PUT_TAILFREE(fs, cip, cbp, ino);
+                               DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld "
+                                     "-> %lld\n", (long long)otail,
+                                     (long long)ino));
+                       }
+               } else {
+                       /*
+                        * Insert this inode into the list after tino.
+                        * We hold the segment lock so we don't have to
+                        * worry about blocks being written out of order.
+                        */
+                       DLOG((DLOG_ALLOC, "lfs_vfree: insert ino %lld "
+                             " after %lld\n", ino, tino));
+
+                       LFS_IENTRY(ifp, fs, tino, bp);
+                       onf = ifp->if_nextfree;
+                       ifp->if_nextfree = ino;
+                       LFS_BWRITE_LOG(bp);     /* Ifile */
+
+                       LFS_IENTRY(ifp, fs, ino, bp);
+                       ifp->if_nextfree = onf;
+                       LFS_BWRITE_LOG(bp);     /* Ifile */
+
+                       /* If we're last, put us on the tail */
+                       if (onf == LFS_UNUSED_INUM) {
+                               LFS_GET_TAILFREE(fs, cip, cbp, &otail);
+                               LFS_PUT_TAILFREE(fs, cip, cbp, ino);
+                               DLOG((DLOG_ALLOC, "lfs_vfree: tailfree %lld "
+                                     "-> %lld\n", (long long)otail,
+                                     (long long)ino));
+                       }
+               }
+       }
+#ifdef DIAGNOSTIC
+       if (ino == LFS_UNUSED_INUM) {
+               panic("inode 0 freed");
+       }
+#endif /* DIAGNOSTIC */
+       if (old_iaddr != LFS_UNUSED_DADDR) {
+               LFS_SEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp);
+#ifdef DIAGNOSTIC
+               if (sup->su_nbytes < sizeof (struct ufs1_dinode)) {
+                       printf("lfs_vfree: negative byte count"
+                              " (segment %" PRIu32 " short by %d)\n",
+                              dtosn(fs, old_iaddr),
+                              (int)sizeof (struct ufs1_dinode) -
+                                   sup->su_nbytes);
+                       panic("lfs_vfree: negative byte count");
+                       sup->su_nbytes = sizeof (struct ufs1_dinode);
+               }
+#endif
+               sup->su_nbytes -= sizeof (struct ufs1_dinode);
+               LFS_WRITESEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp); /* Ifile */
+       }
+
+       /* Set superblock modified bit and decrement file count. */
+       mutex_enter(&lfs_lock);
+       fs->lfs_fmod = 1;
+       mutex_exit(&lfs_lock);
+       --fs->lfs_nfiles;
+
+       VOP_UNLOCK(fs->lfs_ivnode);
+       lfs_segunlock(fs);
+
+       return (0);
+}
+
+/*
+ * Sort the freelist and set up the free-inode bitmap.
+ * To be called by lfs_mountfs().
+ */
+void
+lfs_order_freelist(struct lfs *fs)
+{
+       CLEANERINFO *cip;
+       IFILE *ifp = NULL;
+       struct buf *bp;
+       ino_t ino, firstino, lastino, maxino;
+#ifdef notyet
+       struct vnode *vp;
+#endif
+       
+       ASSERT_NO_SEGLOCK(fs);
+       lfs_seglock(fs, SEGM_PROT);
+
+       maxino = ((fs->lfs_ivnode->v_size >> fs->lfs_bshift) -
+                 fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
+       fs->lfs_ino_bitmap = (lfs_bm_t *)
+               malloc(((maxino + BMMASK) >> BMSHIFT) * sizeof(lfs_bm_t),
+                      M_SEGMENT, M_WAITOK | M_ZERO);
+       KASSERT(fs->lfs_ino_bitmap != NULL);
+
+       firstino = lastino = LFS_UNUSED_INUM;
+       for (ino = 0; ino < maxino; ino++) {
+               if (ino % fs->lfs_ifpb == 0)
+                       LFS_IENTRY(ifp, fs, ino, bp);
+               else
+                       ++ifp;
+
+               /* Don't put zero or ifile on the free list */
+               if (ino == LFS_UNUSED_INUM || ino == LFS_IFILE_INUM)
+                       continue;
+
+#ifdef notyet
+               /* Address orphaned files */
+               if (ifp->if_nextfree == LFS_ORPHAN_NEXTFREE &&
+                   VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp) == 0) {
+                       lfs_truncate(vp, 0, 0, NOCRED);
+                       vput(vp);
+                       LFS_SEGENTRY(sup, fs, dtosn(fs, ifp->if_daddr), bp);
+                       KASSERT(sup->su_nbytes >= DINODE1_SIZE);
+                       sup->su_nbytes -= DINODE1_SIZE;
+                       LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ifp->if_daddr), bp);
+
+                       /* Set up to fall through to next section */
+                       ifp->if_daddr = LFS_UNUSED_DADDR;
+                       LFS_BWRITE_LOG(bp);
+                       LFS_IENTRY(ifp, fs, ino, bp);
+               }
+#endif
+
+               if (ifp->if_daddr == LFS_UNUSED_DADDR) {
+                       if (firstino == LFS_UNUSED_INUM)
+                               firstino = ino;
+                       else {
+                               brelse(bp, 0);
+
+                               LFS_IENTRY(ifp, fs, lastino, bp);
+                               ifp->if_nextfree = ino;
+                               LFS_BWRITE_LOG(bp);
+                               
+                               LFS_IENTRY(ifp, fs, ino, bp);
+                       }
+                       lastino = ino;
+
+                       SET_BITMAP_FREE(fs, ino);
+               }
+
+               if ((ino + 1) % fs->lfs_ifpb == 0)
+                       brelse(bp, 0);
+       }
+
+       LFS_PUT_HEADFREE(fs, cip, bp, firstino);
+       LFS_PUT_TAILFREE(fs, cip, bp, lastino);
+
+       lfs_segunlock(fs);
+}
+
+void
+lfs_orphan(struct lfs *fs, ino_t ino)
+{
+       IFILE *ifp;
+       struct buf *bp;
+
+       LFS_IENTRY(ifp, fs, ino, bp);
+       ifp->if_nextfree = LFS_ORPHAN_NEXTFREE;
+       LFS_BWRITE_LOG(bp);
+}
diff --git a/sys/ufs/lfs/lfs_balloc.c b/sys/ufs/lfs/lfs_balloc.c
new file mode 100644 (file)
index 0000000..d46ba05
--- /dev/null
@@ -0,0 +1,582 @@
+/*     $NetBSD: lfs_balloc.c,v 1.70 2011/07/11 08:27:40 hannken Exp $  */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1989, 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_balloc.c        8.4 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.70 2011/07/11 08:27:40 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/tree.h>
+#include <sys/trace.h>
+#include <sys/kauth.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+int lfs_fragextend(struct vnode *, int, int, daddr_t, struct buf **, kauth_cred_t);
+
+u_int64_t locked_fakequeue_count;
+
+/*
+ * Allocate a block, and to inode and filesystem block accounting for it
+ * and for any indirect blocks the may need to be created in order for
+ * this block to be created.
+ *
+ * Blocks which have never been accounted for (i.e., which "do not exist")
+ * have disk address 0, which is translated by ufs_bmap to the special value
+ * UNASSIGNED == -1, as in the historical UFS.
+ *
+ * Blocks which have been accounted for but which have not yet been written
+ * to disk are given the new special disk address UNWRITTEN == -2, so that
+ * they can be differentiated from completely new blocks.
+ */
+/* VOP_BWRITE NIADDR+2 times */
+int
+lfs_balloc(struct vnode *vp, off_t startoffset, int iosize, kauth_cred_t cred,
+    int flags, struct buf **bpp)
+{
+       int offset;
+       daddr_t daddr, idaddr;
+       struct buf *ibp, *bp;
+       struct inode *ip;
+       struct lfs *fs;
+       struct indir indirs[NIADDR+2], *idp;
+       daddr_t lbn, lastblock;
+       int bcount;
+       int error, frags, i, nsize, osize, num;
+
+       ip = VTOI(vp);
+       fs = ip->i_lfs;
+       offset = blkoff(fs, startoffset);
+       KASSERT(iosize <= fs->lfs_bsize);
+       lbn = lblkno(fs, startoffset);
+       /* (void)lfs_check(vp, lbn, 0); */
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+
+       /*
+        * Three cases: it's a block beyond the end of file, it's a block in
+        * the file that may or may not have been assigned a disk address or
+        * we're writing an entire block.
+        *
+        * Note, if the daddr is UNWRITTEN, the block already exists in
+        * the cache (it was read or written earlier).  If so, make sure
+        * we don't count it as a new block or zero out its contents. If
+        * it did not, make sure we allocate any necessary indirect
+        * blocks.
+        *
+        * If we are writing a block beyond the end of the file, we need to
+        * check if the old last block was a fragment.  If it was, we need
+        * to rewrite it.
+        */
+
+       if (bpp)
+               *bpp = NULL;
+
+       /* Check for block beyond end of file and fragment extension needed. */
+       lastblock = lblkno(fs, ip->i_size);
+       if (lastblock < NDADDR && lastblock < lbn) {
+               osize = blksize(fs, ip, lastblock);
+               if (osize < fs->lfs_bsize && osize > 0) {
+                       if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize,
+                                                   lastblock,
+                                                   (bpp ? &bp : NULL), cred)))
+                               return (error);
+                       ip->i_ffs1_size = ip->i_size =
+                           (lastblock + 1) * fs->lfs_bsize;
+                       uvm_vnp_setsize(vp, ip->i_size);
+                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+                       if (bpp)
+                               (void) VOP_BWRITE(bp->b_vp, bp);
+               }
+       }
+
+       /*
+        * If the block we are writing is a direct block, it's the last
+        * block in the file, and offset + iosize is less than a full
+        * block, we can write one or more fragments.  There are two cases:
+        * the block is brand new and we should allocate it the correct
+        * size or it already exists and contains some fragments and
+        * may need to extend it.
+        */
+       if (lbn < NDADDR && lblkno(fs, ip->i_size) <= lbn) {
+               osize = blksize(fs, ip, lbn);
+               nsize = fragroundup(fs, offset + iosize);
+               if (lblktosize(fs, lbn) >= ip->i_size) {
+                       /* Brand new block or fragment */
+                       frags = numfrags(fs, nsize);
+                       if (!ISSPACE(fs, frags, cred))
+                               return ENOSPC;
+                       if (bpp) {
+                               *bpp = bp = getblk(vp, lbn, nsize, 0, 0);
+                               bp->b_blkno = UNWRITTEN;
+                               if (flags & B_CLRBUF)
+                                       clrbuf(bp);
+                       }
+                       ip->i_lfs_effnblks += frags;
+                       mutex_enter(&lfs_lock);
+                       fs->lfs_bfree -= frags;
+                       mutex_exit(&lfs_lock);
+                       ip->i_ffs1_db[lbn] = UNWRITTEN;
+               } else {
+                       if (nsize <= osize) {
+                               /* No need to extend */
+                               if (bpp && (error = bread(vp, lbn, osize,
+                                   NOCRED, 0, &bp)))
+                                       return error;
+                       } else {
+                               /* Extend existing block */
+                               if ((error =
+                                    lfs_fragextend(vp, osize, nsize, lbn,
+                                                   (bpp ? &bp : NULL), cred)))
+                                       return error;
+                       }
+                       if (bpp)
+                               *bpp = bp;
+               }
+               return 0;
+       }
+
+       error = ufs_bmaparray(vp, lbn, &daddr, &indirs[0], &num, NULL, NULL);
+       if (error)
+               return (error);
+
+       daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
+       KASSERT(daddr <= LFS_MAX_DADDR);
+
+       /*
+        * Do byte accounting all at once, so we can gracefully fail *before*
+        * we start assigning blocks.
+        */
+       frags = VFSTOUFS(vp->v_mount)->um_seqinc;
+       bcount = 0;
+       if (daddr == UNASSIGNED) {
+               bcount = frags;
+       }
+       for (i = 1; i < num; ++i) {
+               if (!indirs[i].in_exists) {
+                       bcount += frags;
+               }
+       }
+       if (ISSPACE(fs, bcount, cred)) {
+               mutex_enter(&lfs_lock);
+               fs->lfs_bfree -= bcount;
+               mutex_exit(&lfs_lock);
+               ip->i_lfs_effnblks += bcount;
+       } else {
+               return ENOSPC;
+       }
+
+       if (daddr == UNASSIGNED) {
+               if (num > 0 && ip->i_ffs1_ib[indirs[0].in_off] == 0) {
+                       ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
+               }
+
+               /*
+                * Create new indirect blocks if necessary
+                */
+               if (num > 1) {
+                       idaddr = ip->i_ffs1_ib[indirs[0].in_off];
+                       for (i = 1; i < num; ++i) {
+                               ibp = getblk(vp, indirs[i].in_lbn,
+                                   fs->lfs_bsize, 0,0);
+                               if (!indirs[i].in_exists) {
+                                       clrbuf(ibp);
+                                       ibp->b_blkno = UNWRITTEN;
+                               } else if (!(ibp->b_oflags & (BO_DELWRI | BO_DONE))) {
+                                       ibp->b_blkno = fsbtodb(fs, idaddr);
+                                       ibp->b_flags |= B_READ;
+                                       VOP_STRATEGY(vp, ibp);
+                                       biowait(ibp);
+                               }
+                               /*
+                                * This block exists, but the next one may not.
+                                * If that is the case mark it UNWRITTEN to keep
+                                * the accounting straight.
+                                */
+                               /* XXX ondisk32 */
+                               if (((int32_t *)ibp->b_data)[indirs[i].in_off] == 0)
+                                       ((int32_t *)ibp->b_data)[indirs[i].in_off] =
+                                               UNWRITTEN;
+                               /* XXX ondisk32 */
+                               idaddr = ((int32_t *)ibp->b_data)[indirs[i].in_off];
+#ifdef DEBUG
+                               if (vp == fs->lfs_ivnode) {
+                                       LFS_ENTER_LOG("balloc", __FILE__,
+                                               __LINE__, indirs[i].in_lbn,
+                                               ibp->b_flags, curproc->p_pid);
+                               }
+#endif
+                               if ((error = VOP_BWRITE(ibp->b_vp, ibp)))
+                                       return error;
+                       }
+               }
+       }
+
+
+       /*
+        * Get the existing block from the cache, if requested.
+        */
+       if (bpp)
+               *bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0);
+
+       /*
+        * Do accounting on blocks that represent pages.
+        */
+       if (!bpp)
+               lfs_register_block(vp, lbn);
+
+       /*
+        * The block we are writing may be a brand new block
+        * in which case we need to do accounting.
+        *
+        * We can tell a truly new block because ufs_bmaparray will say
+        * it is UNASSIGNED.  Once we allocate it we will assign it the
+        * disk address UNWRITTEN.
+        */
+       if (daddr == UNASSIGNED) {
+               if (bpp) {
+                       if (flags & B_CLRBUF)
+                               clrbuf(bp);
+
+                       /* Note the new address */
+                       bp->b_blkno = UNWRITTEN;
+               }
+
+               switch (num) {
+                   case 0:
+                       ip->i_ffs1_db[lbn] = UNWRITTEN;
+                       break;
+                   case 1:
+                       ip->i_ffs1_ib[indirs[0].in_off] = UNWRITTEN;
+                       break;
+                   default:
+                       idp = &indirs[num - 1];
+                       if (bread(vp, idp->in_lbn, fs->lfs_bsize, NOCRED,
+                                 B_MODIFY, &ibp))
+                               panic("lfs_balloc: bread bno %lld",
+                                   (long long)idp->in_lbn);
+                       /* XXX ondisk32 */
+                       ((int32_t *)ibp->b_data)[idp->in_off] = UNWRITTEN;
+#ifdef DEBUG
+                       if (vp == fs->lfs_ivnode) {
+                               LFS_ENTER_LOG("balloc", __FILE__,
+                                       __LINE__, idp->in_lbn,
+                                       ibp->b_flags, curproc->p_pid);
+                       }
+#endif
+                       VOP_BWRITE(ibp->b_vp, ibp);
+               }
+       } else if (bpp && !(bp->b_oflags & (BO_DONE|BO_DELWRI))) {
+               /*
+                * Not a brand new block, also not in the cache;
+                * read it in from disk.
+                */
+               if (iosize == fs->lfs_bsize)
+                       /* Optimization: I/O is unnecessary. */
+                       bp->b_blkno = daddr;
+               else {
+                       /*
+                        * We need to read the block to preserve the
+                        * existing bytes.
+                        */
+                       bp->b_blkno = daddr;
+                       bp->b_flags |= B_READ;
+                       VOP_STRATEGY(vp, bp);
+                       return (biowait(bp));
+               }
+       }
+
+       return (0);
+}
+
+/* VOP_BWRITE 1 time */
+int
+lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf **bpp,
+    kauth_cred_t cred)
+{
+       struct inode *ip;
+       struct lfs *fs;
+       long frags;
+       int error;
+       extern long locked_queue_bytes;
+       size_t obufsize;
+
+       ip = VTOI(vp);
+       fs = ip->i_lfs;
+       frags = (long)numfrags(fs, nsize - osize);
+       error = 0;
+
+       ASSERT_NO_SEGLOCK(fs);
+
+       /*
+        * Get the seglock so we don't enlarge blocks while a segment
+        * is being written.  If we're called with bpp==NULL, though,
+        * we are only pretending to change a buffer, so we don't have to
+        * lock.
+        */
+    top:
+       if (bpp) {
+               rw_enter(&fs->lfs_fraglock, RW_READER);
+               LFS_DEBUG_COUNTLOCKED("frag");
+       }
+
+       if (!ISSPACE(fs, frags, cred)) {
+               error = ENOSPC;
+               goto out;
+       }
+
+       /*
+        * If we are not asked to actually return the block, all we need
+        * to do is allocate space for it.  UBC will handle dirtying the
+        * appropriate things and making sure it all goes to disk.
+        * Don't bother to read in that case.
+        */
+       if (bpp && (error = bread(vp, lbn, osize, NOCRED, 0, bpp))) {
+               brelse(*bpp, 0);
+               goto out;
+       }
+#ifdef QUOTA
+       if ((error = chkdq(ip, frags, cred, 0))) {
+               if (bpp)
+                       brelse(*bpp, 0);
+               goto out;
+       }
+#endif
+       /*
+        * Adjust accounting for lfs_avail.  If there's not enough room,
+        * we will have to wait for the cleaner, which we can't do while
+        * holding a block busy or while holding the seglock.  In that case,
+        * release both and start over after waiting.
+        */
+
+       if (bpp && ((*bpp)->b_oflags & BO_DELWRI)) {
+               if (!lfs_fits(fs, frags)) {
+                       if (bpp)
+                               brelse(*bpp, 0);
+#ifdef QUOTA
+                       chkdq(ip, -frags, cred, 0);
+#endif
+                       rw_exit(&fs->lfs_fraglock);
+                       lfs_availwait(fs, frags);
+                       goto top;
+               }
+               fs->lfs_avail -= frags;
+       }
+
+       mutex_enter(&lfs_lock);
+       fs->lfs_bfree -= frags;
+       mutex_exit(&lfs_lock);
+       ip->i_lfs_effnblks += frags;
+       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+
+       if (bpp) {
+               obufsize = (*bpp)->b_bufsize;
+               allocbuf(*bpp, nsize, 1);
+
+               /* Adjust locked-list accounting */
+               if (((*bpp)->b_flags & B_LOCKED) != 0 &&
+                   (*bpp)->b_iodone == NULL) {
+                       mutex_enter(&lfs_lock);
+                       locked_queue_bytes += (*bpp)->b_bufsize - obufsize;
+                       mutex_exit(&lfs_lock);
+               }
+
+               memset((char *)((*bpp)->b_data) + osize, 0, (u_int)(nsize - osize));
+       }
+
+    out:
+       if (bpp) {
+               rw_exit(&fs->lfs_fraglock);
+       }
+       return (error);
+}
+
+static inline int
+lge(struct lbnentry *a, struct lbnentry *b)
+{
+       return a->lbn - b->lbn;
+}
+
+SPLAY_PROTOTYPE(lfs_splay, lbnentry, entry, lge);
+
+SPLAY_GENERATE(lfs_splay, lbnentry, entry, lge);
+
+/*
+ * Record this lbn as being "write pending".  We used to have this information
+ * on the buffer headers, but since pages don't have buffer headers we
+ * record it here instead.
+ */
+void
+lfs_register_block(struct vnode *vp, daddr_t lbn)
+{
+       struct lfs *fs;
+       struct inode *ip;
+       struct lbnentry *lbp;
+
+       ip = VTOI(vp);
+
+       /* Don't count metadata */
+       if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
+               return;
+
+       fs = ip->i_lfs;
+
+       ASSERT_NO_SEGLOCK(fs);
+
+       /* If no space, wait for the cleaner */
+       lfs_availwait(fs, btofsb(fs, 1 << fs->lfs_bshift));
+
+       lbp = (struct lbnentry *)pool_get(&lfs_lbnentry_pool, PR_WAITOK);
+       lbp->lbn = lbn;
+       mutex_enter(&lfs_lock);
+       if (SPLAY_INSERT(lfs_splay, &ip->i_lfs_lbtree, lbp) != NULL) {
+               mutex_exit(&lfs_lock);
+               /* Already there */
+               pool_put(&lfs_lbnentry_pool, lbp);
+               return;
+       }
+
+       ++ip->i_lfs_nbtree;
+       fs->lfs_favail += btofsb(fs, (1 << fs->lfs_bshift));
+       fs->lfs_pages += fs->lfs_bsize >> PAGE_SHIFT;
+       ++locked_fakequeue_count;
+       lfs_subsys_pages += fs->lfs_bsize >> PAGE_SHIFT;
+       mutex_exit(&lfs_lock);
+}
+
+static void
+lfs_do_deregister(struct lfs *fs, struct inode *ip, struct lbnentry *lbp)
+{
+       ASSERT_MAYBE_SEGLOCK(fs);
+
+       mutex_enter(&lfs_lock);
+       --ip->i_lfs_nbtree;
+       SPLAY_REMOVE(lfs_splay, &ip->i_lfs_lbtree, lbp);
+       if (fs->lfs_favail > btofsb(fs, (1 << fs->lfs_bshift)))
+               fs->lfs_favail -= btofsb(fs, (1 << fs->lfs_bshift));
+       fs->lfs_pages -= fs->lfs_bsize >> PAGE_SHIFT;
+       if (locked_fakequeue_count > 0)
+               --locked_fakequeue_count;
+       lfs_subsys_pages -= fs->lfs_bsize >> PAGE_SHIFT;
+       mutex_exit(&lfs_lock);
+
+       pool_put(&lfs_lbnentry_pool, lbp);
+}
+
+void
+lfs_deregister_block(struct vnode *vp, daddr_t lbn)
+{
+       struct lfs *fs;
+       struct inode *ip;
+       struct lbnentry *lbp;
+       struct lbnentry tmp;
+
+       ip = VTOI(vp);
+
+       /* Don't count metadata */
+       if (lbn < 0 || vp->v_type != VREG || ip->i_number == LFS_IFILE_INUM)
+               return;
+
+       fs = ip->i_lfs;
+       tmp.lbn = lbn;
+       lbp = SPLAY_FIND(lfs_splay, &ip->i_lfs_lbtree, &tmp);
+       if (lbp == NULL)
+               return;
+
+       lfs_do_deregister(fs, ip, lbp);
+}
+
+void
+lfs_deregister_all(struct vnode *vp)
+{
+       struct lbnentry *lbp, *nlbp;
+       struct lfs_splay *hd;
+       struct lfs *fs;
+       struct inode *ip;
+
+       ip = VTOI(vp);
+       fs = ip->i_lfs;
+       hd = &ip->i_lfs_lbtree;
+
+       for (lbp = SPLAY_MIN(lfs_splay, hd); lbp != NULL; lbp = nlbp) {
+               nlbp = SPLAY_NEXT(lfs_splay, hd, lbp);
+               lfs_do_deregister(fs, ip, lbp);
+       }
+}
diff --git a/sys/ufs/lfs/lfs_bio.c b/sys/ufs/lfs/lfs_bio.c
new file mode 100644 (file)
index 0000000..fe3d4b5
--- /dev/null
@@ -0,0 +1,858 @@
+/*     $NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $    */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_bio.c   8.10 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/resourcevar.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+/*
+ * LFS block write function.
+ *
+ * XXX
+ * No write cost accounting is done.
+ * This is almost certainly wrong for synchronous operations and NFS.
+ *
+ * protected by lfs_lock.
+ */
+int    locked_queue_count   = 0;       /* Count of locked-down buffers. */
+long   locked_queue_bytes   = 0L;      /* Total size of locked buffers. */
+int    lfs_subsys_pages     = 0L;      /* Total number LFS-written pages */
+int    lfs_fs_pagetrip      = 0;       /* # of pages to trip per-fs write */
+int    lfs_writing          = 0;       /* Set if already kicked off a writer
+                                          because of buffer space */
+
+/* Lock and condition variables for above. */
+kcondvar_t     locked_queue_cv;
+kcondvar_t     lfs_writing_cv;
+kmutex_t       lfs_lock;
+
+extern int lfs_dostats;
+
+/*
+ * reserved number/bytes of locked buffers
+ */
+int locked_queue_rcount = 0;
+long locked_queue_rbytes = 0L;
+
+static int lfs_fits_buf(struct lfs *, int, int);
+static int lfs_reservebuf(struct lfs *, struct vnode *vp, struct vnode *vp2,
+    int, int);
+static int lfs_reserveavail(struct lfs *, struct vnode *vp, struct vnode *vp2,
+    int);
+
+static int
+lfs_fits_buf(struct lfs *fs, int n, int bytes)
+{
+       int count_fit, bytes_fit;
+
+       ASSERT_NO_SEGLOCK(fs);
+       KASSERT(mutex_owned(&lfs_lock));
+
+       count_fit =
+           (locked_queue_count + locked_queue_rcount + n <= LFS_WAIT_BUFS);
+       bytes_fit =
+           (locked_queue_bytes + locked_queue_rbytes + bytes <= LFS_WAIT_BYTES);
+
+#ifdef DEBUG
+       if (!count_fit) {
+               DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit count: %d + %d + %d >= %d\n",
+                     locked_queue_count, locked_queue_rcount,
+                     n, LFS_WAIT_BUFS));
+       }
+       if (!bytes_fit) {
+               DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit bytes: %ld + %ld + %d >= %ld\n",
+                     locked_queue_bytes, locked_queue_rbytes,
+                     bytes, LFS_WAIT_BYTES));
+       }
+#endif /* DEBUG */
+
+       return (count_fit && bytes_fit);
+}
+
+/* ARGSUSED */
+static int
+lfs_reservebuf(struct lfs *fs, struct vnode *vp,
+    struct vnode *vp2, int n, int bytes)
+{
+       ASSERT_MAYBE_SEGLOCK(fs);
+       KASSERT(locked_queue_rcount >= 0);
+       KASSERT(locked_queue_rbytes >= 0);
+
+       mutex_enter(&lfs_lock);
+       while (n > 0 && !lfs_fits_buf(fs, n, bytes)) {
+               int error;
+
+               lfs_flush(fs, 0, 0);
+
+               error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
+                   hz * LFS_BUFWAIT);
+               if (error && error != EWOULDBLOCK) {
+                       mutex_exit(&lfs_lock);
+                       return error;
+               }
+       }
+
+       locked_queue_rcount += n;
+       locked_queue_rbytes += bytes;
+
+       if (n < 0)
+               cv_broadcast(&locked_queue_cv);
+
+       mutex_exit(&lfs_lock);
+
+       KASSERT(locked_queue_rcount >= 0);
+       KASSERT(locked_queue_rbytes >= 0);
+
+       return 0;
+}
+
+/*
+ * Try to reserve some blocks, prior to performing a sensitive operation that
+ * requires the vnode lock to be honored.  If there is not enough space, give
+ * up the vnode lock temporarily and wait for the space to become available.
+ *
+ * Called with vp locked.  (Note nowever that if fsb < 0, vp is ignored.)
+ *
+ * XXX YAMT - it isn't safe to unlock vp here
+ * because the node might be modified while we sleep.
+ * (eg. cached states like i_offset might be stale,
+ *  the vnode might be truncated, etc..)
+ * maybe we should have a way to restart the vnodeop (EVOPRESTART?)
+ * or rearrange vnodeop interface to leave vnode locking to file system
+ * specific code so that each file systems can have their own vnode locking and
+ * vnode re-using strategies.
+ */
+static int
+lfs_reserveavail(struct lfs *fs, struct vnode *vp,
+    struct vnode *vp2, int fsb)
+{
+       CLEANERINFO *cip;
+       struct buf *bp;
+       int error, slept;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       slept = 0;
+       mutex_enter(&lfs_lock);
+       while (fsb > 0 && !lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail)) {
+               mutex_exit(&lfs_lock);
+#if 0
+               /*
+                * XXX ideally, we should unlock vnodes here
+                * because we might sleep very long time.
+                */
+               VOP_UNLOCK(vp);
+               if (vp2 != NULL) {
+                       VOP_UNLOCK(vp2);
+               }
+#else
+               /*
+                * XXX since we'll sleep for cleaner with vnode lock holding,
+                * deadlock will occur if cleaner tries to lock the vnode.
+                * (eg. lfs_markv -> lfs_fastvget -> getnewvnode -> vclean)
+                */
+#endif
+
+               if (!slept) {
+                       DLOG((DLOG_AVAIL, "lfs_reserve: waiting for %ld (bfree = %d,"
+                             " est_bfree = %d)\n",
+                             fsb + fs->lfs_ravail + fs->lfs_favail,
+                             fs->lfs_bfree, LFS_EST_BFREE(fs)));
+               }
+               ++slept;
+
+               /* Wake up the cleaner */
+               LFS_CLEANERINFO(cip, fs, bp);
+               LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
+               lfs_wakeup_cleaner(fs);
+
+               mutex_enter(&lfs_lock);
+               /* Cleaner might have run while we were reading, check again */
+               if (lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail))
+                       break;
+
+               error = mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_reserve",
+                               0, &lfs_lock);
+#if 0
+               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */
+               vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */
+#endif
+               if (error) {
+                       mutex_exit(&lfs_lock);
+                       return error;
+               }
+       }
+#ifdef DEBUG
+       if (slept) {
+               DLOG((DLOG_AVAIL, "lfs_reserve: woke up\n"));
+       }
+#endif
+       fs->lfs_ravail += fsb;
+       mutex_exit(&lfs_lock);
+
+       return 0;
+}
+
+#ifdef DIAGNOSTIC
+int lfs_rescount;
+int lfs_rescountdirop;
+#endif
+
+int
+lfs_reserve(struct lfs *fs, struct vnode *vp, struct vnode *vp2, int fsb)
+{
+       int error;
+       int cantwait;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       if (vp2) {
+               /* Make sure we're not in the process of reclaiming vp2 */
+               mutex_enter(&lfs_lock);
+               while(fs->lfs_flags & LFS_UNDIROP) {
+                       mtsleep(&fs->lfs_flags, PRIBIO + 1, "lfsrundirop", 0,
+                           &lfs_lock);
+               }
+               mutex_exit(&lfs_lock);
+       }
+
+       KASSERT(fsb < 0 || VOP_ISLOCKED(vp));
+       KASSERT(vp2 == NULL || fsb < 0 || VOP_ISLOCKED(vp2));
+       KASSERT(vp2 == NULL || !(VTOI(vp2)->i_flag & IN_ADIROP));
+       KASSERT(vp2 == NULL || vp2 != fs->lfs_unlockvp);
+
+       cantwait = (VTOI(vp)->i_flag & IN_ADIROP) || fs->lfs_unlockvp == vp;
+#ifdef DIAGNOSTIC
+       if (cantwait) {
+               if (fsb > 0)
+                       lfs_rescountdirop++;
+               else if (fsb < 0)
+                       lfs_rescountdirop--;
+               if (lfs_rescountdirop < 0)
+                       panic("lfs_rescountdirop");
+       }
+       else {
+               if (fsb > 0)
+                       lfs_rescount++;
+               else if (fsb < 0)
+                       lfs_rescount--;
+               if (lfs_rescount < 0)
+                       panic("lfs_rescount");
+       }
+#endif
+       if (cantwait)
+               return 0;
+
+       /*
+        * XXX
+        * vref vnodes here so that cleaner doesn't try to reuse them.
+        * (see XXX comment in lfs_reserveavail)
+        */
+       vhold(vp);
+       if (vp2 != NULL) {
+               vhold(vp2);
+       }
+
+       error = lfs_reserveavail(fs, vp, vp2, fsb);
+       if (error)
+               goto done;
+
+       /*
+        * XXX just a guess. should be more precise.
+        */
+       error = lfs_reservebuf(fs, vp, vp2, fsb, fsbtob(fs, fsb));
+       if (error)
+               lfs_reserveavail(fs, vp, vp2, -fsb);
+
+done:
+       holdrele(vp);
+       if (vp2 != NULL) {
+               holdrele(vp2);
+       }
+
+       return error;
+}
+
+int
+lfs_bwrite(void *v)
+{
+       struct vop_bwrite_args /* {
+               struct vnode *a_vp;
+               struct buf *a_bp;
+       } */ *ap = v;
+       struct buf *bp = ap->a_bp;
+
+#ifdef DIAGNOSTIC
+       if (VTOI(bp->b_vp)->i_lfs->lfs_ronly == 0 && (bp->b_flags & B_ASYNC)) {
+               panic("bawrite LFS buffer");
+       }
+#endif /* DIAGNOSTIC */
+       return lfs_bwrite_ext(bp, 0);
+}
+
+/*
+ * Determine if there is enough room currently available to write fsb
+ * blocks.  We need enough blocks for the new blocks, the current
+ * inode blocks (including potentially the ifile inode), a summary block,
+ * and the segment usage table, plus an ifile block.
+ */
+int
+lfs_fits(struct lfs *fs, int fsb)
+{
+       int needed;
+
+       ASSERT_NO_SEGLOCK(fs);
+       needed = fsb + btofsb(fs, fs->lfs_sumsize) +
+                ((howmany(fs->lfs_uinodes + 1, INOPB(fs)) + fs->lfs_segtabsz +
+                  1) << (fs->lfs_bshift - fs->lfs_ffshift));
+
+       if (needed >= fs->lfs_avail) {
+#ifdef DEBUG
+               DLOG((DLOG_AVAIL, "lfs_fits: no fit: fsb = %ld, uinodes = %ld, "
+                     "needed = %ld, avail = %ld\n",
+                     (long)fsb, (long)fs->lfs_uinodes, (long)needed,
+                     (long)fs->lfs_avail));
+#endif
+               return 0;
+       }
+       return 1;
+}
+
+int
+lfs_availwait(struct lfs *fs, int fsb)
+{
+       int error;
+       CLEANERINFO *cip;
+       struct buf *cbp;
+
+       ASSERT_NO_SEGLOCK(fs);
+       /* Push cleaner blocks through regardless */
+       mutex_enter(&lfs_lock);
+       if (LFS_SEGLOCK_HELD(fs) &&
+           fs->lfs_sp->seg_flags & (SEGM_CLEAN | SEGM_FORCE_CKP)) {
+               mutex_exit(&lfs_lock);
+               return 0;
+       }
+       mutex_exit(&lfs_lock);
+
+       while (!lfs_fits(fs, fsb)) {
+               /*
+                * Out of space, need cleaner to run.
+                * Update the cleaner info, then wake it up.
+                * Note the cleanerinfo block is on the ifile
+                * so it CANT_WAIT.
+                */
+               LFS_CLEANERINFO(cip, fs, cbp);
+               LFS_SYNC_CLEANERINFO(cip, fs, cbp, 0);
+
+#ifdef DEBUG
+               DLOG((DLOG_AVAIL, "lfs_availwait: out of available space, "
+                     "waiting on cleaner\n"));
+#endif
+
+               lfs_wakeup_cleaner(fs);
+#ifdef DIAGNOSTIC
+               if (LFS_SEGLOCK_HELD(fs))
+                       panic("lfs_availwait: deadlock");
+#endif
+               error = tsleep(&fs->lfs_avail, PCATCH | PUSER, "cleaner", 0);
+               if (error)
+                       return (error);
+       }
+       return 0;
+}
+
+int
+lfs_bwrite_ext(struct buf *bp, int flags)
+{
+       struct lfs *fs;
+       struct inode *ip;
+       struct vnode *vp;
+       int fsb;
+
+       vp = bp->b_vp;
+       fs = VFSTOUFS(vp->v_mount)->um_lfs;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       KASSERT(bp->b_cflags & BC_BUSY);
+       KASSERT(flags & BW_CLEAN || !LFS_IS_MALLOC_BUF(bp));
+       KASSERT(((bp->b_oflags | bp->b_flags) & (BO_DELWRI|B_LOCKED))
+           != BO_DELWRI);
+
+       /*
+        * Don't write *any* blocks if we're mounted read-only, or
+        * if we are "already unmounted".
+        *
+        * In particular the cleaner can't write blocks either.
+        */
+       if (fs->lfs_ronly || (fs->lfs_pflags & LFS_PF_CLEAN)) {
+               bp->b_oflags &= ~BO_DELWRI;
+               bp->b_flags |= B_READ;
+               bp->b_error = 0;
+               mutex_enter(&bufcache_lock);
+               LFS_UNLOCK_BUF(bp);
+               if (LFS_IS_MALLOC_BUF(bp))
+                       bp->b_cflags &= ~BC_BUSY;
+               else
+                       brelsel(bp, 0);
+               mutex_exit(&bufcache_lock);
+               return (fs->lfs_ronly ? EROFS : 0);
+       }
+
+       /*
+        * Set the delayed write flag and use reassignbuf to move the buffer
+        * from the clean list to the dirty one.
+        *
+        * Set the B_LOCKED flag and unlock the buffer, causing brelse to move
+        * the buffer onto the LOCKED free list.  This is necessary, otherwise
+        * getnewbuf() would try to reclaim the buffers using bawrite, which
+        * isn't going to work.
+        *
+        * XXX we don't let meta-data writes run out of space because they can
+        * come from the segment writer.  We need to make sure that there is
+        * enough space reserved so that there's room to write meta-data
+        * blocks.
+        */
+       if ((bp->b_flags & B_LOCKED) == 0) {
+               fsb = numfrags(fs, bp->b_bcount);
+
+               ip = VTOI(vp);
+               mutex_enter(&lfs_lock);
+               if (flags & BW_CLEAN) {
+                       LFS_SET_UINO(ip, IN_CLEANING);
+               } else {
+                       LFS_SET_UINO(ip, IN_MODIFIED);
+               }
+               mutex_exit(&lfs_lock);
+               fs->lfs_avail -= fsb;
+
+               mutex_enter(&bufcache_lock);
+               mutex_enter(vp->v_interlock);
+               bp->b_oflags = (bp->b_oflags | BO_DELWRI) & ~BO_DONE;
+               LFS_LOCK_BUF(bp);
+               bp->b_flags &= ~B_READ;
+               bp->b_error = 0;
+               reassignbuf(bp, bp->b_vp);
+               mutex_exit(vp->v_interlock);
+       } else {
+               mutex_enter(&bufcache_lock);
+       }
+
+       if (bp->b_iodone != NULL)
+               bp->b_cflags &= ~BC_BUSY;
+       else
+               brelsel(bp, 0);
+       mutex_exit(&bufcache_lock);
+
+       return (0);
+}
+
+/*
+ * Called and return with the lfs_lock held.
+ */
+void
+lfs_flush_fs(struct lfs *fs, int flags)
+{
+       ASSERT_NO_SEGLOCK(fs);
+       KASSERT(mutex_owned(&lfs_lock));
+       if (fs->lfs_ronly)
+               return;
+
+       if (lfs_dostats)
+               ++lfs_stats.flush_invoked;
+
+       mutex_exit(&lfs_lock);
+       lfs_writer_enter(fs, "fldirop");
+       lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
+       lfs_writer_leave(fs);
+       mutex_enter(&lfs_lock);
+       fs->lfs_favail = 0; /* XXX */
+}
+
+/*
+ * This routine initiates segment writes when LFS is consuming too many
+ * resources.  Ideally the pageout daemon would be able to direct LFS
+ * more subtly.
+ * XXX We have one static count of locked buffers;
+ * XXX need to think more about the multiple filesystem case.
+ *
+ * Called and return with lfs_lock held.
+ * If fs != NULL, we hold the segment lock for fs.
+ */
+void
+lfs_flush(struct lfs *fs, int flags, int only_onefs)
+{
+       extern u_int64_t locked_fakequeue_count;
+       struct mount *mp, *nmp;
+       struct lfs *tfs;
+
+       KASSERT(mutex_owned(&lfs_lock));
+       KDASSERT(fs == NULL || !LFS_SEGLOCK_HELD(fs));
+
+       if (lfs_dostats)
+               ++lfs_stats.write_exceeded;
+       /* XXX should we include SEGM_CKP here? */
+       if (lfs_writing && !(flags & SEGM_SYNC)) {
+               DLOG((DLOG_FLUSH, "lfs_flush: not flushing because another flush is active\n"));
+               return;
+       }
+       while (lfs_writing)
+               cv_wait(&lfs_writing_cv, &lfs_lock);
+       lfs_writing = 1;
+
+       mutex_exit(&lfs_lock);
+
+       if (only_onefs) {
+               KASSERT(fs != NULL);
+               if (vfs_busy(fs->lfs_ivnode->v_mount, NULL))
+                       goto errout;
+               mutex_enter(&lfs_lock);
+               lfs_flush_fs(fs, flags);
+               mutex_exit(&lfs_lock);
+               vfs_unbusy(fs->lfs_ivnode->v_mount, false, NULL);
+       } else {
+               locked_fakequeue_count = 0;
+               mutex_enter(&mountlist_lock);
+               for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
+                    mp = nmp) {
+                       if (vfs_busy(mp, &nmp)) {
+                               DLOG((DLOG_FLUSH, "lfs_flush: fs vfs_busy\n"));
+                               continue;
+                       }
+                       if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS,
+                           sizeof(mp->mnt_stat.f_fstypename)) == 0) {
+                               tfs = VFSTOUFS(mp)->um_lfs;
+                               mutex_enter(&lfs_lock);
+                               lfs_flush_fs(tfs, flags);
+                               mutex_exit(&lfs_lock);
+                       }
+                       vfs_unbusy(mp, false, &nmp);
+               }
+               mutex_exit(&mountlist_lock);
+       }
+       LFS_DEBUG_COUNTLOCKED("flush");
+       wakeup(&lfs_subsys_pages);
+
+    errout:
+       mutex_enter(&lfs_lock);
+       KASSERT(lfs_writing);
+       lfs_writing = 0;
+       wakeup(&lfs_writing);
+}
+
+#define INOCOUNT(fs) howmany((fs)->lfs_uinodes, INOPB(fs))
+#define INOBYTES(fs) ((fs)->lfs_uinodes * sizeof (struct ufs1_dinode))
+
+/*
+ * make sure that we don't have too many locked buffers.
+ * flush buffers if needed.
+ */
+int
+lfs_check(struct vnode *vp, daddr_t blkno, int flags)
+{
+       int error;
+       struct lfs *fs;
+       struct inode *ip;
+       extern pid_t lfs_writer_daemon;
+
+       error = 0;
+       ip = VTOI(vp);
+
+       /* If out of buffers, wait on writer */
+       /* XXX KS - if it's the Ifile, we're probably the cleaner! */
+       if (ip->i_number == LFS_IFILE_INUM)
+               return 0;
+       /* If we're being called from inside a dirop, don't sleep */
+       if (ip->i_flag & IN_ADIROP)
+               return 0;
+
+       fs = ip->i_lfs;
+
+       ASSERT_NO_SEGLOCK(fs);
+
+       /*
+        * If we would flush below, but dirops are active, sleep.
+        * Note that a dirop cannot ever reach this code!
+        */
+       mutex_enter(&lfs_lock);
+       while (fs->lfs_dirops > 0 &&
+              (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
+               locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
+               lfs_subsys_pages > LFS_MAX_PAGES ||
+               fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+               lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0))
+       {
+               ++fs->lfs_diropwait;
+               mtsleep(&fs->lfs_writer, PRIBIO+1, "bufdirop", 0,
+                       &lfs_lock);
+               --fs->lfs_diropwait;
+       }
+
+#ifdef DEBUG
+       if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS)
+               DLOG((DLOG_FLUSH, "lfs_check: lqc = %d, max %d\n",
+                     locked_queue_count + INOCOUNT(fs), LFS_MAX_BUFS));
+       if (locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES)
+               DLOG((DLOG_FLUSH, "lfs_check: lqb = %ld, max %ld\n",
+                     locked_queue_bytes + INOBYTES(fs), LFS_MAX_BYTES));
+       if (lfs_subsys_pages > LFS_MAX_PAGES)
+               DLOG((DLOG_FLUSH, "lfs_check: lssp = %d, max %d\n",
+                     lfs_subsys_pages, LFS_MAX_PAGES));
+       if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip)
+               DLOG((DLOG_FLUSH, "lfs_check: fssp = %d, trip at %d\n",
+                     fs->lfs_pages, lfs_fs_pagetrip));
+       if (lfs_dirvcount > LFS_MAX_DIROP)
+               DLOG((DLOG_FLUSH, "lfs_check: ldvc = %d, max %d\n",
+                     lfs_dirvcount, LFS_MAX_DIROP));
+       if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs))
+               DLOG((DLOG_FLUSH, "lfs_check: lfdvc = %d, max %d\n",
+                     fs->lfs_dirvcount, LFS_MAX_FSDIROP(fs)));
+       if (fs->lfs_diropwait > 0)
+               DLOG((DLOG_FLUSH, "lfs_check: ldvw = %d\n",
+                     fs->lfs_diropwait));
+#endif
+
+       /* If there are too many pending dirops, we have to flush them. */
+       if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+           lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
+               flags |= SEGM_CKP;
+       }
+
+       if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
+           locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
+           lfs_subsys_pages > LFS_MAX_PAGES ||
+           fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+           lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
+               lfs_flush(fs, flags, 0);
+       } else if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip) {
+               /*
+                * If we didn't flush the whole thing, some filesystems
+                * still might want to be flushed.
+                */
+               ++fs->lfs_pdflush;
+               wakeup(&lfs_writer_daemon);
+       }
+
+       while (locked_queue_count + INOCOUNT(fs) >= LFS_WAIT_BUFS ||
+               locked_queue_bytes + INOBYTES(fs) >= LFS_WAIT_BYTES ||
+               lfs_subsys_pages > LFS_WAIT_PAGES ||
+               fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+               lfs_dirvcount > LFS_MAX_DIROP) {
+
+               if (lfs_dostats)
+                       ++lfs_stats.wait_exceeded;
+               DLOG((DLOG_AVAIL, "lfs_check: waiting: count=%d, bytes=%ld\n",
+                     locked_queue_count, locked_queue_bytes));
+               error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
+                   hz * LFS_BUFWAIT);
+               if (error != EWOULDBLOCK)
+                       break;
+
+               /*
+                * lfs_flush might not flush all the buffers, if some of the
+                * inodes were locked or if most of them were Ifile blocks
+                * and we weren't asked to checkpoint.  Try flushing again
+                * to keep us from blocking indefinitely.
+                */
+               if (locked_queue_count + INOCOUNT(fs) >= LFS_MAX_BUFS ||
+                   locked_queue_bytes + INOBYTES(fs) >= LFS_MAX_BYTES) {
+                       lfs_flush(fs, flags | SEGM_CKP, 0);
+               }
+       }
+       mutex_exit(&lfs_lock);
+       return (error);
+}
+
+/*
+ * Allocate a new buffer header.
+ */
+struct buf *
+lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, int type)
+{
+       struct buf *bp;
+       size_t nbytes;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       nbytes = roundup(size, fsbtob(fs, 1));
+
+       bp = getiobuf(NULL, true);
+       if (nbytes) {
+               bp->b_data = lfs_malloc(fs, nbytes, type);
+               /* memset(bp->b_data, 0, nbytes); */
+       }
+#ifdef DIAGNOSTIC
+       if (vp == NULL)
+               panic("vp is NULL in lfs_newbuf");
+       if (bp == NULL)
+               panic("bp is NULL after malloc in lfs_newbuf");
+#endif
+
+       bp->b_bufsize = size;
+       bp->b_bcount = size;
+       bp->b_lblkno = daddr;
+       bp->b_blkno = daddr;
+       bp->b_error = 0;
+       bp->b_resid = 0;
+       bp->b_iodone = lfs_callback;
+       bp->b_cflags = BC_BUSY | BC_NOCACHE;
+       bp->b_private = fs;
+
+       mutex_enter(&bufcache_lock);
+       mutex_enter(vp->v_interlock);
+       bgetvp(vp, bp);
+       mutex_exit(vp->v_interlock);
+       mutex_exit(&bufcache_lock);
+
+       return (bp);
+}
+
+void
+lfs_freebuf(struct lfs *fs, struct buf *bp)
+{
+       struct vnode *vp;
+
+       if ((vp = bp->b_vp) != NULL) {
+               mutex_enter(&bufcache_lock);
+               mutex_enter(vp->v_interlock);
+               brelvp(bp);
+               mutex_exit(vp->v_interlock);
+               mutex_exit(&bufcache_lock);
+       }
+       if (!(bp->b_cflags & BC_INVAL)) { /* BC_INVAL indicates a "fake" buffer */
+               lfs_free(fs, bp->b_data, LFS_NB_UNKNOWN);
+               bp->b_data = NULL;
+       }
+       putiobuf(bp);
+}
+
+/*
+ * Count buffers on the "locked" queue, and compare it to a pro-forma count.
+ * Don't count malloced buffers, since they don't detract from the total.
+ */
+void
+lfs_countlocked(int *count, long *bytes, const char *msg)
+{
+       struct buf *bp;
+       int n = 0;
+       long int size = 0L;
+
+       mutex_enter(&bufcache_lock);
+       TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED].bq_queue, b_freelist) {
+               KASSERT(bp->b_iodone == NULL);
+               n++;
+               size += bp->b_bufsize;
+#ifdef DIAGNOSTIC
+               if (n > nbuf)
+                       panic("lfs_countlocked: this can't happen: more"
+                             " buffers locked than exist");
+#endif
+       }
+       /*
+        * Theoretically this function never really does anything.
+        * Give a warning if we have to fix the accounting.
+        */
+       if (n != *count) {
+               DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted buf count"
+                     " from %d to %d\n", msg, *count, n));
+       }
+       if (size != *bytes) {
+               DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted byte count"
+                     " from %ld to %ld\n", msg, *bytes, size));
+       }
+       *count = n;
+       *bytes = size;
+       mutex_exit(&bufcache_lock);
+       return;
+}
+
+int
+lfs_wait_pages(void)
+{
+       int active, inactive;
+
+       uvm_estimatepageable(&active, &inactive);
+       return LFS_WAIT_RESOURCE(active + inactive + uvmexp.free, 1);
+}
+
+int
+lfs_max_pages(void)
+{
+       int active, inactive;
+
+       uvm_estimatepageable(&active, &inactive);
+       return LFS_MAX_RESOURCE(active + inactive + uvmexp.free, 1);
+}
diff --git a/sys/ufs/lfs/lfs_cksum.c b/sys/ufs/lfs/lfs_cksum.c
new file mode 100644 (file)
index 0000000..a3f0fb9
--- /dev/null
@@ -0,0 +1,110 @@
+/*     $NetBSD: lfs_cksum.c,v 1.27 2008/04/28 20:24:11 martin Exp $    */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_cksum.c 8.2 (Berkeley) 10/9/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_cksum.c,v 1.27 2008/04/28 20:24:11 martin Exp $");
+
+#include <sys/param.h>
+#ifdef _KERNEL
+# include <sys/systm.h>
+# include <sys/lock.h>
+#else
+# include <stddef.h>
+#endif
+#include <sys/mount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+/*
+ * Simple, general purpose, fast checksum.  Data must be short-aligned.
+ * Returns a u_long in case we ever want to do something more rigorous.
+ *
+ * XXX
+ * Use the TCP/IP checksum instead.
+ */
+u_int32_t
+lfs_cksum_part(void *str, size_t len, u_int32_t sum)
+{
+
+       len &= ~(sizeof(u_int16_t) - 1);
+       for (; len; len -= sizeof(u_int16_t)) {
+               sum ^= *(u_int16_t *)str;
+               str = (void *)((u_int16_t *)str + 1);
+       }
+       return (sum);
+}
+
+u_int32_t
+cksum(void *str, size_t len)
+{
+
+       return lfs_cksum_fold(lfs_cksum_part(str, len, 0));
+}
+
+u_int32_t
+lfs_sb_cksum(struct dlfs *fs)
+{
+       size_t size;
+
+       size = (size_t)offsetof(struct dlfs, dlfs_cksum);
+       return cksum(fs, size);
+}
diff --git a/sys/ufs/lfs/lfs_debug.c b/sys/ufs/lfs/lfs_debug.c
new file mode 100644 (file)
index 0000000..ecad772
--- /dev/null
@@ -0,0 +1,325 @@
+/*     $NetBSD: lfs_debug.c,v 1.39 2011/07/17 20:54:54 joerg Exp $     */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_debug.c 8.1 (Berkeley) 6/11/93
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.39 2011/07/17 20:54:54 joerg Exp $");
+
+#ifdef DEBUG
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+#include <sys/syslog.h>
+#include <sys/proc.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+int lfs_lognum;
+struct lfs_log_entry lfs_log[LFS_LOGLENGTH];
+
+int
+lfs_bwrite_log(struct buf *bp, const char *file, int line)
+{
+       struct vop_bwrite_args a;
+
+       a.a_desc = VDESC(vop_bwrite);
+       a.a_bp = bp;
+
+       if (!(bp->b_flags & B_GATHERED) && !(bp->b_oflags & BO_DELWRI)) {
+               LFS_ENTER_LOG("write", file, line, bp->b_lblkno, bp->b_flags,
+                       curproc->p_pid);
+       }
+       return (VCALL(bp->b_vp, VOFFSET(vop_bwrite), &a));
+}
+
+void
+lfs_dumplog(void)
+{
+       int i;
+       const char *cp;
+
+       for (i = lfs_lognum; i != (lfs_lognum - 1) % LFS_LOGLENGTH;
+            i = (i + 1) % LFS_LOGLENGTH)
+               if (lfs_log[i].file) {
+                       /* Only print out basename, for readability */
+                       cp = lfs_log[i].file;
+                       while(*cp)
+                               ++cp;
+                       while(*cp != '/' && cp > lfs_log[i].file)
+                               --cp;
+
+                       printf("lbn %" PRId64 " %s %lx %d, %d %s\n",
+                               lfs_log[i].block,
+                               lfs_log[i].op,
+                               lfs_log[i].flags,
+                               lfs_log[i].pid,
+                               lfs_log[i].line,
+                               cp);
+               }
+}
+
+void
+lfs_dump_super(struct lfs *lfsp)
+{
+       int i;
+
+       printf("%s%x\t%s%x\t%s%d\t%s%d\n",
+              "magic    ", lfsp->lfs_magic,
+              "version  ", lfsp->lfs_version,
+              "size     ", lfsp->lfs_size,
+              "ssize    ", lfsp->lfs_ssize);
+       printf("%s%d\t%s%d\t%s%d\t%s%d\n",
+              "dsize    ", lfsp->lfs_dsize,
+              "bsize    ", lfsp->lfs_bsize,
+              "fsize    ", lfsp->lfs_fsize,
+              "frag     ", lfsp->lfs_frag);
+
+       printf("%s%d\t%s%d\t%s%d\t%s%d\n",
+              "minfree  ", lfsp->lfs_minfree,
+              "inopb    ", lfsp->lfs_inopb,
+              "ifpb     ", lfsp->lfs_ifpb,
+              "nindir   ", lfsp->lfs_nindir);
+
+       printf("%s%d\t%s%d\t%s%d\t%s%d\n",
+              "nseg     ", lfsp->lfs_nseg,
+              "nspf     ", lfsp->lfs_nspf,
+              "cleansz  ", lfsp->lfs_cleansz,
+              "segtabsz ", lfsp->lfs_segtabsz);
+
+       printf("%s%x\t%s%d\t%s%lx\t%s%d\n",
+              "segmask  ", lfsp->lfs_segmask,
+              "segshift ", lfsp->lfs_segshift,
+              "bmask    ", (unsigned long)lfsp->lfs_bmask,
+              "bshift   ", lfsp->lfs_bshift);
+
+       printf("%s%lu\t%s%d\t%s%lx\t%s%u\n",
+              "ffmask   ", (unsigned long)lfsp->lfs_ffmask,
+              "ffshift  ", lfsp->lfs_ffshift,
+              "fbmask   ", (unsigned long)lfsp->lfs_fbmask,
+              "fbshift  ", lfsp->lfs_fbshift);
+
+       printf("%s%d\t%s%d\t%s%x\t%s%qx\n",
+              "sushift  ", lfsp->lfs_sushift,
+              "fsbtodb  ", lfsp->lfs_fsbtodb,
+              "cksum    ", lfsp->lfs_cksum,
+              "maxfilesize ", (long long)lfsp->lfs_maxfilesize);
+
+       printf("Superblock disk addresses:");
+       for (i = 0; i < LFS_MAXNUMSB; i++)
+               printf(" %x", lfsp->lfs_sboffs[i]);
+       printf("\n");
+
+       printf("Checkpoint Info\n");
+       printf("%s%d\t%s%x\t%s%d\n",
+              "freehd   ", lfsp->lfs_freehd,
+              "idaddr   ", lfsp->lfs_idaddr,
+              "ifile    ", lfsp->lfs_ifile);
+       printf("%s%x\t%s%d\t%s%x\t%s%x\t%s%x\t%s%x\n",
+              "bfree    ", lfsp->lfs_bfree,
+              "nfiles   ", lfsp->lfs_nfiles,
+              "lastseg  ", lfsp->lfs_lastseg,
+              "nextseg  ", lfsp->lfs_nextseg,
+              "curseg   ", lfsp->lfs_curseg,
+              "offset   ", lfsp->lfs_offset);
+       printf("tstamp   %llx\n", (long long)lfsp->lfs_tstamp);
+}
+
+void
+lfs_dump_dinode(struct ufs1_dinode *dip)
+{
+       int i;
+
+       printf("%s%u\t%s%d\t%s%u\t%s%u\t%s%qu\t%s%d\n",
+              "mode   ", dip->di_mode,
+              "nlink  ", dip->di_nlink,
+              "uid    ", dip->di_uid,
+              "gid    ", dip->di_gid,
+              "size   ", (long long)dip->di_size,
+              "blocks ", dip->di_blocks);
+       printf("inum  %d\n", dip->di_inumber);
+       printf("Direct Addresses\n");
+       for (i = 0; i < NDADDR; i++) {
+               printf("\t%x", dip->di_db[i]);
+               if ((i % 6) == 5)
+                       printf("\n");
+       }
+       for (i = 0; i < NIADDR; i++)
+               printf("\t%x", dip->di_ib[i]);
+       printf("\n");
+}
+
+void
+lfs_check_segsum(struct lfs *fs, struct segment *sp, char *file, int line)
+{
+       int actual;
+#if 0
+       static int offset;
+#endif
+
+       if ((actual = 1) == 1)
+               return; /* XXXX not checking this anymore, really */
+
+       if (sp->sum_bytes_left >= FINFOSIZE
+          && sp->fip->fi_nblocks > 512) {
+               printf("%s:%d: fi_nblocks = %d\n",file,line,sp->fip->fi_nblocks);
+#ifdef DDB
+               Debugger();
+#endif
+       }
+
+       if (sp->sum_bytes_left > 484) {
+               printf("%s:%d: bad value (%d = -%d) for sum_bytes_left\n",
+                      file, line, sp->sum_bytes_left, fs->lfs_sumsize-sp->sum_bytes_left);
+               panic("too many bytes");
+       }
+
+       actual = fs->lfs_sumsize
+               /* amount taken up by FINFOs */
+               - ((char *)&(sp->fip->fi_blocks[sp->fip->fi_nblocks]) - (char *)(sp->segsum))
+                       /* amount taken up by inode blocks */
+                       - sizeof(int32_t)*((sp->ninodes+INOPB(fs)-1) / INOPB(fs));
+#if 0
+       if (actual - sp->sum_bytes_left < offset)
+       {
+               printf("%s:%d: offset changed %d -> %d\n", file, line,
+                      offset, actual-sp->sum_bytes_left);
+               offset = actual - sp->sum_bytes_left;
+               /* panic("byte mismatch"); */
+       }
+#endif
+#if 0
+       if (actual != sp->sum_bytes_left)
+               printf("%s:%d: warning: segsum miscalc at %d (-%d => %d)\n",
+                      file, line, sp->sum_bytes_left,
+                      fs->lfs_sumsize-sp->sum_bytes_left,
+                      actual);
+#endif
+       if (sp->sum_bytes_left > 0
+          && ((char *)(sp->segsum))[fs->lfs_sumsize
+                                    - sizeof(int32_t) * ((sp->ninodes+INOPB(fs)-1) / INOPB(fs))
+                                    - sp->sum_bytes_left] != '\0') {
+               printf("%s:%d: warning: segsum overwrite at %d (-%d => %d)\n",
+                      file, line, sp->sum_bytes_left,
+                      fs->lfs_sumsize-sp->sum_bytes_left,
+                      actual);
+#ifdef DDB
+               Debugger();
+#endif
+       }
+}
+
+void
+lfs_check_bpp(struct lfs *fs, struct segment *sp, char *file, int line)
+{
+       daddr_t blkno;
+       struct buf **bpp;
+       struct vnode *devvp;
+
+       devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+       blkno = (*(sp->bpp))->b_blkno;
+       for (bpp = sp->bpp; bpp < sp->cbpp; bpp++) {
+               if ((*bpp)->b_blkno != blkno) {
+                       if ((*bpp)->b_vp == devvp) {
+                               printf("Oops, would misplace raw block "
+                                      "0x%" PRIx64 " at 0x%" PRIx64 "\n",
+                                      (*bpp)->b_blkno,
+                                      blkno);
+                       } else {
+                               printf("%s:%d: misplace ino %llu lbn %" PRId64
+                                      " at 0x%" PRIx64 " instead of "
+                                      "0x%" PRIx64 "\n",
+                                      file, line,
+                                      (unsigned long long)
+                                      VTOI((*bpp)->b_vp)->i_number,
+                                      (*bpp)->b_lblkno,
+                                      blkno,
+                                      (*bpp)->b_blkno);
+                       }
+               }
+               blkno += fsbtodb(fs, btofsb(fs, (*bpp)->b_bcount));
+       }
+}
+
+int lfs_debug_log_subsys[DLOG_MAX];
+
+/*
+ * Log events from various debugging areas of LFS, depending on what
+ * the user has enabled.
+ */
+void
+lfs_debug_log(int subsys, const char *fmt, ...)
+{
+       va_list ap;
+
+       /* If not debugging this subsys, exit */
+       if (lfs_debug_log_subsys[subsys] == 0)
+               return;
+
+       va_start(ap, fmt);
+       vlog(LOG_DEBUG, fmt, ap);
+       va_end(ap);
+}
+#endif /* DEBUG */
diff --git a/sys/ufs/lfs/lfs_inode.c b/sys/ufs/lfs/lfs_inode.c
new file mode 100644 (file)
index 0000000..06bb9c1
--- /dev/null
@@ -0,0 +1,902 @@
+/*     $NetBSD: lfs_inode.c,v 1.126 2011/11/23 19:42:10 bouyer Exp $   */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1986, 1989, 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_inode.c 8.9 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.126 2011/11/23 19:42:10 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/trace.h>
+#include <sys/resourcevar.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+static int lfs_update_seguse(struct lfs *, struct inode *ip, long, size_t);
+static int lfs_indirtrunc (struct inode *, daddr_t, daddr_t,
+                          daddr_t, int, long *, long *, long *, size_t *);
+static int lfs_blkfree (struct lfs *, struct inode *, daddr_t, size_t, long *, size_t *);
+static int lfs_vtruncbuf(struct vnode *, daddr_t, bool, int);
+
+/* Search a block for a specific dinode. */
+struct ufs1_dinode *
+lfs_ifind(struct lfs *fs, ino_t ino, struct buf *bp)
+{
+       struct ufs1_dinode *dip = (struct ufs1_dinode *)bp->b_data;
+       struct ufs1_dinode *ldip, *fin;
+
+       ASSERT_NO_SEGLOCK(fs);
+       /*
+        * Read the inode block backwards, since later versions of the
+        * inode will supercede earlier ones.  Though it is unlikely, it is
+        * possible that the same inode will appear in the same inode block.
+        */
+       fin = dip + INOPB(fs);
+       for (ldip = fin - 1; ldip >= dip; --ldip)
+               if (ldip->di_inumber == ino)
+                       return (ldip);
+
+       printf("searched %d entries\n", (int)(fin - dip));
+       printf("offset is 0x%x (seg %d)\n", fs->lfs_offset,
+              dtosn(fs, fs->lfs_offset));
+       printf("block is 0x%llx (seg %lld)\n",
+              (unsigned long long)dbtofsb(fs, bp->b_blkno),
+              (long long)dtosn(fs, dbtofsb(fs, bp->b_blkno)));
+
+       return NULL;
+}
+
+int
+lfs_update(struct vnode *vp, const struct timespec *acc,
+    const struct timespec *mod, int updflags)
+{
+       struct inode *ip;
+       struct lfs *fs = VFSTOUFS(vp->v_mount)->um_lfs;
+       int flags;
+
+       ASSERT_NO_SEGLOCK(fs);
+       if (vp->v_mount->mnt_flag & MNT_RDONLY)
+               return (0);
+       ip = VTOI(vp);
+
+       /*
+        * If we are called from vinvalbuf, and the file's blocks have
+        * already been scheduled for writing, but the writes have not
+        * yet completed, lfs_vflush will not be called, and vinvalbuf
+        * will cause a panic.  So, we must wait until any pending write
+        * for our inode completes, if we are called with UPDATE_WAIT set.
+        */
+       mutex_enter(vp->v_interlock);
+       while ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT &&
+           WRITEINPROG(vp)) {
+               DLOG((DLOG_SEG, "lfs_update: sleeping on ino %d"
+                     " (in progress)\n", ip->i_number));
+               cv_wait(&vp->v_cv, vp->v_interlock);
+       }
+       mutex_exit(vp->v_interlock);
+       LFS_ITIMES(ip, acc, mod, NULL);
+       if (updflags & UPDATE_CLOSE)
+               flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED | IN_CLEANING);
+       else
+               flags = ip->i_flag & (IN_MODIFIED | IN_CLEANING);
+       if (flags == 0)
+               return (0);
+
+       /* If sync, push back the vnode and any dirty blocks it may have. */
+       if ((updflags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT) {
+               /* Avoid flushing VU_DIROP. */
+               mutex_enter(&lfs_lock);
+               ++fs->lfs_diropwait;
+               while (vp->v_uflag & VU_DIROP) {
+                       DLOG((DLOG_DIROP, "lfs_update: sleeping on inode %d"
+                             " (dirops)\n", ip->i_number));
+                       DLOG((DLOG_DIROP, "lfs_update: vflags 0x%x, iflags"
+                             " 0x%x\n",
+                             vp->v_iflag | vp->v_vflag | vp->v_uflag,
+                             ip->i_flag));
+                       if (fs->lfs_dirops == 0)
+                               lfs_flush_fs(fs, SEGM_SYNC);
+                       else
+                               mtsleep(&fs->lfs_writer, PRIBIO+1, "lfs_fsync",
+                                       0, &lfs_lock);
+                       /* XXX KS - by falling out here, are we writing the vn
+                       twice? */
+               }
+               --fs->lfs_diropwait;
+               mutex_exit(&lfs_lock);
+               return lfs_vflush(vp);
+       }
+       return 0;
+}
+
+#define        SINGLE  0       /* index of single indirect block */
+#define        DOUBLE  1       /* index of double indirect block */
+#define        TRIPLE  2       /* index of triple indirect block */
+/*
+ * Truncate the inode oip to at most length size, freeing the
+ * disk blocks.
+ */
+/* VOP_BWRITE 1 + NIADDR + lfs_balloc == 2 + 2*NIADDR times */
+
+int
+lfs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
+{
+       daddr_t lastblock;
+       struct inode *oip = VTOI(ovp);
+       daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
+       /* XXX ondisk32 */
+       int32_t newblks[NDADDR + NIADDR];
+       struct lfs *fs;
+       struct buf *bp;
+       int offset, size, level;
+       long count, rcount, blocksreleased = 0, real_released = 0;
+       int i, nblocks;
+       int aflags, error, allerror = 0;
+       off_t osize;
+       long lastseg;
+       size_t bc;
+       int obufsize, odb;
+       int usepc;
+       struct ufsmount *ump = oip->i_ump;
+
+       if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
+           ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
+               KASSERT(oip->i_size == 0);
+               return 0;
+       }
+
+       if (length < 0)
+               return (EINVAL);
+
+       /*
+        * Just return and not update modification times.
+        */
+       if (oip->i_size == length) {
+               /* still do a uvm_vnp_setsize() as writesize may be larger */
+               uvm_vnp_setsize(ovp, length);
+               return (0);
+       }
+
+       if (ovp->v_type == VLNK &&
+           (oip->i_size < ump->um_maxsymlinklen ||
+            (ump->um_maxsymlinklen == 0 &&
+             oip->i_ffs1_blocks == 0))) {
+#ifdef DIAGNOSTIC
+               if (length != 0)
+                       panic("lfs_truncate: partial truncate of symlink");
+#endif
+               memset((char *)SHORTLINK(oip), 0, (u_int)oip->i_size);
+               oip->i_size = oip->i_ffs1_size = 0;
+               oip->i_flag |= IN_CHANGE | IN_UPDATE;
+               return (lfs_update(ovp, NULL, NULL, 0));
+       }
+       if (oip->i_size == length) {
+               oip->i_flag |= IN_CHANGE | IN_UPDATE;
+               return (lfs_update(ovp, NULL, NULL, 0));
+       }
+       fs = oip->i_lfs;
+       lfs_imtime(fs);
+       osize = oip->i_size;
+       usepc = (ovp->v_type == VREG && ovp != fs->lfs_ivnode);
+
+       ASSERT_NO_SEGLOCK(fs);
+       /*
+        * Lengthen the size of the file. We must ensure that the
+        * last byte of the file is allocated. Since the smallest
+        * value of osize is 0, length will be at least 1.
+        */
+       if (osize < length) {
+               if (length > ump->um_maxfilesize)
+                       return (EFBIG);
+               aflags = B_CLRBUF;
+               if (ioflag & IO_SYNC)
+                       aflags |= B_SYNC;
+               if (usepc) {
+                       if (lblkno(fs, osize) < NDADDR &&
+                           lblkno(fs, osize) != lblkno(fs, length) &&
+                           blkroundup(fs, osize) != osize) {
+                               off_t eob;
+
+                               eob = blkroundup(fs, osize);
+                               uvm_vnp_setwritesize(ovp, eob);
+                               error = ufs_balloc_range(ovp, osize,
+                                   eob - osize, cred, aflags);
+                               if (error) {
+                                       (void) lfs_truncate(ovp, osize,
+                                                   ioflag & IO_SYNC, cred);
+                                       return error;
+                               }
+                               if (ioflag & IO_SYNC) {
+                                       mutex_enter(ovp->v_interlock);
+                                       VOP_PUTPAGES(ovp,
+                                           trunc_page(osize & fs->lfs_bmask),
+                                           round_page(eob),
+                                           PGO_CLEANIT | PGO_SYNCIO);
+                               }
+                       }
+                       uvm_vnp_setwritesize(ovp, length);
+                       error = ufs_balloc_range(ovp, length - 1, 1, cred,
+                                                aflags);
+                       if (error) {
+                               (void) lfs_truncate(ovp, osize,
+                                                   ioflag & IO_SYNC, cred);
+                               return error;
+                       }
+                       uvm_vnp_setsize(ovp, length);
+                       oip->i_flag |= IN_CHANGE | IN_UPDATE;
+                       KASSERT(ovp->v_size == oip->i_size);
+                       oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1;
+                       return (lfs_update(ovp, NULL, NULL, 0));
+               } else {
+                       error = lfs_reserve(fs, ovp, NULL,
+                           btofsb(fs, (NIADDR + 2) << fs->lfs_bshift));
+                       if (error)
+                               return (error);
+                       error = lfs_balloc(ovp, length - 1, 1, cred,
+                                          aflags, &bp);
+                       lfs_reserve(fs, ovp, NULL,
+                           -btofsb(fs, (NIADDR + 2) << fs->lfs_bshift));
+                       if (error)
+                               return (error);
+                       oip->i_ffs1_size = oip->i_size = length;
+                       uvm_vnp_setsize(ovp, length);
+                       (void) VOP_BWRITE(bp->b_vp, bp);
+                       oip->i_flag |= IN_CHANGE | IN_UPDATE;
+                       oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1;
+                       return (lfs_update(ovp, NULL, NULL, 0));
+               }
+       }
+
+       if ((error = lfs_reserve(fs, ovp, NULL,
+           btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift))) != 0)
+               return (error);
+
+       /*
+        * Shorten the size of the file. If the file is not being
+        * truncated to a block boundary, the contents of the
+        * partial block following the end of the file must be
+        * zero'ed in case it ever becomes accessible again because
+        * of subsequent file growth. Directories however are not
+        * zero'ed as they should grow back initialized to empty.
+        */
+       offset = blkoff(fs, length);
+       lastseg = -1;
+       bc = 0;
+
+       if (ovp != fs->lfs_ivnode)
+               lfs_seglock(fs, SEGM_PROT);
+       if (offset == 0) {
+               oip->i_size = oip->i_ffs1_size = length;
+       } else if (!usepc) {
+               lbn = lblkno(fs, length);
+               aflags = B_CLRBUF;
+               if (ioflag & IO_SYNC)
+                       aflags |= B_SYNC;
+               error = lfs_balloc(ovp, length - 1, 1, cred, aflags, &bp);
+               if (error) {
+                       lfs_reserve(fs, ovp, NULL,
+                           -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+                       goto errout;
+               }
+               obufsize = bp->b_bufsize;
+               odb = btofsb(fs, bp->b_bcount);
+               oip->i_size = oip->i_ffs1_size = length;
+               size = blksize(fs, oip, lbn);
+               if (ovp->v_type != VDIR)
+                       memset((char *)bp->b_data + offset, 0,
+                              (u_int)(size - offset));
+               allocbuf(bp, size, 1);
+               if ((bp->b_flags & B_LOCKED) != 0 && bp->b_iodone == NULL) {
+                       mutex_enter(&lfs_lock);
+                       locked_queue_bytes -= obufsize - bp->b_bufsize;
+                       mutex_exit(&lfs_lock);
+               }
+               if (bp->b_oflags & BO_DELWRI)
+                       fs->lfs_avail += odb - btofsb(fs, size);
+               (void) VOP_BWRITE(bp->b_vp, bp);
+       } else { /* vp->v_type == VREG && length < osize && offset != 0 */
+               /*
+                * When truncating a regular file down to a non-block-aligned
+                * size, we must zero the part of last block which is past
+                * the new EOF.  We must synchronously flush the zeroed pages
+                * to disk since the new pages will be invalidated as soon
+                * as we inform the VM system of the new, smaller size.
+                * We must do this before acquiring the GLOCK, since fetching
+                * the pages will acquire the GLOCK internally.
+                * So there is a window where another thread could see a whole
+                * zeroed page past EOF, but that's life.
+                */
+               daddr_t xlbn;
+               voff_t eoz;
+
+               aflags = ioflag & IO_SYNC ? B_SYNC : 0;
+               error = ufs_balloc_range(ovp, length - 1, 1, cred, aflags);
+               if (error) {
+                       lfs_reserve(fs, ovp, NULL,
+                                   -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+                       goto errout;
+               }
+               xlbn = lblkno(fs, length);
+               size = blksize(fs, oip, xlbn);
+               eoz = MIN(lblktosize(fs, xlbn) + size, osize);
+               ubc_zerorange(&ovp->v_uobj, length, eoz - length,
+                   UBC_UNMAP_FLAG(ovp));
+               if (round_page(eoz) > round_page(length)) {
+                       mutex_enter(ovp->v_interlock);
+                       error = VOP_PUTPAGES(ovp, round_page(length),
+                           round_page(eoz),
+                           PGO_CLEANIT | PGO_DEACTIVATE |
+                           ((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
+                       if (error) {
+                               lfs_reserve(fs, ovp, NULL,
+                                           -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+                               goto errout;
+                       }
+               }
+       }
+
+       genfs_node_wrlock(ovp);
+
+       oip->i_size = oip->i_ffs1_size = length;
+       uvm_vnp_setsize(ovp, length);
+
+       /*
+        * Calculate index into inode's block list of
+        * last direct and indirect blocks (if any)
+        * which we want to keep.  Lastblock is -1 when
+        * the file is truncated to 0.
+        */
+       /* Avoid sign overflow - XXX assumes that off_t is a quad_t. */
+       if (length > QUAD_MAX - fs->lfs_bsize)
+               lastblock = lblkno(fs, QUAD_MAX - fs->lfs_bsize);
+       else
+               lastblock = lblkno(fs, length + fs->lfs_bsize - 1) - 1;
+       lastiblock[SINGLE] = lastblock - NDADDR;
+       lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
+       lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
+       nblocks = btofsb(fs, fs->lfs_bsize);
+       /*
+        * Record changed file and block pointers before we start
+        * freeing blocks.  lastiblock values are also normalized to -1
+        * for calls to lfs_indirtrunc below.
+        */
+       memcpy((void *)newblks, (void *)&oip->i_ffs1_db[0], sizeof newblks);
+       for (level = TRIPLE; level >= SINGLE; level--)
+               if (lastiblock[level] < 0) {
+                       newblks[NDADDR+level] = 0;
+                       lastiblock[level] = -1;
+               }
+       for (i = NDADDR - 1; i > lastblock; i--)
+               newblks[i] = 0;
+
+       oip->i_size = oip->i_ffs1_size = osize;
+       error = lfs_vtruncbuf(ovp, lastblock + 1, false, 0);
+       if (error && !allerror)
+               allerror = error;
+
+       /*
+        * Indirect blocks first.
+        */
+       indir_lbn[SINGLE] = -NDADDR;
+       indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
+       indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
+       for (level = TRIPLE; level >= SINGLE; level--) {
+               bn = oip->i_ffs1_ib[level];
+               if (bn != 0) {
+                       error = lfs_indirtrunc(oip, indir_lbn[level],
+                                              bn, lastiblock[level],
+                                              level, &count, &rcount,
+                                              &lastseg, &bc);
+                       if (error)
+                               allerror = error;
+                       real_released += rcount;
+                       blocksreleased += count;
+                       if (lastiblock[level] < 0) {
+                               if (oip->i_ffs1_ib[level] > 0)
+                                       real_released += nblocks;
+                               blocksreleased += nblocks;
+                               oip->i_ffs1_ib[level] = 0;
+                               lfs_blkfree(fs, oip, bn, fs->lfs_bsize,
+                                           &lastseg, &bc);
+                               lfs_deregister_block(ovp, bn);
+                       }
+               }
+               if (lastiblock[level] >= 0)
+                       goto done;
+       }
+
+       /*
+        * All whole direct blocks or frags.
+        */
+       for (i = NDADDR - 1; i > lastblock; i--) {
+               long bsize, obsize;
+
+               bn = oip->i_ffs1_db[i];
+               if (bn == 0)
+                       continue;
+               bsize = blksize(fs, oip, i);
+               if (oip->i_ffs1_db[i] > 0) {
+                       /* Check for fragment size changes */
+                       obsize = oip->i_lfs_fragsize[i];
+                       real_released += btofsb(fs, obsize);
+                       oip->i_lfs_fragsize[i] = 0;
+               } else
+                       obsize = 0;
+               blocksreleased += btofsb(fs, bsize);
+               oip->i_ffs1_db[i] = 0;
+               lfs_blkfree(fs, oip, bn, obsize, &lastseg, &bc);
+               lfs_deregister_block(ovp, bn);
+       }
+       if (lastblock < 0)
+               goto done;
+
+       /*
+        * Finally, look for a change in size of the
+        * last direct block; release any frags.
+        */
+       bn = oip->i_ffs1_db[lastblock];
+       if (bn != 0) {
+               long oldspace, newspace;
+#if 0
+               long olddspace;
+#endif
+
+               /*
+                * Calculate amount of space we're giving
+                * back as old block size minus new block size.
+                */
+               oldspace = blksize(fs, oip, lastblock);
+#if 0
+               olddspace = oip->i_lfs_fragsize[lastblock];
+#endif
+
+               oip->i_size = oip->i_ffs1_size = length;
+               newspace = blksize(fs, oip, lastblock);
+               if (newspace == 0)
+                       panic("itrunc: newspace");
+               if (oldspace - newspace > 0) {
+                       blocksreleased += btofsb(fs, oldspace - newspace);
+               }
+#if 0
+               if (bn > 0 && olddspace - newspace > 0) {
+                       /* No segment accounting here, just vnode */
+                       real_released += btofsb(fs, olddspace - newspace);
+               }
+#endif
+       }
+
+done:
+       /* Finish segment accounting corrections */
+       lfs_update_seguse(fs, oip, lastseg, bc);
+#ifdef DIAGNOSTIC
+       for (level = SINGLE; level <= TRIPLE; level++)
+               if ((newblks[NDADDR + level] == 0) !=
+                   ((oip->i_ffs1_ib[level]) == 0)) {
+                       panic("lfs itrunc1");
+               }
+       for (i = 0; i < NDADDR; i++)
+               if ((newblks[i] == 0) != (oip->i_ffs1_db[i] == 0)) {
+                       panic("lfs itrunc2");
+               }
+       if (length == 0 &&
+           (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+               panic("lfs itrunc3");
+#endif /* DIAGNOSTIC */
+       /*
+        * Put back the real size.
+        */
+       oip->i_size = oip->i_ffs1_size = length;
+       oip->i_lfs_effnblks -= blocksreleased;
+       oip->i_ffs1_blocks -= real_released;
+       mutex_enter(&lfs_lock);
+       fs->lfs_bfree += blocksreleased;
+       mutex_exit(&lfs_lock);
+#ifdef DIAGNOSTIC
+       if (oip->i_size == 0 &&
+           (oip->i_ffs1_blocks != 0 || oip->i_lfs_effnblks != 0)) {
+               printf("lfs_truncate: truncate to 0 but %d blks/%d effblks\n",
+                      oip->i_ffs1_blocks, oip->i_lfs_effnblks);
+               panic("lfs_truncate: persistent blocks");
+       }
+#endif
+
+       /*
+        * If we truncated to zero, take us off the paging queue.
+        */
+       mutex_enter(&lfs_lock);
+       if (oip->i_size == 0 && oip->i_flags & IN_PAGING) {
+               oip->i_flags &= ~IN_PAGING;
+               TAILQ_REMOVE(&fs->lfs_pchainhd, oip, i_lfs_pchain);
+       }
+       mutex_exit(&lfs_lock);
+
+       oip->i_flag |= IN_CHANGE;
+#ifdef QUOTA
+       (void) chkdq(oip, -blocksreleased, NOCRED, 0);
+#endif
+       lfs_reserve(fs, ovp, NULL,
+           -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+       genfs_node_unlock(ovp);
+  errout:
+       oip->i_lfs_hiblk = lblkno(fs, oip->i_size + fs->lfs_bsize - 1) - 1;
+       if (ovp != fs->lfs_ivnode)
+               lfs_segunlock(fs);
+       return (allerror ? allerror : error);
+}
+
+/* Update segment and avail usage information when removing a block. */
+static int
+lfs_blkfree(struct lfs *fs, struct inode *ip, daddr_t daddr,
+           size_t bsize, long *lastseg, size_t *num)
+{
+       long seg;
+       int error = 0;
+
+       ASSERT_SEGLOCK(fs);
+       bsize = fragroundup(fs, bsize);
+       if (daddr > 0) {
+               if (*lastseg != (seg = dtosn(fs, daddr))) {
+                       error = lfs_update_seguse(fs, ip, *lastseg, *num);
+                       *num = bsize;
+                       *lastseg = seg;
+               } else
+                       *num += bsize;
+       }
+
+       return error;
+}
+
+/* Finish the accounting updates for a segment. */
+static int
+lfs_update_seguse(struct lfs *fs, struct inode *ip, long lastseg, size_t num)
+{
+       struct segdelta *sd;
+       struct vnode *vp;
+
+       ASSERT_SEGLOCK(fs);
+       if (lastseg < 0 || num == 0)
+               return 0;
+
+       vp = ITOV(ip);
+       LIST_FOREACH(sd, &ip->i_lfs_segdhd, list)
+               if (sd->segnum == lastseg)
+                       break;
+       if (sd == NULL) {
+               sd = malloc(sizeof(*sd), M_SEGMENT, M_WAITOK);
+               sd->segnum = lastseg;
+               sd->num = 0;
+               LIST_INSERT_HEAD(&ip->i_lfs_segdhd, sd, list);
+       }
+       sd->num += num;
+
+       return 0;
+}
+
+static void
+lfs_finalize_seguse(struct lfs *fs, void *v)
+{
+       SEGUSE *sup;
+       struct buf *bp;
+       struct segdelta *sd;
+       LIST_HEAD(, segdelta) *hd = v;
+
+       ASSERT_SEGLOCK(fs);
+       while((sd = LIST_FIRST(hd)) != NULL) {
+               LIST_REMOVE(sd, list);
+               LFS_SEGENTRY(sup, fs, sd->segnum, bp);
+               if (sd->num > sup->su_nbytes) {
+                       printf("lfs_finalize_seguse: segment %ld short by %ld\n",
+                               sd->segnum, (long)(sd->num - sup->su_nbytes));
+                       panic("lfs_finalize_seguse: negative bytes");
+                       sup->su_nbytes = sd->num;
+               }
+               sup->su_nbytes -= sd->num;
+               LFS_WRITESEGENTRY(sup, fs, sd->segnum, bp);
+               free(sd, M_SEGMENT);
+       }
+}
+
+/* Finish the accounting updates for a segment. */
+void
+lfs_finalize_ino_seguse(struct lfs *fs, struct inode *ip)
+{
+       ASSERT_SEGLOCK(fs);
+       lfs_finalize_seguse(fs, &ip->i_lfs_segdhd);
+}
+
+/* Finish the accounting updates for a segment. */
+void
+lfs_finalize_fs_seguse(struct lfs *fs)
+{
+       ASSERT_SEGLOCK(fs);
+       lfs_finalize_seguse(fs, &fs->lfs_segdhd);
+}
+
+/*
+ * Release blocks associated with the inode ip and stored in the indirect
+ * block bn.  Blocks are free'd in LIFO order up to (but not including)
+ * lastbn.  If level is greater than SINGLE, the block is an indirect block
+ * and recursive calls to indirtrunc must be used to cleanse other indirect
+ * blocks.
+ *
+ * NB: triple indirect blocks are untested.
+ */
+static int
+lfs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn,
+              daddr_t lastbn, int level, long *countp,
+              long *rcountp, long *lastsegp, size_t *bcp)
+{
+       int i;
+       struct buf *bp;
+       struct lfs *fs = ip->i_lfs;
+       int32_t *bap;   /* XXX ondisk32 */
+       struct vnode *vp;
+       daddr_t nb, nlbn, last;
+       int32_t *copy = NULL;   /* XXX ondisk32 */
+       long blkcount, rblkcount, factor;
+       int nblocks, blocksreleased = 0, real_released = 0;
+       int error = 0, allerror = 0;
+
+       ASSERT_SEGLOCK(fs);
+       /*
+        * Calculate index in current block of last
+        * block to be kept.  -1 indicates the entire
+        * block so we need not calculate the index.
+        */
+       factor = 1;
+       for (i = SINGLE; i < level; i++)
+               factor *= NINDIR(fs);
+       last = lastbn;
+       if (lastbn > 0)
+               last /= factor;
+       nblocks = btofsb(fs, fs->lfs_bsize);
+       /*
+        * Get buffer of block pointers, zero those entries corresponding
+        * to blocks to be free'd, and update on disk copy first.  Since
+        * double(triple) indirect before single(double) indirect, calls
+        * to bmap on these blocks will fail.  However, we already have
+        * the on disk address, so we have to set the b_blkno field
+        * explicitly instead of letting bread do everything for us.
+        */
+       vp = ITOV(ip);
+       bp = getblk(vp, lbn, (int)fs->lfs_bsize, 0, 0);
+       if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+               /* Braces must be here in case trace evaluates to nothing. */
+               trace(TR_BREADHIT, pack(vp, fs->lfs_bsize), lbn);
+       } else {
+               trace(TR_BREADMISS, pack(vp, fs->lfs_bsize), lbn);
+               curlwp->l_ru.ru_inblock++; /* pay for read */
+               bp->b_flags |= B_READ;
+               if (bp->b_bcount > bp->b_bufsize)
+                       panic("lfs_indirtrunc: bad buffer size");
+               bp->b_blkno = fsbtodb(fs, dbn);
+               VOP_STRATEGY(vp, bp);
+               error = biowait(bp);
+       }
+       if (error) {
+               brelse(bp, 0);
+               *countp = *rcountp = 0;
+               return (error);
+       }
+
+       bap = (int32_t *)bp->b_data;    /* XXX ondisk32 */
+       if (lastbn >= 0) {
+               copy = (int32_t *)lfs_malloc(fs, fs->lfs_bsize, LFS_NB_IBLOCK);
+               memcpy((void *)copy, (void *)bap, (u_int)fs->lfs_bsize);
+               memset((void *)&bap[last + 1], 0,
+               /* XXX ondisk32 */
+                 (u_int)(NINDIR(fs) - (last + 1)) * sizeof (int32_t));
+               error = VOP_BWRITE(bp->b_vp, bp);
+               if (error)
+                       allerror = error;
+               bap = copy;
+       }
+
+       /*
+        * Recursively free totally unused blocks.
+        */
+       for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
+           i--, nlbn += factor) {
+               nb = bap[i];
+               if (nb == 0)
+                       continue;
+               if (level > SINGLE) {
+                       error = lfs_indirtrunc(ip, nlbn, nb,
+                                              (daddr_t)-1, level - 1,
+                                              &blkcount, &rblkcount,
+                                              lastsegp, bcp);
+                       if (error)
+                               allerror = error;
+                       blocksreleased += blkcount;
+                       real_released += rblkcount;
+               }
+               lfs_blkfree(fs, ip, nb, fs->lfs_bsize, lastsegp, bcp);
+               if (bap[i] > 0)
+                       real_released += nblocks;
+               blocksreleased += nblocks;
+       }
+
+       /*
+        * Recursively free last partial block.
+        */
+       if (level > SINGLE && lastbn >= 0) {
+               last = lastbn % factor;
+               nb = bap[i];
+               if (nb != 0) {
+                       error = lfs_indirtrunc(ip, nlbn, nb,
+                                              last, level - 1, &blkcount,
+                                              &rblkcount, lastsegp, bcp);
+                       if (error)
+                               allerror = error;
+                       real_released += rblkcount;
+                       blocksreleased += blkcount;
+               }
+       }
+
+       if (copy != NULL) {
+               lfs_free(fs, copy, LFS_NB_IBLOCK);
+       } else {
+               mutex_enter(&bufcache_lock);
+               if (bp->b_oflags & BO_DELWRI) {
+                       LFS_UNLOCK_BUF(bp);
+                       fs->lfs_avail += btofsb(fs, bp->b_bcount);
+                       wakeup(&fs->lfs_avail);
+               }
+               brelsel(bp, BC_INVAL);
+               mutex_exit(&bufcache_lock);
+       }
+
+       *countp = blocksreleased;
+       *rcountp = real_released;
+       return (allerror);
+}
+
+/*
+ * Destroy any in core blocks past the truncation length.
+ * Inlined from vtruncbuf, so that lfs_avail could be updated.
+ * We take the seglock to prevent cleaning from occurring while we are
+ * invalidating blocks.
+ */
+static int
+lfs_vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch, int slptimeo)
+{
+       struct buf *bp, *nbp;
+       int error;
+       struct lfs *fs;
+       voff_t off;
+
+       off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
+       mutex_enter(vp->v_interlock);
+       error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
+       if (error)
+               return error;
+
+       fs = VTOI(vp)->i_lfs;
+
+       ASSERT_SEGLOCK(fs);
+
+       mutex_enter(&bufcache_lock);
+restart:       
+       for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
+               nbp = LIST_NEXT(bp, b_vnbufs);
+               if (bp->b_lblkno < lbn)
+                       continue;
+               error = bbusy(bp, catch, slptimeo, NULL);
+               if (error == EPASSTHROUGH)
+                       goto restart;
+               if (error != 0) {
+                       mutex_exit(&bufcache_lock);
+                       return (error);
+               }
+               mutex_enter(bp->b_objlock);
+               if (bp->b_oflags & BO_DELWRI) {
+                       bp->b_oflags &= ~BO_DELWRI;
+                       fs->lfs_avail += btofsb(fs, bp->b_bcount);
+                       wakeup(&fs->lfs_avail);
+               }
+               mutex_exit(bp->b_objlock);
+               LFS_UNLOCK_BUF(bp);
+               brelsel(bp, BC_INVAL | BC_VFLUSH);
+       }
+
+       for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+               nbp = LIST_NEXT(bp, b_vnbufs);
+               if (bp->b_lblkno < lbn)
+                       continue;
+               error = bbusy(bp, catch, slptimeo, NULL);
+               if (error == EPASSTHROUGH)
+                       goto restart;
+               if (error != 0) {
+                       mutex_exit(&bufcache_lock);
+                       return (error);
+               }
+               mutex_enter(bp->b_objlock);
+               if (bp->b_oflags & BO_DELWRI) {
+                       bp->b_oflags &= ~BO_DELWRI;
+                       fs->lfs_avail += btofsb(fs, bp->b_bcount);
+                       wakeup(&fs->lfs_avail);
+               }
+               mutex_exit(bp->b_objlock);
+               LFS_UNLOCK_BUF(bp);
+               brelsel(bp, BC_INVAL | BC_VFLUSH);
+       }
+       mutex_exit(&bufcache_lock);
+
+       return (0);
+}
+
diff --git a/sys/ufs/lfs/lfs_itimes.c b/sys/ufs/lfs/lfs_itimes.c
new file mode 100644 (file)
index 0000000..3ef9f86
--- /dev/null
@@ -0,0 +1,118 @@
+/*     $NetBSD: lfs_itimes.c,v 1.12 2008/04/28 20:24:11 martin Exp $   */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_itimes.c,v 1.12 2008/04/28 20:24:11 martin Exp $");
+
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/mount.h>
+#include <sys/buf.h>
+
+#include <ufs/ufs/inode.h>
+
+#ifndef _KERNEL
+#include "bufcache.h"
+#include "vnode.h"
+#include "lfs_user.h"
+#define vnode uvnode
+#define buf ubuf
+#define panic call_panic
+#else
+#include <ufs/lfs/lfs_extern.h>
+#include <sys/kauth.h>
+#endif
+
+#include <ufs/lfs/lfs.h>
+
+void
+lfs_itimes(struct inode *ip, const struct timespec *acc,
+    const struct timespec *mod, const struct timespec *cre)
+{
+#ifdef _KERNEL
+       struct timespec now;
+
+       KASSERT(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY));
+
+       vfs_timestamp(&now);
+#endif
+
+       if (ip->i_flag & IN_ACCESS) {
+#ifdef _KERNEL
+               if (acc == NULL)
+                       acc = &now;
+#endif
+               ip->i_ffs1_atime = acc->tv_sec;
+               ip->i_ffs1_atimensec = acc->tv_nsec;
+               if (ip->i_lfs->lfs_version > 1) {
+                       struct lfs *fs = ip->i_lfs;
+                       struct buf *ibp;
+                       IFILE *ifp;
+
+                       LFS_IENTRY(ifp, ip->i_lfs, ip->i_number, ibp);
+                       ifp->if_atime_sec = acc->tv_sec;
+                       ifp->if_atime_nsec = acc->tv_nsec;
+                       LFS_BWRITE_LOG(ibp);
+                       mutex_enter(&lfs_lock);
+                       fs->lfs_flags |= LFS_IFDIRTY;
+                       mutex_exit(&lfs_lock);
+               } else {
+                       mutex_enter(&lfs_lock);
+                       LFS_SET_UINO(ip, IN_ACCESSED);
+                       mutex_exit(&lfs_lock);
+               }
+       }
+       if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFY)) {
+               if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) {
+#ifdef _KERNEL
+                       if (mod == NULL)
+                               mod = &now;
+#endif
+                       ip->i_ffs1_mtime = mod->tv_sec;
+                       ip->i_ffs1_mtimensec = mod->tv_nsec;
+                       ip->i_modrev++;
+               }
+               if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
+#ifdef _KERNEL
+                       if (cre == NULL)
+                               cre = &now;
+#endif
+                       ip->i_ffs1_ctime = cre->tv_sec;
+                       ip->i_ffs1_ctimensec = cre->tv_nsec;
+               }
+               mutex_enter(&lfs_lock);
+               if (ip->i_flag & (IN_CHANGE | IN_UPDATE))
+                       LFS_SET_UINO(ip, IN_MODIFIED);
+               if (ip->i_flag & IN_MODIFY)
+                       LFS_SET_UINO(ip, IN_ACCESSED);
+               mutex_exit(&lfs_lock);
+       }
+       ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
+}
diff --git a/sys/ufs/lfs/lfs_rfw.c b/sys/ufs/lfs/lfs_rfw.c
new file mode 100644 (file)
index 0000000..60d926e
--- /dev/null
@@ -0,0 +1,702 @@
+/*     $NetBSD: lfs_rfw.c,v 1.12 2009/02/22 20:28:07 ad Exp $  */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_rfw.c,v 1.12 2009/02/22 20:28:07 ad Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kthread.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <uvm/uvm_extern.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+#include <uvm/uvm_pdaemon.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+
+/*
+ * Roll-forward code.
+ */
+static daddr_t check_segsum(struct lfs *, daddr_t, u_int64_t,
+    kauth_cred_t, int, int *, struct lwp *);
+
+extern int lfs_do_rfw;
+
+/*
+ * Allocate a particular inode with a particular version number, freeing
+ * any previous versions of this inode that may have gone before.
+ * Used by the roll-forward code.
+ *
+ * XXX this function does not have appropriate locking to be used on a live fs;
+ * XXX but something similar could probably be used for an "undelete" call.
+ *
+ * Called with the Ifile inode locked.
+ */
+int
+lfs_rf_valloc(struct lfs *fs, ino_t ino, int vers, struct lwp *l,
+             struct vnode **vpp)
+{
+       IFILE *ifp;
+       struct buf *bp, *cbp;
+       struct vnode *vp;
+       struct inode *ip;
+       ino_t tino, oldnext;
+       int error;
+       CLEANERINFO *cip;
+
+       ASSERT_SEGLOCK(fs); /* XXX it doesn't, really */
+
+       /*
+        * First, just try a vget. If the version number is the one we want,
+        * we don't have to do anything else.  If the version number is wrong,
+        * take appropriate action.
+        */
+       error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp);
+       if (error == 0) {
+               DLOG((DLOG_RF, "lfs_rf_valloc[1]: ino %d vp %p\n", ino, vp));
+
+               *vpp = vp;
+               ip = VTOI(vp);
+               if (ip->i_gen == vers)
+                       return 0;
+               else if (ip->i_gen < vers) {
+                       lfs_truncate(vp, (off_t)0, 0, NOCRED);
+                       ip->i_gen = ip->i_ffs1_gen = vers;
+                       LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
+                       return 0;
+               } else {
+                       DLOG((DLOG_RF, "ino %d: sought version %d, got %d\n",
+                              ino, vers, ip->i_ffs1_gen));
+                       vput(vp);
+                       *vpp = NULLVP;
+                       return EEXIST;
+               }
+       }
+
+       /*
+        * The inode is not in use.  Find it on the free list.
+        */
+       /* If the Ifile is too short to contain this inum, extend it */
+       while (VTOI(fs->lfs_ivnode)->i_size <= (ino /
+               fs->lfs_ifpb + fs->lfs_cleansz + fs->lfs_segtabsz)
+               << fs->lfs_bshift) {
+               lfs_extend_ifile(fs, NOCRED);
+       }
+
+       LFS_IENTRY(ifp, fs, ino, bp);
+       oldnext = ifp->if_nextfree;
+       ifp->if_version = vers;
+       brelse(bp, 0);
+
+       LFS_GET_HEADFREE(fs, cip, cbp, &ino);
+       if (ino) {
+               LFS_PUT_HEADFREE(fs, cip, cbp, oldnext);
+       } else {
+               tino = ino;
+               while (1) {
+                       LFS_IENTRY(ifp, fs, tino, bp);
+                       if (ifp->if_nextfree == ino ||
+                           ifp->if_nextfree == LFS_UNUSED_INUM)
+                               break;
+                       tino = ifp->if_nextfree;
+                       brelse(bp, 0);
+               }
+               if (ifp->if_nextfree == LFS_UNUSED_INUM) {
+                       brelse(bp, 0);
+                       return ENOENT;
+               }
+               ifp->if_nextfree = oldnext;
+               LFS_BWRITE_LOG(bp);
+       }
+
+       error = lfs_ialloc(fs, fs->lfs_ivnode, ino, vers, &vp);
+       if (error == 0) {
+               /*
+                * Make it VREG so we can put blocks on it.  We will change
+                * this later if it turns out to be some other kind of file.
+                */
+               ip = VTOI(vp);
+               ip->i_mode = ip->i_ffs1_mode = IFREG;
+               ip->i_nlink = ip->i_ffs1_nlink = 1;
+               ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, &vp);
+               ip = VTOI(vp);
+
+               DLOG((DLOG_RF, "lfs_rf_valloc: ino %d vp %p\n", ino, vp));
+
+               /* The dirop-nature of this vnode is past */
+               lfs_unmark_vnode(vp);
+               (void)lfs_vunref(vp);
+               vp->v_uflag &= ~VU_DIROP;
+               mutex_enter(&lfs_lock);
+               --lfs_dirvcount;
+               --fs->lfs_dirvcount;
+               TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+               wakeup(&lfs_dirvcount);
+               wakeup(&fs->lfs_dirvcount);
+               mutex_exit(&lfs_lock);
+       }
+       *vpp = vp;
+       return error;
+}
+
+/*
+ * Load the appropriate indirect block, and change the appropriate pointer.
+ * Mark the block dirty.  Do segment and avail accounting.
+ */
+static int
+update_meta(struct lfs *fs, ino_t ino, int vers, daddr_t lbn,
+           daddr_t ndaddr, size_t size, struct lwp *l)
+{
+       int error;
+       struct vnode *vp;
+       struct inode *ip;
+#ifdef DEBUG
+       daddr_t odaddr;
+       struct indir a[NIADDR];
+       int num;
+       int i;
+#endif /* DEBUG */
+       struct buf *bp;
+       SEGUSE *sup;
+
+       KASSERT(lbn >= 0);      /* no indirect blocks */
+
+       if ((error = lfs_rf_valloc(fs, ino, vers, l, &vp)) != 0) {
+               DLOG((DLOG_RF, "update_meta: ino %d: lfs_rf_valloc"
+                     " returned %d\n", ino, error));
+               return error;
+       }
+
+       if ((error = lfs_balloc(vp, (lbn << fs->lfs_bshift), size,
+                               NOCRED, 0, &bp)) != 0) {
+               vput(vp);
+               return (error);
+       }
+       /* No need to write, the block is already on disk */
+       if (bp->b_oflags & BO_DELWRI) {
+               LFS_UNLOCK_BUF(bp);
+               fs->lfs_avail += btofsb(fs, bp->b_bcount);
+       }
+       brelse(bp, BC_INVAL);
+
+       /*
+        * Extend the file, if it is not large enough already.
+        * XXX this is not exactly right, we don't know how much of the
+        * XXX last block is actually used.  We hope that an inode will
+        * XXX appear later to give the correct size.
+        */
+       ip = VTOI(vp);
+       if (ip->i_size <= (lbn << fs->lfs_bshift)) {
+               u_int64_t newsize;
+
+               if (lbn < NDADDR)
+                       newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) +
+                               (size - fs->lfs_fsize) + 1;
+               else
+                       newsize = ip->i_ffs1_size = (lbn << fs->lfs_bshift) + 1;
+
+               if (ip->i_size < newsize) {
+                       ip->i_size = newsize;
+                       /*
+                        * tell vm our new size for the case the inode won't
+                        * appear later.
+                        */
+                       uvm_vnp_setsize(vp, newsize);
+               }
+       }
+
+       lfs_update_single(fs, NULL, vp, lbn, ndaddr, size);
+
+       LFS_SEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
+       sup->su_nbytes += size;
+       LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
+
+       /* differences here should be due to UNWRITTEN indirect blocks. */
+       KASSERT((lblkno(fs, ip->i_size) > NDADDR &&
+           ip->i_lfs_effnblks == ip->i_ffs1_blocks) ||
+           ip->i_lfs_effnblks >= ip->i_ffs1_blocks);
+
+#ifdef DEBUG
+       /* Now look again to make sure it worked */
+       ufs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL, NULL);
+       for (i = num; i > 0; i--) {
+               if (!a[i].in_exists)
+                       panic("update_meta: absent %d lv indirect block", i);
+       }
+       if (dbtofsb(fs, odaddr) != ndaddr)
+               DLOG((DLOG_RF, "update_meta: failed setting ino %d lbn %"
+                     PRId64 " to %" PRId64 "\n", ino, lbn, ndaddr));
+#endif /* DEBUG */
+       vput(vp);
+       return 0;
+}
+
+static int
+update_inoblk(struct lfs *fs, daddr_t offset, kauth_cred_t cred,
+             struct lwp *l)
+{
+       struct vnode *devvp, *vp;
+       struct inode *ip;
+       struct ufs1_dinode *dip;
+       struct buf *dbp, *ibp;
+       int error;
+       daddr_t daddr;
+       IFILE *ifp;
+       SEGUSE *sup;
+
+       devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+
+       /*
+        * Get the inode, update times and perms.
+        * DO NOT update disk blocks, we do that separately.
+        */
+       error = bread(devvp, fsbtodb(fs, offset), fs->lfs_ibsize,
+           cred, 0, &dbp);
+       if (error) {
+               DLOG((DLOG_RF, "update_inoblk: bread returned %d\n", error));
+               return error;
+       }
+       dip = ((struct ufs1_dinode *)(dbp->b_data)) + INOPB(fs);
+       while (--dip >= (struct ufs1_dinode *)dbp->b_data) {
+               if (dip->di_inumber > LFS_IFILE_INUM) {
+                       error = lfs_rf_valloc(fs, dip->di_inumber, dip->di_gen,
+                                             l, &vp);
+                       if (error) {
+                               DLOG((DLOG_RF, "update_inoblk: lfs_rf_valloc"
+                                     " returned %d\n", error));
+                               continue;
+                       }
+                       ip = VTOI(vp);
+                       if (dip->di_size != ip->i_size)
+                               lfs_truncate(vp, dip->di_size, 0, NOCRED);
+                       /* Get mode, link count, size, and times */
+                       memcpy(ip->i_din.ffs1_din, dip,
+                              offsetof(struct ufs1_dinode, di_db[0]));
+
+                       /* Then the rest, except di_blocks */
+                       ip->i_flags = ip->i_ffs1_flags = dip->di_flags;
+                       ip->i_gen = ip->i_ffs1_gen = dip->di_gen;
+                       ip->i_uid = ip->i_ffs1_uid = dip->di_uid;
+                       ip->i_gid = ip->i_ffs1_gid = dip->di_gid;
+
+                       ip->i_mode = ip->i_ffs1_mode;
+                       ip->i_nlink = ip->i_ffs1_nlink;
+                       ip->i_size = ip->i_ffs1_size;
+
+                       LFS_SET_UINO(ip, IN_CHANGE | IN_UPDATE);
+
+                       /* Re-initialize to get type right */
+                       ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
+                                 &vp);
+                       vput(vp);
+
+                       /* Record change in location */
+                       LFS_IENTRY(ifp, fs, dip->di_inumber, ibp);
+                       daddr = ifp->if_daddr;
+                       ifp->if_daddr = dbtofsb(fs, dbp->b_blkno);
+                       error = LFS_BWRITE_LOG(ibp); /* Ifile */
+                       /* And do segment accounting */
+                       if (dtosn(fs, daddr) != dtosn(fs, dbtofsb(fs, dbp->b_blkno))) {
+                               if (daddr > 0) {
+                                       LFS_SEGENTRY(sup, fs, dtosn(fs, daddr),
+                                                    ibp);
+                                       sup->su_nbytes -= sizeof (struct ufs1_dinode);
+                                       LFS_WRITESEGENTRY(sup, fs,
+                                                         dtosn(fs, daddr),
+                                                         ibp);
+                               }
+                               LFS_SEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
+                                            ibp);
+                               sup->su_nbytes += sizeof (struct ufs1_dinode);
+                               LFS_WRITESEGENTRY(sup, fs,
+                                                 dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
+                                                 ibp);
+                       }
+               }
+       }
+       brelse(dbp, BC_AGE);
+
+       return 0;
+}
+
+#define CHECK_CKSUM   0x0001  /* Check the checksum to make sure it's valid */
+#define CHECK_UPDATE  0x0002  /* Update Ifile for new data blocks / inodes */
+
+static daddr_t
+check_segsum(struct lfs *fs, daddr_t offset, u_int64_t nextserial,
+            kauth_cred_t cred, int flags, int *pseg_flags, struct lwp *l)
+{
+       struct vnode *devvp;
+       struct buf *bp, *dbp;
+       int error, nblocks = 0, ninos, i, j; /* XXX: gcc */
+       SEGSUM *ssp;
+       u_long *dp = NULL, *datap = NULL; /* XXX u_int32_t */
+       daddr_t oldoffset;
+       int32_t *iaddr; /* XXX ondisk32 */
+       FINFO *fip;
+       SEGUSE *sup;
+       size_t size;
+
+       devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+       /*
+        * If the segment has a superblock and we're at the top
+        * of the segment, skip the superblock.
+        */
+       if (sntod(fs, dtosn(fs, offset)) == offset) {
+               LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp);
+               if (sup->su_flags & SEGUSE_SUPERBLOCK)
+                       offset += btofsb(fs, LFS_SBPAD);
+               brelse(bp, 0);
+       }
+
+       /* Read in the segment summary */
+       error = bread(devvp, fsbtodb(fs, offset), fs->lfs_sumsize,
+           cred, 0, &bp);
+       if (error)
+               return -1;
+
+       /* Check summary checksum */
+       ssp = (SEGSUM *)bp->b_data;
+       if (flags & CHECK_CKSUM) {
+               if (ssp->ss_sumsum != cksum(&ssp->ss_datasum,
+                                          fs->lfs_sumsize -
+                                          sizeof(ssp->ss_sumsum))) {
+                       DLOG((DLOG_RF, "Sumsum error at 0x%" PRIx64 "\n", offset));
+                       offset = -1;
+                       goto err1;
+               }
+               if (ssp->ss_nfinfo == 0 && ssp->ss_ninos == 0) {
+                       DLOG((DLOG_RF, "Empty pseg at 0x%" PRIx64 "\n", offset));
+                       offset = -1;
+                       goto err1;
+               }
+               if (ssp->ss_create < fs->lfs_tstamp) {
+                       DLOG((DLOG_RF, "Old data at 0x%" PRIx64 "\n", offset));
+                       offset = -1;
+                       goto err1;
+               }
+       }
+       if (fs->lfs_version > 1) {
+               if (ssp->ss_serial != nextserial) {
+                       DLOG((DLOG_RF, "Unexpected serial number at 0x%" PRIx64
+                             "\n", offset));
+                       offset = -1;
+                       goto err1;
+               }
+               if (ssp->ss_ident != fs->lfs_ident) {
+                       DLOG((DLOG_RF, "Incorrect fsid (0x%x vs 0x%x) at 0x%"
+                             PRIx64 "\n", ssp->ss_ident, fs->lfs_ident, offset));
+                       offset = -1;
+                       goto err1;
+               }
+       }
+       if (pseg_flags)
+               *pseg_flags = ssp->ss_flags;
+       oldoffset = offset;
+       offset += btofsb(fs, fs->lfs_sumsize);
+
+       ninos = howmany(ssp->ss_ninos, INOPB(fs));
+       /* XXX ondisk32 */
+       iaddr = (int32_t *)((char*)bp->b_data + fs->lfs_sumsize - sizeof(int32_t));
+       if (flags & CHECK_CKSUM) {
+               /* Count blocks */
+               nblocks = 0;
+               fip = (FINFO *)((char*)bp->b_data + SEGSUM_SIZE(fs));
+               for (i = 0; i < ssp->ss_nfinfo; ++i) {
+                       nblocks += fip->fi_nblocks;
+                       if (fip->fi_nblocks <= 0)
+                               break;
+                       /* XXX ondisk32 */
+                       fip = (FINFO *)(((char *)fip) + FINFOSIZE +
+                                       (fip->fi_nblocks * sizeof(int32_t)));
+               }
+               nblocks += ninos;
+               /* Create the sum array */
+               datap = dp = (u_long *)malloc(nblocks * sizeof(u_long),
+                                             M_SEGMENT, M_WAITOK);
+       }
+
+       /* Handle individual blocks */
+       fip = (FINFO *)((char*)bp->b_data + SEGSUM_SIZE(fs));
+       for (i = 0; i < ssp->ss_nfinfo || ninos; ++i) {
+               /* Inode block? */
+               if (ninos && *iaddr == offset) {
+                       if (flags & CHECK_CKSUM) {
+                               /* Read in the head and add to the buffer */
+                               error = bread(devvp, fsbtodb(fs, offset), fs->lfs_bsize,
+                                             cred, 0, &dbp);
+                               if (error) {
+                                       offset = -1;
+                                       goto err2;
+                               }
+                               (*dp++) = ((u_long *)(dbp->b_data))[0];
+                               brelse(dbp, BC_AGE);
+                       }
+                       if (flags & CHECK_UPDATE) {
+                               if ((error = update_inoblk(fs, offset, cred, l))
+                                   != 0) {
+                                       offset = -1;
+                                       goto err2;
+                               }
+                       }
+                       offset += btofsb(fs, fs->lfs_ibsize);
+                       --iaddr;
+                       --ninos;
+                       --i; /* compensate */
+                       continue;
+               }
+               size = fs->lfs_bsize;
+               for (j = 0; j < fip->fi_nblocks; ++j) {
+                       if (j == fip->fi_nblocks - 1)
+                               size = fip->fi_lastlength;
+                       if (flags & CHECK_CKSUM) {
+                               error = bread(devvp, fsbtodb(fs, offset), size,
+                                   cred, 0, &dbp);
+                               if (error) {
+                                       offset = -1;
+                                       goto err2;
+                               }
+                               (*dp++) = ((u_long *)(dbp->b_data))[0];
+                               brelse(dbp, BC_AGE);
+                       }
+                       /* Account for and update any direct blocks */
+                       if ((flags & CHECK_UPDATE) &&
+                          fip->fi_ino > LFS_IFILE_INUM &&
+                          fip->fi_blocks[j] >= 0) {
+                               update_meta(fs, fip->fi_ino, fip->fi_version,
+                                           fip->fi_blocks[j], offset, size, l);
+                       }
+                       offset += btofsb(fs, size);
+               }
+               /* XXX ondisk32 */
+               fip = (FINFO *)(((char *)fip) + FINFOSIZE
+                               + fip->fi_nblocks * sizeof(int32_t));
+       }
+       /* Checksum the array, compare */
+       if ((flags & CHECK_CKSUM) &&
+          ssp->ss_datasum != cksum(datap, nblocks * sizeof(u_long)))
+       {
+               DLOG((DLOG_RF, "Datasum error at 0x%" PRIx64
+                     " (wanted %x got %x)\n",
+                     offset, ssp->ss_datasum, cksum(datap, nblocks *
+                                                    sizeof(u_long))));
+               offset = -1;
+               goto err2;
+       }
+
+       /* If we're at the end of the segment, move to the next */
+       if (dtosn(fs, offset + btofsb(fs, fs->lfs_sumsize + fs->lfs_bsize)) !=
+          dtosn(fs, offset)) {
+               if (dtosn(fs, offset) == dtosn(fs, ssp->ss_next)) {
+                       offset = -1;
+                       goto err2;
+               }
+               offset = ssp->ss_next;
+               DLOG((DLOG_RF, "LFS roll forward: moving to offset 0x%" PRIx64
+                      " -> segment %d\n", offset, dtosn(fs,offset)));
+       }
+
+       if (flags & CHECK_UPDATE) {
+               fs->lfs_avail -= (offset - oldoffset);
+               /* Don't clog the buffer queue */
+               mutex_enter(&lfs_lock);
+               if (locked_queue_count > LFS_MAX_BUFS ||
+                   locked_queue_bytes > LFS_MAX_BYTES) {
+                       lfs_flush(fs, SEGM_CKP, 0);
+               }
+               mutex_exit(&lfs_lock);
+       }
+
+    err2:
+       if (flags & CHECK_CKSUM)
+               free(datap, M_SEGMENT);
+    err1:
+       brelse(bp, BC_AGE);
+
+       /* XXX should we update the serial number even for bad psegs? */
+       if ((flags & CHECK_UPDATE) && offset > 0 && fs->lfs_version > 1)
+               fs->lfs_serial = nextserial;
+       return offset;
+}
+
+void
+lfs_roll_forward(struct lfs *fs, struct mount *mp, struct lwp *l)
+{
+       int flags, dirty;
+       daddr_t offset, oldoffset, lastgoodpseg;
+       int sn, curseg, do_rollforward;
+       struct proc *p;
+       kauth_cred_t cred;
+       SEGUSE *sup;
+       struct buf *bp;
+
+       p = l ? l->l_proc : NULL;
+       cred = p ? p->p_cred : NOCRED;
+
+       /*
+        * Roll forward.
+        *
+        * We don't roll forward for v1 filesystems, because
+        * of the danger that the clock was turned back between the last
+        * checkpoint and crash.  This would roll forward garbage.
+        *
+        * v2 filesystems don't have this problem because they use a
+        * monotonically increasing serial number instead of a timestamp.
+        */
+       do_rollforward = (!(fs->lfs_pflags & LFS_PF_CLEAN) &&
+                         lfs_do_rfw && fs->lfs_version > 1 && p != NULL);
+       if (do_rollforward) {
+               u_int64_t nextserial;
+               /*
+                * Phase I: Find the address of the last good partial
+                * segment that was written after the checkpoint.  Mark
+                * the segments in question dirty, so they won't be
+                * reallocated.
+                */
+               lastgoodpseg = oldoffset = offset = fs->lfs_offset;
+               flags = 0x0;
+               DLOG((DLOG_RF, "LFS roll forward phase 1: start at offset 0x%"
+                     PRIx64 "\n", offset));
+               LFS_SEGENTRY(sup, fs, dtosn(fs, offset), bp);
+               if (!(sup->su_flags & SEGUSE_DIRTY))
+                       --fs->lfs_nclean;
+               sup->su_flags |= SEGUSE_DIRTY;
+               LFS_WRITESEGENTRY(sup, fs, dtosn(fs, offset), bp);
+               nextserial = fs->lfs_serial + 1;
+               while ((offset = check_segsum(fs, offset, nextserial,
+                   cred, CHECK_CKSUM, &flags, l)) > 0) {
+                       nextserial++;
+                       if (sntod(fs, oldoffset) != sntod(fs, offset)) {
+                               LFS_SEGENTRY(sup, fs, dtosn(fs, oldoffset),
+                                            bp);
+                               if (!(sup->su_flags & SEGUSE_DIRTY))
+                                       --fs->lfs_nclean;
+                               sup->su_flags |= SEGUSE_DIRTY;
+                               LFS_WRITESEGENTRY(sup, fs, dtosn(fs, oldoffset),
+                                            bp);
+                       }
+
+                       DLOG((DLOG_RF, "LFS roll forward phase 1: offset=0x%"
+                             PRIx64 "\n", offset));
+                       if (flags & SS_DIROP) {
+                               DLOG((DLOG_RF, "lfs_mountfs: dirops at 0x%"
+                                     PRIx64 "\n", oldoffset));
+                               if (!(flags & SS_CONT)) {
+                                    DLOG((DLOG_RF, "lfs_mountfs: dirops end "
+                                          "at 0x%" PRIx64 "\n", oldoffset));
+                               }
+                       }
+                       if (!(flags & SS_CONT))
+                               lastgoodpseg = offset;
+                       oldoffset = offset;
+               }
+               if (flags & SS_CONT) {
+                       DLOG((DLOG_RF, "LFS roll forward: warning: incomplete "
+                             "dirops discarded\n"));
+               }
+               DLOG((DLOG_RF, "LFS roll forward phase 1: completed: "
+                     "lastgoodpseg=0x%" PRIx64 "\n", lastgoodpseg));
+               oldoffset = fs->lfs_offset;
+               if (fs->lfs_offset != lastgoodpseg) {
+                       /* Don't overwrite what we're trying to preserve */
+                       offset = fs->lfs_offset;
+                       fs->lfs_offset = lastgoodpseg;
+                       fs->lfs_curseg = sntod(fs, dtosn(fs, fs->lfs_offset));
+                       for (sn = curseg = dtosn(fs, fs->lfs_curseg);;) {
+                               sn = (sn + 1) % fs->lfs_nseg;
+                               if (sn == curseg)
+                                       panic("lfs_mountfs: no clean segments");
+                               LFS_SEGENTRY(sup, fs, sn, bp);
+                               dirty = (sup->su_flags & SEGUSE_DIRTY);
+                               brelse(bp, 0);
+                               if (!dirty)
+                                       break;
+                       }
+                       fs->lfs_nextseg = sntod(fs, sn);
+
+                       /*
+                        * Phase II: Roll forward from the first superblock.
+                        */
+                       while (offset != lastgoodpseg) {
+                               DLOG((DLOG_RF, "LFS roll forward phase 2: 0x%"
+                                     PRIx64 "\n", offset));
+                               offset = check_segsum(fs, offset,
+                                   fs->lfs_serial + 1, cred, CHECK_UPDATE,
+                                   NULL, l);
+                       }
+
+                       /*
+                        * Finish: flush our changes to disk.
+                        */
+                       lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
+                       DLOG((DLOG_RF, "lfs_mountfs: roll forward ",
+                             "recovered %lld blocks\n",
+                             (long long)(lastgoodpseg - oldoffset)));
+               }
+               DLOG((DLOG_RF, "LFS roll forward complete\n"));
+       }
+}
diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c
new file mode 100644 (file)
index 0000000..aea143a
--- /dev/null
@@ -0,0 +1,2829 @@
+/*     $NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $        */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_segment.c       8.10 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $");
+
+#ifdef DEBUG
+# define vndebug(vp, str) do {                                         \
+       if (VTOI(vp)->i_flag & IN_CLEANING)                             \
+               DLOG((DLOG_WVNODE, "not writing ino %d because %s (op %d)\n", \
+                    VTOI(vp)->i_number, (str), op));                   \
+} while(0)
+#else
+# define vndebug(vp, str)
+#endif
+#define ivndebug(vp, str) \
+       DLOG((DLOG_WVNODE, "ino %d: %s\n", VTOI(vp)->i_number, (str)))
+
+#if defined(_KERNEL_OPT)
+#include "opt_ddb.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/resourcevar.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+#include <sys/syslog.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_extern.h>
+
+MALLOC_JUSTDEFINE(M_SEGMENT, "LFS segment", "Segment for LFS");
+
+static void lfs_generic_callback(struct buf *, void (*)(struct buf *));
+static void lfs_free_aiodone(struct buf *);
+static void lfs_super_aiodone(struct buf *);
+static void lfs_cluster_aiodone(struct buf *);
+static void lfs_cluster_callback(struct buf *);
+
+/*
+ * Determine if it's OK to start a partial in this segment, or if we need
+ * to go on to a new segment.
+ */
+#define        LFS_PARTIAL_FITS(fs) \
+       ((fs)->lfs_fsbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \
+       (fs)->lfs_frag)
+
+/*
+ * Figure out whether we should do a checkpoint write or go ahead with
+ * an ordinary write.
+ */
+#define LFS_SHOULD_CHECKPOINT(fs, flags) \
+        ((flags & SEGM_CLEAN) == 0 &&                                  \
+         ((fs->lfs_nactive > LFS_MAX_ACTIVE ||                         \
+           (flags & SEGM_CKP) ||                                       \
+           fs->lfs_nclean < LFS_MAX_ACTIVE)))
+
+int     lfs_match_fake(struct lfs *, struct buf *);
+void    lfs_newseg(struct lfs *);
+/* XXX ondisk32 */
+void    lfs_shellsort(struct buf **, int32_t *, int, int);
+void    lfs_supercallback(struct buf *);
+void    lfs_updatemeta(struct segment *);
+void    lfs_writesuper(struct lfs *, daddr_t);
+int     lfs_writevnodes(struct lfs *fs, struct mount *mp,
+           struct segment *sp, int dirops);
+
+int    lfs_allclean_wakeup;            /* Cleaner wakeup address. */
+int    lfs_writeindir = 1;             /* whether to flush indir on non-ckp */
+int    lfs_clean_vnhead = 0;           /* Allow freeing to head of vn list */
+int    lfs_dirvcount = 0;              /* # active dirops */
+
+/* Statistics Counters */
+int lfs_dostats = 1;
+struct lfs_stats lfs_stats;
+
+/* op values to lfs_writevnodes */
+#define        VN_REG          0
+#define        VN_DIROP        1
+#define        VN_EMPTY        2
+#define VN_CLEAN       3
+
+/*
+ * XXX KS - Set modification time on the Ifile, so the cleaner can
+ * read the fs mod time off of it.  We don't set IN_UPDATE here,
+ * since we don't really need this to be flushed to disk (and in any
+ * case that wouldn't happen to the Ifile until we checkpoint).
+ */
+void
+lfs_imtime(struct lfs *fs)
+{
+       struct timespec ts;
+       struct inode *ip;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       vfs_timestamp(&ts);
+       ip = VTOI(fs->lfs_ivnode);
+       ip->i_ffs1_mtime = ts.tv_sec;
+       ip->i_ffs1_mtimensec = ts.tv_nsec;
+}
+
+/*
+ * Ifile and meta data blocks are not marked busy, so segment writes MUST be
+ * single threaded.  Currently, there are two paths into lfs_segwrite, sync()
+ * and getnewbuf().  They both mark the file system busy.  Lfs_vflush()
+ * explicitly marks the file system busy.  So lfs_segwrite is safe.  I think.
+ */
+
+#define IS_FLUSHING(fs,vp)  ((fs)->lfs_flushvp == (vp))
+
+int
+lfs_vflush(struct vnode *vp)
+{
+       struct inode *ip;
+       struct lfs *fs;
+       struct segment *sp;
+       struct buf *bp, *nbp, *tbp, *tnbp;
+       int error;
+       int flushed;
+       int relock;
+       int loopcount;
+
+       ip = VTOI(vp);
+       fs = VFSTOUFS(vp->v_mount)->um_lfs;
+       relock = 0;
+
+    top:
+       ASSERT_NO_SEGLOCK(fs);
+       if (ip->i_flag & IN_CLEANING) {
+               ivndebug(vp,"vflush/in_cleaning");
+               mutex_enter(&lfs_lock);
+               LFS_CLR_UINO(ip, IN_CLEANING);
+               LFS_SET_UINO(ip, IN_MODIFIED);
+               mutex_exit(&lfs_lock);
+
+               /*
+                * Toss any cleaning buffers that have real counterparts
+                * to avoid losing new data.
+                */
+               mutex_enter(vp->v_interlock);
+               for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+                       nbp = LIST_NEXT(bp, b_vnbufs);
+                       if (!LFS_IS_MALLOC_BUF(bp))
+                               continue;
+                       /*
+                        * Look for pages matching the range covered
+                        * by cleaning blocks.  It's okay if more dirty
+                        * pages appear, so long as none disappear out
+                        * from under us.
+                        */
+                       if (bp->b_lblkno > 0 && vp->v_type == VREG &&
+                           vp != fs->lfs_ivnode) {
+                               struct vm_page *pg;
+                               voff_t off;
+
+                               for (off = lblktosize(fs, bp->b_lblkno);
+                                    off < lblktosize(fs, bp->b_lblkno + 1);
+                                    off += PAGE_SIZE) {
+                                       pg = uvm_pagelookup(&vp->v_uobj, off);
+                                       if (pg == NULL)
+                                               continue;
+                                       if ((pg->flags & PG_CLEAN) == 0 ||
+                                           pmap_is_modified(pg)) {
+                                               fs->lfs_avail += btofsb(fs,
+                                                       bp->b_bcount);
+                                               wakeup(&fs->lfs_avail);
+                                               mutex_exit(vp->v_interlock);
+                                               lfs_freebuf(fs, bp);
+                                               mutex_enter(vp->v_interlock);
+                                               bp = NULL;
+                                               break;
+                                       }
+                               }
+                       }
+                       for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp;
+                           tbp = tnbp)
+                       {
+                               tnbp = LIST_NEXT(tbp, b_vnbufs);
+                               if (tbp->b_vp == bp->b_vp
+                                  && tbp->b_lblkno == bp->b_lblkno
+                                  && tbp != bp)
+                               {
+                                       fs->lfs_avail += btofsb(fs,
+                                               bp->b_bcount);
+                                       wakeup(&fs->lfs_avail);
+                                       mutex_exit(vp->v_interlock);
+                                       lfs_freebuf(fs, bp);
+                                       mutex_enter(vp->v_interlock);
+                                       bp = NULL;
+                                       break;
+                               }
+                       }
+               }
+       } else {
+               mutex_enter(vp->v_interlock);
+       }
+
+       /* If the node is being written, wait until that is done */
+       while (WRITEINPROG(vp)) {
+               ivndebug(vp,"vflush/writeinprog");
+               cv_wait(&vp->v_cv, vp->v_interlock);
+       }
+       mutex_exit(vp->v_interlock);
+
+       /* Protect against VI_XLOCK deadlock in vinvalbuf() */
+       lfs_seglock(fs, SEGM_SYNC);
+
+       /* If we're supposed to flush a freed inode, just toss it */
+       if (ip->i_lfs_iflags & LFSI_DELETED) {
+               DLOG((DLOG_VNODE, "lfs_vflush: ino %d freed, not flushing\n",
+                     ip->i_number));
+               /* Drain v_numoutput */
+               mutex_enter(vp->v_interlock);
+               while (vp->v_numoutput > 0) {
+                       cv_wait(&vp->v_cv, vp->v_interlock);
+               }
+               KASSERT(vp->v_numoutput == 0);
+               mutex_exit(vp->v_interlock);
+       
+               mutex_enter(&bufcache_lock);
+               for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+                       nbp = LIST_NEXT(bp, b_vnbufs);
+
+                       KASSERT((bp->b_flags & B_GATHERED) == 0);
+                       if (bp->b_oflags & BO_DELWRI) { /* XXX always true? */
+                               fs->lfs_avail += btofsb(fs, bp->b_bcount);
+                               wakeup(&fs->lfs_avail);
+                       }
+                       /* Copied from lfs_writeseg */
+                       if (bp->b_iodone != NULL) {
+                               mutex_exit(&bufcache_lock);
+                               biodone(bp);
+                               mutex_enter(&bufcache_lock);
+                       } else {
+                               bremfree(bp);
+                               LFS_UNLOCK_BUF(bp);
+                               mutex_enter(vp->v_interlock);
+                               bp->b_flags &= ~(B_READ | B_GATHERED);
+                               bp->b_oflags = (bp->b_oflags & ~BO_DELWRI) | BO_DONE;
+                               bp->b_error = 0;
+                               reassignbuf(bp, vp);
+                               mutex_exit(vp->v_interlock);
+                               brelse(bp, 0);
+                       }
+               }
+               mutex_exit(&bufcache_lock);
+               LFS_CLR_UINO(ip, IN_CLEANING);
+               LFS_CLR_UINO(ip, IN_MODIFIED | IN_ACCESSED);
+               ip->i_flag &= ~IN_ALLMOD;
+               DLOG((DLOG_VNODE, "lfs_vflush: done not flushing ino %d\n",
+                     ip->i_number));
+               lfs_segunlock(fs);
+
+               KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+
+               return 0;
+       }
+
+       fs->lfs_flushvp = vp;
+       if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) {
+               error = lfs_segwrite(vp->v_mount, SEGM_CKP | SEGM_SYNC);
+               fs->lfs_flushvp = NULL;
+               KASSERT(fs->lfs_flushvp_fakevref == 0);
+               lfs_segunlock(fs);
+
+               /* Make sure that any pending buffers get written */
+               mutex_enter(vp->v_interlock);
+               while (vp->v_numoutput > 0) {
+                       cv_wait(&vp->v_cv, vp->v_interlock);
+               }
+               KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+               KASSERT(vp->v_numoutput == 0);
+               mutex_exit(vp->v_interlock);
+
+               return error;
+       }
+       sp = fs->lfs_sp;
+
+       flushed = 0;
+       if (VPISEMPTY(vp)) {
+               lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY);
+               ++flushed;
+       } else if ((ip->i_flag & IN_CLEANING) &&
+                 (fs->lfs_sp->seg_flags & SEGM_CLEAN)) {
+               ivndebug(vp,"vflush/clean");
+               lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN);
+               ++flushed;
+       } else if (lfs_dostats) {
+               if (!VPISEMPTY(vp) || (VTOI(vp)->i_flag & IN_ALLMOD))
+                       ++lfs_stats.vflush_invoked;
+               ivndebug(vp,"vflush");
+       }
+
+#ifdef DIAGNOSTIC
+       if (vp->v_uflag & VU_DIROP) {
+               DLOG((DLOG_VNODE, "lfs_vflush: flushing VU_DIROP\n"));
+               /* panic("lfs_vflush: VU_DIROP being flushed...this can\'t happen"); */
+       }
+#endif
+
+       do {
+               loopcount = 0;
+               do {
+                       if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
+                               relock = lfs_writefile(fs, sp, vp);
+                               if (relock) {
+                                       /*
+                                        * Might have to wait for the
+                                        * cleaner to run; but we're
+                                        * still not done with this vnode.
+                                        */
+                                       KDASSERT(ip->i_number != LFS_IFILE_INUM);
+                                       lfs_writeinode(fs, sp, ip);
+                                       mutex_enter(&lfs_lock);
+                                       LFS_SET_UINO(ip, IN_MODIFIED);
+                                       mutex_exit(&lfs_lock);
+                                       lfs_writeseg(fs, sp);
+                                       lfs_segunlock(fs);
+                                       lfs_segunlock_relock(fs);
+                                       goto top;
+                               }
+                       }
+                       /*
+                        * If we begin a new segment in the middle of writing
+                        * the Ifile, it creates an inconsistent checkpoint,
+                        * since the Ifile information for the new segment
+                        * is not up-to-date.  Take care of this here by
+                        * sending the Ifile through again in case there
+                        * are newly dirtied blocks.  But wait, there's more!
+                        * This second Ifile write could *also* cross a segment
+                        * boundary, if the first one was large.  The second
+                        * one is guaranteed to be no more than 8 blocks,
+                        * though (two segment blocks and supporting indirects)
+                        * so the third write *will not* cross the boundary.
+                        */
+                       if (vp == fs->lfs_ivnode) {
+                               lfs_writefile(fs, sp, vp);
+                               lfs_writefile(fs, sp, vp);
+                       }
+#ifdef DEBUG
+                       if (++loopcount > 2)
+                               log(LOG_NOTICE, "lfs_vflush: looping count=%d\n", loopcount);
+#endif
+               } while (lfs_writeinode(fs, sp, ip));
+       } while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
+
+       if (lfs_dostats) {
+               ++lfs_stats.nwrites;
+               if (sp->seg_flags & SEGM_SYNC)
+                       ++lfs_stats.nsync_writes;
+               if (sp->seg_flags & SEGM_CKP)
+                       ++lfs_stats.ncheckpoints;
+       }
+       /*
+        * If we were called from somewhere that has already held the seglock
+        * (e.g., lfs_markv()), the lfs_segunlock will not wait for
+        * the write to complete because we are still locked.
+        * Since lfs_vflush() must return the vnode with no dirty buffers,
+        * we must explicitly wait, if that is the case.
+        *
+        * We compare the iocount against 1, not 0, because it is
+        * artificially incremented by lfs_seglock().
+        */
+       mutex_enter(&lfs_lock);
+       if (fs->lfs_seglock > 1) {
+               while (fs->lfs_iocount > 1)
+                       (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+                                    "lfs_vflush", 0, &lfs_lock);
+       }
+       mutex_exit(&lfs_lock);
+
+       lfs_segunlock(fs);
+
+       /* Wait for these buffers to be recovered by aiodoned */
+       mutex_enter(vp->v_interlock);
+       while (vp->v_numoutput > 0) {
+               cv_wait(&vp->v_cv, vp->v_interlock);
+       }
+       KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
+       KASSERT(vp->v_numoutput == 0);
+       mutex_exit(vp->v_interlock);
+
+       fs->lfs_flushvp = NULL;
+       KASSERT(fs->lfs_flushvp_fakevref == 0);
+
+       return (0);
+}
+
+int
+lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
+{
+       struct inode *ip;
+       struct vnode *vp;
+       int inodes_written = 0, only_cleaning;
+       int error = 0;
+
+       ASSERT_SEGLOCK(fs);
+ loop:
+       /* start at last (newest) vnode. */
+       mutex_enter(&mntvnode_lock);
+       TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) {
+               /*
+                * If the vnode that we are about to sync is no longer
+                * associated with this mount point, start over.
+                */
+               if (vp->v_mount != mp) {
+                       DLOG((DLOG_VNODE, "lfs_writevnodes: starting over\n"));
+                       /*
+                        * After this, pages might be busy
+                        * due to our own previous putpages.
+                        * Start actual segment write here to avoid deadlock.
+                        */
+                       mutex_exit(&mntvnode_lock);
+                       (void)lfs_writeseg(fs, sp);
+                       goto loop;
+               }
+
+               mutex_enter(vp->v_interlock);
+               if (vp->v_type == VNON || vismarker(vp) ||
+                   (vp->v_iflag & VI_CLEAN) != 0) {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+
+               ip = VTOI(vp);
+               if ((op == VN_DIROP && !(vp->v_uflag & VU_DIROP)) ||
+                   (op != VN_DIROP && op != VN_CLEAN &&
+                   (vp->v_uflag & VU_DIROP))) {
+                       mutex_exit(vp->v_interlock);
+                       vndebug(vp,"dirop");
+                       continue;
+               }
+
+               if (op == VN_EMPTY && !VPISEMPTY(vp)) {
+                       mutex_exit(vp->v_interlock);
+                       vndebug(vp,"empty");
+                       continue;
+               }
+
+               if (op == VN_CLEAN && ip->i_number != LFS_IFILE_INUM
+                  && vp != fs->lfs_flushvp
+                  && !(ip->i_flag & IN_CLEANING)) {
+                       mutex_exit(vp->v_interlock);
+                       vndebug(vp,"cleaning");
+                       continue;
+               }
+
+               mutex_exit(&mntvnode_lock);
+               if (lfs_vref(vp)) {
+                       vndebug(vp,"vref");
+                       mutex_enter(&mntvnode_lock);
+                       continue;
+               }
+
+               only_cleaning = 0;
+               /*
+                * Write the inode/file if dirty and it's not the IFILE.
+                */
+               if ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp)) {
+                       only_cleaning =
+                           ((ip->i_flag & IN_ALLMOD) == IN_CLEANING);
+
+                       if (ip->i_number != LFS_IFILE_INUM) {
+                               error = lfs_writefile(fs, sp, vp);
+                               if (error) {
+                                       lfs_vunref(vp);
+                                       if (error == EAGAIN) {
+                                               /*
+                                                * This error from lfs_putpages
+                                                * indicates we need to drop
+                                                * the segment lock and start
+                                                * over after the cleaner has
+                                                * had a chance to run.
+                                                */
+                                               lfs_writeinode(fs, sp, ip);
+                                               lfs_writeseg(fs, sp);
+                                               if (!VPISEMPTY(vp) &&
+                                                   !WRITEINPROG(vp) &&
+                                                   !(ip->i_flag & IN_ALLMOD)) {
+                                                       mutex_enter(&lfs_lock);
+                                                       LFS_SET_UINO(ip, IN_MODIFIED);
+                                                       mutex_exit(&lfs_lock);
+                                               }
+                                               mutex_enter(&mntvnode_lock);
+                                               break;
+                                       }
+                                       error = 0; /* XXX not quite right */
+                                       mutex_enter(&mntvnode_lock);
+                                       continue;
+                               }
+                               
+                               if (!VPISEMPTY(vp)) {
+                                       if (WRITEINPROG(vp)) {
+                                               ivndebug(vp,"writevnodes/write2");
+                                       } else if (!(ip->i_flag & IN_ALLMOD)) {
+                                               mutex_enter(&lfs_lock);
+                                               LFS_SET_UINO(ip, IN_MODIFIED);
+                                               mutex_exit(&lfs_lock);
+                                       }
+                               }
+                               (void) lfs_writeinode(fs, sp, ip);
+                               inodes_written++;
+                       }
+               }
+
+               if (lfs_clean_vnhead && only_cleaning)
+                       lfs_vunref_head(vp);
+               else
+                       lfs_vunref(vp);
+
+               mutex_enter(&mntvnode_lock);
+       }
+       mutex_exit(&mntvnode_lock);
+       return error;
+}
+
+/*
+ * Do a checkpoint.
+ */
+int
+lfs_segwrite(struct mount *mp, int flags)
+{
+       struct buf *bp;
+       struct inode *ip;
+       struct lfs *fs;
+       struct segment *sp;
+       struct vnode *vp;
+       SEGUSE *segusep;
+       int do_ckp, did_ckp, error;
+       unsigned n, segleft, maxseg, sn, i, curseg;
+       int writer_set = 0;
+       int dirty;
+       int redo;
+       int um_error;
+       int loopcount;
+
+       fs = VFSTOUFS(mp)->um_lfs;
+       ASSERT_MAYBE_SEGLOCK(fs);
+
+       if (fs->lfs_ronly)
+               return EROFS;
+
+       lfs_imtime(fs);
+
+       /*
+        * Allocate a segment structure and enough space to hold pointers to
+        * the maximum possible number of buffers which can be described in a
+        * single summary block.
+        */
+       do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags);
+
+       lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
+       sp = fs->lfs_sp;
+       if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP))
+               do_ckp = 1;
+
+       /*
+        * If lfs_flushvp is non-NULL, we are called from lfs_vflush,
+        * in which case we have to flush *all* buffers off of this vnode.
+        * We don't care about other nodes, but write any non-dirop nodes
+        * anyway in anticipation of another getnewvnode().
+        *
+        * If we're cleaning we only write cleaning and ifile blocks, and
+        * no dirops, since otherwise we'd risk corruption in a crash.
+        */
+       if (sp->seg_flags & SEGM_CLEAN)
+               lfs_writevnodes(fs, mp, sp, VN_CLEAN);
+       else if (!(sp->seg_flags & SEGM_FORCE_CKP)) {
+               do {
+                       um_error = lfs_writevnodes(fs, mp, sp, VN_REG);
+
+                       if (do_ckp || fs->lfs_dirops == 0) {
+                               if (!writer_set) {
+                                       lfs_writer_enter(fs, "lfs writer");
+                                       writer_set = 1;
+                               }
+                               error = lfs_writevnodes(fs, mp, sp, VN_DIROP);
+                               if (um_error == 0)
+                                       um_error = error;
+                               /* In case writevnodes errored out */
+                               lfs_flush_dirops(fs);
+                               ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
+                               lfs_finalize_fs_seguse(fs);
+                       }
+                       if (do_ckp && um_error) {
+                               lfs_segunlock_relock(fs);
+                               sp = fs->lfs_sp;
+                       }
+               } while (do_ckp && um_error != 0);
+       }
+
+       /*
+        * If we are doing a checkpoint, mark everything since the
+        * last checkpoint as no longer ACTIVE.
+        */
+       if (do_ckp || fs->lfs_doifile) {
+               segleft = fs->lfs_nseg;
+               curseg = 0;
+               for (n = 0; n < fs->lfs_segtabsz; n++) {
+                       dirty = 0;
+                       if (bread(fs->lfs_ivnode, fs->lfs_cleansz + n,
+                           fs->lfs_bsize, NOCRED, B_MODIFY, &bp))
+                               panic("lfs_segwrite: ifile read");
+                       segusep = (SEGUSE *)bp->b_data;
+                       maxseg = min(segleft, fs->lfs_sepb);
+                       for (i = 0; i < maxseg; i++) {
+                               sn = curseg + i;
+                               if (sn != dtosn(fs, fs->lfs_curseg) &&
+                                   segusep->su_flags & SEGUSE_ACTIVE) {
+                                       segusep->su_flags &= ~SEGUSE_ACTIVE;
+                                       --fs->lfs_nactive;
+                                       ++dirty;
+                               }
+                               fs->lfs_suflags[fs->lfs_activesb][sn] =
+                                       segusep->su_flags;
+                               if (fs->lfs_version > 1)
+                                       ++segusep;
+                               else
+                                       segusep = (SEGUSE *)
+                                               ((SEGUSE_V1 *)segusep + 1);
+                       }
+
+                       if (dirty)
+                               error = LFS_BWRITE_LOG(bp); /* Ifile */
+                       else
+                               brelse(bp, 0);
+                       segleft -= fs->lfs_sepb;
+                       curseg += fs->lfs_sepb;
+               }
+       }
+
+       KASSERT(LFS_SEGLOCK_HELD(fs));
+
+       did_ckp = 0;
+       if (do_ckp || fs->lfs_doifile) {
+               vp = fs->lfs_ivnode;
+               vn_lock(vp, LK_EXCLUSIVE);
+               loopcount = 0;
+               do {
+#ifdef DEBUG
+                       LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0, curproc->p_pid);
+#endif
+                       mutex_enter(&lfs_lock);
+                       fs->lfs_flags &= ~LFS_IFDIRTY;
+                       mutex_exit(&lfs_lock);
+
+                       ip = VTOI(vp);
+
+                       if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
+                               /*
+                                * Ifile has no pages, so we don't need
+                                * to check error return here.
+                                */
+                               lfs_writefile(fs, sp, vp);
+                               /*
+                                * Ensure the Ifile takes the current segment
+                                * into account.  See comment in lfs_vflush.
+                                */
+                               lfs_writefile(fs, sp, vp);
+                               lfs_writefile(fs, sp, vp);
+                       }
+
+                       if (ip->i_flag & IN_ALLMOD)
+                               ++did_ckp;
+#if 0
+                       redo = (do_ckp ? lfs_writeinode(fs, sp, ip) : 0);
+#else
+                       redo = lfs_writeinode(fs, sp, ip);
+#endif
+                       redo += lfs_writeseg(fs, sp);
+                       mutex_enter(&lfs_lock);
+                       redo += (fs->lfs_flags & LFS_IFDIRTY);
+                       mutex_exit(&lfs_lock);
+#ifdef DEBUG
+                       if (++loopcount > 2)
+                               log(LOG_NOTICE, "lfs_segwrite: looping count=%d\n",
+                                       loopcount);
+#endif
+               } while (redo && do_ckp);
+
+               /*
+                * Unless we are unmounting, the Ifile may continue to have
+                * dirty blocks even after a checkpoint, due to changes to
+                * inodes' atime.  If we're checkpointing, it's "impossible"
+                * for other parts of the Ifile to be dirty after the loop
+                * above, since we hold the segment lock.
+                */
+               mutex_enter(vp->v_interlock);
+               if (LIST_EMPTY(&vp->v_dirtyblkhd)) {
+                       LFS_CLR_UINO(ip, IN_ALLMOD);
+               }
+#ifdef DIAGNOSTIC
+               else if (do_ckp) {
+                       int do_panic = 0;
+                       LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
+                               if (bp->b_lblkno < fs->lfs_cleansz +
+                                   fs->lfs_segtabsz &&
+                                   !(bp->b_flags & B_GATHERED)) {
+                                       printf("ifile lbn %ld still dirty (flags %lx)\n",
+                                               (long)bp->b_lblkno,
+                                               (long)bp->b_flags);
+                                       ++do_panic;
+                               }
+                       }
+                       if (do_panic)
+                               panic("dirty blocks");
+               }
+#endif
+               mutex_exit(vp->v_interlock);
+               VOP_UNLOCK(vp);
+       } else {
+               (void) lfs_writeseg(fs, sp);
+       }
+
+       /* Note Ifile no longer needs to be written */
+       fs->lfs_doifile = 0;
+       if (writer_set)
+               lfs_writer_leave(fs);
+
+       /*
+        * If we didn't write the Ifile, we didn't really do anything.
+        * That means that (1) there is a checkpoint on disk and (2)
+        * nothing has changed since it was written.
+        *
+        * Take the flags off of the segment so that lfs_segunlock
+        * doesn't have to write the superblock either.
+        */
+       if (do_ckp && !did_ckp) {
+               sp->seg_flags &= ~SEGM_CKP;
+       }
+
+       if (lfs_dostats) {
+               ++lfs_stats.nwrites;
+               if (sp->seg_flags & SEGM_SYNC)
+                       ++lfs_stats.nsync_writes;
+               if (sp->seg_flags & SEGM_CKP)
+                       ++lfs_stats.ncheckpoints;
+       }
+       lfs_segunlock(fs);
+       return (0);
+}
+
+/*
+ * Write the dirty blocks associated with a vnode.
+ */
+int
+lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
+{
+       struct finfo *fip;
+       struct inode *ip;
+       int i, frag;
+       int error;
+
+       ASSERT_SEGLOCK(fs);
+       error = 0;
+       ip = VTOI(vp);
+
+       fip = sp->fip;
+       lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
+
+       if (vp->v_uflag & VU_DIROP)
+               ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+
+       if (sp->seg_flags & SEGM_CLEAN) {
+               lfs_gather(fs, sp, vp, lfs_match_fake);
+               /*
+                * For a file being flushed, we need to write *all* blocks.
+                * This means writing the cleaning blocks first, and then
+                * immediately following with any non-cleaning blocks.
+                * The same is true of the Ifile since checkpoints assume
+                * that all valid Ifile blocks are written.
+                */
+               if (IS_FLUSHING(fs, vp) || vp == fs->lfs_ivnode) {
+                       lfs_gather(fs, sp, vp, lfs_match_data);
+                       /*
+                        * Don't call VOP_PUTPAGES: if we're flushing,
+                        * we've already done it, and the Ifile doesn't
+                        * use the page cache.
+                        */
+               }
+       } else {
+               lfs_gather(fs, sp, vp, lfs_match_data);
+               /*
+                * If we're flushing, we've already called VOP_PUTPAGES
+                * so don't do it again.  Otherwise, we want to write
+                * everything we've got.
+                */
+               if (!IS_FLUSHING(fs, vp)) {
+                       mutex_enter(vp->v_interlock);
+                       error = VOP_PUTPAGES(vp, 0, 0,
+                               PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED);
+               }
+       }
+
+       /*
+        * It may not be necessary to write the meta-data blocks at this point,
+        * as the roll-forward recovery code should be able to reconstruct the
+        * list.
+        *
+        * We have to write them anyway, though, under two conditions: (1) the
+        * vnode is being flushed (for reuse by vinvalbuf); or (2) we are
+        * checkpointing.
+        *
+        * BUT if we are cleaning, we might have indirect blocks that refer to
+        * new blocks not being written yet, in addition to fragments being
+        * moved out of a cleaned segment.  If that is the case, don't
+        * write the indirect blocks, or the finfo will have a small block
+        * in the middle of it!
+        * XXX in this case isn't the inode size wrong too?
+        */
+       frag = 0;
+       if (sp->seg_flags & SEGM_CLEAN) {
+               for (i = 0; i < NDADDR; i++)
+                       if (ip->i_lfs_fragsize[i] > 0 &&
+                           ip->i_lfs_fragsize[i] < fs->lfs_bsize)
+                               ++frag;
+       }
+#ifdef DIAGNOSTIC
+       if (frag > 1)
+               panic("lfs_writefile: more than one fragment!");
+#endif
+       if (IS_FLUSHING(fs, vp) ||
+           (frag == 0 && (lfs_writeindir || (sp->seg_flags & SEGM_CKP)))) {
+               lfs_gather(fs, sp, vp, lfs_match_indir);
+               lfs_gather(fs, sp, vp, lfs_match_dindir);
+               lfs_gather(fs, sp, vp, lfs_match_tindir);
+       }
+       fip = sp->fip;
+       lfs_release_finfo(fs);
+
+       return error;
+}
+
+/*
+ * Update segment accounting to reflect this inode's change of address.
+ */
+static int
+lfs_update_iaddr(struct lfs *fs, struct segment *sp, struct inode *ip, daddr_t ndaddr)
+{
+       struct buf *bp;
+       daddr_t daddr;
+       IFILE *ifp;
+       SEGUSE *sup;
+       ino_t ino;
+       int redo_ifile, error;
+       u_int32_t sn;
+
+       redo_ifile = 0;
+
+       /*
+        * If updating the ifile, update the super-block.  Update the disk
+        * address and access times for this inode in the ifile.
+        */
+       ino = ip->i_number;
+       if (ino == LFS_IFILE_INUM) {
+               daddr = fs->lfs_idaddr;
+               fs->lfs_idaddr = dbtofsb(fs, ndaddr);
+       } else {
+               LFS_IENTRY(ifp, fs, ino, bp);
+               daddr = ifp->if_daddr;
+               ifp->if_daddr = dbtofsb(fs, ndaddr);
+               error = LFS_BWRITE_LOG(bp); /* Ifile */
+       }
+
+       /*
+        * If this is the Ifile and lfs_offset is set to the first block
+        * in the segment, dirty the new segment's accounting block
+        * (XXX should already be dirty?) and tell the caller to do it again.
+        */
+       if (ip->i_number == LFS_IFILE_INUM) {
+               sn = dtosn(fs, fs->lfs_offset);
+               if (sntod(fs, sn) + btofsb(fs, fs->lfs_sumsize) ==
+                   fs->lfs_offset) {
+                       LFS_SEGENTRY(sup, fs, sn, bp);
+                       KASSERT(bp->b_oflags & BO_DELWRI);
+                       LFS_WRITESEGENTRY(sup, fs, sn, bp);
+                       /* fs->lfs_flags |= LFS_IFDIRTY; */
+                       redo_ifile |= 1;
+               }
+       }
+
+       /*
+        * The inode's last address should not be in the current partial
+        * segment, except under exceptional circumstances (lfs_writevnodes
+        * had to start over, and in the meantime more blocks were written
+        * to a vnode).  Both inodes will be accounted to this segment
+        * in lfs_writeseg so we need to subtract the earlier version
+        * here anyway.  The segment count can temporarily dip below
+        * zero here; keep track of how many duplicates we have in
+        * "dupino" so we don't panic below.
+        */
+       if (daddr >= fs->lfs_lastpseg && daddr <= fs->lfs_offset) {
+               ++sp->ndupino;
+               DLOG((DLOG_SEG, "lfs_writeinode: last inode addr in current pseg "
+                     "(ino %d daddr 0x%llx) ndupino=%d\n", ino,
+                     (long long)daddr, sp->ndupino));
+       }
+       /*
+        * Account the inode: it no longer belongs to its former segment,
+        * though it will not belong to the new segment until that segment
+        * is actually written.
+        */
+       if (daddr != LFS_UNUSED_DADDR) {
+               u_int32_t oldsn = dtosn(fs, daddr);
+#ifdef DIAGNOSTIC
+               int ndupino = (sp->seg_number == oldsn) ? sp->ndupino : 0;
+#endif
+               LFS_SEGENTRY(sup, fs, oldsn, bp);
+#ifdef DIAGNOSTIC
+               if (sup->su_nbytes +
+                   sizeof (struct ufs1_dinode) * ndupino
+                     < sizeof (struct ufs1_dinode)) {
+                       printf("lfs_writeinode: negative bytes "
+                              "(segment %" PRIu32 " short by %d, "
+                              "oldsn=%" PRIu32 ", cursn=%" PRIu32
+                              ", daddr=%" PRId64 ", su_nbytes=%u, "
+                              "ndupino=%d)\n",
+                              dtosn(fs, daddr),
+                              (int)sizeof (struct ufs1_dinode) *
+                                  (1 - sp->ndupino) - sup->su_nbytes,
+                              oldsn, sp->seg_number, daddr,
+                              (unsigned int)sup->su_nbytes,
+                              sp->ndupino);
+                       panic("lfs_writeinode: negative bytes");
+                       sup->su_nbytes = sizeof (struct ufs1_dinode);
+               }
+#endif
+               DLOG((DLOG_SU, "seg %d -= %d for ino %d inode\n",
+                     dtosn(fs, daddr), sizeof (struct ufs1_dinode), ino));
+               sup->su_nbytes -= sizeof (struct ufs1_dinode);
+               redo_ifile |=
+                       (ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
+               if (redo_ifile) {
+                       mutex_enter(&lfs_lock);
+                       fs->lfs_flags |= LFS_IFDIRTY;
+                       mutex_exit(&lfs_lock);
+                       /* Don't double-account */
+                       fs->lfs_idaddr = 0x0;
+               }
+               LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */
+       }
+
+       return redo_ifile;
+}
+
+int
+lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
+{
+       struct buf *bp;
+       struct ufs1_dinode *cdp;
+       daddr_t daddr;
+       int32_t *daddrp;        /* XXX ondisk32 */
+       int i, ndx;
+       int redo_ifile = 0;
+       int gotblk = 0;
+       int count;
+
+       ASSERT_SEGLOCK(fs);
+       if (!(ip->i_flag & IN_ALLMOD))
+               return (0);
+
+       /* Can't write ifile when writer is not set */
+       KASSERT(ip->i_number != LFS_IFILE_INUM || fs->lfs_writer > 0 ||
+               (sp->seg_flags & SEGM_CLEAN));
+
+       /*
+        * If this is the Ifile, see if writing it here will generate a
+        * temporary misaccounting.  If it will, do the accounting and write
+        * the blocks, postponing the inode write until the accounting is
+        * solid.
+        */
+       count = 0;
+       while (ip->i_number == LFS_IFILE_INUM) {
+               int redo = 0;
+
+               if (sp->idp == NULL && sp->ibp == NULL &&
+                   (sp->seg_bytes_left < fs->lfs_ibsize ||
+                    sp->sum_bytes_left < sizeof(int32_t))) {
+                       (void) lfs_writeseg(fs, sp);
+                       continue;
+               }
+
+               /* Look for dirty Ifile blocks */
+               LIST_FOREACH(bp, &fs->lfs_ivnode->v_dirtyblkhd, b_vnbufs) {
+                       if (!(bp->b_flags & B_GATHERED)) {
+                               redo = 1;
+                               break;
+                       }
+               }
+
+               if (redo == 0)
+                       redo = lfs_update_iaddr(fs, sp, ip, 0x0);
+               if (redo == 0)
+                       break;
+
+               if (sp->idp) {
+                       sp->idp->di_inumber = 0;
+                       sp->idp = NULL;
+               }
+               ++count;
+               if (count > 2)
+                       log(LOG_NOTICE, "lfs_writeinode: looping count=%d\n", count);
+               lfs_writefile(fs, sp, fs->lfs_ivnode);
+       }
+
+       /* Allocate a new inode block if necessary. */
+       if ((ip->i_number != LFS_IFILE_INUM || sp->idp == NULL) &&
+           sp->ibp == NULL) {
+               /* Allocate a new segment if necessary. */
+               if (sp->seg_bytes_left < fs->lfs_ibsize ||
+                   sp->sum_bytes_left < sizeof(int32_t))
+                       (void) lfs_writeseg(fs, sp);
+
+               /* Get next inode block. */
+               daddr = fs->lfs_offset;
+               fs->lfs_offset += btofsb(fs, fs->lfs_ibsize);
+               sp->ibp = *sp->cbpp++ =
+                       getblk(VTOI(fs->lfs_ivnode)->i_devvp,
+                           fsbtodb(fs, daddr), fs->lfs_ibsize, 0, 0);
+               gotblk++;
+
+               /* Zero out inode numbers */
+               for (i = 0; i < INOPB(fs); ++i)
+                       ((struct ufs1_dinode *)sp->ibp->b_data)[i].di_inumber =
+                           0;
+
+               ++sp->start_bpp;
+               fs->lfs_avail -= btofsb(fs, fs->lfs_ibsize);
+               /* Set remaining space counters. */
+               sp->seg_bytes_left -= fs->lfs_ibsize;
+               sp->sum_bytes_left -= sizeof(int32_t);
+               ndx = fs->lfs_sumsize / sizeof(int32_t) -
+                       sp->ninodes / INOPB(fs) - 1;
+               ((int32_t *)(sp->segsum))[ndx] = daddr;
+       }
+
+       /* Check VU_DIROP in case there is a new file with no data blocks */
+       if (ITOV(ip)->v_uflag & VU_DIROP)
+               ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+
+       /* Update the inode times and copy the inode onto the inode page. */
+       /* XXX kludge --- don't redirty the ifile just to put times on it */
+       if (ip->i_number != LFS_IFILE_INUM)
+               LFS_ITIMES(ip, NULL, NULL, NULL);
+
+       /*
+        * If this is the Ifile, and we've already written the Ifile in this
+        * partial segment, just overwrite it (it's not on disk yet) and
+        * continue.
+        *
+        * XXX we know that the bp that we get the second time around has
+        * already been gathered.
+        */
+       if (ip->i_number == LFS_IFILE_INUM && sp->idp) {
+               *(sp->idp) = *ip->i_din.ffs1_din;
+               ip->i_lfs_osize = ip->i_size;
+               return 0;
+       }
+
+       bp = sp->ibp;
+       cdp = ((struct ufs1_dinode *)bp->b_data) + (sp->ninodes % INOPB(fs));
+       *cdp = *ip->i_din.ffs1_din;
+
+       /*
+        * If cleaning, link counts and directory file sizes cannot change,
+        * since those would be directory operations---even if the file
+        * we are writing is marked VU_DIROP we should write the old values.
+        * If we're not cleaning, of course, update the values so we get
+        * current values the next time we clean.
+        */
+       if (sp->seg_flags & SEGM_CLEAN) {
+               if (ITOV(ip)->v_uflag & VU_DIROP) {
+                       cdp->di_nlink = ip->i_lfs_odnlink;
+                       /* if (ITOV(ip)->v_type == VDIR) */
+                       cdp->di_size = ip->i_lfs_osize;
+               }
+       } else {
+               ip->i_lfs_odnlink = cdp->di_nlink;
+               ip->i_lfs_osize = ip->i_size;
+       }
+               
+
+       /* We can finish the segment accounting for truncations now */
+       lfs_finalize_ino_seguse(fs, ip);
+
+       /*
+        * If we are cleaning, ensure that we don't write UNWRITTEN disk
+        * addresses to disk; possibly change the on-disk record of
+        * the inode size, either by reverting to the previous size
+        * (in the case of cleaning) or by verifying the inode's block
+        * holdings (in the case of files being allocated as they are being
+        * written).
+        * XXX By not writing UNWRITTEN blocks, we are making the lfs_avail
+        * XXX count on disk wrong by the same amount.  We should be
+        * XXX able to "borrow" from lfs_avail and return it after the
+        * XXX Ifile is written.  See also in lfs_writeseg.
+        */
+
+       /* Check file size based on highest allocated block */
+       if (((ip->i_ffs1_mode & IFMT) == IFREG ||
+            (ip->i_ffs1_mode & IFMT) == IFDIR) &&
+           ip->i_size > ((ip->i_lfs_hiblk + 1) << fs->lfs_bshift)) {
+               cdp->di_size = (ip->i_lfs_hiblk + 1) << fs->lfs_bshift;
+               DLOG((DLOG_SEG, "lfs_writeinode: ino %d size %" PRId64 " -> %"
+                     PRId64 "\n", (int)ip->i_number, ip->i_size, cdp->di_size));
+       }
+       if (ip->i_lfs_effnblks != ip->i_ffs1_blocks) {
+               DLOG((DLOG_SEG, "lfs_writeinode: cleansing ino %d eff %d != nblk %d)"
+                     " at %x\n", ip->i_number, ip->i_lfs_effnblks,
+                     ip->i_ffs1_blocks, fs->lfs_offset));
+               for (daddrp = cdp->di_db; daddrp < cdp->di_ib + NIADDR;
+                    daddrp++) {
+                       if (*daddrp == UNWRITTEN) {
+                               DLOG((DLOG_SEG, "lfs_writeinode: wiping UNWRITTEN\n"));
+                               *daddrp = 0;
+                       }
+               }
+       }
+
+#ifdef DIAGNOSTIC
+       /*
+        * Check dinode held blocks against dinode size.
+        * This should be identical to the check in lfs_vget().
+        */
+       for (i = (cdp->di_size + fs->lfs_bsize - 1) >> fs->lfs_bshift;
+            i < NDADDR; i++) {
+               KASSERT(i >= 0);
+               if ((cdp->di_mode & IFMT) == IFLNK)
+                       continue;
+               if (((cdp->di_mode & IFMT) == IFBLK ||
+                    (cdp->di_mode & IFMT) == IFCHR) && i == 0)
+                       continue;
+               if (cdp->di_db[i] != 0) {
+# ifdef DEBUG
+                       lfs_dump_dinode(cdp);
+# endif
+                       panic("writing inconsistent inode");
+               }
+       }
+#endif /* DIAGNOSTIC */
+
+       if (ip->i_flag & IN_CLEANING)
+               LFS_CLR_UINO(ip, IN_CLEANING);
+       else {
+               /* XXX IN_ALLMOD */
+               LFS_CLR_UINO(ip, IN_ACCESSED | IN_ACCESS | IN_CHANGE |
+                            IN_UPDATE | IN_MODIFY);
+               if (ip->i_lfs_effnblks == ip->i_ffs1_blocks)
+                       LFS_CLR_UINO(ip, IN_MODIFIED);
+               else {
+                       DLOG((DLOG_VNODE, "lfs_writeinode: ino %d: real "
+                           "blks=%d, eff=%d\n", ip->i_number,
+                           ip->i_ffs1_blocks, ip->i_lfs_effnblks));
+               }
+       }
+
+       if (ip->i_number == LFS_IFILE_INUM) {
+               /* We know sp->idp == NULL */
+               sp->idp = ((struct ufs1_dinode *)bp->b_data) +
+                       (sp->ninodes % INOPB(fs));
+
+               /* Not dirty any more */
+               mutex_enter(&lfs_lock);
+               fs->lfs_flags &= ~LFS_IFDIRTY;
+               mutex_exit(&lfs_lock);
+       }
+
+       if (gotblk) {
+               mutex_enter(&bufcache_lock);
+               LFS_LOCK_BUF(bp);
+               brelsel(bp, 0);
+               mutex_exit(&bufcache_lock);
+       }
+
+       /* Increment inode count in segment summary block. */
+       ++((SEGSUM *)(sp->segsum))->ss_ninos;
+
+       /* If this page is full, set flag to allocate a new page. */
+       if (++sp->ninodes % INOPB(fs) == 0)
+               sp->ibp = NULL;
+
+       redo_ifile = lfs_update_iaddr(fs, sp, ip, bp->b_blkno);
+
+       KASSERT(redo_ifile == 0);
+       return (redo_ifile);
+}
+
+int
+lfs_gatherblock(struct segment *sp, struct buf *bp, kmutex_t *mptr)
+{
+       struct lfs *fs;
+       int vers;
+       int j, blksinblk;
+
+       ASSERT_SEGLOCK(sp->fs);
+       /*
+        * If full, finish this segment.  We may be doing I/O, so
+        * release and reacquire the splbio().
+        */
+#ifdef DIAGNOSTIC
+       if (sp->vp == NULL)
+               panic ("lfs_gatherblock: Null vp in segment");
+#endif
+       fs = sp->fs;
+       blksinblk = howmany(bp->b_bcount, fs->lfs_bsize);
+       if (sp->sum_bytes_left < sizeof(int32_t) * blksinblk ||
+           sp->seg_bytes_left < bp->b_bcount) {
+               if (mptr)
+                       mutex_exit(mptr);
+               lfs_updatemeta(sp);
+
+               vers = sp->fip->fi_version;
+               (void) lfs_writeseg(fs, sp);
+
+               /* Add the current file to the segment summary. */
+               lfs_acquire_finfo(fs, VTOI(sp->vp)->i_number, vers);
+
+               if (mptr)
+                       mutex_enter(mptr);
+               return (1);
+       }
+
+       if (bp->b_flags & B_GATHERED) {
+               DLOG((DLOG_SEG, "lfs_gatherblock: already gathered! Ino %d,"
+                     " lbn %" PRId64 "\n",
+                     sp->fip->fi_ino, bp->b_lblkno));
+               return (0);
+       }
+
+       /* Insert into the buffer list, update the FINFO block. */
+       bp->b_flags |= B_GATHERED;
+
+       *sp->cbpp++ = bp;
+       for (j = 0; j < blksinblk; j++) {
+               sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno + j;
+               /* This block's accounting moves from lfs_favail to lfs_avail */
+               lfs_deregister_block(sp->vp, bp->b_lblkno + j);
+       }
+
+       sp->sum_bytes_left -= sizeof(int32_t) * blksinblk;
+       sp->seg_bytes_left -= bp->b_bcount;
+       return (0);
+}
+
+int
+lfs_gather(struct lfs *fs, struct segment *sp, struct vnode *vp,
+    int (*match)(struct lfs *, struct buf *))
+{
+       struct buf *bp, *nbp;
+       int count = 0;
+
+       ASSERT_SEGLOCK(fs);
+       if (vp->v_type == VBLK)
+               return 0;
+       KASSERT(sp->vp == NULL);
+       sp->vp = vp;
+       mutex_enter(&bufcache_lock);
+
+#ifndef LFS_NO_BACKBUF_HACK
+/* This is a hack to see if ordering the blocks in LFS makes a difference. */
+# define       BUF_OFFSET      \
+       (((char *)&LIST_NEXT(bp, b_vnbufs)) - (char *)bp)
+# define       BACK_BUF(BP)    \
+       ((struct buf *)(((char *)(BP)->b_vnbufs.le_prev) - BUF_OFFSET))
+# define       BEG_OF_LIST     \
+       ((struct buf *)(((char *)&LIST_FIRST(&vp->v_dirtyblkhd)) - BUF_OFFSET))
+
+loop:
+       /* Find last buffer. */
+       for (bp = LIST_FIRST(&vp->v_dirtyblkhd);
+            bp && LIST_NEXT(bp, b_vnbufs) != NULL;
+            bp = LIST_NEXT(bp, b_vnbufs))
+               /* nothing */;
+       for (; bp && bp != BEG_OF_LIST; bp = nbp) {
+               nbp = BACK_BUF(bp);
+#else /* LFS_NO_BACKBUF_HACK */
+loop:
+       for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+               nbp = LIST_NEXT(bp, b_vnbufs);
+#endif /* LFS_NO_BACKBUF_HACK */
+               if ((bp->b_cflags & BC_BUSY) != 0 ||
+                   (bp->b_flags & B_GATHERED) != 0 || !match(fs, bp)) {
+#ifdef DEBUG
+                       if (vp == fs->lfs_ivnode &&
+                           (bp->b_cflags & BC_BUSY) != 0 &&
+                           (bp->b_flags & B_GATHERED) == 0)
+                               log(LOG_NOTICE, "lfs_gather: ifile lbn %"
+                                     PRId64 " busy (%x) at 0x%x",
+                                     bp->b_lblkno, bp->b_flags,
+                                     (unsigned)fs->lfs_offset);
+#endif
+                       continue;
+               }
+#ifdef DIAGNOSTIC
+# ifdef LFS_USE_B_INVAL
+               if ((bp->b_flags & BC_INVAL) != 0 && bp->b_iodone == NULL) {
+                       DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
+                             " is BC_INVAL\n", bp->b_lblkno));
+                       VOP_PRINT(bp->b_vp);
+               }
+# endif /* LFS_USE_B_INVAL */
+               if (!(bp->b_oflags & BO_DELWRI))
+                       panic("lfs_gather: bp not BO_DELWRI");
+               if (!(bp->b_flags & B_LOCKED)) {
+                       DLOG((DLOG_SEG, "lfs_gather: lbn %" PRId64
+                             " blk %" PRId64 " not B_LOCKED\n",
+                             bp->b_lblkno,
+                             dbtofsb(fs, bp->b_blkno)));
+                       VOP_PRINT(bp->b_vp);
+                       panic("lfs_gather: bp not B_LOCKED");
+               }
+#endif
+               if (lfs_gatherblock(sp, bp, &bufcache_lock)) {
+                       goto loop;
+               }
+               count++;
+       }
+       mutex_exit(&bufcache_lock);
+       lfs_updatemeta(sp);
+       KASSERT(sp->vp == vp);
+       sp->vp = NULL;
+       return count;
+}
+
+#if DEBUG
+# define DEBUG_OOFF(n) do {                                            \
+       if (ooff == 0) {                                                \
+               DLOG((DLOG_SEG, "lfs_updatemeta[%d]: warning: writing " \
+                       "ino %d lbn %" PRId64 " at 0x%" PRIx32          \
+                       ", was 0x0 (or %" PRId64 ")\n",                 \
+                       (n), ip->i_number, lbn, ndaddr, daddr));        \
+       }                                                               \
+} while (0)
+#else
+# define DEBUG_OOFF(n)
+#endif
+
+/*
+ * Change the given block's address to ndaddr, finding its previous
+ * location using ufs_bmaparray().
+ *
+ * Account for this change in the segment table.
+ *
+ * called with sp == NULL by roll-forwarding code.
+ */
+void
+lfs_update_single(struct lfs *fs, struct segment *sp,
+    struct vnode *vp, daddr_t lbn, int32_t ndaddr, int size)
+{
+       SEGUSE *sup;
+       struct buf *bp;
+       struct indir a[NIADDR + 2], *ap;
+       struct inode *ip;
+       daddr_t daddr, ooff;
+       int num, error;
+       int bb, osize, obb;
+
+       ASSERT_SEGLOCK(fs);
+       KASSERT(sp == NULL || sp->vp == vp);
+       ip = VTOI(vp);
+
+       error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL, NULL);
+       if (error)
+               panic("lfs_updatemeta: ufs_bmaparray returned %d", error);
+
+       daddr = (daddr_t)((int32_t)daddr); /* XXX ondisk32 */
+       KASSERT(daddr <= LFS_MAX_DADDR);
+       if (daddr > 0)
+               daddr = dbtofsb(fs, daddr);
+
+       bb = numfrags(fs, size);
+       switch (num) {
+           case 0:
+                   ooff = ip->i_ffs1_db[lbn];
+                   DEBUG_OOFF(0);
+                   if (ooff == UNWRITTEN)
+                           ip->i_ffs1_blocks += bb;
+                   else {
+                           /* possible fragment truncation or extension */
+                           obb = btofsb(fs, ip->i_lfs_fragsize[lbn]);
+                           ip->i_ffs1_blocks += (bb - obb);
+                   }
+                   ip->i_ffs1_db[lbn] = ndaddr;
+                   break;
+           case 1:
+                   ooff = ip->i_ffs1_ib[a[0].in_off];
+                   DEBUG_OOFF(1);
+                   if (ooff == UNWRITTEN)
+                           ip->i_ffs1_blocks += bb;
+                   ip->i_ffs1_ib[a[0].in_off] = ndaddr;
+                   break;
+           default:
+                   ap = &a[num - 1];
+                   if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED,
+                       B_MODIFY, &bp))
+                           panic("lfs_updatemeta: bread bno %" PRId64,
+                                 ap->in_lbn);
+
+                   /* XXX ondisk32 */
+                   ooff = ((int32_t *)bp->b_data)[ap->in_off];
+                   DEBUG_OOFF(num);
+                   if (ooff == UNWRITTEN)
+                           ip->i_ffs1_blocks += bb;
+                   /* XXX ondisk32 */
+                   ((int32_t *)bp->b_data)[ap->in_off] = ndaddr;
+                   (void) VOP_BWRITE(bp->b_vp, bp);
+       }
+
+       KASSERT(ooff == 0 || ooff == UNWRITTEN || ooff == daddr);
+
+       /* Update hiblk when extending the file */
+       if (lbn > ip->i_lfs_hiblk)
+               ip->i_lfs_hiblk = lbn;
+
+       /*
+        * Though we'd rather it couldn't, this *can* happen right now
+        * if cleaning blocks and regular blocks coexist.
+        */
+       /* KASSERT(daddr < fs->lfs_lastpseg || daddr > ndaddr); */
+
+       /*
+        * Update segment usage information, based on old size
+        * and location.
+        */
+       if (daddr > 0) {
+               u_int32_t oldsn = dtosn(fs, daddr);
+#ifdef DIAGNOSTIC
+               int ndupino;
+
+               if (sp && sp->seg_number == oldsn) {
+                       ndupino = sp->ndupino;
+               } else {
+                       ndupino = 0;
+               }
+#endif
+               KASSERT(oldsn < fs->lfs_nseg);
+               if (lbn >= 0 && lbn < NDADDR)
+                       osize = ip->i_lfs_fragsize[lbn];
+               else
+                       osize = fs->lfs_bsize;
+               LFS_SEGENTRY(sup, fs, oldsn, bp);
+#ifdef DIAGNOSTIC
+               if (sup->su_nbytes + sizeof (struct ufs1_dinode) * ndupino
+                   < osize) {
+                       printf("lfs_updatemeta: negative bytes "
+                              "(segment %" PRIu32 " short by %" PRId64
+                              ")\n", dtosn(fs, daddr),
+                              (int64_t)osize -
+                              (sizeof (struct ufs1_dinode) * ndupino +
+                               sup->su_nbytes));
+                       printf("lfs_updatemeta: ino %llu, lbn %" PRId64
+                              ", addr = 0x%" PRIx64 "\n",
+                              (unsigned long long)ip->i_number, lbn, daddr);
+                       printf("lfs_updatemeta: ndupino=%d\n", ndupino);
+                       panic("lfs_updatemeta: negative bytes");
+                       sup->su_nbytes = osize -
+                           sizeof (struct ufs1_dinode) * ndupino;
+               }
+#endif
+               DLOG((DLOG_SU, "seg %" PRIu32 " -= %d for ino %d lbn %" PRId64
+                     " db 0x%" PRIx64 "\n",
+                     dtosn(fs, daddr), osize,
+                     ip->i_number, lbn, daddr));
+               sup->su_nbytes -= osize;
+               if (!(bp->b_flags & B_GATHERED)) {
+                       mutex_enter(&lfs_lock);
+                       fs->lfs_flags |= LFS_IFDIRTY;
+                       mutex_exit(&lfs_lock);
+               }
+               LFS_WRITESEGENTRY(sup, fs, oldsn, bp);
+       }
+       /*
+        * Now that this block has a new address, and its old
+        * segment no longer owns it, we can forget about its
+        * old size.
+        */
+       if (lbn >= 0 && lbn < NDADDR)
+               ip->i_lfs_fragsize[lbn] = size;
+}
+
+/*
+ * Update the metadata that points to the blocks listed in the FINFO
+ * array.
+ */
+void
+lfs_updatemeta(struct segment *sp)
+{
+       struct buf *sbp;
+       struct lfs *fs;
+       struct vnode *vp;
+       daddr_t lbn;
+       int i, nblocks, num;
+       int bb;
+       int bytesleft, size;
+
+       ASSERT_SEGLOCK(sp->fs);
+       vp = sp->vp;
+       nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp;
+       KASSERT(nblocks >= 0);
+       KASSERT(vp != NULL);
+       if (nblocks == 0)
+               return;
+
+       /*
+        * This count may be high due to oversize blocks from lfs_gop_write.
+        * Correct for this. (XXX we should be able to keep track of these.)
+        */
+       fs = sp->fs;
+       for (i = 0; i < nblocks; i++) {
+               if (sp->start_bpp[i] == NULL) {
+                       DLOG((DLOG_SEG, "lfs_updatemeta: nblocks = %d, not %d\n", i, nblocks));
+                       nblocks = i;
+                       break;
+               }
+               num = howmany(sp->start_bpp[i]->b_bcount, fs->lfs_bsize);
+               KASSERT(sp->start_bpp[i]->b_lblkno >= 0 || num == 1);
+               nblocks -= num - 1;
+       }
+
+       KASSERT(vp->v_type == VREG ||
+          nblocks == &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp);
+       KASSERT(nblocks == sp->cbpp - sp->start_bpp);
+
+       /*
+        * Sort the blocks.
+        *
+        * We have to sort even if the blocks come from the
+        * cleaner, because there might be other pending blocks on the
+        * same inode...and if we don't sort, and there are fragments
+        * present, blocks may be written in the wrong place.
+        */
+       lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks, fs->lfs_bsize);
+
+       /*
+        * Record the length of the last block in case it's a fragment.
+        * If there are indirect blocks present, they sort last.  An
+        * indirect block will be lfs_bsize and its presence indicates
+        * that you cannot have fragments.
+        *
+        * XXX This last is a lie.  A cleaned fragment can coexist with
+        * XXX a later indirect block.  This will continue to be
+        * XXX true until lfs_markv is fixed to do everything with
+        * XXX fake blocks (including fake inodes and fake indirect blocks).
+        */
+       sp->fip->fi_lastlength = ((sp->start_bpp[nblocks - 1]->b_bcount - 1) &
+               fs->lfs_bmask) + 1;
+
+       /*
+        * Assign disk addresses, and update references to the logical
+        * block and the segment usage information.
+        */
+       for (i = nblocks; i--; ++sp->start_bpp) {
+               sbp = *sp->start_bpp;
+               lbn = *sp->start_lbp;
+               KASSERT(sbp->b_lblkno == lbn);
+
+               sbp->b_blkno = fsbtodb(fs, fs->lfs_offset);
+
+               /*
+                * If we write a frag in the wrong place, the cleaner won't
+                * be able to correctly identify its size later, and the
+                * segment will be uncleanable.  (Even worse, it will assume
+                * that the indirect block that actually ends the list
+                * is of a smaller size!)
+                */
+               if ((sbp->b_bcount & fs->lfs_bmask) && i != 0)
+                       panic("lfs_updatemeta: fragment is not last block");
+
+               /*
+                * For each subblock in this possibly oversized block,
+                * update its address on disk.
+                */
+               KASSERT(lbn >= 0 || sbp->b_bcount == fs->lfs_bsize);
+               KASSERT(vp == sbp->b_vp);
+               for (bytesleft = sbp->b_bcount; bytesleft > 0;
+                    bytesleft -= fs->lfs_bsize) {
+                       size = MIN(bytesleft, fs->lfs_bsize);
+                       bb = numfrags(fs, size);
+                       lbn = *sp->start_lbp++;
+                       lfs_update_single(fs, sp, sp->vp, lbn, fs->lfs_offset,
+                           size);
+                       fs->lfs_offset += bb;
+               }
+
+       }
+
+       /* This inode has been modified */
+       LFS_SET_UINO(VTOI(vp), IN_MODIFIED);
+}
+
+/*
+ * Move lfs_offset to a segment earlier than sn.
+ */
+int
+lfs_rewind(struct lfs *fs, int newsn)
+{
+       int sn, osn, isdirty;
+       struct buf *bp;
+       SEGUSE *sup;
+
+       ASSERT_SEGLOCK(fs);
+
+       osn = dtosn(fs, fs->lfs_offset);
+       if (osn < newsn)
+               return 0;
+
+       /* lfs_avail eats the remaining space in this segment */
+       fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset - fs->lfs_curseg);
+
+       /* Find a low-numbered segment */
+       for (sn = 0; sn < fs->lfs_nseg; ++sn) {
+               LFS_SEGENTRY(sup, fs, sn, bp);
+               isdirty = sup->su_flags & SEGUSE_DIRTY;
+               brelse(bp, 0);
+
+               if (!isdirty)
+                       break;
+       }
+       if (sn == fs->lfs_nseg)
+               panic("lfs_rewind: no clean segments");
+       if (newsn >= 0 && sn >= newsn)
+               return ENOENT;
+       fs->lfs_nextseg = sn;
+       lfs_newseg(fs);
+       fs->lfs_offset = fs->lfs_curseg;
+
+       return 0;
+}
+
+/*
+ * Start a new partial segment.
+ *
+ * Return 1 when we entered to a new segment.
+ * Otherwise, return 0.
+ */
+int
+lfs_initseg(struct lfs *fs)
+{
+       struct segment *sp = fs->lfs_sp;
+       SEGSUM *ssp;
+       struct buf *sbp;        /* buffer for SEGSUM */
+       int repeat = 0;         /* return value */
+
+       ASSERT_SEGLOCK(fs);
+       /* Advance to the next segment. */
+       if (!LFS_PARTIAL_FITS(fs)) {
+               SEGUSE *sup;
+               struct buf *bp;
+
+               /* lfs_avail eats the remaining space */
+               fs->lfs_avail -= fs->lfs_fsbpseg - (fs->lfs_offset -
+                                                  fs->lfs_curseg);
+               /* Wake up any cleaning procs waiting on this file system. */
+               lfs_wakeup_cleaner(fs);
+               lfs_newseg(fs);
+               repeat = 1;
+               fs->lfs_offset = fs->lfs_curseg;
+
+               sp->seg_number = dtosn(fs, fs->lfs_curseg);
+               sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg);
+
+               /*
+                * If the segment contains a superblock, update the offset
+                * and summary address to skip over it.
+                */
+               LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
+               if (sup->su_flags & SEGUSE_SUPERBLOCK) {
+                       fs->lfs_offset += btofsb(fs, LFS_SBPAD);
+                       sp->seg_bytes_left -= LFS_SBPAD;
+               }
+               brelse(bp, 0);
+               /* Segment zero could also contain the labelpad */
+               if (fs->lfs_version > 1 && sp->seg_number == 0 &&
+                   fs->lfs_start < btofsb(fs, LFS_LABELPAD)) {
+                       fs->lfs_offset +=
+                           btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
+                       sp->seg_bytes_left -=
+                           LFS_LABELPAD - fsbtob(fs, fs->lfs_start);
+               }
+       } else {
+               sp->seg_number = dtosn(fs, fs->lfs_curseg);
+               sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg -
+                                     (fs->lfs_offset - fs->lfs_curseg));
+       }
+       fs->lfs_lastpseg = fs->lfs_offset;
+
+       /* Record first address of this partial segment */
+       if (sp->seg_flags & SEGM_CLEAN) {
+               fs->lfs_cleanint[fs->lfs_cleanind] = fs->lfs_offset;
+               if (++fs->lfs_cleanind >= LFS_MAX_CLEANIND) {
+                       /* "1" is the artificial inc in lfs_seglock */
+                       mutex_enter(&lfs_lock);
+                       while (fs->lfs_iocount > 1) {
+                               mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+                                   "lfs_initseg", 0, &lfs_lock);
+                       }
+                       mutex_exit(&lfs_lock);
+                       fs->lfs_cleanind = 0;
+               }
+       }
+
+       sp->fs = fs;
+       sp->ibp = NULL;
+       sp->idp = NULL;
+       sp->ninodes = 0;
+       sp->ndupino = 0;
+
+       sp->cbpp = sp->bpp;
+
+       /* Get a new buffer for SEGSUM */
+       sbp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp,
+           fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize, LFS_NB_SUMMARY);
+
+       /* ... and enter it into the buffer list. */
+       *sp->cbpp = sbp;
+       sp->cbpp++;
+       fs->lfs_offset += btofsb(fs, fs->lfs_sumsize);
+
+       sp->start_bpp = sp->cbpp;
+
+       /* Set point to SEGSUM, initialize it. */
+       ssp = sp->segsum = sbp->b_data;
+       memset(ssp, 0, fs->lfs_sumsize);
+       ssp->ss_next = fs->lfs_nextseg;
+       ssp->ss_nfinfo = ssp->ss_ninos = 0;
+       ssp->ss_magic = SS_MAGIC;
+
+       /* Set pointer to first FINFO, initialize it. */
+       sp->fip = (struct finfo *)((char *)sp->segsum + SEGSUM_SIZE(fs));
+       sp->fip->fi_nblocks = 0;
+       sp->start_lbp = &sp->fip->fi_blocks[0];
+       sp->fip->fi_lastlength = 0;
+
+       sp->seg_bytes_left -= fs->lfs_sumsize;
+       sp->sum_bytes_left = fs->lfs_sumsize - SEGSUM_SIZE(fs);
+
+       return (repeat);
+}
+
+/*
+ * Remove SEGUSE_INVAL from all segments.
+ */
+void
+lfs_unset_inval_all(struct lfs *fs)
+{
+       SEGUSE *sup;
+       struct buf *bp;
+       int i;
+
+       for (i = 0; i < fs->lfs_nseg; i++) {
+               LFS_SEGENTRY(sup, fs, i, bp);
+               if (sup->su_flags & SEGUSE_INVAL) {
+                       sup->su_flags &= ~SEGUSE_INVAL;
+                       LFS_WRITESEGENTRY(sup, fs, i, bp);
+               } else
+                       brelse(bp, 0);
+       }
+}
+
+/*
+ * Return the next segment to write.
+ */
+void
+lfs_newseg(struct lfs *fs)
+{
+       CLEANERINFO *cip;
+       SEGUSE *sup;
+       struct buf *bp;
+       int curseg, isdirty, sn, skip_inval;
+
+       ASSERT_SEGLOCK(fs);
+
+       /* Honor LFCNWRAPSTOP */
+       mutex_enter(&lfs_lock);
+       while (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) {
+               if (fs->lfs_wrappass) {
+                       log(LOG_NOTICE, "%s: wrappass=%d\n",
+                               fs->lfs_fsmnt, fs->lfs_wrappass);
+                       fs->lfs_wrappass = 0;
+                       break;
+               }
+               fs->lfs_wrapstatus = LFS_WRAP_WAITING;
+               wakeup(&fs->lfs_nowrap);
+               log(LOG_NOTICE, "%s: waiting at log wrap\n", fs->lfs_fsmnt);
+               mtsleep(&fs->lfs_wrappass, PVFS, "newseg", 10 * hz,
+                       &lfs_lock);
+       }
+       fs->lfs_wrapstatus = LFS_WRAP_GOING;
+       mutex_exit(&lfs_lock);
+
+       LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
+       DLOG((DLOG_SU, "lfs_newseg: seg %d := 0 in newseg\n",
+             dtosn(fs, fs->lfs_nextseg)));
+       sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
+       sup->su_nbytes = 0;
+       sup->su_nsums = 0;
+       sup->su_ninos = 0;
+       LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
+
+       LFS_CLEANERINFO(cip, fs, bp);
+       --cip->clean;
+       ++cip->dirty;
+       fs->lfs_nclean = cip->clean;
+       LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+
+       fs->lfs_lastseg = fs->lfs_curseg;
+       fs->lfs_curseg = fs->lfs_nextseg;
+       skip_inval = 1;
+       for (sn = curseg = dtosn(fs, fs->lfs_curseg) + fs->lfs_interleave;;) {
+               sn = (sn + 1) % fs->lfs_nseg;
+
+               if (sn == curseg) {
+                       if (skip_inval)
+                               skip_inval = 0;
+                       else
+                               panic("lfs_nextseg: no clean segments");
+               }
+               LFS_SEGENTRY(sup, fs, sn, bp);
+               isdirty = sup->su_flags & (SEGUSE_DIRTY | (skip_inval ? SEGUSE_INVAL : 0));
+               /* Check SEGUSE_EMPTY as we go along */
+               if (isdirty && sup->su_nbytes == 0 &&
+                   !(sup->su_flags & SEGUSE_EMPTY))
+                       LFS_WRITESEGENTRY(sup, fs, sn, bp);
+               else
+                       brelse(bp, 0);
+
+               if (!isdirty)
+                       break;
+       }
+       if (skip_inval == 0)
+               lfs_unset_inval_all(fs);
+
+       ++fs->lfs_nactive;
+       fs->lfs_nextseg = sntod(fs, sn);
+       if (lfs_dostats) {
+               ++lfs_stats.segsused;
+       }
+}
+
+static struct buf *
+lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr,
+    int n)
+{
+       struct lfs_cluster *cl;
+       struct buf **bpp, *bp;
+
+       ASSERT_SEGLOCK(fs);
+       cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK);
+       bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK);
+       memset(cl, 0, sizeof(*cl));
+       cl->fs = fs;
+       cl->bpp = bpp;
+       cl->bufcount = 0;
+       cl->bufsize = 0;
+
+       /* If this segment is being written synchronously, note that */
+       if (fs->lfs_sp->seg_flags & SEGM_SYNC) {
+               cl->flags |= LFS_CL_SYNC;
+               cl->seg = fs->lfs_sp;
+               ++cl->seg->seg_iocount;
+       }
+
+       /* Get an empty buffer header, or maybe one with something on it */
+       bp = getiobuf(vp, true);
+       bp->b_dev = NODEV;
+       bp->b_blkno = bp->b_lblkno = addr;
+       bp->b_iodone = lfs_cluster_callback;
+       bp->b_private = cl;
+
+       return bp;
+}
+
+int
+lfs_writeseg(struct lfs *fs, struct segment *sp)
+{
+       struct buf **bpp, *bp, *cbp, *newbp, *unbusybp;
+       SEGUSE *sup;
+       SEGSUM *ssp;
+       int i;
+       int do_again, nblocks, byteoffset;
+       size_t el_size;
+       struct lfs_cluster *cl;
+       u_short ninos;
+       struct vnode *devvp;
+       char *p = NULL;
+       struct vnode *vp;
+       int32_t *daddrp;        /* XXX ondisk32 */
+       int changed;
+       u_int32_t sum;
+#ifdef DEBUG
+       FINFO *fip;
+       int findex;
+#endif
+
+       ASSERT_SEGLOCK(fs);
+
+       ssp = (SEGSUM *)sp->segsum;
+
+       /*
+        * If there are no buffers other than the segment summary to write,
+        * don't do anything.  If we are the end of a dirop sequence, however,
+        * write the empty segment summary anyway, to help out the
+        * roll-forward agent.
+        */
+       if ((nblocks = sp->cbpp - sp->bpp) == 1) {
+               if ((ssp->ss_flags & (SS_DIROP | SS_CONT)) != SS_DIROP)
+                       return 0;
+       }
+
+       /* Note if partial segment is being written by the cleaner */
+       if (sp->seg_flags & SEGM_CLEAN)
+               ssp->ss_flags |= SS_CLEAN;
+
+       devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+
+       /* Update the segment usage information. */
+       LFS_SEGENTRY(sup, fs, sp->seg_number, bp);
+
+       /* Loop through all blocks, except the segment summary. */
+       for (bpp = sp->bpp; ++bpp < sp->cbpp; ) {
+               if ((*bpp)->b_vp != devvp) {
+                       sup->su_nbytes += (*bpp)->b_bcount;
+                       DLOG((DLOG_SU, "seg %" PRIu32 " += %ld for ino %d"
+                             " lbn %" PRId64 " db 0x%" PRIx64 "\n",
+                             sp->seg_number, (*bpp)->b_bcount,
+                             VTOI((*bpp)->b_vp)->i_number, (*bpp)->b_lblkno,
+                             (*bpp)->b_blkno));
+               }
+       }
+
+#ifdef DEBUG
+       /* Check for zero-length and zero-version FINFO entries. */
+       fip = (struct finfo *)((char *)ssp + SEGSUM_SIZE(fs));
+       for (findex = 0; findex < ssp->ss_nfinfo; findex++) {
+               KDASSERT(fip->fi_nblocks > 0);
+               KDASSERT(fip->fi_version > 0);
+               fip = (FINFO *)((char *)fip + FINFOSIZE +
+                       sizeof(int32_t) * fip->fi_nblocks);
+       }
+#endif /* DEBUG */
+
+       ninos = (ssp->ss_ninos + INOPB(fs) - 1) / INOPB(fs);
+       DLOG((DLOG_SU, "seg %d += %d for %d inodes\n",
+             sp->seg_number, ssp->ss_ninos * sizeof (struct ufs1_dinode),
+             ssp->ss_ninos));
+       sup->su_nbytes += ssp->ss_ninos * sizeof (struct ufs1_dinode);
+       /* sup->su_nbytes += fs->lfs_sumsize; */
+       if (fs->lfs_version == 1)
+               sup->su_olastmod = time_second;
+       else
+               sup->su_lastmod = time_second;
+       sup->su_ninos += ninos;
+       ++sup->su_nsums;
+       fs->lfs_avail -= btofsb(fs, fs->lfs_sumsize);
+
+       do_again = !(bp->b_flags & B_GATHERED);
+       LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); /* Ifile */
+
+       /*
+        * Mark blocks B_BUSY, to prevent then from being changed between
+        * the checksum computation and the actual write.
+        *
+        * If we are cleaning, check indirect blocks for UNWRITTEN, and if
+        * there are any, replace them with copies that have UNASSIGNED
+        * instead.
+        */
+       mutex_enter(&bufcache_lock);
+       for (bpp = sp->bpp, i = nblocks - 1; i--;) {
+               ++bpp;
+               bp = *bpp;
+               if (bp->b_iodone != NULL) {      /* UBC or malloced buffer */
+                       bp->b_cflags |= BC_BUSY;
+                       continue;
+               }
+
+               while (bp->b_cflags & BC_BUSY) {
+                       DLOG((DLOG_SEG, "lfs_writeseg: avoiding potential"
+                             " data summary corruption for ino %d, lbn %"
+                             PRId64 "\n",
+                             VTOI(bp->b_vp)->i_number, bp->b_lblkno));
+                       bp->b_cflags |= BC_WANTED;
+                       cv_wait(&bp->b_busy, &bufcache_lock);
+               }
+               bp->b_cflags |= BC_BUSY;
+               mutex_exit(&bufcache_lock);
+               unbusybp = NULL;
+
+               /*
+                * Check and replace indirect block UNWRITTEN bogosity.
+                * XXX See comment in lfs_writefile.
+                */
+               if (bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp &&
+                  VTOI(bp->b_vp)->i_ffs1_blocks !=
+                  VTOI(bp->b_vp)->i_lfs_effnblks) {
+                       DLOG((DLOG_VNODE, "lfs_writeseg: cleansing ino %d (%d != %d)\n",
+                             VTOI(bp->b_vp)->i_number,
+                             VTOI(bp->b_vp)->i_lfs_effnblks,
+                             VTOI(bp->b_vp)->i_ffs1_blocks));
+                       /* Make a copy we'll make changes to */
+                       newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno,
+                                          bp->b_bcount, LFS_NB_IBLOCK);
+                       newbp->b_blkno = bp->b_blkno;
+                       memcpy(newbp->b_data, bp->b_data,
+                              newbp->b_bcount);
+
+                       changed = 0;
+                       /* XXX ondisk32 */
+                       for (daddrp = (int32_t *)(newbp->b_data);
+                            daddrp < (int32_t *)((char *)newbp->b_data +
+                                                 newbp->b_bcount); daddrp++) {
+                               if (*daddrp == UNWRITTEN) {
+                                       ++changed;
+                                       *daddrp = 0;
+                               }
+                       }
+                       /*
+                        * Get rid of the old buffer.  Don't mark it clean,
+                        * though, if it still has dirty data on it.
+                        */
+                       if (changed) {
+                               DLOG((DLOG_SEG, "lfs_writeseg: replacing UNWRITTEN(%d):"
+                                     " bp = %p newbp = %p\n", changed, bp,
+                                     newbp));
+                               *bpp = newbp;
+                               bp->b_flags &= ~B_GATHERED;
+                               bp->b_error = 0;
+                               if (bp->b_iodone != NULL) {
+                                       DLOG((DLOG_SEG, "lfs_writeseg: "
+                                             "indir bp should not be B_CALL\n"));
+                                       biodone(bp);
+                                       bp = NULL;
+                               } else {
+                                       /* Still on free list, leave it there */
+                                       unbusybp = bp;
+                                       /*
+                                        * We have to re-decrement lfs_avail
+                                        * since this block is going to come
+                                        * back around to us in the next
+                                        * segment.
+                                        */
+                                       fs->lfs_avail -=
+                                           btofsb(fs, bp->b_bcount);
+                               }
+                       } else {
+                               lfs_freebuf(fs, newbp);
+                       }
+               }
+               mutex_enter(&bufcache_lock);
+               if (unbusybp != NULL) {
+                       unbusybp->b_cflags &= ~BC_BUSY;
+                       if (unbusybp->b_cflags & BC_WANTED)
+                               cv_broadcast(&bp->b_busy);
+               }
+       }
+       mutex_exit(&bufcache_lock);
+
+       /*
+        * Compute checksum across data and then across summary; the first
+        * block (the summary block) is skipped.  Set the create time here
+        * so that it's guaranteed to be later than the inode mod times.
+        */
+       sum = 0;
+       if (fs->lfs_version == 1)
+               el_size = sizeof(u_long);
+       else
+               el_size = sizeof(u_int32_t);
+       for (bpp = sp->bpp, i = nblocks - 1; i--; ) {
+               ++bpp;
+               /* Loop through gop_write cluster blocks */
+               for (byteoffset = 0; byteoffset < (*bpp)->b_bcount;
+                    byteoffset += fs->lfs_bsize) {
+#ifdef LFS_USE_B_INVAL
+                       if (((*bpp)->b_cflags & BC_INVAL) != 0 &&
+                           (*bpp)->b_iodone != NULL) {
+                               if (copyin((void *)(*bpp)->b_saveaddr +
+                                          byteoffset, dp, el_size)) {
+                                       panic("lfs_writeseg: copyin failed [1]:"
+                                               " ino %d blk %" PRId64,
+                                               VTOI((*bpp)->b_vp)->i_number,
+                                               (*bpp)->b_lblkno);
+                               }
+                       } else
+#endif /* LFS_USE_B_INVAL */
+                       {
+                               sum = lfs_cksum_part((char *)
+                                   (*bpp)->b_data + byteoffset, el_size, sum);
+                       }
+               }
+       }
+       if (fs->lfs_version == 1)
+               ssp->ss_ocreate = time_second;
+       else {
+               ssp->ss_create = time_second;
+               ssp->ss_serial = ++fs->lfs_serial;
+               ssp->ss_ident  = fs->lfs_ident;
+       }
+       ssp->ss_datasum = lfs_cksum_fold(sum);
+       ssp->ss_sumsum = cksum(&ssp->ss_datasum,
+           fs->lfs_sumsize - sizeof(ssp->ss_sumsum));
+
+       mutex_enter(&lfs_lock);
+       fs->lfs_bfree -= (btofsb(fs, ninos * fs->lfs_ibsize) +
+                         btofsb(fs, fs->lfs_sumsize));
+       fs->lfs_dmeta += (btofsb(fs, ninos * fs->lfs_ibsize) +
+                         btofsb(fs, fs->lfs_sumsize));
+       mutex_exit(&lfs_lock);
+
+       /*
+        * When we simply write the blocks we lose a rotation for every block
+        * written.  To avoid this problem, we cluster the buffers into a
+        * chunk and write the chunk.  MAXPHYS is the largest size I/O
+        * devices can handle, use that for the size of the chunks.
+        *
+        * Blocks that are already clusters (from GOP_WRITE), however, we
+        * don't bother to copy into other clusters.
+        */
+
+#define CHUNKSIZE MAXPHYS
+
+       if (devvp == NULL)
+               panic("devvp is NULL");
+       for (bpp = sp->bpp, i = nblocks; i;) {
+               cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i);
+               cl = cbp->b_private;
+
+               cbp->b_flags |= B_ASYNC;
+               cbp->b_cflags |= BC_BUSY;
+               cbp->b_bcount = 0;
+
+#if defined(DEBUG) && defined(DIAGNOSTIC)
+               if (bpp - sp->bpp > (fs->lfs_sumsize - SEGSUM_SIZE(fs))
+                   / sizeof(int32_t)) {
+                       panic("lfs_writeseg: real bpp overwrite");
+               }
+               if (bpp - sp->bpp > segsize(fs) / fs->lfs_fsize) {
+                       panic("lfs_writeseg: theoretical bpp overwrite");
+               }
+#endif
+
+               /*
+                * Construct the cluster.
+                */
+               mutex_enter(&lfs_lock);
+               ++fs->lfs_iocount;
+               mutex_exit(&lfs_lock);
+               while (i && cbp->b_bcount < CHUNKSIZE) {
+                       bp = *bpp;
+
+                       if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount))
+                               break;
+                       if (cbp->b_bcount > 0 && !(cl->flags & LFS_CL_MALLOC))
+                               break;
+
+                       /* Clusters from GOP_WRITE are expedited */
+                       if (bp->b_bcount > fs->lfs_bsize) {
+                               if (cbp->b_bcount > 0)
+                                       /* Put in its own buffer */
+                                       break;
+                               else {
+                                       cbp->b_data = bp->b_data;
+                               }
+                       } else if (cbp->b_bcount == 0) {
+                               p = cbp->b_data = lfs_malloc(fs, CHUNKSIZE,
+                                                            LFS_NB_CLUSTER);
+                               cl->flags |= LFS_CL_MALLOC;
+                       }
+#ifdef DIAGNOSTIC
+                       if (dtosn(fs, dbtofsb(fs, bp->b_blkno +
+                                             btodb(bp->b_bcount - 1))) !=
+                           sp->seg_number) {
+                               printf("blk size %d daddr %" PRIx64
+                                   " not in seg %d\n",
+                                   bp->b_bcount, bp->b_blkno,
+                                   sp->seg_number);
+                               panic("segment overwrite");
+                       }
+#endif
+
+#ifdef LFS_USE_B_INVAL
+                       /*
+                        * Fake buffers from the cleaner are marked as B_INVAL.
+                        * We need to copy the data from user space rather than
+                        * from the buffer indicated.
+                        * XXX == what do I do on an error?
+                        */
+                       if ((bp->b_cflags & BC_INVAL) != 0 &&
+                           bp->b_iodone != NULL) {
+                               if (copyin(bp->b_saveaddr, p, bp->b_bcount))
+                                       panic("lfs_writeseg: "
+                                           "copyin failed [2]");
+                       } else
+#endif /* LFS_USE_B_INVAL */
+                       if (cl->flags & LFS_CL_MALLOC) {
+                               /* copy data into our cluster. */
+                               memcpy(p, bp->b_data, bp->b_bcount);
+                               p += bp->b_bcount;
+                       }
+
+                       cbp->b_bcount += bp->b_bcount;
+                       cl->bufsize += bp->b_bcount;
+
+                       bp->b_flags &= ~B_READ;
+                       bp->b_error = 0;
+                       cl->bpp[cl->bufcount++] = bp;
+
+                       vp = bp->b_vp;
+                       mutex_enter(&bufcache_lock);
+                       mutex_enter(vp->v_interlock);
+                       bp->b_oflags &= ~(BO_DELWRI | BO_DONE);
+                       reassignbuf(bp, vp);
+                       vp->v_numoutput++;
+                       mutex_exit(vp->v_interlock);
+                       mutex_exit(&bufcache_lock);
+
+                       bpp++;
+                       i--;
+               }
+               if (fs->lfs_sp->seg_flags & SEGM_SYNC)
+                       BIO_SETPRIO(cbp, BPRIO_TIMECRITICAL);
+               else
+                       BIO_SETPRIO(cbp, BPRIO_TIMELIMITED);
+               mutex_enter(devvp->v_interlock);
+               devvp->v_numoutput++;
+               mutex_exit(devvp->v_interlock);
+               VOP_STRATEGY(devvp, cbp);
+               curlwp->l_ru.ru_oublock++;
+       }
+
+       if (lfs_dostats) {
+               ++lfs_stats.psegwrites;
+               lfs_stats.blocktot += nblocks - 1;
+               if (fs->lfs_sp->seg_flags & SEGM_SYNC)
+                       ++lfs_stats.psyncwrites;
+               if (fs->lfs_sp->seg_flags & SEGM_CLEAN) {
+                       ++lfs_stats.pcleanwrites;
+                       lfs_stats.cleanblocks += nblocks - 1;
+               }
+       }
+
+       return (lfs_initseg(fs) || do_again);
+}
+
+void
+lfs_writesuper(struct lfs *fs, daddr_t daddr)
+{
+       struct buf *bp;
+       struct vnode *devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+       int s;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+#ifdef DIAGNOSTIC
+       KASSERT(fs->lfs_magic == LFS_MAGIC);
+#endif
+       /*
+        * If we can write one superblock while another is in
+        * progress, we risk not having a complete checkpoint if we crash.
+        * So, block here if a superblock write is in progress.
+        */
+       mutex_enter(&lfs_lock);
+       s = splbio();
+       while (fs->lfs_sbactive) {
+               mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs sb", 0,
+                       &lfs_lock);
+       }
+       fs->lfs_sbactive = daddr;
+       splx(s);
+       mutex_exit(&lfs_lock);
+
+       /* Set timestamp of this version of the superblock */
+       if (fs->lfs_version == 1)
+               fs->lfs_otstamp = time_second;
+       fs->lfs_tstamp = time_second;
+
+       /* Checksum the superblock and copy it into a buffer. */
+       fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs));
+       bp = lfs_newbuf(fs, devvp,
+           fsbtodb(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK);
+       memset((char *)bp->b_data + sizeof(struct dlfs), 0,
+           LFS_SBPAD - sizeof(struct dlfs));
+       *(struct dlfs *)bp->b_data = fs->lfs_dlfs;
+
+       bp->b_cflags |= BC_BUSY;
+       bp->b_flags = (bp->b_flags & ~B_READ) | B_ASYNC;
+       bp->b_oflags &= ~(BO_DONE | BO_DELWRI);
+       bp->b_error = 0;
+       bp->b_iodone = lfs_supercallback;
+
+       if (fs->lfs_sp != NULL && fs->lfs_sp->seg_flags & SEGM_SYNC)
+               BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+       else
+               BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
+       curlwp->l_ru.ru_oublock++;
+
+       mutex_enter(devvp->v_interlock);
+       devvp->v_numoutput++;
+       mutex_exit(devvp->v_interlock);
+
+       mutex_enter(&lfs_lock);
+       ++fs->lfs_iocount;
+       mutex_exit(&lfs_lock);
+       VOP_STRATEGY(devvp, bp);
+}
+
+/*
+ * Logical block number match routines used when traversing the dirty block
+ * chain.
+ */
+int
+lfs_match_fake(struct lfs *fs, struct buf *bp)
+{
+
+       ASSERT_SEGLOCK(fs);
+       return LFS_IS_MALLOC_BUF(bp);
+}
+
+#if 0
+int
+lfs_match_real(struct lfs *fs, struct buf *bp)
+{
+
+       ASSERT_SEGLOCK(fs);
+       return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp));
+}
+#endif
+
+int
+lfs_match_data(struct lfs *fs, struct buf *bp)
+{
+
+       ASSERT_SEGLOCK(fs);
+       return (bp->b_lblkno >= 0);
+}
+
+int
+lfs_match_indir(struct lfs *fs, struct buf *bp)
+{
+       daddr_t lbn;
+
+       ASSERT_SEGLOCK(fs);
+       lbn = bp->b_lblkno;
+       return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 0);
+}
+
+int
+lfs_match_dindir(struct lfs *fs, struct buf *bp)
+{
+       daddr_t lbn;
+
+       ASSERT_SEGLOCK(fs);
+       lbn = bp->b_lblkno;
+       return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 1);
+}
+
+int
+lfs_match_tindir(struct lfs *fs, struct buf *bp)
+{
+       daddr_t lbn;
+
+       ASSERT_SEGLOCK(fs);
+       lbn = bp->b_lblkno;
+       return (lbn < 0 && (-lbn - NDADDR) % NINDIR(fs) == 2);
+}
+
+static void
+lfs_free_aiodone(struct buf *bp)
+{
+       struct lfs *fs;
+
+       KERNEL_LOCK(1, curlwp);
+       fs = bp->b_private;
+       ASSERT_NO_SEGLOCK(fs);
+       lfs_freebuf(fs, bp);
+       KERNEL_UNLOCK_LAST(curlwp);
+}
+
+static void
+lfs_super_aiodone(struct buf *bp)
+{
+       struct lfs *fs;
+
+       KERNEL_LOCK(1, curlwp);
+       fs = bp->b_private;
+       ASSERT_NO_SEGLOCK(fs);
+       mutex_enter(&lfs_lock);
+       fs->lfs_sbactive = 0;
+       if (--fs->lfs_iocount <= 1)
+               wakeup(&fs->lfs_iocount);
+       wakeup(&fs->lfs_sbactive);
+       mutex_exit(&lfs_lock);
+       lfs_freebuf(fs, bp);
+       KERNEL_UNLOCK_LAST(curlwp);
+}
+
+static void
+lfs_cluster_aiodone(struct buf *bp)
+{
+       struct lfs_cluster *cl;
+       struct lfs *fs;
+       struct buf *tbp, *fbp;
+       struct vnode *vp, *devvp, *ovp;
+       struct inode *ip;
+       int error;
+
+       KERNEL_LOCK(1, curlwp);
+
+       error = bp->b_error;
+       cl = bp->b_private;
+       fs = cl->fs;
+       devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+       ASSERT_NO_SEGLOCK(fs);
+
+       /* Put the pages back, and release the buffer */
+       while (cl->bufcount--) {
+               tbp = cl->bpp[cl->bufcount];
+               KASSERT(tbp->b_cflags & BC_BUSY);
+               if (error) {
+                       tbp->b_error = error;
+               }
+
+               /*
+                * We're done with tbp.  If it has not been re-dirtied since
+                * the cluster was written, free it.  Otherwise, keep it on
+                * the locked list to be written again.
+                */
+               vp = tbp->b_vp;
+
+               tbp->b_flags &= ~B_GATHERED;
+
+               LFS_BCLEAN_LOG(fs, tbp);
+
+               mutex_enter(&bufcache_lock);
+               if (tbp->b_iodone == NULL) {
+                       KASSERT(tbp->b_flags & B_LOCKED);
+                       bremfree(tbp);
+                       if (vp) {
+                               mutex_enter(vp->v_interlock);
+                               reassignbuf(tbp, vp);
+                               mutex_exit(vp->v_interlock);
+                       }
+                       tbp->b_flags |= B_ASYNC; /* for biodone */
+               }
+
+               if (((tbp->b_flags | tbp->b_oflags) &
+                   (B_LOCKED | BO_DELWRI)) == B_LOCKED)
+                       LFS_UNLOCK_BUF(tbp);
+
+               if (tbp->b_oflags & BO_DONE) {
+                       DLOG((DLOG_SEG, "blk %d biodone already (flags %lx)\n",
+                               cl->bufcount, (long)tbp->b_flags));
+               }
+
+               if (tbp->b_iodone != NULL && !LFS_IS_MALLOC_BUF(tbp)) {
+                       /*
+                        * A buffer from the page daemon.
+                        * We use the same iodone as it does,
+                        * so we must manually disassociate its
+                        * buffers from the vp.
+                        */
+                       if ((ovp = tbp->b_vp) != NULL) {
+                               /* This is just silly */
+                               mutex_enter(ovp->v_interlock);
+                               brelvp(tbp);
+                               mutex_exit(ovp->v_interlock);
+                               tbp->b_vp = vp;
+                               tbp->b_objlock = vp->v_interlock;
+                       }
+                       /* Put it back the way it was */
+                       tbp->b_flags |= B_ASYNC;
+                       /* Master buffers have BC_AGE */
+                       if (tbp->b_private == tbp)
+                               tbp->b_cflags |= BC_AGE;
+               }
+               mutex_exit(&bufcache_lock);
+
+               biodone(tbp);
+
+               /*
+                * If this is the last block for this vnode, but
+                * there are other blocks on its dirty list,
+                * set IN_MODIFIED/IN_CLEANING depending on what
+                * sort of block.  Only do this for our mount point,
+                * not for, e.g., inode blocks that are attached to
+                * the devvp.
+                * XXX KS - Shouldn't we set *both* if both types
+                * of blocks are present (traverse the dirty list?)
+                */
+               mutex_enter(&lfs_lock);
+               mutex_enter(vp->v_interlock);
+               if (vp != devvp && vp->v_numoutput == 0 &&
+                   (fbp = LIST_FIRST(&vp->v_dirtyblkhd)) != NULL) {
+                       ip = VTOI(vp);
+                       DLOG((DLOG_SEG, "lfs_cluster_aiodone: mark ino %d\n",
+                              ip->i_number));
+                       if (LFS_IS_MALLOC_BUF(fbp))
+                               LFS_SET_UINO(ip, IN_CLEANING);
+                       else
+                               LFS_SET_UINO(ip, IN_MODIFIED);
+               }
+               cv_broadcast(&vp->v_cv);
+               mutex_exit(vp->v_interlock);
+               mutex_exit(&lfs_lock);
+       }
+
+       /* Fix up the cluster buffer, and release it */
+       if (cl->flags & LFS_CL_MALLOC)
+               lfs_free(fs, bp->b_data, LFS_NB_CLUSTER);
+       putiobuf(bp);
+
+       /* Note i/o done */
+       if (cl->flags & LFS_CL_SYNC) {
+               if (--cl->seg->seg_iocount == 0)
+                       wakeup(&cl->seg->seg_iocount);
+       }
+       mutex_enter(&lfs_lock);
+#ifdef DIAGNOSTIC
+       if (fs->lfs_iocount == 0)
+               panic("lfs_cluster_aiodone: zero iocount");
+#endif
+       if (--fs->lfs_iocount <= 1)
+               wakeup(&fs->lfs_iocount);
+       mutex_exit(&lfs_lock);
+
+       KERNEL_UNLOCK_LAST(curlwp);
+
+       pool_put(&fs->lfs_bpppool, cl->bpp);
+       cl->bpp = NULL;
+       pool_put(&fs->lfs_clpool, cl);
+}
+
+static void
+lfs_generic_callback(struct buf *bp, void (*aiodone)(struct buf *))
+{
+       /* reset b_iodone for when this is a single-buf i/o. */
+       bp->b_iodone = aiodone;
+
+       workqueue_enqueue(uvm.aiodone_queue, &bp->b_work, NULL);
+}
+
+static void
+lfs_cluster_callback(struct buf *bp)
+{
+
+       lfs_generic_callback(bp, lfs_cluster_aiodone);
+}
+
+void
+lfs_supercallback(struct buf *bp)
+{
+
+       lfs_generic_callback(bp, lfs_super_aiodone);
+}
+
+/*
+ * The only buffers that are going to hit these functions are the
+ * segment write blocks, or the segment summaries, or the superblocks.
+ *
+ * All of the above are created by lfs_newbuf, and so do not need to be
+ * released via brelse.
+ */
+void
+lfs_callback(struct buf *bp)
+{
+
+       lfs_generic_callback(bp, lfs_free_aiodone);
+}
+
+/*
+ * Shellsort (diminishing increment sort) from Data Structures and
+ * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
+ * see also Knuth Vol. 3, page 84.  The increments are selected from
+ * formula (8), page 95.  Roughly O(N^3/2).
+ */
+/*
+ * This is our own private copy of shellsort because we want to sort
+ * two parallel arrays (the array of buffer pointers and the array of
+ * logical block numbers) simultaneously.  Note that we cast the array
+ * of logical block numbers to a unsigned in this routine so that the
+ * negative block numbers (meta data blocks) sort AFTER the data blocks.
+ */
+
+void
+lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb, int size)
+{
+       static int __rsshell_increments[] = { 4, 1, 0 };
+       int incr, *incrp, t1, t2;
+       struct buf *bp_temp;
+
+#ifdef DEBUG
+       incr = 0;
+       for (t1 = 0; t1 < nmemb; t1++) {
+               for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) {
+                       if (lb_array[incr++] != bp_array[t1]->b_lblkno + t2) {
+                               /* dump before panic */
+                               printf("lfs_shellsort: nmemb=%d, size=%d\n",
+                                   nmemb, size);
+                               incr = 0;
+                               for (t1 = 0; t1 < nmemb; t1++) {
+                                       const struct buf *bp = bp_array[t1];
+
+                                       printf("bp[%d]: lbn=%" PRIu64 ", size=%"
+                                           PRIu64 "\n", t1,
+                                           (uint64_t)bp->b_bcount,
+                                           (uint64_t)bp->b_lblkno);
+                                       printf("lbns:");
+                                       for (t2 = 0; t2 * size < bp->b_bcount;
+                                           t2++) {
+                                               printf(" %" PRId32,
+                                                   lb_array[incr++]);
+                                       }
+                                       printf("\n");
+                               }
+                               panic("lfs_shellsort: inconsistent input");
+                       }
+               }
+       }
+#endif
+
+       for (incrp = __rsshell_increments; (incr = *incrp++) != 0;)
+               for (t1 = incr; t1 < nmemb; ++t1)
+                       for (t2 = t1 - incr; t2 >= 0;)
+                               if ((u_int32_t)bp_array[t2]->b_lblkno >
+                                   (u_int32_t)bp_array[t2 + incr]->b_lblkno) {
+                                       bp_temp = bp_array[t2];
+                                       bp_array[t2] = bp_array[t2 + incr];
+                                       bp_array[t2 + incr] = bp_temp;
+                                       t2 -= incr;
+                               } else
+                                       break;
+
+       /* Reform the list of logical blocks */
+       incr = 0;
+       for (t1 = 0; t1 < nmemb; t1++) {
+               for (t2 = 0; t2 * size < bp_array[t1]->b_bcount; t2++) {
+                       lb_array[incr++] = bp_array[t1]->b_lblkno + t2;
+               }
+       }
+}
+
+/*
+ * Call vget with LK_NOWAIT.  If we are the one who holds VI_XLOCK,
+ * however, we must press on.  Just fake success in that case.
+ */
+int
+lfs_vref(struct vnode *vp)
+{
+       int error;
+       struct lfs *fs;
+
+       KASSERT(mutex_owned(vp->v_interlock));
+
+       fs = VTOI(vp)->i_lfs;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+
+       /*
+        * If we return 1 here during a flush, we risk vinvalbuf() not
+        * being able to flush all of the pages from this vnode, which
+        * will cause it to panic.  So, return 0 if a flush is in progress.
+        */
+       error = vget(vp, LK_NOWAIT);
+       if (error == EBUSY && IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
+               ++fs->lfs_flushvp_fakevref;
+               return 0;
+       }
+       return error;
+}
+
+/*
+ * This is vrele except that we do not want to VOP_INACTIVE this vnode. We
+ * inline vrele here to avoid the vn_lock and VOP_INACTIVE call at the end.
+ */
+void
+lfs_vunref(struct vnode *vp)
+{
+       struct lfs *fs;
+
+       fs = VTOI(vp)->i_lfs;
+       ASSERT_MAYBE_SEGLOCK(fs);
+
+       /*
+        * Analogous to lfs_vref, if the node is flushing, fake it.
+        */
+       if (IS_FLUSHING(fs, vp) && fs->lfs_flushvp_fakevref) {
+               --fs->lfs_flushvp_fakevref;
+               return;
+       }
+
+       /* does not call inactive */
+       mutex_enter(vp->v_interlock);
+       vrelel(vp, 0);
+}
+
+/*
+ * We use this when we have vnodes that were loaded in solely for cleaning.
+ * There is no reason to believe that these vnodes will be referenced again
+ * soon, since the cleaning process is unrelated to normal filesystem
+ * activity.  Putting cleaned vnodes at the tail of the list has the effect
+ * of flushing the vnode LRU.  So, put vnodes that were loaded only for
+ * cleaning at the head of the list, instead.
+ */
+void
+lfs_vunref_head(struct vnode *vp)
+{
+
+       ASSERT_SEGLOCK(VTOI(vp)->i_lfs);
+
+       /* does not call inactive, inserts non-held vnode at head of freelist */
+       mutex_enter(vp->v_interlock);
+       vrelel(vp, 0);
+}
+
+
+/*
+ * Set up an FINFO entry for a new file.  The fip pointer is assumed to 
+ * point at uninitialized space.
+ */
+void
+lfs_acquire_finfo(struct lfs *fs, ino_t ino, int vers)
+{
+       struct segment *sp = fs->lfs_sp;
+
+       KASSERT(vers > 0);
+
+       if (sp->seg_bytes_left < fs->lfs_bsize ||
+           sp->sum_bytes_left < sizeof(struct finfo))
+               (void) lfs_writeseg(fs, fs->lfs_sp);
+       
+       sp->sum_bytes_left -= FINFOSIZE;
+       ++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+       sp->fip->fi_nblocks = 0;
+       sp->fip->fi_ino = ino;
+       sp->fip->fi_version = vers;
+}
+
+/*
+ * Release the FINFO entry, either clearing out an unused entry or
+ * advancing us to the next available entry.
+ */
+void
+lfs_release_finfo(struct lfs *fs)
+{
+       struct segment *sp = fs->lfs_sp;
+
+       if (sp->fip->fi_nblocks != 0) {
+               sp->fip = (FINFO*)((char *)sp->fip + FINFOSIZE +
+                       sizeof(int32_t) * sp->fip->fi_nblocks);
+               sp->start_lbp = &sp->fip->fi_blocks[0];
+       } else {
+               sp->sum_bytes_left += FINFOSIZE;
+               --((SEGSUM *)(sp->segsum))->ss_nfinfo;
+       }
+}
diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c
new file mode 100644 (file)
index 0000000..4da38aa
--- /dev/null
@@ -0,0 +1,661 @@
+/*     $NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $    */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_subr.c  8.4 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/mount.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+#ifdef DEBUG
+const char *lfs_res_names[LFS_NB_COUNT] = {
+       "summary",
+       "superblock",
+       "file block",
+       "cluster",
+       "clean",
+       "blkiov",
+};
+#endif
+
+int lfs_res_qty[LFS_NB_COUNT] = {
+       LFS_N_SUMMARIES,
+       LFS_N_SBLOCKS,
+       LFS_N_IBLOCKS,
+       LFS_N_CLUSTERS,
+       LFS_N_CLEAN,
+       LFS_N_BLKIOV,
+};
+
+void
+lfs_setup_resblks(struct lfs *fs)
+{
+       int i, j;
+       int maxbpp;
+
+       ASSERT_NO_SEGLOCK(fs);
+       fs->lfs_resblk = (res_t *)malloc(LFS_N_TOTAL * sizeof(res_t), M_SEGMENT,
+                                         M_WAITOK);
+       for (i = 0; i < LFS_N_TOTAL; i++) {
+               fs->lfs_resblk[i].inuse = 0;
+               fs->lfs_resblk[i].p = NULL;
+       }
+       for (i = 0; i < LFS_RESHASH_WIDTH; i++)
+               LIST_INIT(fs->lfs_reshash + i);
+
+       /*
+        * These types of allocations can be larger than a page,
+        * so we can't use the pool subsystem for them.
+        */
+       for (i = 0, j = 0; j < LFS_N_SUMMARIES; j++, i++)
+               fs->lfs_resblk[i].size = fs->lfs_sumsize;
+       for (j = 0; j < LFS_N_SBLOCKS; j++, i++)
+               fs->lfs_resblk[i].size = LFS_SBPAD;
+       for (j = 0; j < LFS_N_IBLOCKS; j++, i++)
+               fs->lfs_resblk[i].size = fs->lfs_bsize;
+       for (j = 0; j < LFS_N_CLUSTERS; j++, i++)
+               fs->lfs_resblk[i].size = MAXPHYS;
+       for (j = 0; j < LFS_N_CLEAN; j++, i++)
+               fs->lfs_resblk[i].size = MAXPHYS;
+       for (j = 0; j < LFS_N_BLKIOV; j++, i++)
+               fs->lfs_resblk[i].size = LFS_MARKV_MAXBLKCNT * sizeof(BLOCK_INFO);
+
+       for (i = 0; i < LFS_N_TOTAL; i++) {
+               fs->lfs_resblk[i].p = malloc(fs->lfs_resblk[i].size,
+                                            M_SEGMENT, M_WAITOK);
+       }
+
+       /*
+        * Initialize pools for small types (XXX is BPP small?)
+        */
+       pool_init(&fs->lfs_clpool, sizeof(struct lfs_cluster), 0, 0, 0,
+               "lfsclpl", &pool_allocator_nointr, IPL_NONE);
+       pool_init(&fs->lfs_segpool, sizeof(struct segment), 0, 0, 0,
+               "lfssegpool", &pool_allocator_nointr, IPL_NONE);
+       maxbpp = ((fs->lfs_sumsize - SEGSUM_SIZE(fs)) / sizeof(int32_t) + 2);
+       maxbpp = MIN(maxbpp, segsize(fs) / fs->lfs_fsize + 2);
+       pool_init(&fs->lfs_bpppool, maxbpp * sizeof(struct buf *), 0, 0, 0,
+               "lfsbpppl", &pool_allocator_nointr, IPL_NONE);
+}
+
+void
+lfs_free_resblks(struct lfs *fs)
+{
+       int i;
+
+       pool_destroy(&fs->lfs_bpppool);
+       pool_destroy(&fs->lfs_segpool);
+       pool_destroy(&fs->lfs_clpool);
+
+       mutex_enter(&lfs_lock);
+       for (i = 0; i < LFS_N_TOTAL; i++) {
+               while (fs->lfs_resblk[i].inuse)
+                       mtsleep(&fs->lfs_resblk, PRIBIO + 1, "lfs_free", 0,
+                               &lfs_lock);
+               if (fs->lfs_resblk[i].p != NULL)
+                       free(fs->lfs_resblk[i].p, M_SEGMENT);
+       }
+       free(fs->lfs_resblk, M_SEGMENT);
+       mutex_exit(&lfs_lock);
+}
+
+static unsigned int
+lfs_mhash(void *vp)
+{
+       return (unsigned int)(((unsigned long)vp) >> 2) % LFS_RESHASH_WIDTH;
+}
+
+/*
+ * Return memory of the given size for the given purpose, or use one of a
+ * number of spare last-resort buffers, if malloc returns NULL.
+ */
+void *
+lfs_malloc(struct lfs *fs, size_t size, int type)
+{
+       struct lfs_res_blk *re;
+       void *r;
+       int i, s, start;
+       unsigned int h;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       r = NULL;
+
+       /* If no mem allocated for this type, it just waits */
+       if (lfs_res_qty[type] == 0) {
+               r = malloc(size, M_SEGMENT, M_WAITOK);
+               return r;
+       }
+
+       /* Otherwise try a quick malloc, and if it works, great */
+       if ((r = malloc(size, M_SEGMENT, M_NOWAIT)) != NULL) {
+               return r;
+       }
+
+       /*
+        * If malloc returned NULL, we are forced to use one of our
+        * reserve blocks.  We have on hand at least one summary block,
+        * at least one cluster block, at least one superblock,
+        * and several indirect blocks.
+        */
+
+       mutex_enter(&lfs_lock);
+       /* skip over blocks of other types */
+       for (i = 0, start = 0; i < type; i++)
+               start += lfs_res_qty[i];
+       while (r == NULL) {
+               for (i = 0; i < lfs_res_qty[type]; i++) {
+                       if (fs->lfs_resblk[start + i].inuse == 0) {
+                               re = fs->lfs_resblk + start + i;
+                               re->inuse = 1;
+                               r = re->p;
+                               KASSERT(re->size >= size);
+                               h = lfs_mhash(r);
+                               s = splbio();
+                               LIST_INSERT_HEAD(&fs->lfs_reshash[h], re, res);
+                               splx(s);
+                               mutex_exit(&lfs_lock);
+                               return r;
+                       }
+               }
+               DLOG((DLOG_MALLOC, "sleeping on %s (%d)\n",
+                     lfs_res_names[type], lfs_res_qty[type]));
+               mtsleep(&fs->lfs_resblk, PVM, "lfs_malloc", 0,
+                       &lfs_lock);
+               DLOG((DLOG_MALLOC, "done sleeping on %s\n",
+                     lfs_res_names[type]));
+       }
+       /* NOTREACHED */
+       mutex_exit(&lfs_lock);
+       return r;
+}
+
+void
+lfs_free(struct lfs *fs, void *p, int type)
+{
+       int s;
+       unsigned int h;
+       res_t *re;
+#ifdef DEBUG
+       int i;
+#endif
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       h = lfs_mhash(p);
+       mutex_enter(&lfs_lock);
+       s = splbio();
+       LIST_FOREACH(re, &fs->lfs_reshash[h], res) {
+               if (re->p == p) {
+                       KASSERT(re->inuse == 1);
+                       LIST_REMOVE(re, res);
+                       re->inuse = 0;
+                       wakeup(&fs->lfs_resblk);
+                       splx(s);
+                       mutex_exit(&lfs_lock);
+                       return;
+               }
+       }
+#ifdef DEBUG
+       for (i = 0; i < LFS_N_TOTAL; i++) {
+               if (fs->lfs_resblk[i].p == p)
+                       panic("lfs_free: inconsistent reserved block");
+       }
+#endif
+       splx(s);
+       mutex_exit(&lfs_lock);
+       
+       /*
+        * If we didn't find it, free it.
+        */
+       free(p, M_SEGMENT);
+}
+
+/*
+ * lfs_seglock --
+ *     Single thread the segment writer.
+ */
+int
+lfs_seglock(struct lfs *fs, unsigned long flags)
+{
+       struct segment *sp;
+
+       mutex_enter(&lfs_lock);
+       if (fs->lfs_seglock) {
+               if (fs->lfs_lockpid == curproc->p_pid &&
+                   fs->lfs_locklwp == curlwp->l_lid) {
+                       ++fs->lfs_seglock;
+                       fs->lfs_sp->seg_flags |= flags;
+                       mutex_exit(&lfs_lock);
+                       return 0;
+               } else if (flags & SEGM_PAGEDAEMON) {
+                       mutex_exit(&lfs_lock);
+                       return EWOULDBLOCK;
+               } else {
+                       while (fs->lfs_seglock) {
+                               (void)mtsleep(&fs->lfs_seglock, PRIBIO + 1,
+                                       "lfs_seglock", 0, &lfs_lock);
+                       }
+               }
+       }
+
+       fs->lfs_seglock = 1;
+       fs->lfs_lockpid = curproc->p_pid;
+       fs->lfs_locklwp = curlwp->l_lid;
+       mutex_exit(&lfs_lock);
+       fs->lfs_cleanind = 0;
+
+#ifdef DEBUG
+       LFS_ENTER_LOG("seglock", __FILE__, __LINE__, 0, flags, curproc->p_pid);
+#endif
+       /* Drain fragment size changes out */
+       rw_enter(&fs->lfs_fraglock, RW_WRITER);
+
+       sp = fs->lfs_sp = pool_get(&fs->lfs_segpool, PR_WAITOK);
+       sp->bpp = pool_get(&fs->lfs_bpppool, PR_WAITOK);
+       sp->seg_flags = flags;
+       sp->vp = NULL;
+       sp->seg_iocount = 0;
+       (void) lfs_initseg(fs);
+
+       /*
+        * Keep a cumulative count of the outstanding I/O operations.  If the
+        * disk drive catches up with us it could go to zero before we finish,
+        * so we artificially increment it by one until we've scheduled all of
+        * the writes we intend to do.
+        */
+       mutex_enter(&lfs_lock);
+       ++fs->lfs_iocount;
+       mutex_exit(&lfs_lock);
+       return 0;
+}
+
+static void lfs_unmark_dirop(struct lfs *);
+
+static void
+lfs_unmark_dirop(struct lfs *fs)
+{
+       struct inode *ip, *nip;
+       struct vnode *vp;
+       int doit;
+
+       ASSERT_NO_SEGLOCK(fs);
+       mutex_enter(&lfs_lock);
+       doit = !(fs->lfs_flags & LFS_UNDIROP);
+       if (doit)
+               fs->lfs_flags |= LFS_UNDIROP;
+       if (!doit) {
+               mutex_exit(&lfs_lock);
+               return;
+       }
+
+       for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
+               nip = TAILQ_NEXT(ip, i_lfs_dchain);
+               vp = ITOV(ip);
+               if ((VTOI(vp)->i_flag & (IN_ADIROP | IN_ALLMOD)) == 0) {
+                       --lfs_dirvcount;
+                       --fs->lfs_dirvcount;
+                       vp->v_uflag &= ~VU_DIROP;
+                       TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+                       wakeup(&lfs_dirvcount);
+                       fs->lfs_unlockvp = vp;
+                       mutex_exit(&lfs_lock);
+                       vrele(vp);
+                       mutex_enter(&lfs_lock);
+                       fs->lfs_unlockvp = NULL;
+               }
+       }
+
+       fs->lfs_flags &= ~LFS_UNDIROP;
+       wakeup(&fs->lfs_flags);
+       mutex_exit(&lfs_lock);
+}
+
+static void
+lfs_auto_segclean(struct lfs *fs)
+{
+       int i, error, s, waited;
+
+       ASSERT_SEGLOCK(fs);
+       /*
+        * Now that we've swapped lfs_activesb, but while we still
+        * hold the segment lock, run through the segment list marking
+        * the empty ones clean.
+        * XXX - do we really need to do them all at once?
+        */
+       waited = 0;
+       for (i = 0; i < fs->lfs_nseg; i++) {
+               if ((fs->lfs_suflags[0][i] &
+                    (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+                   (SEGUSE_DIRTY | SEGUSE_EMPTY) &&
+                   (fs->lfs_suflags[1][i] &
+                    (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+                   (SEGUSE_DIRTY | SEGUSE_EMPTY)) {
+
+                       /* Make sure the sb is written before we clean */
+                       mutex_enter(&lfs_lock);
+                       s = splbio();
+                       while (waited == 0 && fs->lfs_sbactive)
+                               mtsleep(&fs->lfs_sbactive, PRIBIO+1, "lfs asb",
+                                       0, &lfs_lock);
+                       splx(s);
+                       mutex_exit(&lfs_lock);
+                       waited = 1;
+
+                       if ((error = lfs_do_segclean(fs, i)) != 0) {
+                               DLOG((DLOG_CLEAN, "lfs_auto_segclean: lfs_do_segclean returned %d for seg %d\n", error, i));
+                       }
+               }
+               fs->lfs_suflags[1 - fs->lfs_activesb][i] =
+                       fs->lfs_suflags[fs->lfs_activesb][i];
+       }
+}
+
+/*
+ * lfs_segunlock --
+ *     Single thread the segment writer.
+ */
+void
+lfs_segunlock(struct lfs *fs)
+{
+       struct segment *sp;
+       unsigned long sync, ckp;
+       struct buf *bp;
+       int do_unmark_dirop = 0;
+
+       sp = fs->lfs_sp;
+
+       mutex_enter(&lfs_lock);
+       KASSERT(LFS_SEGLOCK_HELD(fs));
+       if (fs->lfs_seglock == 1) {
+               if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0 &&
+                   LFS_STARVED_FOR_SEGS(fs) == 0)
+                       do_unmark_dirop = 1;
+               mutex_exit(&lfs_lock);
+               sync = sp->seg_flags & SEGM_SYNC;
+               ckp = sp->seg_flags & SEGM_CKP;
+
+               /* We should have a segment summary, and nothing else */
+               KASSERT(sp->cbpp == sp->bpp + 1);
+
+               /* Free allocated segment summary */
+               fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize);
+               bp = *sp->bpp;
+               lfs_freebuf(fs, bp);
+
+               pool_put(&fs->lfs_bpppool, sp->bpp);
+               sp->bpp = NULL;
+
+               /*
+                * If we're not sync, we're done with sp, get rid of it.
+                * Otherwise, we keep a local copy around but free
+                * fs->lfs_sp so another process can use it (we have to
+                * wait but they don't have to wait for us).
+                */
+               if (!sync)
+                       pool_put(&fs->lfs_segpool, sp);
+               fs->lfs_sp = NULL;
+
+               /*
+                * If the I/O count is non-zero, sleep until it reaches zero.
+                * At the moment, the user's process hangs around so we can
+                * sleep.
+                */
+               mutex_enter(&lfs_lock);
+               if (--fs->lfs_iocount == 0) {
+                       LFS_DEBUG_COUNTLOCKED("lfs_segunlock");
+               }
+               if (fs->lfs_iocount <= 1)
+                       wakeup(&fs->lfs_iocount);
+               mutex_exit(&lfs_lock);
+               /*
+                * If we're not checkpointing, we don't have to block
+                * other processes to wait for a synchronous write
+                * to complete.
+                */
+               if (!ckp) {
+#ifdef DEBUG
+                       LFS_ENTER_LOG("segunlock_std", __FILE__, __LINE__, 0, 0, curproc->p_pid);
+#endif
+                       mutex_enter(&lfs_lock);
+                       --fs->lfs_seglock;
+                       fs->lfs_lockpid = 0;
+                       fs->lfs_locklwp = 0;
+                       mutex_exit(&lfs_lock);
+                       wakeup(&fs->lfs_seglock);
+               }
+               /*
+                * We let checkpoints happen asynchronously.  That means
+                * that during recovery, we have to roll forward between
+                * the two segments described by the first and second
+                * superblocks to make sure that the checkpoint described
+                * by a superblock completed.
+                */
+               mutex_enter(&lfs_lock);
+               while (ckp && sync && fs->lfs_iocount) {
+                       (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+                                     "lfs_iocount", 0, &lfs_lock);
+                       DLOG((DLOG_SEG, "sleeping on iocount %x == %d\n", fs, fs->lfs_iocount));
+               }
+               while (sync && sp->seg_iocount) {
+                       (void)mtsleep(&sp->seg_iocount, PRIBIO + 1,
+                                    "seg_iocount", 0, &lfs_lock);
+                       DLOG((DLOG_SEG, "sleeping on iocount %x == %d\n", sp, sp->seg_iocount));
+               }
+               mutex_exit(&lfs_lock);
+               if (sync)
+                       pool_put(&fs->lfs_segpool, sp);
+
+               if (ckp) {
+                       fs->lfs_nactive = 0;
+                       /* If we *know* everything's on disk, write both sbs */
+                       /* XXX should wait for this one  */
+                       if (sync)
+                               lfs_writesuper(fs, fs->lfs_sboffs[fs->lfs_activesb]);
+                       lfs_writesuper(fs, fs->lfs_sboffs[1 - fs->lfs_activesb]);
+                       if (!(fs->lfs_ivnode->v_mount->mnt_iflag & IMNT_UNMOUNT)) {
+                               lfs_auto_segclean(fs);
+                               /* If sync, we can clean the remainder too */
+                               if (sync)
+                                       lfs_auto_segclean(fs);
+                       }
+                       fs->lfs_activesb = 1 - fs->lfs_activesb;
+#ifdef DEBUG
+                       LFS_ENTER_LOG("segunlock_ckp", __FILE__, __LINE__, 0, 0, curproc->p_pid);
+#endif
+                       mutex_enter(&lfs_lock);
+                       --fs->lfs_seglock;
+                       fs->lfs_lockpid = 0;
+                       fs->lfs_locklwp = 0;
+                       mutex_exit(&lfs_lock);
+                       wakeup(&fs->lfs_seglock);
+               }
+               /* Reenable fragment size changes */
+               rw_exit(&fs->lfs_fraglock);
+               if (do_unmark_dirop)
+                       lfs_unmark_dirop(fs);
+       } else if (fs->lfs_seglock == 0) {
+               mutex_exit(&lfs_lock);
+               panic ("Seglock not held");
+       } else {
+               --fs->lfs_seglock;
+               mutex_exit(&lfs_lock);
+       }
+}
+
+/*
+ * Drain dirops and start writer.
+ *
+ * No simple_locks are held when we enter and none are held when we return.
+ */
+int
+lfs_writer_enter(struct lfs *fs, const char *wmesg)
+{
+       int error = 0;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       mutex_enter(&lfs_lock);
+
+       /* disallow dirops during flush */
+       fs->lfs_writer++;
+
+       while (fs->lfs_dirops > 0) {
+               ++fs->lfs_diropwait;
+               error = mtsleep(&fs->lfs_writer, PRIBIO+1, wmesg, 0,
+                               &lfs_lock);
+               --fs->lfs_diropwait;
+       }
+
+       if (error)
+               fs->lfs_writer--;
+
+       mutex_exit(&lfs_lock);
+
+       return error;
+}
+
+void
+lfs_writer_leave(struct lfs *fs)
+{
+       bool dowakeup;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       mutex_enter(&lfs_lock);
+       dowakeup = !(--fs->lfs_writer);
+       mutex_exit(&lfs_lock);
+       if (dowakeup)
+               wakeup(&fs->lfs_dirops);
+}
+
+/*
+ * Unlock, wait for the cleaner, then relock to where we were before.
+ * To be used only at a fairly high level, to address a paucity of free
+ * segments propagated back from lfs_gop_write().
+ */
+void
+lfs_segunlock_relock(struct lfs *fs)
+{
+       int n = fs->lfs_seglock;
+       u_int16_t seg_flags;
+       CLEANERINFO *cip;
+       struct buf *bp;
+
+       if (n == 0)
+               return;
+
+       /* Write anything we've already gathered to disk */
+       lfs_writeseg(fs, fs->lfs_sp);
+
+       /* Tell cleaner */
+       LFS_CLEANERINFO(cip, fs, bp);
+       cip->flags |= LFS_CLEANER_MUST_CLEAN;
+       LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+
+       /* Save segment flags for later */
+       seg_flags = fs->lfs_sp->seg_flags;
+
+       fs->lfs_sp->seg_flags |= SEGM_PROT; /* Don't unmark dirop nodes */
+       while(fs->lfs_seglock)
+               lfs_segunlock(fs);
+
+       /* Wait for the cleaner */
+       lfs_wakeup_cleaner(fs);
+       mutex_enter(&lfs_lock);
+       while (LFS_STARVED_FOR_SEGS(fs))
+               mtsleep(&fs->lfs_avail, PRIBIO, "relock", 0,
+                       &lfs_lock);
+       mutex_exit(&lfs_lock);
+
+       /* Put the segment lock back the way it was. */
+       while(n--)
+               lfs_seglock(fs, seg_flags);
+
+       /* Cleaner can relax now */
+       LFS_CLEANERINFO(cip, fs, bp);
+       cip->flags &= ~LFS_CLEANER_MUST_CLEAN;
+       LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+
+       return;
+}
+
+/*
+ * Wake up the cleaner, provided that nowrap is not set.
+ */
+void
+lfs_wakeup_cleaner(struct lfs *fs)
+{
+       if (fs->lfs_nowrap > 0)
+               return;
+
+       wakeup(&fs->lfs_nextseg);
+       wakeup(&lfs_allclean_wakeup);
+}
diff --git a/sys/ufs/lfs/lfs_syscalls.c b/sys/ufs/lfs/lfs_syscalls.c
new file mode 100644 (file)
index 0000000..442b81d
--- /dev/null
@@ -0,0 +1,1224 @@
+/*     $NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
+ *    The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1991, 1993, 1994
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_syscalls.c      8.10 (Berkeley) 5/14/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $");
+
+#ifndef LFS
+# define LFS           /* for prototypes in syscallargs.h */
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+#include <sys/syscallargs.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *);
+int lfs_fasthashget(dev_t, ino_t, struct vnode **);
+
+pid_t lfs_cleaner_pid = 0;
+
+/*
+ * sys_lfs_markv:
+ *
+ * This will mark inodes and blocks dirty, so they are written into the log.
+ * It will block until all the blocks have been written.  The segment create
+ * time passed in the block_info and inode_info structures is used to decide
+ * if the data is valid for each block (in case some process dirtied a block
+ * or inode that is being cleaned between the determination that a block is
+ * live and the lfs_markv call).
+ *
+ *  0 on success
+ * -1/errno is return on error.
+ */
+#ifdef USE_64BIT_SYSCALLS
+int
+sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
+{
+       /* {
+               syscallarg(fsid_t *) fsidp;
+               syscallarg(struct block_info *) blkiov;
+               syscallarg(int) blkcnt;
+       } */
+       BLOCK_INFO *blkiov;
+       int blkcnt, error;
+       fsid_t fsid;
+       struct lfs *fs;
+       struct mount *mntp;
+
+       if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+           NULL)) != 0)
+               return (error);
+
+       if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+               return (error);
+
+       if ((mntp = vfs_getvfs(fsidp)) == NULL) 
+               return (ENOENT);
+       fs = VFSTOUFS(mntp)->um_lfs;
+
+       blkcnt = SCARG(uap, blkcnt);
+       if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
+               return (EINVAL);
+
+       KERNEL_LOCK(1, NULL);
+       blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+       if ((error = copyin(SCARG(uap, blkiov), blkiov,
+                           blkcnt * sizeof(BLOCK_INFO))) != 0)
+               goto out;
+
+       if ((error = lfs_markv(p, &fsid, blkiov, blkcnt)) == 0)
+               copyout(blkiov, SCARG(uap, blkiov),
+                       blkcnt * sizeof(BLOCK_INFO));
+    out:
+       lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+       KERNEL_UNLOCK_ONE(NULL);
+       return error;
+}
+#else
+int
+sys_lfs_markv(struct lwp *l, const struct sys_lfs_markv_args *uap, register_t *retval)
+{
+       /* {
+               syscallarg(fsid_t *) fsidp;
+               syscallarg(struct block_info *) blkiov;
+               syscallarg(int) blkcnt;
+       } */
+       BLOCK_INFO *blkiov;
+       BLOCK_INFO_15 *blkiov15;
+       int i, blkcnt, error;
+       fsid_t fsid;
+       struct lfs *fs;
+       struct mount *mntp;
+
+       if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+           NULL)) != 0)
+               return (error);
+
+       if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+               return (error);
+
+       if ((mntp = vfs_getvfs(&fsid)) == NULL) 
+               return (ENOENT);
+       fs = VFSTOUFS(mntp)->um_lfs;
+
+       blkcnt = SCARG(uap, blkcnt);
+       if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
+               return (EINVAL);
+
+       KERNEL_LOCK(1, NULL);
+       blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+       blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
+       if ((error = copyin(SCARG(uap, blkiov), blkiov15,
+                           blkcnt * sizeof(BLOCK_INFO_15))) != 0)
+               goto out;
+
+       for (i = 0; i < blkcnt; i++) {
+               blkiov[i].bi_inode     = blkiov15[i].bi_inode;
+               blkiov[i].bi_lbn       = blkiov15[i].bi_lbn;
+               blkiov[i].bi_daddr     = blkiov15[i].bi_daddr;
+               blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
+               blkiov[i].bi_version   = blkiov15[i].bi_version;
+               blkiov[i].bi_bp        = blkiov15[i].bi_bp;
+               blkiov[i].bi_size      = blkiov15[i].bi_size;
+       }
+
+       if ((error = lfs_markv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
+               for (i = 0; i < blkcnt; i++) {
+                       blkiov15[i].bi_inode     = blkiov[i].bi_inode;
+                       blkiov15[i].bi_lbn       = blkiov[i].bi_lbn;
+                       blkiov15[i].bi_daddr     = blkiov[i].bi_daddr;
+                       blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
+                       blkiov15[i].bi_version   = blkiov[i].bi_version;
+                       blkiov15[i].bi_bp        = blkiov[i].bi_bp;
+                       blkiov15[i].bi_size      = blkiov[i].bi_size;
+               }
+               copyout(blkiov15, SCARG(uap, blkiov),
+                       blkcnt * sizeof(BLOCK_INFO_15));
+       }
+    out:
+       lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+       lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
+       KERNEL_UNLOCK_ONE(NULL);
+       return error;
+}
+#endif
+
+#define        LFS_MARKV_MAX_BLOCKS    (LFS_MAX_BUFS)
+
+int
+lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov,
+    int blkcnt)
+{
+       BLOCK_INFO *blkp;
+       IFILE *ifp;
+       struct buf *bp;
+       struct inode *ip = NULL;
+       struct lfs *fs;
+       struct mount *mntp;
+       struct vnode *vp = NULL;
+       ino_t lastino;
+       daddr_t b_daddr, v_daddr;
+       int cnt, error;
+       int do_again = 0;
+       int numrefed = 0;
+       ino_t maxino;
+       size_t obsize;
+
+       /* number of blocks/inodes that we have already bwrite'ed */
+       int nblkwritten, ninowritten;
+
+       if ((mntp = vfs_getvfs(fsidp)) == NULL)
+               return (ENOENT);
+
+       fs = VFSTOUFS(mntp)->um_lfs;
+
+       if (fs->lfs_ronly)
+               return EROFS;
+
+       maxino = (fragstoblks(fs, VTOI(fs->lfs_ivnode)->i_ffs1_blocks) -
+                     fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb;
+
+       cnt = blkcnt;
+
+       if ((error = vfs_busy(mntp, NULL)) != 0)
+               return (error);
+
+       /*
+        * This seglock is just to prevent the fact that we might have to sleep
+        * from allowing the possibility that our blocks might become
+        * invalid.
+        *
+        * It is also important to note here that unless we specify SEGM_CKP,
+        * any Ifile blocks that we might be asked to clean will never get
+        * to the disk.
+        */
+       lfs_seglock(fs, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
+
+       /* Mark blocks/inodes dirty.  */
+       error = 0;
+
+       /* these were inside the initialization for the for loop */
+       v_daddr = LFS_UNUSED_DADDR;
+       lastino = LFS_UNUSED_INUM;
+       nblkwritten = ninowritten = 0;
+       for (blkp = blkiov; cnt--; ++blkp)
+       {
+               /* Bounds-check incoming data, avoid panic for failed VGET */
+               if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
+                       error = EINVAL;
+                       goto err3;
+               }
+               /*
+                * Get the IFILE entry (only once) and see if the file still
+                * exists.
+                */
+               if (lastino != blkp->bi_inode) {
+                       /*
+                        * Finish the old file, if there was one.  The presence
+                        * of a usable vnode in vp is signaled by a valid v_daddr.
+                        */
+                       if (v_daddr != LFS_UNUSED_DADDR) {
+                               lfs_vunref(vp);
+                               numrefed--;
+                       }
+
+                       /*
+                        * Start a new file
+                        */
+                       lastino = blkp->bi_inode;
+                       if (blkp->bi_inode == LFS_IFILE_INUM)
+                               v_daddr = fs->lfs_idaddr;
+                       else {
+                               LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
+                               /* XXX fix for force write */
+                               v_daddr = ifp->if_daddr;
+                               brelse(bp, 0);
+                       }
+                       if (v_daddr == LFS_UNUSED_DADDR)
+                               continue;
+
+                       /* Get the vnode/inode. */
+                       error = lfs_fastvget(mntp, blkp->bi_inode, v_daddr,
+                                          &vp,
+                                          (blkp->bi_lbn == LFS_UNUSED_LBN
+                                           ? blkp->bi_bp
+                                           : NULL));
+
+                       if (!error) {
+                               numrefed++;
+                       }
+                       if (error) {
+                               DLOG((DLOG_CLEAN, "lfs_markv: lfs_fastvget"
+                                     " failed with %d (ino %d, segment %d)\n",
+                                     error, blkp->bi_inode,
+                                     dtosn(fs, blkp->bi_daddr)));
+                               /*
+                                * If we got EAGAIN, that means that the
+                                * Inode was locked.  This is
+                                * recoverable: just clean the rest of
+                                * this segment, and let the cleaner try
+                                * again with another.  (When the
+                                * cleaner runs again, this segment will
+                                * sort high on the list, since it is
+                                * now almost entirely empty.) But, we
+                                * still set v_daddr = LFS_UNUSED_ADDR
+                                * so as not to test this over and over
+                                * again.
+                                */
+                               if (error == EAGAIN) {
+                                       error = 0;
+                                       do_again++;
+                               }
+#ifdef DIAGNOSTIC
+                               else if (error != ENOENT)
+                                       panic("lfs_markv VFS_VGET FAILED");
+#endif
+                               /* lastino = LFS_UNUSED_INUM; */
+                               v_daddr = LFS_UNUSED_DADDR;
+                               vp = NULL;
+                               ip = NULL;
+                               continue;
+                       }
+                       ip = VTOI(vp);
+                       ninowritten++;
+               } else if (v_daddr == LFS_UNUSED_DADDR) {
+                       /*
+                        * This can only happen if the vnode is dead (or
+                        * in any case we can't get it...e.g., it is
+                        * inlocked).  Keep going.
+                        */
+                       continue;
+               }
+
+               /* Past this point we are guaranteed that vp, ip are valid. */
+
+               /* Can't clean VU_DIROP directories in case of truncation */
+               /* XXX - maybe we should mark removed dirs specially? */
+               if (vp->v_type == VDIR && (vp->v_uflag & VU_DIROP)) {
+                       do_again++;
+                       continue;
+               }
+
+               /* If this BLOCK_INFO didn't contain a block, keep going. */
+               if (blkp->bi_lbn == LFS_UNUSED_LBN) {
+                       /* XXX need to make sure that the inode gets written in this case */
+                       /* XXX but only write the inode if it's the right one */
+                       if (blkp->bi_inode != LFS_IFILE_INUM) {
+                               LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
+                               if (ifp->if_daddr == blkp->bi_daddr) {
+                                       mutex_enter(&lfs_lock);
+                                       LFS_SET_UINO(ip, IN_CLEANING);
+                                       mutex_exit(&lfs_lock);
+                               }
+                               brelse(bp, 0);
+                       }
+                       continue;
+               }
+
+               b_daddr = 0;
+               if (VOP_BMAP(vp, blkp->bi_lbn, NULL, &b_daddr, NULL) ||
+                   dbtofsb(fs, b_daddr) != blkp->bi_daddr)
+               {
+                       if (dtosn(fs, dbtofsb(fs, b_daddr)) ==
+                           dtosn(fs, blkp->bi_daddr))
+                       {
+                               DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n",
+                                     (long long)blkp->bi_daddr, (long long)dbtofsb(fs, b_daddr)));
+                       }
+                       do_again++;
+                       continue;
+               }
+
+               /*
+                * Check block sizes.  The blocks being cleaned come from
+                * disk, so they should have the same size as their on-disk
+                * counterparts.
+                */
+               if (blkp->bi_lbn >= 0)
+                       obsize = blksize(fs, ip, blkp->bi_lbn);
+               else
+                       obsize = fs->lfs_bsize;
+               /* Check for fragment size change */
+               if (blkp->bi_lbn >= 0 && blkp->bi_lbn < NDADDR) {
+                       obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
+               }
+               if (obsize != blkp->bi_size) {
+                       DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong"
+                             " size (%ld != %d), try again\n",
+                             blkp->bi_inode, (long long)blkp->bi_lbn,
+                             (long) obsize, blkp->bi_size));
+                       do_again++;
+                       continue;
+               }
+
+               /*
+                * If we get to here, then we are keeping the block.  If
+                * it is an indirect block, we want to actually put it
+                * in the buffer cache so that it can be updated in the
+                * finish_meta section.  If it's not, we need to
+                * allocate a fake buffer so that writeseg can perform
+                * the copyin and write the buffer.
+                */
+               if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
+                       /* Data Block */
+                       bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
+                                        blkp->bi_size, blkp->bi_bp);
+                       /* Pretend we used bread() to get it */
+                       bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
+               } else {
+                       /* Indirect block or ifile */
+                       if (blkp->bi_size != fs->lfs_bsize &&
+                           ip->i_number != LFS_IFILE_INUM)
+                               panic("lfs_markv: partial indirect block?"
+                                   " size=%d\n", blkp->bi_size);
+                       bp = getblk(vp, blkp->bi_lbn, blkp->bi_size, 0, 0);
+                       if (!(bp->b_oflags & (BO_DONE|BO_DELWRI))) {
+                               /*
+                                * The block in question was not found
+                                * in the cache; i.e., the block that
+                                * getblk() returned is empty.  So, we
+                                * can (and should) copy in the
+                                * contents, because we've already
+                                * determined that this was the right
+                                * version of this block on disk.
+                                *
+                                * And, it can't have changed underneath
+                                * us, because we have the segment lock.
+                                */
+                               error = copyin(blkp->bi_bp, bp->b_data, blkp->bi_size);
+                               if (error)
+                                       goto err2;
+                       }
+               }
+               if ((error = lfs_bwrite_ext(bp, BW_CLEAN)) != 0)
+                       goto err2;
+
+               nblkwritten++;
+               /*
+                * XXX should account indirect blocks and ifile pages as well
+                */
+               if (nblkwritten + lblkno(fs, ninowritten * sizeof (struct ufs1_dinode))
+                   > LFS_MARKV_MAX_BLOCKS) {
+                       DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos\n",
+                             nblkwritten, ninowritten));
+                       lfs_segwrite(mntp, SEGM_CLEAN);
+                       nblkwritten = ninowritten = 0;
+               }
+       }
+
+       /*
+        * Finish the old file, if there was one
+        */
+       if (v_daddr != LFS_UNUSED_DADDR) {
+               lfs_vunref(vp);
+               numrefed--;
+       }
+
+#ifdef DIAGNOSTIC
+       if (numrefed != 0)
+               panic("lfs_markv: numrefed=%d", numrefed);
+#endif
+       DLOG((DLOG_CLEAN, "lfs_markv: writing %d blks %d inos (check point)\n",
+             nblkwritten, ninowritten));
+
+       /*
+        * The last write has to be SEGM_SYNC, because of calling semantics.
+        * It also has to be SEGM_CKP, because otherwise we could write
+        * over the newly cleaned data contained in a checkpoint, and then
+        * we'd be unhappy at recovery time.
+        */
+       lfs_segwrite(mntp, SEGM_CLEAN | SEGM_CKP | SEGM_SYNC);
+
+       lfs_segunlock(fs);
+
+       vfs_unbusy(mntp, false, NULL);
+       if (error)
+               return (error);
+       else if (do_again)
+               return EAGAIN;
+
+       return 0;
+
+err2:
+       DLOG((DLOG_CLEAN, "lfs_markv err2\n"));
+
+       /*
+        * XXX we're here because copyin() failed.
+        * XXX it means that we can't trust the cleanerd.  too bad.
+        * XXX how can we recover from this?
+        */
+
+err3:
+       KERNEL_UNLOCK_ONE(NULL);
+       /*
+        * XXX should do segwrite here anyway?
+        */
+
+       if (v_daddr != LFS_UNUSED_DADDR) {
+               lfs_vunref(vp);
+               --numrefed;
+       }
+
+       lfs_segunlock(fs);
+       vfs_unbusy(mntp, false, NULL);
+#ifdef DIAGNOSTIC
+       if (numrefed != 0)
+               panic("lfs_markv: numrefed=%d", numrefed);
+#endif
+
+       return (error);
+}
+
+/*
+ * sys_lfs_bmapv:
+ *
+ * This will fill in the current disk address for arrays of blocks.
+ *
+ *  0 on success
+ * -1/errno is return on error.
+ */
+#ifdef USE_64BIT_SYSCALLS
+int
+sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
+{
+       /* {
+               syscallarg(fsid_t *) fsidp;
+               syscallarg(struct block_info *) blkiov;
+               syscallarg(int) blkcnt;
+       } */
+       BLOCK_INFO *blkiov;
+       int blkcnt, error;
+       fsid_t fsid;
+       struct lfs *fs;
+       struct mount *mntp;
+
+       if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+           NULL)) != 0)
+               return (error);
+
+       if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+               return (error);
+
+       if ((mntp = vfs_getvfs(&fsid)) == NULL) 
+               return (ENOENT);
+       fs = VFSTOUFS(mntp)->um_lfs;
+
+       blkcnt = SCARG(uap, blkcnt);
+       if ((u_int) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
+               return (EINVAL);
+       KERNEL_LOCK(1, NULL);
+       blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+       if ((error = copyin(SCARG(uap, blkiov), blkiov,
+                           blkcnt * sizeof(BLOCK_INFO))) != 0)
+               goto out;
+
+       if ((error = lfs_bmapv(p, &fsid, blkiov, blkcnt)) == 0)
+               copyout(blkiov, SCARG(uap, blkiov),
+                       blkcnt * sizeof(BLOCK_INFO));
+    out:
+       lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+       KERNEL_UNLOCK_ONE(NULL);
+       return error;
+}
+#else
+int
+sys_lfs_bmapv(struct lwp *l, const struct sys_lfs_bmapv_args *uap, register_t *retval)
+{
+       /* {
+               syscallarg(fsid_t *) fsidp;
+               syscallarg(struct block_info *) blkiov;
+               syscallarg(int) blkcnt;
+       } */
+       BLOCK_INFO *blkiov;
+       BLOCK_INFO_15 *blkiov15;
+       int i, blkcnt, error;
+       fsid_t fsid;
+       struct lfs *fs;
+       struct mount *mntp;
+
+       if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+           NULL)) != 0)
+               return (error);
+
+       if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+               return (error);
+
+       if ((mntp = vfs_getvfs(&fsid)) == NULL) 
+               return (ENOENT);
+       fs = VFSTOUFS(mntp)->um_lfs;
+
+       blkcnt = SCARG(uap, blkcnt);
+       if ((size_t) blkcnt > SIZE_T_MAX / sizeof(BLOCK_INFO))
+               return (EINVAL);
+       KERNEL_LOCK(1, NULL);
+       blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+       blkiov15 = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO_15), LFS_NB_BLKIOV);
+       if ((error = copyin(SCARG(uap, blkiov), blkiov15,
+                           blkcnt * sizeof(BLOCK_INFO_15))) != 0)
+               goto out;
+
+       for (i = 0; i < blkcnt; i++) {
+               blkiov[i].bi_inode     = blkiov15[i].bi_inode;
+               blkiov[i].bi_lbn       = blkiov15[i].bi_lbn;
+               blkiov[i].bi_daddr     = blkiov15[i].bi_daddr;
+               blkiov[i].bi_segcreate = blkiov15[i].bi_segcreate;
+               blkiov[i].bi_version   = blkiov15[i].bi_version;
+               blkiov[i].bi_bp        = blkiov15[i].bi_bp;
+               blkiov[i].bi_size      = blkiov15[i].bi_size;
+       }
+
+       if ((error = lfs_bmapv(l->l_proc, &fsid, blkiov, blkcnt)) == 0) {
+               for (i = 0; i < blkcnt; i++) {
+                       blkiov15[i].bi_inode     = blkiov[i].bi_inode;
+                       blkiov15[i].bi_lbn       = blkiov[i].bi_lbn;
+                       blkiov15[i].bi_daddr     = blkiov[i].bi_daddr;
+                       blkiov15[i].bi_segcreate = blkiov[i].bi_segcreate;
+                       blkiov15[i].bi_version   = blkiov[i].bi_version;
+                       blkiov15[i].bi_bp        = blkiov[i].bi_bp;
+                       blkiov15[i].bi_size      = blkiov[i].bi_size;
+               }
+               copyout(blkiov15, SCARG(uap, blkiov),
+                       blkcnt * sizeof(BLOCK_INFO_15));
+       }
+    out:
+       lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+       lfs_free(fs, blkiov15, LFS_NB_BLKIOV);
+       KERNEL_UNLOCK_ONE(NULL);
+       return error;
+}
+#endif
+
+int
+lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
+{
+       BLOCK_INFO *blkp;
+       IFILE *ifp;
+       struct buf *bp;
+       struct inode *ip = NULL;
+       struct lfs *fs;
+       struct mount *mntp;
+       struct ufsmount *ump;
+       struct vnode *vp;
+       ino_t lastino;
+       daddr_t v_daddr;
+       int cnt, error;
+       int numrefed = 0;
+
+       lfs_cleaner_pid = p->p_pid;
+
+       if ((mntp = vfs_getvfs(fsidp)) == NULL)
+               return (ENOENT);
+
+       ump = VFSTOUFS(mntp);
+       if ((error = vfs_busy(mntp, NULL)) != 0)
+               return (error);
+
+       cnt = blkcnt;
+
+       fs = VFSTOUFS(mntp)->um_lfs;
+
+       error = 0;
+
+       /* these were inside the initialization for the for loop */
+       v_daddr = LFS_UNUSED_DADDR;
+       lastino = LFS_UNUSED_INUM;
+       for (blkp = blkiov; cnt--; ++blkp)
+       {
+               /*
+                * Get the IFILE entry (only once) and see if the file still
+                * exists.
+                */
+               if (lastino != blkp->bi_inode) {
+                       /*
+                        * Finish the old file, if there was one.  The presence
+                        * of a usable vnode in vp is signaled by a valid
+                        * v_daddr.
+                        */
+                       if (v_daddr != LFS_UNUSED_DADDR) {
+                               lfs_vunref(vp);
+                               numrefed--;
+                       }
+
+                       /*
+                        * Start a new file
+                        */
+                       lastino = blkp->bi_inode;
+                       if (blkp->bi_inode == LFS_IFILE_INUM)
+                               v_daddr = fs->lfs_idaddr;
+                       else {
+                               LFS_IENTRY(ifp, fs, blkp->bi_inode, bp);
+                               v_daddr = ifp->if_daddr;
+                               brelse(bp, 0);
+                       }
+                       if (v_daddr == LFS_UNUSED_DADDR) {
+                               blkp->bi_daddr = LFS_UNUSED_DADDR;
+                               continue;
+                       }
+                       /*
+                        * A regular call to VFS_VGET could deadlock
+                        * here.  Instead, we try an unlocked access.
+                        */
+                       mutex_enter(&ufs_ihash_lock);
+                       vp = ufs_ihashlookup(ump->um_dev, blkp->bi_inode);
+                       if (vp != NULL && !(vp->v_iflag & VI_XLOCK)) {
+                               ip = VTOI(vp);
+                               mutex_enter(vp->v_interlock);
+                               mutex_exit(&ufs_ihash_lock);
+                               if (lfs_vref(vp)) {
+                                       v_daddr = LFS_UNUSED_DADDR;
+                                       continue;
+                               }
+                               numrefed++;
+                       } else {
+                               mutex_exit(&ufs_ihash_lock);
+                               /*
+                                * Don't VFS_VGET if we're being unmounted,
+                                * since we hold vfs_busy().
+                                */
+                               if (mntp->mnt_iflag & IMNT_UNMOUNT) {
+                                       v_daddr = LFS_UNUSED_DADDR;
+                                       continue;
+                               }
+                               error = VFS_VGET(mntp, blkp->bi_inode, &vp);
+                               if (error) {
+                                       DLOG((DLOG_CLEAN, "lfs_bmapv: vget ino"
+                                             "%d failed with %d",
+                                             blkp->bi_inode,error));
+                                       v_daddr = LFS_UNUSED_DADDR;
+                                       continue;
+                               } else {
+                                       KASSERT(VOP_ISLOCKED(vp));
+                                       VOP_UNLOCK(vp);
+                                       numrefed++;
+                               }
+                       }
+                       ip = VTOI(vp);
+               } else if (v_daddr == LFS_UNUSED_DADDR) {
+                       /*
+                        * This can only happen if the vnode is dead.
+                        * Keep going.  Note that we DO NOT set the
+                        * bi_addr to anything -- if we failed to get
+                        * the vnode, for example, we want to assume
+                        * conservatively that all of its blocks *are*
+                        * located in the segment in question.
+                        * lfs_markv will throw them out if we are
+                        * wrong.
+                        */
+                       /* blkp->bi_daddr = LFS_UNUSED_DADDR; */
+                       continue;
+               }
+
+               /* Past this point we are guaranteed that vp, ip are valid. */
+
+               if (blkp->bi_lbn == LFS_UNUSED_LBN) {
+                       /*
+                        * We just want the inode address, which is
+                        * conveniently in v_daddr.
+                        */
+                       blkp->bi_daddr = v_daddr;
+               } else {
+                       daddr_t bi_daddr;
+
+                       /* XXX ondisk32 */
+                       error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
+                                        &bi_daddr, NULL);
+                       if (error)
+                       {
+                               blkp->bi_daddr = LFS_UNUSED_DADDR;
+                               continue;
+                       }
+                       blkp->bi_daddr = dbtofsb(fs, bi_daddr);
+                       /* Fill in the block size, too */
+                       if (blkp->bi_lbn >= 0)
+                               blkp->bi_size = blksize(fs, ip, blkp->bi_lbn);
+                       else
+                               blkp->bi_size = fs->lfs_bsize;
+               }
+       }
+
+       /*
+        * Finish the old file, if there was one.  The presence
+        * of a usable vnode in vp is signaled by a valid v_daddr.
+        */
+       if (v_daddr != LFS_UNUSED_DADDR) {
+               lfs_vunref(vp);
+               numrefed--;
+       }
+
+#ifdef DIAGNOSTIC
+       if (numrefed != 0)
+               panic("lfs_bmapv: numrefed=%d", numrefed);
+#endif
+
+       vfs_unbusy(mntp, false, NULL);
+
+       return 0;
+}
+
+/*
+ * sys_lfs_segclean:
+ *
+ * Mark the segment clean.
+ *
+ *  0 on success
+ * -1/errno is return on error.
+ */
+int
+sys_lfs_segclean(struct lwp *l, const struct sys_lfs_segclean_args *uap, register_t *retval)
+{
+       /* {
+               syscallarg(fsid_t *) fsidp;
+               syscallarg(u_long) segment;
+       } */
+       struct lfs *fs;
+       struct mount *mntp;
+       fsid_t fsid;
+       int error;
+       unsigned long segnum;
+
+       if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+           NULL)) != 0)
+               return (error);
+
+       if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+               return (error);
+       if ((mntp = vfs_getvfs(&fsid)) == NULL)
+               return (ENOENT);
+
+       fs = VFSTOUFS(mntp)->um_lfs;
+       segnum = SCARG(uap, segment);
+
+       if ((error = vfs_busy(mntp, NULL)) != 0)
+               return (error);
+
+       KERNEL_LOCK(1, NULL);
+       lfs_seglock(fs, SEGM_PROT);
+       error = lfs_do_segclean(fs, segnum);
+       lfs_segunlock(fs);
+       KERNEL_UNLOCK_ONE(NULL);
+       vfs_unbusy(mntp, false, NULL);
+       return error;
+}
+
+/*
+ * Actually mark the segment clean.
+ * Must be called with the segment lock held.
+ */
+int
+lfs_do_segclean(struct lfs *fs, unsigned long segnum)
+{
+       extern int lfs_dostats;
+       struct buf *bp;
+       CLEANERINFO *cip;
+       SEGUSE *sup;
+
+       if (dtosn(fs, fs->lfs_curseg) == segnum) {
+               return (EBUSY);
+       }
+
+       LFS_SEGENTRY(sup, fs, segnum, bp);
+       if (sup->su_nbytes) {
+               DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
+                     " %d live bytes\n", segnum, sup->su_nbytes));
+               brelse(bp, 0);
+               return (EBUSY);
+       }
+       if (sup->su_flags & SEGUSE_ACTIVE) {
+               DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
+                     " segment is active\n", segnum));
+               brelse(bp, 0);
+               return (EBUSY);
+       }
+       if (!(sup->su_flags & SEGUSE_DIRTY)) {
+               DLOG((DLOG_CLEAN, "lfs_segclean: not cleaning segment %lu:"
+                     " segment is already clean\n", segnum));
+               brelse(bp, 0);
+               return (EALREADY);
+       }
+
+       fs->lfs_avail += segtod(fs, 1);
+       if (sup->su_flags & SEGUSE_SUPERBLOCK)
+               fs->lfs_avail -= btofsb(fs, LFS_SBPAD);
+       if (fs->lfs_version > 1 && segnum == 0 &&
+           fs->lfs_start < btofsb(fs, LFS_LABELPAD))
+               fs->lfs_avail -= btofsb(fs, LFS_LABELPAD) - fs->lfs_start;
+       mutex_enter(&lfs_lock);
+       fs->lfs_bfree += sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
+               btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
+       fs->lfs_dmeta -= sup->su_nsums * btofsb(fs, fs->lfs_sumsize) +
+               btofsb(fs, sup->su_ninos * fs->lfs_ibsize);
+       if (fs->lfs_dmeta < 0)
+               fs->lfs_dmeta = 0;
+       mutex_exit(&lfs_lock);
+       sup->su_flags &= ~SEGUSE_DIRTY;
+       LFS_WRITESEGENTRY(sup, fs, segnum, bp);
+
+       LFS_CLEANERINFO(cip, fs, bp);
+       ++cip->clean;
+       --cip->dirty;
+       fs->lfs_nclean = cip->clean;
+       cip->bfree = fs->lfs_bfree;
+       mutex_enter(&lfs_lock);
+       cip->avail = fs->lfs_avail - fs->lfs_ravail - fs->lfs_favail;
+       wakeup(&fs->lfs_avail);
+       mutex_exit(&lfs_lock);
+       (void) LFS_BWRITE_LOG(bp);
+
+       if (lfs_dostats)
+               ++lfs_stats.segs_reclaimed;
+
+       return (0);
+}
+
+/*
+ * This will block until a segment in file system fsid is written.  A timeout
+ * in milliseconds may be specified which will awake the cleaner automatically.
+ * An fsid of -1 means any file system, and a timeout of 0 means forever.
+ */
+int
+lfs_segwait(fsid_t *fsidp, struct timeval *tv)
+{
+       struct mount *mntp;
+       void *addr;
+       u_long timeout;
+       int error;
+
+       KERNEL_LOCK(1, NULL);
+       if (fsidp == NULL || (mntp = vfs_getvfs(fsidp)) == NULL)
+               addr = &lfs_allclean_wakeup;
+       else
+               addr = &VFSTOUFS(mntp)->um_lfs->lfs_nextseg;
+       /*
+        * XXX THIS COULD SLEEP FOREVER IF TIMEOUT IS {0,0}!
+        * XXX IS THAT WHAT IS INTENDED?
+        */
+       timeout = tvtohz(tv);
+       error = tsleep(addr, PCATCH | PVFS, "segment", timeout);
+       KERNEL_UNLOCK_ONE(NULL);
+       return (error == ERESTART ? EINTR : 0);
+}
+
+/*
+ * sys_lfs_segwait:
+ *
+ * System call wrapper around lfs_segwait().
+ *
+ *  0 on success
+ *  1 on timeout
+ * -1/errno is return on error.
+ */
+int
+sys___lfs_segwait50(struct lwp *l, const struct sys___lfs_segwait50_args *uap,
+    register_t *retval)
+{
+       /* {
+               syscallarg(fsid_t *) fsidp;
+               syscallarg(struct timeval *) tv;
+       } */
+       struct timeval atv;
+       fsid_t fsid;
+       int error;
+
+       /* XXX need we be su to segwait? */
+       if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+           NULL)) != 0)
+               return (error);
+       if ((error = copyin(SCARG(uap, fsidp), &fsid, sizeof(fsid_t))) != 0)
+               return (error);
+
+       if (SCARG(uap, tv)) {
+               error = copyin(SCARG(uap, tv), &atv, sizeof(struct timeval));
+               if (error)
+                       return (error);
+               if (itimerfix(&atv))
+                       return (EINVAL);
+       } else /* NULL or invalid */
+               atv.tv_sec = atv.tv_usec = 0;
+       return lfs_segwait(&fsid, &atv);
+}
+
+/*
+ * VFS_VGET call specialized for the cleaner.  The cleaner already knows the
+ * daddr from the ifile, so don't look it up again.  If the cleaner is
+ * processing IINFO structures, it may have the ondisk inode already, so
+ * don't go retrieving it again.
+ *
+ * we lfs_vref, and it is the caller's responsibility to lfs_vunref
+ * when finished.
+ */
+
+int
+lfs_fasthashget(dev_t dev, ino_t ino, struct vnode **vpp)
+{
+       struct vnode *vp;
+
+       mutex_enter(&ufs_ihash_lock);
+       if ((vp = ufs_ihashlookup(dev, ino)) != NULL) {
+               mutex_enter(vp->v_interlock);
+               mutex_exit(&ufs_ihash_lock);
+               if (vp->v_iflag & VI_XLOCK) {
+                       DLOG((DLOG_CLEAN, "lfs_fastvget: ino %d VI_XLOCK\n",
+                             ino));
+                       lfs_stats.clean_vnlocked++;
+                       mutex_exit(vp->v_interlock);
+                       return EAGAIN;
+               }
+               if (lfs_vref(vp)) {
+                       DLOG((DLOG_CLEAN, "lfs_fastvget: lfs_vref failed"
+                             " for ino %d\n", ino));
+                       lfs_stats.clean_inlocked++;
+                       return EAGAIN;
+               }
+       } else {
+               mutex_exit(&ufs_ihash_lock);
+       }
+       *vpp = vp;
+
+       return (0);
+}
+
+int
+lfs_fastvget(struct mount *mp, ino_t ino, daddr_t daddr, struct vnode **vpp,
+            struct ufs1_dinode *dinp)
+{
+       struct inode *ip;
+       struct ufs1_dinode *dip;
+       struct vnode *vp;
+       struct ufsmount *ump;
+       dev_t dev;
+       int error, retries;
+       struct buf *bp;
+       struct lfs *fs;
+
+       ump = VFSTOUFS(mp);
+       dev = ump->um_dev;
+       fs = ump->um_lfs;
+
+       /*
+        * Wait until the filesystem is fully mounted before allowing vget
+        * to complete.  This prevents possible problems with roll-forward.
+        */
+       mutex_enter(&lfs_lock);
+       while (fs->lfs_flags & LFS_NOTYET) {
+               mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0,
+                       &lfs_lock);
+       }
+       mutex_exit(&lfs_lock);
+
+       /*
+        * This is playing fast and loose.  Someone may have the inode
+        * locked, in which case they are going to be distinctly unhappy
+        * if we trash something.
+        */
+
+       error = lfs_fasthashget(dev, ino, vpp);
+       if (error != 0 || *vpp != NULL)
+               return (error);
+
+       /*
+        * getnewvnode(9) will call vfs_busy, which will block if the
+        * filesystem is being unmounted; but umount(9) is waiting for
+        * us because we're already holding the fs busy.
+        * XXXMP
+        */
+       if (mp->mnt_iflag & IMNT_UNMOUNT) {
+               *vpp = NULL;
+               return EDEADLK;
+       }
+       error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp);
+       if (error) {
+               *vpp = NULL;
+               return (error);
+       }
+
+       mutex_enter(&ufs_hashlock);
+       error = lfs_fasthashget(dev, ino, vpp);
+       if (error != 0 || *vpp != NULL) {
+               mutex_exit(&ufs_hashlock);
+               ungetnewvnode(vp);
+               return (error);
+       }
+
+       /* Allocate new vnode/inode. */
+       lfs_vcreate(mp, ino, vp);
+
+       /*
+        * Put it onto its hash chain and lock it so that other requests for
+        * this inode will block if they arrive while we are sleeping waiting
+        * for old data structures to be purged or for the contents of the
+        * disk portion of this inode to be read.
+        */
+       ip = VTOI(vp);
+       ufs_ihashins(ip);
+       mutex_exit(&ufs_hashlock);
+
+       /*
+        * XXX
+        * This may not need to be here, logically it should go down with
+        * the i_devvp initialization.
+        * Ask Kirk.
+        */
+       ip->i_lfs = fs;
+
+       /* Read in the disk contents for the inode, copy into the inode. */
+       if (dinp) {
+               error = copyin(dinp, ip->i_din.ffs1_din, sizeof (struct ufs1_dinode));
+               if (error) {
+                       DLOG((DLOG_CLEAN, "lfs_fastvget: dinode copyin failed"
+                             " for ino %d\n", ino));
+                       ufs_ihashrem(ip);
+
+                       /* Unlock and discard unneeded inode. */
+                       VOP_UNLOCK(vp);
+                       lfs_vunref(vp);
+                       *vpp = NULL;
+                       return (error);
+               }
+               if (ip->i_number != ino)
+                       panic("lfs_fastvget: I was fed the wrong inode!");
+       } else {
+               retries = 0;
+           again:
+               error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
+                             NOCRED, 0, &bp);
+               if (error) {
+                       DLOG((DLOG_CLEAN, "lfs_fastvget: bread failed (%d)\n",
+                             error));
+                       /*
+                        * The inode does not contain anything useful, so it
+                        * would be misleading to leave it on its hash chain.
+                        * Iput() will return it to the free list.
+                        */
+                       ufs_ihashrem(ip);
+
+                       /* Unlock and discard unneeded inode. */
+                       VOP_UNLOCK(vp);
+                       lfs_vunref(vp);
+                       brelse(bp, 0);
+                       *vpp = NULL;
+                       return (error);
+               }
+               dip = lfs_ifind(ump->um_lfs, ino, bp);
+               if (dip == NULL) {
+                       /* Assume write has not completed yet; try again */
+                       brelse(bp, BC_INVAL);
+                       ++retries;
+                       if (retries > LFS_IFIND_RETRIES)
+                               panic("lfs_fastvget: dinode not found");
+                       DLOG((DLOG_CLEAN, "lfs_fastvget: dinode not found,"
+                             " retrying...\n"));
+                       goto again;
+               }
+               *ip->i_din.ffs1_din = *dip;
+               brelse(bp, 0);
+       }
+       lfs_vinit(mp, &vp);
+
+       *vpp = vp;
+
+       KASSERT(VOP_ISLOCKED(vp));
+       VOP_UNLOCK(vp);
+
+       return (0);
+}
+
+/*
+ * Make up a "fake" cleaner buffer, copy the data from userland into it.
+ */
+struct buf *
+lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, void *uaddr)
+{
+       struct buf *bp;
+       int error;
+
+       KASSERT(VTOI(vp)->i_number != LFS_IFILE_INUM);
+
+       bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
+       error = copyin(uaddr, bp->b_data, size);
+       if (error) {
+               lfs_freebuf(fs, bp);
+               return NULL;
+       }
+       KDASSERT(bp->b_iodone == lfs_callback);
+
+#if 0
+       mutex_enter(&lfs_lock);
+       ++fs->lfs_iocount;
+       mutex_exit(&lfs_lock);
+#endif
+       bp->b_bufsize = size;
+       bp->b_bcount = size;
+       return (bp);
+}
diff --git a/sys/ufs/lfs/lfs_vfsops.c b/sys/ufs/lfs/lfs_vfsops.c
new file mode 100644 (file)
index 0000000..7769e94
--- /dev/null
@@ -0,0 +1,2138 @@
+/*     $NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $ */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
+ *     The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1989, 1991, 1993, 1994
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_vfsops.c        8.20 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_lfs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kthread.h>
+#include <sys/buf.h>
+#include <sys/device.h>
+#include <sys/mbuf.h>
+#include <sys/file.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/socket.h>
+#include <sys/syslog.h>
+#include <uvm/uvm_extern.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/kauth.h>
+#include <sys/module.h>
+#include <sys/syscallvar.h>
+#include <sys/syscall.h>
+#include <sys/syscallargs.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+#include <uvm/uvm_pdaemon.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+
+MODULE(MODULE_CLASS_VFS, lfs, "ffs");
+
+static int lfs_gop_write(struct vnode *, struct vm_page **, int, int);
+static bool lfs_issequential_hole(const struct ufsmount *,
+    daddr_t, daddr_t);
+
+static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *);
+
+static struct sysctllog *lfs_sysctl_log;
+
+extern const struct vnodeopv_desc lfs_vnodeop_opv_desc;
+extern const struct vnodeopv_desc lfs_specop_opv_desc;
+extern const struct vnodeopv_desc lfs_fifoop_opv_desc;
+
+pid_t lfs_writer_daemon = 0;
+int lfs_do_flush = 0;
+#ifdef LFS_KERNEL_RFW
+int lfs_do_rfw = 0;
+#endif
+
+const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = {
+       &lfs_vnodeop_opv_desc,
+       &lfs_specop_opv_desc,
+       &lfs_fifoop_opv_desc,
+       NULL,
+};
+
+struct vfsops lfs_vfsops = {
+       MOUNT_LFS,
+       sizeof (struct ufs_args),
+       lfs_mount,
+       ufs_start,
+       lfs_unmount,
+       ufs_root,
+       ufs_quotactl,
+       lfs_statvfs,
+       lfs_sync,
+       lfs_vget,
+       lfs_fhtovp,
+       lfs_vptofh,
+       lfs_init,
+       lfs_reinit,
+       lfs_done,
+       lfs_mountroot,
+       (int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp,
+       vfs_stdextattrctl,
+       (void *)eopnotsupp,     /* vfs_suspendctl */
+       genfs_renamelock_enter,
+       genfs_renamelock_exit,
+       (void *)eopnotsupp,
+       lfs_vnodeopv_descs,
+       0,
+       { NULL, NULL },
+};
+
+const struct genfs_ops lfs_genfsops = {
+       .gop_size = lfs_gop_size,
+       .gop_alloc = ufs_gop_alloc,
+       .gop_write = lfs_gop_write,
+       .gop_markupdate = ufs_gop_markupdate,
+};
+
+static const struct ufs_ops lfs_ufsops = {
+       .uo_itimes = NULL,
+       .uo_update = lfs_update,
+       .uo_truncate = lfs_truncate,
+       .uo_valloc = lfs_valloc,
+       .uo_vfree = lfs_vfree,
+       .uo_balloc = lfs_balloc,
+       .uo_unmark_vnode = lfs_unmark_vnode,
+};
+
+struct shortlong {
+       const char *sname;
+       const char *lname;
+};
+
+static int
+sysctl_lfs_dostats(SYSCTLFN_ARGS)
+{
+       extern struct lfs_stats lfs_stats;
+       extern int lfs_dostats;
+       int error;
+
+       error = sysctl_lookup(SYSCTLFN_CALL(rnode));
+       if (error || newp == NULL)
+               return (error);
+
+       if (lfs_dostats == 0)
+               memset(&lfs_stats, 0, sizeof(lfs_stats));
+
+       return (0);
+}
+
+static void
+lfs_sysctl_setup(struct sysctllog **clog)
+{
+       int i;
+       extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead,
+                  lfs_fs_pagetrip, lfs_ignore_lazy_sync;
+#ifdef DEBUG
+       extern int lfs_debug_log_subsys[DLOG_MAX];
+       struct shortlong dlog_names[DLOG_MAX] = { /* Must match lfs.h ! */
+               { "rollforward", "Debug roll-forward code" },
+               { "alloc",      "Debug inode allocation and free list" },
+               { "avail",      "Debug space-available-now accounting" },
+               { "flush",      "Debug flush triggers" },
+               { "lockedlist", "Debug locked list accounting" },
+               { "vnode_verbose", "Verbose per-vnode-written debugging" },
+               { "vnode",      "Debug vnode use during segment write" },
+               { "segment",    "Debug segment writing" },
+               { "seguse",     "Debug segment used-bytes accounting" },
+               { "cleaner",    "Debug cleaning routines" },
+               { "mount",      "Debug mount/unmount routines" },
+               { "pagecache",  "Debug UBC interactions" },
+               { "dirop",      "Debug directory-operation accounting" },
+               { "malloc",     "Debug private malloc accounting" },
+       };
+#endif /* DEBUG */
+       struct shortlong stat_names[] = { /* Must match lfs.h! */
+               { "segsused",       "Number of new segments allocated" },
+               { "psegwrites",     "Number of partial-segment writes" },
+               { "psyncwrites",    "Number of synchronous partial-segment"
+                                   " writes" },
+               { "pcleanwrites",   "Number of partial-segment writes by the"
+                                   " cleaner" },
+               { "blocktot",       "Number of blocks written" },
+               { "cleanblocks",    "Number of blocks written by the cleaner" },
+               { "ncheckpoints",   "Number of checkpoints made" },
+               { "nwrites",        "Number of whole writes" },
+               { "nsync_writes",   "Number of synchronous writes" },
+               { "wait_exceeded",  "Number of times writer waited for"
+                                   " cleaner" },
+               { "write_exceeded", "Number of times writer invoked flush" },
+               { "flush_invoked",  "Number of times flush was invoked" },
+               { "vflush_invoked", "Number of time vflush was called" },
+               { "clean_inlocked", "Number of vnodes skipped for VI_XLOCK" },
+               { "clean_vnlocked", "Number of vnodes skipped for vget failure" },
+               { "segs_reclaimed", "Number of segments reclaimed" },
+       };
+
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_NODE, "vfs", NULL,
+                      NULL, 0, NULL, 0,
+                      CTL_VFS, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_NODE, "lfs",
+                      SYSCTL_DESCR("Log-structured file system"),
+                      NULL, 0, NULL, 0,
+                      CTL_VFS, 5, CTL_EOL);
+       /*
+        * XXX the "5" above could be dynamic, thereby eliminating one
+        * more instance of the "number to vfs" mapping problem, but
+        * "5" is the order as taken from sys/mount.h
+        */
+
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "flushindir", NULL,
+                      NULL, 0, &lfs_writeindir, 0,
+                      CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "clean_vnhead", NULL,
+                      NULL, 0, &lfs_clean_vnhead, 0,
+                      CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "dostats",
+                      SYSCTL_DESCR("Maintain statistics on LFS operations"),
+                      sysctl_lfs_dostats, 0, &lfs_dostats, 0,
+                      CTL_VFS, 5, LFS_DOSTATS, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "pagetrip",
+                      SYSCTL_DESCR("How many dirty pages in fs triggers"
+                                   " a flush"),
+                      NULL, 0, &lfs_fs_pagetrip, 0,
+                      CTL_VFS, 5, LFS_FS_PAGETRIP, CTL_EOL);
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "ignore_lazy_sync",
+                      SYSCTL_DESCR("Lazy Sync is ignored entirely"),
+                      NULL, 0, &lfs_ignore_lazy_sync, 0,
+                      CTL_VFS, 5, LFS_IGNORE_LAZY_SYNC, CTL_EOL);
+#ifdef LFS_KERNEL_RFW
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "rfw",
+                      SYSCTL_DESCR("Use in-kernel roll-forward on mount"),
+                      NULL, 0, &lfs_do_rfw, 0,
+                      CTL_VFS, 5, LFS_DO_RFW, CTL_EOL);
+#endif
+
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_NODE, "stats",
+                      SYSCTL_DESCR("Debugging options"),
+                      NULL, 0, NULL, 0,
+                      CTL_VFS, 5, LFS_STATS, CTL_EOL);
+       for (i = 0; i < sizeof(struct lfs_stats) / sizeof(u_int); i++) {
+               sysctl_createv(clog, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT|CTLFLAG_READONLY,
+                              CTLTYPE_INT, stat_names[i].sname,
+                              SYSCTL_DESCR(stat_names[i].lname),
+                              NULL, 0, &(((u_int *)&lfs_stats.segsused)[i]),
+                              0, CTL_VFS, 5, LFS_STATS, i, CTL_EOL);
+       }
+
+#ifdef DEBUG
+       sysctl_createv(clog, 0, NULL, NULL,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_NODE, "debug",
+                      SYSCTL_DESCR("Debugging options"),
+                      NULL, 0, NULL, 0,
+                      CTL_VFS, 5, LFS_DEBUGLOG, CTL_EOL);
+       for (i = 0; i < DLOG_MAX; i++) {
+               sysctl_createv(clog, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                              CTLTYPE_INT, dlog_names[i].sname,
+                              SYSCTL_DESCR(dlog_names[i].lname),
+                              NULL, 0, &(lfs_debug_log_subsys[i]), 0,
+                              CTL_VFS, 5, LFS_DEBUGLOG, i, CTL_EOL);
+       }
+#endif
+}
+
+/* old cleaner syscall interface.  see VOP_FCNTL() */
+static const struct syscall_package lfs_syscalls[] = {
+       { SYS_lfs_bmapv,        0, (sy_call_t *)sys_lfs_bmapv           },
+       { SYS_lfs_markv,        0, (sy_call_t *)sys_lfs_markv           },
+       { SYS_lfs_segclean,     0, (sy_call_t *)sys___lfs_segwait50     },
+       { 0, 0, NULL },
+};
+
+static int
+lfs_modcmd(modcmd_t cmd, void *arg)
+{
+       int error;
+
+       switch (cmd) {
+       case MODULE_CMD_INIT:
+               error = syscall_establish(NULL, lfs_syscalls);
+               if (error)
+                       return error;
+               error = vfs_attach(&lfs_vfsops);
+               if (error != 0) {
+                       syscall_disestablish(NULL, lfs_syscalls);
+                       break;
+               }
+               lfs_sysctl_setup(&lfs_sysctl_log);
+               break;
+       case MODULE_CMD_FINI:
+               error = vfs_detach(&lfs_vfsops);
+               if (error != 0)
+                       break;
+               syscall_disestablish(NULL, lfs_syscalls);
+               sysctl_teardown(&lfs_sysctl_log);
+               break;
+       default:
+               error = ENOTTY;
+               break;
+       }
+
+       return (error);
+}
+
+/*
+ * XXX Same structure as FFS inodes?  Should we share a common pool?
+ */
+struct pool lfs_inode_pool;
+struct pool lfs_dinode_pool;
+struct pool lfs_inoext_pool;
+struct pool lfs_lbnentry_pool;
+
+/*
+ * The writer daemon.  UVM keeps track of how many dirty pages we are holding
+ * in lfs_subsys_pages; the daemon flushes the filesystem when this value
+ * crosses the (user-defined) threshhold LFS_MAX_PAGES.
+ */
+static void
+lfs_writerd(void *arg)
+{
+       struct mount *mp, *nmp;
+       struct lfs *fs;
+       int fsflags;
+       int loopcount;
+
+       lfs_writer_daemon = curproc->p_pid;
+
+       mutex_enter(&lfs_lock);
+       for (;;) {
+               mtsleep(&lfs_writer_daemon, PVM | PNORELOCK, "lfswriter", hz/10,
+                   &lfs_lock);
+
+               /*
+                * Look through the list of LFSs to see if any of them
+                * have requested pageouts.
+                */
+               mutex_enter(&mountlist_lock);
+               for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
+                    mp = nmp) {
+                       if (vfs_busy(mp, &nmp)) {
+                               continue;
+                       }
+                       if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
+                           sizeof(mp->mnt_stat.f_fstypename)) == 0) {
+                               fs = VFSTOUFS(mp)->um_lfs;
+                               mutex_enter(&lfs_lock);
+                               fsflags = 0;
+                               if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+                                    lfs_dirvcount > LFS_MAX_DIROP) &&
+                                   fs->lfs_dirops == 0)
+                                       fsflags |= SEGM_CKP;
+                               if (fs->lfs_pdflush) {
+                                       DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
+                                       fs->lfs_pdflush = 0;
+                                       lfs_flush_fs(fs, fsflags);
+                                       mutex_exit(&lfs_lock);
+                               } else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
+                                       DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
+                                       mutex_exit(&lfs_lock);
+                                       lfs_writer_enter(fs, "wrdirop");
+                                       lfs_flush_pchain(fs);
+                                       lfs_writer_leave(fs);
+                               } else
+                                       mutex_exit(&lfs_lock);
+                       }
+                       vfs_unbusy(mp, false, &nmp);
+               }
+               mutex_exit(&mountlist_lock);
+
+               /*
+                * If global state wants a flush, flush everything.
+                */
+               mutex_enter(&lfs_lock);
+               loopcount = 0;
+               if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS ||
+                       locked_queue_bytes > LFS_MAX_BYTES ||
+                       lfs_subsys_pages > LFS_MAX_PAGES) {
+
+                       if (lfs_do_flush) {
+                               DLOG((DLOG_FLUSH, "daemon: lfs_do_flush\n"));
+                       }
+                       if (locked_queue_count > LFS_MAX_BUFS) {
+                               DLOG((DLOG_FLUSH, "daemon: lqc = %d, max %d\n",
+                                     locked_queue_count, LFS_MAX_BUFS));
+                       }
+                       if (locked_queue_bytes > LFS_MAX_BYTES) {
+                               DLOG((DLOG_FLUSH, "daemon: lqb = %ld, max %ld\n",
+                                     locked_queue_bytes, LFS_MAX_BYTES));
+                       }
+                       if (lfs_subsys_pages > LFS_MAX_PAGES) {
+                               DLOG((DLOG_FLUSH, "daemon: lssp = %d, max %d\n",
+                                     lfs_subsys_pages, LFS_MAX_PAGES));
+                       }
+
+                       lfs_flush(NULL, SEGM_WRITERD, 0);
+                       lfs_do_flush = 0;
+               }
+       }
+       /* NOTREACHED */
+}
+
+/*
+ * Initialize the filesystem, most work done by ufs_init.
+ */
+void
+lfs_init(void)
+{
+
+       malloc_type_attach(M_SEGMENT);
+       pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0,
+           "lfsinopl", &pool_allocator_nointr, IPL_NONE);
+       pool_init(&lfs_dinode_pool, sizeof(struct ufs1_dinode), 0, 0, 0,
+           "lfsdinopl", &pool_allocator_nointr, IPL_NONE);
+       pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0,
+           "lfsinoextpl", &pool_allocator_nointr, IPL_NONE);
+       pool_init(&lfs_lbnentry_pool, sizeof(struct lbnentry), 0, 0, 0,
+           "lfslbnpool", &pool_allocator_nointr, IPL_NONE);
+       ufs_init();
+
+#ifdef DEBUG
+       memset(lfs_log, 0, sizeof(lfs_log));
+#endif
+       mutex_init(&lfs_lock, MUTEX_DEFAULT, IPL_NONE);
+       cv_init(&locked_queue_cv, "lfsbuf");
+       cv_init(&lfs_writing_cv, "lfsflush");
+}
+
+void
+lfs_reinit(void)
+{
+       ufs_reinit();
+}
+
+void
+lfs_done(void)
+{
+       ufs_done();
+       mutex_destroy(&lfs_lock);
+       cv_destroy(&locked_queue_cv);
+       cv_destroy(&lfs_writing_cv);
+       pool_destroy(&lfs_inode_pool);
+       pool_destroy(&lfs_dinode_pool);
+       pool_destroy(&lfs_inoext_pool);
+       pool_destroy(&lfs_lbnentry_pool);
+       malloc_type_detach(M_SEGMENT);
+}
+
+/*
+ * Called by main() when ufs is going to be mounted as root.
+ */
+int
+lfs_mountroot(void)
+{
+       extern struct vnode *rootvp;
+       struct lfs *fs = NULL;                          /* LFS */
+       struct mount *mp;
+       struct lwp *l = curlwp;
+       struct ufsmount *ump;
+       int error;
+
+       if (device_class(root_device) != DV_DISK)
+               return (ENODEV);
+
+       if (rootdev == NODEV)
+               return (ENODEV);
+       if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) {
+               vrele(rootvp);
+               return (error);
+       }
+       if ((error = lfs_mountfs(rootvp, mp, l))) {
+               vfs_unbusy(mp, false, NULL);
+               vfs_destroy(mp);
+               return (error);
+       }
+       mutex_enter(&mountlist_lock);
+       CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+       mutex_exit(&mountlist_lock);
+       ump = VFSTOUFS(mp);
+       fs = ump->um_lfs;
+       memset(fs->lfs_fsmnt, 0, sizeof(fs->lfs_fsmnt));
+       (void)copystr(mp->mnt_stat.f_mntonname, fs->lfs_fsmnt, MNAMELEN - 1, 0);
+       (void)lfs_statvfs(mp, &mp->mnt_stat);
+       vfs_unbusy(mp, false, NULL);
+       setrootfstime((time_t)(VFSTOUFS(mp)->um_lfs->lfs_tstamp));
+       return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+int
+lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+       struct lwp *l = curlwp;
+       struct vnode *devvp;
+       struct ufs_args *args = data;
+       struct ufsmount *ump = NULL;
+       struct lfs *fs = NULL;                          /* LFS */
+       int error = 0, update;
+       mode_t accessmode;
+
+       if (*data_len < sizeof *args)
+               return EINVAL;
+
+       if (mp->mnt_flag & MNT_GETARGS) {
+               ump = VFSTOUFS(mp);
+               if (ump == NULL)
+                       return EIO;
+               args->fspec = NULL;
+               *data_len = sizeof *args;
+               return 0;
+       }
+
+       update = mp->mnt_flag & MNT_UPDATE;
+
+       /* Check arguments */
+       if (args->fspec != NULL) {
+               /*
+                * Look up the name and verify that it's sane.
+                */
+               error = namei_simple_user(args->fspec,
+                                       NSM_FOLLOW_NOEMULROOT, &devvp);
+               if (error != 0)
+                       return (error);
+
+               if (!update) {
+                       /*
+                        * Be sure this is a valid block device
+                        */
+                       if (devvp->v_type != VBLK)
+                               error = ENOTBLK;
+                       else if (bdevsw_lookup(devvp->v_rdev) == NULL)
+                               error = ENXIO;
+               } else {
+                       /*
+                        * Be sure we're still naming the same device
+                        * used for our initial mount
+                        */
+                       ump = VFSTOUFS(mp);
+                       if (devvp != ump->um_devvp) {
+                               if (devvp->v_rdev != ump->um_devvp->v_rdev)
+                                       error = EINVAL;
+                               else {
+                                       vrele(devvp);
+                                       devvp = ump->um_devvp;
+                                       vref(devvp);
+                               }
+                       }
+               }
+       } else {
+               if (!update) {
+                       /* New mounts must have a filename for the device */
+                       return (EINVAL);
+               } else {
+                       /* Use the extant mount */
+                       ump = VFSTOUFS(mp);
+                       devvp = ump->um_devvp;
+                       vref(devvp);
+               }
+       }
+
+
+       /*
+        * If mount by non-root, then verify that user has necessary
+        * permissions on the device.
+        */
+       if (error == 0) {
+               accessmode = VREAD;
+               if (update ?
+                   (mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
+                   (mp->mnt_flag & MNT_RDONLY) == 0)
+                       accessmode |= VWRITE;
+               vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+               error = genfs_can_mount(devvp, accessmode, l->l_cred);
+               VOP_UNLOCK(devvp);
+       }
+
+       if (error) {
+               vrele(devvp);
+               return (error);
+       }
+
+       if (!update) {
+               int flags;
+
+               if (mp->mnt_flag & MNT_RDONLY)
+                       flags = FREAD;
+               else
+                       flags = FREAD|FWRITE;
+               vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+               error = VOP_OPEN(devvp, flags, FSCRED);
+               VOP_UNLOCK(devvp);
+               if (error)
+                       goto fail;
+               error = lfs_mountfs(devvp, mp, l);              /* LFS */
+               if (error) {
+                       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+                       (void)VOP_CLOSE(devvp, flags, NOCRED);
+                       VOP_UNLOCK(devvp);
+                       goto fail;
+               }
+
+               ump = VFSTOUFS(mp);
+               fs = ump->um_lfs;
+       } else {
+               /*
+                * Update the mount.
+                */
+
+               /*
+                * The initial mount got a reference on this
+                * device, so drop the one obtained via
+                * namei(), above.
+                */
+               vrele(devvp);
+
+               ump = VFSTOUFS(mp);
+               fs = ump->um_lfs;
+               if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
+                       /*
+                        * Changing from read-only to read/write.
+                        * Note in the superblocks that we're writing.
+                        */
+                       fs->lfs_ronly = 0;
+                       if (fs->lfs_pflags & LFS_PF_CLEAN) {
+                               fs->lfs_pflags &= ~LFS_PF_CLEAN;
+                               lfs_writesuper(fs, fs->lfs_sboffs[0]);
+                               lfs_writesuper(fs, fs->lfs_sboffs[1]);
+                       }
+               }
+               if (args->fspec == NULL)
+                       return EINVAL;
+       }
+
+       error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+           UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+       if (error == 0)
+               (void)strncpy(fs->lfs_fsmnt, mp->mnt_stat.f_mntonname,
+                             sizeof(fs->lfs_fsmnt));
+       return error;
+
+fail:
+       vrele(devvp);
+       return (error);
+}
+
+
+/*
+ * Common code for mount and mountroot
+ * LFS specific
+ */
+int
+lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
+{
+       struct dlfs *tdfs, *dfs, *adfs;
+       struct lfs *fs;
+       struct ufsmount *ump;
+       struct vnode *vp;
+       struct buf *bp, *abp;
+       dev_t dev;
+       int error, i, ronly, fsbsize;
+       kauth_cred_t cred;
+       CLEANERINFO *cip;
+       SEGUSE *sup;
+       daddr_t sb_addr;
+
+       cred = l ? l->l_cred : NOCRED;
+
+       /*
+        * Flush out any old buffers remaining from a previous use.
+        */
+       vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+       error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
+       VOP_UNLOCK(devvp);
+       if (error)
+               return (error);
+
+       ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+       /* Don't free random space on error. */
+       bp = NULL;
+       abp = NULL;
+       ump = NULL;
+
+       sb_addr = LFS_LABELPAD / DEV_BSIZE;
+       while (1) {
+               /* Read in the superblock. */
+               error = bread(devvp, sb_addr, LFS_SBPAD, cred, 0, &bp);
+               if (error)
+                       goto out;
+               dfs = (struct dlfs *)bp->b_data;
+
+               /* Check the basics. */
+               if (dfs->dlfs_magic != LFS_MAGIC || dfs->dlfs_bsize > MAXBSIZE ||
+                   dfs->dlfs_version > LFS_VERSION ||
+                   dfs->dlfs_bsize < sizeof(struct dlfs)) {
+                       DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock sanity failed\n"));
+                       error = EINVAL;         /* XXX needs translation */
+                       goto out;
+               }
+               if (dfs->dlfs_inodefmt > LFS_MAXINODEFMT) {
+                       DLOG((DLOG_MOUNT, "lfs_mountfs: unknown inode format %d\n",
+                              dfs->dlfs_inodefmt));
+                       error = EINVAL;
+                       goto out;
+               }
+
+               if (dfs->dlfs_version == 1)
+                       fsbsize = DEV_BSIZE;
+               else {
+                       fsbsize = 1 << dfs->dlfs_ffshift;
+                       /*
+                        * Could be, if the frag size is large enough, that we
+                        * don't have the "real" primary superblock.  If that's
+                        * the case, get the real one, and try again.
+                        */
+                       if (sb_addr != (dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT))) {
+                               DLOG((DLOG_MOUNT, "lfs_mountfs: sb daddr"
+                                     " 0x%llx is not right, trying 0x%llx\n",
+                                     (long long)sb_addr,
+                                     (long long)(dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT))));
+                               sb_addr = dfs->dlfs_sboffs[0] << (dfs->dlfs_ffshift - DEV_BSHIFT);
+                               brelse(bp, 0);
+                               continue;
+                       }
+               }
+               break;
+       }
+
+       /*
+        * Check the second superblock to see which is newer; then mount
+        * using the older of the two.  This is necessary to ensure that
+        * the filesystem is valid if it was not unmounted cleanly.
+        */
+
+       if (dfs->dlfs_sboffs[1] &&
+           dfs->dlfs_sboffs[1] - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize)
+       {
+               error = bread(devvp, dfs->dlfs_sboffs[1] * (fsbsize / DEV_BSIZE),
+                       LFS_SBPAD, cred, 0, &abp);
+               if (error)
+                       goto out;
+               adfs = (struct dlfs *)abp->b_data;
+
+               if (dfs->dlfs_version == 1) {
+                       /* 1s resolution comparison */
+                       if (adfs->dlfs_tstamp < dfs->dlfs_tstamp)
+                               tdfs = adfs;
+                       else
+                               tdfs = dfs;
+               } else {
+                       /* monotonic infinite-resolution comparison */
+                       if (adfs->dlfs_serial < dfs->dlfs_serial)
+                               tdfs = adfs;
+                       else
+                               tdfs = dfs;
+               }
+
+               /* Check the basics. */
+               if (tdfs->dlfs_magic != LFS_MAGIC ||
+                   tdfs->dlfs_bsize > MAXBSIZE ||
+                   tdfs->dlfs_version > LFS_VERSION ||
+                   tdfs->dlfs_bsize < sizeof(struct dlfs)) {
+                       DLOG((DLOG_MOUNT, "lfs_mountfs: alt superblock"
+                             " sanity failed\n"));
+                       error = EINVAL;         /* XXX needs translation */
+                       goto out;
+               }
+       } else {
+               DLOG((DLOG_MOUNT, "lfs_mountfs: invalid alt superblock"
+                     " daddr=0x%x\n", dfs->dlfs_sboffs[1]));
+               error = EINVAL;
+               goto out;
+       }
+
+       /* Allocate the mount structure, copy the superblock into it. */
+       fs = malloc(sizeof(struct lfs), M_UFSMNT, M_WAITOK | M_ZERO);
+       memcpy(&fs->lfs_dlfs, tdfs, sizeof(struct dlfs));
+
+       /* Compatibility */
+       if (fs->lfs_version < 2) {
+               fs->lfs_sumsize = LFS_V1_SUMMARY_SIZE;
+               fs->lfs_ibsize = fs->lfs_bsize;
+               fs->lfs_start = fs->lfs_sboffs[0];
+               fs->lfs_tstamp = fs->lfs_otstamp;
+               fs->lfs_fsbtodb = 0;
+       }
+       if (fs->lfs_resvseg == 0)
+               fs->lfs_resvseg = MIN(fs->lfs_minfreeseg - 1, \
+                       MAX(MIN_RESV_SEGS, fs->lfs_minfreeseg / 2 + 1));
+
+       /*
+        * If we aren't going to be able to write meaningfully to this
+        * filesystem, and were not mounted readonly, bomb out now.
+        */
+       if (fsbtob(fs, LFS_NRESERVE(fs)) > LFS_MAX_BYTES && !ronly) {
+               DLOG((DLOG_MOUNT, "lfs_mount: to mount this filesystem read/write,"
+                     " we need BUFPAGES >= %lld\n",
+                     (long long)((bufmem_hiwater / bufmem_lowater) *
+                                 LFS_INVERSE_MAX_BYTES(
+                                         fsbtob(fs, LFS_NRESERVE(fs))) >> PAGE_SHIFT)));
+               free(fs, M_UFSMNT);
+               error = EFBIG; /* XXX needs translation */
+               goto out;
+       }
+
+       /* Before rolling forward, lock so vget will sleep for other procs */
+       if (l != NULL) {
+               fs->lfs_flags = LFS_NOTYET;
+               fs->lfs_rfpid = l->l_proc->p_pid;
+       }
+
+       ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
+       ump->um_lfs = fs;
+       ump->um_ops = &lfs_ufsops;
+       ump->um_fstype = UFS1;
+       if (sizeof(struct lfs) < LFS_SBPAD) {                   /* XXX why? */
+               brelse(bp, BC_INVAL);
+               brelse(abp, BC_INVAL);
+       } else {
+               brelse(bp, 0);
+               brelse(abp, 0);
+       }
+       bp = NULL;
+       abp = NULL;
+
+
+       /* Set up the I/O information */
+       fs->lfs_devbsize = DEV_BSIZE;
+       fs->lfs_iocount = 0;
+       fs->lfs_diropwait = 0;
+       fs->lfs_activesb = 0;
+       fs->lfs_uinodes = 0;
+       fs->lfs_ravail = 0;
+       fs->lfs_favail = 0;
+       fs->lfs_sbactive = 0;
+
+       /* Set up the ifile and lock aflags */
+       fs->lfs_doifile = 0;
+       fs->lfs_writer = 0;
+       fs->lfs_dirops = 0;
+       fs->lfs_nadirop = 0;
+       fs->lfs_seglock = 0;
+       fs->lfs_pdflush = 0;
+       fs->lfs_sleepers = 0;
+       fs->lfs_pages = 0;
+       rw_init(&fs->lfs_fraglock);
+       rw_init(&fs->lfs_iflock);
+       cv_init(&fs->lfs_stopcv, "lfsstop");
+
+       /* Set the file system readonly/modify bits. */
+       fs->lfs_ronly = ronly;
+       if (ronly == 0)
+               fs->lfs_fmod = 1;
+
+       /* Initialize the mount structure. */
+       dev = devvp->v_rdev;
+       mp->mnt_data = ump;
+       mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
+       mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_LFS);
+       mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
+       mp->mnt_stat.f_namemax = LFS_MAXNAMLEN;
+       mp->mnt_stat.f_iosize = fs->lfs_bsize;
+       mp->mnt_flag |= MNT_LOCAL;
+       mp->mnt_fs_bshift = fs->lfs_bshift;
+       ump->um_flags = 0;
+       ump->um_mountp = mp;
+       ump->um_dev = dev;
+       ump->um_devvp = devvp;
+       ump->um_bptrtodb = fs->lfs_ffshift - DEV_BSHIFT;
+       ump->um_seqinc = fs->lfs_frag;
+       ump->um_nindir = fs->lfs_nindir;
+       ump->um_lognindir = ffs(fs->lfs_nindir) - 1;
+       for (i = 0; i < MAXQUOTAS; i++)
+               ump->um_quotas[i] = NULLVP;
+       ump->um_maxsymlinklen = fs->lfs_maxsymlinklen;
+       ump->um_dirblksiz = DIRBLKSIZ;
+       ump->um_maxfilesize = fs->lfs_maxfilesize;
+       if (ump->um_maxsymlinklen > 0)
+               mp->mnt_iflag |= IMNT_DTYPE;
+       devvp->v_specmountpoint = mp;
+
+       /* Set up reserved memory for pageout */
+       lfs_setup_resblks(fs);
+       /* Set up vdirop tailq */
+       TAILQ_INIT(&fs->lfs_dchainhd);
+       /* and paging tailq */
+       TAILQ_INIT(&fs->lfs_pchainhd);
+       /* and delayed segment accounting for truncation list */
+       LIST_INIT(&fs->lfs_segdhd);
+
+       /*
+        * We use the ifile vnode for almost every operation.  Instead of
+        * retrieving it from the hash table each time we retrieve it here,
+        * artificially increment the reference count and keep a pointer
+        * to it in the incore copy of the superblock.
+        */
+       if ((error = VFS_VGET(mp, LFS_IFILE_INUM, &vp)) != 0) {
+               DLOG((DLOG_MOUNT, "lfs_mountfs: ifile vget failed, error=%d\n", error));
+               goto out;
+       }
+       fs->lfs_ivnode = vp;
+       vref(vp);
+
+       /* Set up inode bitmap and order free list */
+       lfs_order_freelist(fs);
+
+       /* Set up segment usage flags for the autocleaner. */
+       fs->lfs_nactive = 0;
+       fs->lfs_suflags = (u_int32_t **)malloc(2 * sizeof(u_int32_t *),
+                                               M_SEGMENT, M_WAITOK);
+       fs->lfs_suflags[0] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+                                                M_SEGMENT, M_WAITOK);
+       fs->lfs_suflags[1] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+                                                M_SEGMENT, M_WAITOK);
+       memset(fs->lfs_suflags[1], 0, fs->lfs_nseg * sizeof(u_int32_t));
+       for (i = 0; i < fs->lfs_nseg; i++) {
+               int changed;
+
+               LFS_SEGENTRY(sup, fs, i, bp);
+               changed = 0;
+               if (!ronly) {
+                       if (sup->su_nbytes == 0 &&
+                           !(sup->su_flags & SEGUSE_EMPTY)) {
+                               sup->su_flags |= SEGUSE_EMPTY;
+                               ++changed;
+                       } else if (!(sup->su_nbytes == 0) &&
+                                  (sup->su_flags & SEGUSE_EMPTY)) {
+                               sup->su_flags &= ~SEGUSE_EMPTY;
+                               ++changed;
+                       }
+                       if (sup->su_flags & (SEGUSE_ACTIVE|SEGUSE_INVAL)) {
+                               sup->su_flags &= ~(SEGUSE_ACTIVE|SEGUSE_INVAL);
+                               ++changed;
+                       }
+               }
+               fs->lfs_suflags[0][i] = sup->su_flags;
+               if (changed)
+                       LFS_WRITESEGENTRY(sup, fs, i, bp);
+               else
+                       brelse(bp, 0);
+       }
+
+#ifdef LFS_KERNEL_RFW
+       lfs_roll_forward(fs, mp, l);
+#endif
+
+       /* If writing, sb is not clean; record in case of immediate crash */
+       if (!fs->lfs_ronly) {
+               fs->lfs_pflags &= ~LFS_PF_CLEAN;
+               lfs_writesuper(fs, fs->lfs_sboffs[0]);
+               lfs_writesuper(fs, fs->lfs_sboffs[1]);
+       }
+
+       /* Allow vget now that roll-forward is complete */
+       fs->lfs_flags &= ~(LFS_NOTYET);
+       wakeup(&fs->lfs_flags);
+
+       /*
+        * Initialize the ifile cleaner info with information from
+        * the superblock.
+        */
+       LFS_CLEANERINFO(cip, fs, bp);
+       cip->clean = fs->lfs_nclean;
+       cip->dirty = fs->lfs_nseg - fs->lfs_nclean;
+       cip->avail = fs->lfs_avail;
+       cip->bfree = fs->lfs_bfree;
+       (void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+       /*
+        * Mark the current segment as ACTIVE, since we're going to
+        * be writing to it.
+        */
+       LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp);
+       sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
+       fs->lfs_nactive++;
+       LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp);  /* Ifile */
+
+       /* Now that roll-forward is done, unlock the Ifile */
+       vput(vp);
+
+       /* Start the pagedaemon-anticipating daemon */
+       if (lfs_writer_daemon == 0 && kthread_create(PRI_BIO, 0, NULL,
+           lfs_writerd, NULL, NULL, "lfs_writer") != 0)
+               panic("fork lfs_writer");
+       /*
+        * XXX: Get extra reference to LFS vfsops.  This prevents unload,
+        * but also prevents kernel panic due to text being unloaded
+        * from below lfs_writerd.  When lfs_writerd can exit, remove
+        * this!!!
+        */
+       vfs_getopsbyname(MOUNT_LFS);
+
+       printf("WARNING: the log-structured file system is experimental\n"
+           "WARNING: it may cause system crashes and/or corrupt data\n");
+
+       return (0);
+
+out:
+       if (bp)
+               brelse(bp, 0);
+       if (abp)
+               brelse(abp, 0);
+       if (ump) {
+               free(ump->um_lfs, M_UFSMNT);
+               free(ump, M_UFSMNT);
+               mp->mnt_data = NULL;
+       }
+
+       return (error);
+}
+
+/*
+ * unmount system call
+ */
+int
+lfs_unmount(struct mount *mp, int mntflags)
+{
+       struct lwp *l = curlwp;
+       struct ufsmount *ump;
+       struct lfs *fs;
+       int error, flags, ronly;
+       vnode_t *vp;
+
+       flags = 0;
+       if (mntflags & MNT_FORCE)
+               flags |= FORCECLOSE;
+
+       ump = VFSTOUFS(mp);
+       fs = ump->um_lfs;
+
+       /* Two checkpoints */
+       lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
+       lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
+
+       /* wake up the cleaner so it can die */
+       lfs_wakeup_cleaner(fs);
+       mutex_enter(&lfs_lock);
+       while (fs->lfs_sleepers)
+               mtsleep(&fs->lfs_sleepers, PRIBIO + 1, "lfs_sleepers", 0,
+                       &lfs_lock);
+       mutex_exit(&lfs_lock);
+
+#ifdef QUOTA
+        if ((error = quota1_umount(mp, flags)) != 0)
+               return (error);
+#endif
+       if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0)
+               return (error);
+       if ((error = VFS_SYNC(mp, 1, l->l_cred)) != 0)
+               return (error);
+       vp = fs->lfs_ivnode;
+       mutex_enter(vp->v_interlock);
+       if (LIST_FIRST(&vp->v_dirtyblkhd))
+               panic("lfs_unmount: still dirty blocks on ifile vnode");
+       mutex_exit(vp->v_interlock);
+
+       /* Explicitly write the superblock, to update serial and pflags */
+       fs->lfs_pflags |= LFS_PF_CLEAN;
+       lfs_writesuper(fs, fs->lfs_sboffs[0]);
+       lfs_writesuper(fs, fs->lfs_sboffs[1]);
+       mutex_enter(&lfs_lock);
+       while (fs->lfs_iocount)
+               mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0,
+                       &lfs_lock);
+       mutex_exit(&lfs_lock);
+
+       /* Finish with the Ifile, now that we're done with it */
+       vgone(fs->lfs_ivnode);
+
+       ronly = !fs->lfs_ronly;
+       if (ump->um_devvp->v_type != VBAD)
+               ump->um_devvp->v_specmountpoint = NULL;
+       vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+       error = VOP_CLOSE(ump->um_devvp,
+           ronly ? FREAD : FREAD|FWRITE, NOCRED);
+       vput(ump->um_devvp);
+
+       /* Complain about page leakage */
+       if (fs->lfs_pages > 0)
+               printf("lfs_unmount: still claim %d pages (%d in subsystem)\n",
+                       fs->lfs_pages, lfs_subsys_pages);
+
+       /* Free per-mount data structures */
+       free(fs->lfs_ino_bitmap, M_SEGMENT);
+       free(fs->lfs_suflags[0], M_SEGMENT);
+       free(fs->lfs_suflags[1], M_SEGMENT);
+       free(fs->lfs_suflags, M_SEGMENT);
+       lfs_free_resblks(fs);
+       cv_destroy(&fs->lfs_stopcv);
+       rw_destroy(&fs->lfs_fraglock);
+       rw_destroy(&fs->lfs_iflock);
+       free(fs, M_UFSMNT);
+       free(ump, M_UFSMNT);
+
+       mp->mnt_data = NULL;
+       mp->mnt_flag &= ~MNT_LOCAL;
+       return (error);
+}
+
+/*
+ * Get file system statistics.
+ *
+ * NB: We don't lock to access the superblock here, because it's not
+ * really that important if we get it wrong.
+ */
+int
+lfs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+       struct lfs *fs;
+       struct ufsmount *ump;
+
+       ump = VFSTOUFS(mp);
+       fs = ump->um_lfs;
+       if (fs->lfs_magic != LFS_MAGIC)
+               panic("lfs_statvfs: magic");
+
+       sbp->f_bsize = fs->lfs_bsize;
+       sbp->f_frsize = fs->lfs_fsize;
+       sbp->f_iosize = fs->lfs_bsize;
+       sbp->f_blocks = LFS_EST_NONMETA(fs) - VTOI(fs->lfs_ivnode)->i_lfs_effnblks;
+
+       sbp->f_bfree = LFS_EST_BFREE(fs);
+       KASSERT(sbp->f_bfree <= fs->lfs_dsize);
+#if 0
+       if (sbp->f_bfree < 0)
+               sbp->f_bfree = 0;
+#endif
+
+       sbp->f_bresvd = LFS_EST_RSVD(fs);
+       if (sbp->f_bfree > sbp->f_bresvd)
+               sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
+       else
+               sbp->f_bavail = 0;
+
+       sbp->f_files = fs->lfs_bfree / btofsb(fs, fs->lfs_ibsize) * INOPB(fs);
+       sbp->f_ffree = sbp->f_files - fs->lfs_nfiles;
+       sbp->f_favail = sbp->f_ffree;
+       sbp->f_fresvd = 0;
+       copy_statvfs_info(sbp, mp);
+       return (0);
+}
+
+/*
+ * Go through the disk queues to initiate sandbagged IO;
+ * go through the inodes to write those that have been modified;
+ * initiate the writing of the super block if it has been modified.
+ *
+ * Note: we are always called with the filesystem marked `MPBUSY'.
+ */
+int
+lfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
+{
+       int error;
+       struct lfs *fs;
+
+       fs = VFSTOUFS(mp)->um_lfs;
+       if (fs->lfs_ronly)
+               return 0;
+
+       /* Snapshots should not hose the syncer */
+       /*
+        * XXX Sync can block here anyway, since we don't have a very
+        * XXX good idea of how much data is pending.  If it's more
+        * XXX than a segment and lfs_nextseg is close to the end of
+        * XXX the log, we'll likely block.
+        */
+       mutex_enter(&lfs_lock);
+       if (fs->lfs_nowrap && fs->lfs_nextseg < fs->lfs_curseg) {
+               mutex_exit(&lfs_lock);
+               return 0;
+       }
+       mutex_exit(&lfs_lock);
+
+       lfs_writer_enter(fs, "lfs_dirops");
+
+       /* All syncs must be checkpoints until roll-forward is implemented. */
+       DLOG((DLOG_FLUSH, "lfs_sync at 0x%x\n", fs->lfs_offset));
+       error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0));
+       lfs_writer_leave(fs);
+#ifdef QUOTA
+       qsync(mp);
+#endif
+       return (error);
+}
+
+/*
+ * Look up an LFS dinode number to find its incore vnode.  If not already
+ * in core, read it in from the specified device.  Return the inode locked.
+ * Detection and handling of mount points must be done by the calling routine.
+ */
+int
+lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
+{
+       struct lfs *fs;
+       struct ufs1_dinode *dip;
+       struct inode *ip;
+       struct buf *bp;
+       struct ifile *ifp;
+       struct vnode *vp;
+       struct ufsmount *ump;
+       daddr_t daddr;
+       dev_t dev;
+       int error, retries;
+       struct timespec ts;
+
+       memset(&ts, 0, sizeof ts);      /* XXX gcc */
+
+       ump = VFSTOUFS(mp);
+       dev = ump->um_dev;
+       fs = ump->um_lfs;
+
+       /*
+        * If the filesystem is not completely mounted yet, suspend
+        * any access requests (wait for roll-forward to complete).
+        */
+       mutex_enter(&lfs_lock);
+       while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid)
+               mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0,
+                       &lfs_lock);
+       mutex_exit(&lfs_lock);
+
+retry:
+       if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
+               return (0);
+
+       error = getnewvnode(VT_LFS, mp, lfs_vnodeop_p, NULL, &vp);
+       if (error) {
+               *vpp = NULL;
+                return (error);
+       }
+
+       mutex_enter(&ufs_hashlock);
+       if (ufs_ihashget(dev, ino, 0) != NULL) {
+               mutex_exit(&ufs_hashlock);
+               ungetnewvnode(vp);
+               goto retry;
+       }
+
+       /* Translate the inode number to a disk address. */
+       if (ino == LFS_IFILE_INUM)
+               daddr = fs->lfs_idaddr;
+       else {
+               /* XXX bounds-check this too */
+               LFS_IENTRY(ifp, fs, ino, bp);
+               daddr = ifp->if_daddr;
+               if (fs->lfs_version > 1) {
+                       ts.tv_sec = ifp->if_atime_sec;
+                       ts.tv_nsec = ifp->if_atime_nsec;
+               }
+
+               brelse(bp, 0);
+               if (daddr == LFS_UNUSED_DADDR) {
+                       *vpp = NULLVP;
+                       mutex_exit(&ufs_hashlock);
+                       ungetnewvnode(vp);
+                       return (ENOENT);
+               }
+       }
+
+       /* Allocate/init new vnode/inode. */
+       lfs_vcreate(mp, ino, vp);
+
+       /*
+        * Put it onto its hash chain and lock it so that other requests for
+        * this inode will block if they arrive while we are sleeping waiting
+        * for old data structures to be purged or for the contents of the
+        * disk portion of this inode to be read.
+        */
+       ip = VTOI(vp);
+       ufs_ihashins(ip);
+       mutex_exit(&ufs_hashlock);
+
+       /*
+        * XXX
+        * This may not need to be here, logically it should go down with
+        * the i_devvp initialization.
+        * Ask Kirk.
+        */
+       ip->i_lfs = ump->um_lfs;
+
+       /* Read in the disk contents for the inode, copy into the inode. */
+       retries = 0;
+    again:
+       error = bread(ump->um_devvp, fsbtodb(fs, daddr),
+               (fs->lfs_version == 1 ? fs->lfs_bsize : fs->lfs_ibsize),
+               NOCRED, 0, &bp);
+       if (error) {
+               /*
+                * The inode does not contain anything useful, so it would
+                * be misleading to leave it on its hash chain. With mode
+                * still zero, it will be unlinked and returned to the free
+                * list by vput().
+                */
+               vput(vp);
+               brelse(bp, 0);
+               *vpp = NULL;
+               return (error);
+       }
+
+       dip = lfs_ifind(fs, ino, bp);
+       if (dip == NULL) {
+               /* Assume write has not completed yet; try again */
+               brelse(bp, BC_INVAL);
+               ++retries;
+               if (retries > LFS_IFIND_RETRIES) {
+#ifdef DEBUG
+                       /* If the seglock is held look at the bpp to see
+                          what is there anyway */
+                       mutex_enter(&lfs_lock);
+                       if (fs->lfs_seglock > 0) {
+                               struct buf **bpp;
+                               struct ufs1_dinode *dp;
+                               int i;
+
+                               for (bpp = fs->lfs_sp->bpp;
+                                    bpp != fs->lfs_sp->cbpp; ++bpp) {
+                                       if ((*bpp)->b_vp == fs->lfs_ivnode &&
+                                           bpp != fs->lfs_sp->bpp) {
+                                               /* Inode block */
+                                               printf("lfs_vget: block 0x%" PRIx64 ": ",
+                                                      (*bpp)->b_blkno);
+                                               dp = (struct ufs1_dinode *)(*bpp)->b_data;
+                                               for (i = 0; i < INOPB(fs); i++)
+                                                       if (dp[i].di_u.inumber)
+                                                               printf("%d ", dp[i].di_u.inumber);
+                                               printf("\n");
+                                       }
+                               }
+                       }
+                       mutex_exit(&lfs_lock);
+#endif /* DEBUG */
+                       panic("lfs_vget: dinode not found");
+               }
+               mutex_enter(&lfs_lock);
+               if (fs->lfs_iocount) {
+                       DLOG((DLOG_VNODE, "lfs_vget: dinode %d not found, retrying...\n", ino));
+                       (void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
+                                     "lfs ifind", 1, &lfs_lock);
+               } else
+                       retries = LFS_IFIND_RETRIES;
+               mutex_exit(&lfs_lock);
+               goto again;
+       }
+       *ip->i_din.ffs1_din = *dip;
+       brelse(bp, 0);
+
+       if (fs->lfs_version > 1) {
+               ip->i_ffs1_atime = ts.tv_sec;
+               ip->i_ffs1_atimensec = ts.tv_nsec;
+       }
+
+       lfs_vinit(mp, &vp);
+
+       *vpp = vp;
+
+       KASSERT(VOP_ISLOCKED(vp));
+
+       return (0);
+}
+
+/*
+ * File handle to vnode
+ */
+int
+lfs_fhtovp(struct mount *mp, struct fid *fhp, struct vnode **vpp)
+{
+       struct lfid lfh;
+       struct buf *bp;
+       IFILE *ifp;
+       int32_t daddr;
+       struct lfs *fs;
+       vnode_t *vp;
+
+       if (fhp->fid_len != sizeof(struct lfid))
+               return EINVAL;
+
+       memcpy(&lfh, fhp, sizeof(lfh));
+       if (lfh.lfid_ino < LFS_IFILE_INUM)
+               return ESTALE;
+
+       fs = VFSTOUFS(mp)->um_lfs;
+       if (lfh.lfid_ident != fs->lfs_ident)
+               return ESTALE;
+
+       if (lfh.lfid_ino >
+           ((VTOI(fs->lfs_ivnode)->i_ffs1_size >> fs->lfs_bshift) -
+            fs->lfs_cleansz - fs->lfs_segtabsz) * fs->lfs_ifpb)
+               return ESTALE;
+
+       mutex_enter(&ufs_ihash_lock);
+       vp = ufs_ihashlookup(VFSTOUFS(mp)->um_dev, lfh.lfid_ino);
+       mutex_exit(&ufs_ihash_lock);
+       if (vp == NULL) {
+               LFS_IENTRY(ifp, fs, lfh.lfid_ino, bp);
+               daddr = ifp->if_daddr;
+               brelse(bp, 0);
+               if (daddr == LFS_UNUSED_DADDR)
+                       return ESTALE;
+       }
+
+       return (ufs_fhtovp(mp, &lfh.lfid_ufid, vpp));
+}
+
+/*
+ * Vnode pointer to File handle
+ */
+/* ARGSUSED */
+int
+lfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
+{
+       struct inode *ip;
+       struct lfid lfh;
+
+       if (*fh_size < sizeof(struct lfid)) {
+               *fh_size = sizeof(struct lfid);
+               return E2BIG;
+       }
+       *fh_size = sizeof(struct lfid);
+       ip = VTOI(vp);
+       memset(&lfh, 0, sizeof(lfh));
+       lfh.lfid_len = sizeof(struct lfid);
+       lfh.lfid_ino = ip->i_number;
+       lfh.lfid_gen = ip->i_gen;
+       lfh.lfid_ident = ip->i_lfs->lfs_ident;
+       memcpy(fhp, &lfh, sizeof(lfh));
+       return (0);
+}
+
+/*
+ * ufs_bmaparray callback function for writing.
+ *
+ * Since blocks will be written to the new segment anyway,
+ * we don't care about current daddr of them.
+ */
+static bool
+lfs_issequential_hole(const struct ufsmount *ump,
+    daddr_t daddr0, daddr_t daddr1)
+{
+       daddr0 = (daddr_t)((int32_t)daddr0); /* XXX ondisk32 */
+       daddr1 = (daddr_t)((int32_t)daddr1); /* XXX ondisk32 */
+
+       KASSERT(daddr0 == UNWRITTEN ||
+           (0 <= daddr0 && daddr0 <= LFS_MAX_DADDR));
+       KASSERT(daddr1 == UNWRITTEN ||
+           (0 <= daddr1 && daddr1 <= LFS_MAX_DADDR));
+
+       /* NOTE: all we want to know here is 'hole or not'. */
+       /* NOTE: UNASSIGNED is converted to 0 by ufs_bmaparray. */
+
+       /*
+        * treat UNWRITTENs and all resident blocks as 'contiguous'
+        */
+       if (daddr0 != 0 && daddr1 != 0)
+               return true;
+
+       /*
+        * both are in hole?
+        */
+       if (daddr0 == 0 && daddr1 == 0)
+               return true; /* all holes are 'contiguous' for us. */
+
+       return false;
+}
+
+/*
+ * lfs_gop_write functions exactly like genfs_gop_write, except that
+ * (1) it requires the seglock to be held by its caller, and sp->fip
+ *     to be properly initialized (it will return without re-initializing
+ *     sp->fip, and without calling lfs_writeseg).
+ * (2) it uses the remaining space in the segment, rather than VOP_BMAP,
+ *     to determine how large a block it can write at once (though it does
+ *     still use VOP_BMAP to find holes in the file);
+ * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks
+ *     (leaving lfs_writeseg to deal with the cluster blocks, so we might
+ *     now have clusters of clusters, ick.)
+ */
+static int
+lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
+    int flags)
+{
+       int i, error, run, haveeof = 0;
+       int fs_bshift;
+       vaddr_t kva;
+       off_t eof, offset, startoffset = 0;
+       size_t bytes, iobytes, skipbytes;
+       bool async = (flags & PGO_SYNCIO) == 0;
+       daddr_t lbn, blkno;
+       struct vm_page *pg;
+       struct buf *mbp, *bp;
+       struct vnode *devvp = VTOI(vp)->i_devvp;
+       struct inode *ip = VTOI(vp);
+       struct lfs *fs = ip->i_lfs;
+       struct segment *sp = fs->lfs_sp;
+       UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
+
+       ASSERT_SEGLOCK(fs);
+
+       /* The Ifile lives in the buffer cache */
+       KASSERT(vp != fs->lfs_ivnode);
+
+       /*
+        * We don't want to fill the disk before the cleaner has a chance
+        * to make room for us.  If we're in danger of doing that, fail
+        * with EAGAIN.  The caller will have to notice this, unlock
+        * so the cleaner can run, relock and try again.
+        *
+        * We must write everything, however, if our vnode is being
+        * reclaimed.
+        */
+       if (LFS_STARVED_FOR_SEGS(fs) && vp != fs->lfs_flushvp)
+               goto tryagain;
+
+       /*
+        * Sometimes things slip past the filters in lfs_putpages,
+        * and the pagedaemon tries to write pages---problem is
+        * that the pagedaemon never acquires the segment lock.
+        *
+        * Alternatively, pages that were clean when we called
+        * genfs_putpages may have become dirty in the meantime.  In this
+        * case the segment header is not properly set up for blocks
+        * to be added to it.
+        *
+        * Unbusy and unclean the pages, and put them on the ACTIVE
+        * queue under the hypothesis that they couldn't have got here
+        * unless they were modified *quite* recently.
+        *
+        * XXXUBC that last statement is an oversimplification of course.
+        */
+       if (!LFS_SEGLOCK_HELD(fs) ||
+           (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) ||
+           (pgs[0]->offset & fs->lfs_bmask) != 0) {
+               goto tryagain;
+       }
+
+       UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
+           vp, pgs, npages, flags);
+
+       GOP_SIZE(vp, vp->v_size, &eof, 0);
+       haveeof = 1;
+
+       if (vp->v_type == VREG)
+               fs_bshift = vp->v_mount->mnt_fs_bshift;
+       else
+               fs_bshift = DEV_BSHIFT;
+       error = 0;
+       pg = pgs[0];
+       startoffset = pg->offset;
+       KASSERT(eof >= 0);
+
+       if (startoffset >= eof) {
+               goto tryagain;
+       } else
+               bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
+       skipbytes = 0;
+
+       KASSERT(bytes != 0);
+
+       /* Swap PG_DELWRI for PG_PAGEOUT */
+       for (i = 0; i < npages; i++) {
+               if (pgs[i]->flags & PG_DELWRI) {
+                       KASSERT(!(pgs[i]->flags & PG_PAGEOUT));
+                       pgs[i]->flags &= ~PG_DELWRI;
+                       pgs[i]->flags |= PG_PAGEOUT;
+                       uvm_pageout_start(1);
+                       mutex_enter(&uvm_pageqlock);
+                       uvm_pageunwire(pgs[i]);
+                       mutex_exit(&uvm_pageqlock);
+               }
+       }
+
+       /*
+        * Check to make sure we're starting on a block boundary.
+        * We'll check later to make sure we always write entire
+        * blocks (or fragments).
+        */
+       if (startoffset & fs->lfs_bmask)
+               printf("%" PRId64 " & %" PRId64 " = %" PRId64 "\n",
+                      startoffset, fs->lfs_bmask,
+                      startoffset & fs->lfs_bmask);
+       KASSERT((startoffset & fs->lfs_bmask) == 0);
+       if (bytes & fs->lfs_ffmask) {
+               printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes);
+               panic("lfs_gop_write: non-integer blocks");
+       }
+
+       /*
+        * We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK.
+        * If we would, write what we have and try again.  If we don't
+        * have anything to write, we'll have to sleep.
+        */
+       if ((kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
+                                     (((SEGSUM *)(sp->segsum))->ss_nfinfo < 1 ?
+                                      UVMPAGER_MAPIN_WAITOK : 0))) == 0x0) {
+               DLOG((DLOG_PAGE, "lfs_gop_write: forcing write\n"));
+#if 0
+                     " with nfinfo=%d at offset 0x%x\n",
+                     (int)((SEGSUM *)(sp->segsum))->ss_nfinfo,
+                     (unsigned)fs->lfs_offset));
+#endif
+               lfs_updatemeta(sp);
+               lfs_release_finfo(fs);
+               (void) lfs_writeseg(fs, sp);
+
+               lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
+
+               /*
+                * Having given up all of the pager_map we were holding,
+                * we can now wait for aiodoned to reclaim it for us
+                * without fear of deadlock.
+                */
+               kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
+                                    UVMPAGER_MAPIN_WAITOK);
+       }
+
+       mbp = getiobuf(NULL, true);
+       UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
+           vp, mbp, vp->v_numoutput, bytes);
+       mbp->b_bufsize = npages << PAGE_SHIFT;
+       mbp->b_data = (void *)kva;
+       mbp->b_resid = mbp->b_bcount = bytes;
+       mbp->b_cflags = BC_BUSY|BC_AGE;
+       mbp->b_iodone = uvm_aio_biodone;
+
+       bp = NULL;
+       for (offset = startoffset;
+           bytes > 0;
+           offset += iobytes, bytes -= iobytes) {
+               lbn = offset >> fs_bshift;
+               error = ufs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run,
+                   lfs_issequential_hole);
+               if (error) {
+                       UVMHIST_LOG(ubchist, "ufs_bmaparray() -> %d",
+                           error,0,0,0);
+                       skipbytes += bytes;
+                       bytes = 0;
+                       break;
+               }
+
+               iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
+                   bytes);
+               if (blkno == (daddr_t)-1) {
+                       skipbytes += iobytes;
+                       continue;
+               }
+
+               /*
+                * Discover how much we can really pack into this buffer.
+                */
+               /* If no room in the current segment, finish it up */
+               if (sp->sum_bytes_left < sizeof(int32_t) ||
+                   sp->seg_bytes_left < (1 << fs->lfs_bshift)) {
+                       int vers;
+
+                       lfs_updatemeta(sp);
+                       vers = sp->fip->fi_version;
+                       lfs_release_finfo(fs);
+                       (void) lfs_writeseg(fs, sp);
+
+                       lfs_acquire_finfo(fs, ip->i_number, vers);
+               }
+               /* Check both for space in segment and space in segsum */
+               iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift)
+                                       << fs_bshift);
+               iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t))
+                                      << fs_bshift);
+               KASSERT(iobytes > 0);
+
+               /* if it's really one i/o, don't make a second buf */
+               if (offset == startoffset && iobytes == bytes) {
+                       bp = mbp;
+                       /* 
+                        * All the LFS output is done by the segwriter.  It
+                        * will increment numoutput by one for all the bufs it
+                        * recieves.  However this buffer needs one extra to
+                        * account for aiodone.
+                        */
+                       mutex_enter(vp->v_interlock);
+                       vp->v_numoutput++;
+                       mutex_exit(vp->v_interlock);
+               } else {
+                       bp = getiobuf(NULL, true);
+                       UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
+                           vp, bp, vp->v_numoutput, 0);
+                       nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
+                       /*
+                        * LFS doesn't like async I/O here, dies with
+                        * and assert in lfs_bwrite().  Is that assert
+                        * valid?  I retained non-async behaviour when
+                        * converted this to use nestiobuf --pooka
+                        */
+                       bp->b_flags &= ~B_ASYNC;
+               }
+
+               /* XXX This is silly ... is this necessary? */
+               mutex_enter(&bufcache_lock);
+               mutex_enter(vp->v_interlock);
+               bgetvp(vp, bp);
+               mutex_exit(vp->v_interlock);
+               mutex_exit(&bufcache_lock);
+
+               bp->b_lblkno = lblkno(fs, offset);
+               bp->b_private = mbp;
+               if (devvp->v_type == VBLK) {
+                       bp->b_dev = devvp->v_rdev;
+               }
+               VOP_BWRITE(bp->b_vp, bp);
+               while (lfs_gatherblock(sp, bp, NULL))
+                       continue;
+       }
+
+       nestiobuf_done(mbp, skipbytes, error);
+       if (skipbytes) {
+               UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0);
+       }
+       UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0);
+
+       if (!async) {
+               /* Start a segment write. */
+               UVMHIST_LOG(ubchist, "flushing", 0,0,0,0);
+               mutex_enter(&lfs_lock);
+               lfs_flush(fs, 0, 1);
+               mutex_exit(&lfs_lock);
+       }
+       return (0);
+
+    tryagain:
+       /*
+        * We can't write the pages, for whatever reason.
+        * Clean up after ourselves, and make the caller try again.
+        */
+       mutex_enter(vp->v_interlock);
+
+       /* Tell why we're here, if we know */
+       if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
+               DLOG((DLOG_PAGE, "lfs_gop_write: clean pages dirtied\n"));
+       } else if ((pgs[0]->offset & fs->lfs_bmask) != 0) {
+               DLOG((DLOG_PAGE, "lfs_gop_write: not on block boundary\n"));
+       } else if (haveeof && startoffset >= eof) {
+               DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
+                     " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
+                     pgs[0]->offset, eof, npages));
+       } else if (LFS_STARVED_FOR_SEGS(fs)) {
+               DLOG((DLOG_PAGE, "lfs_gop_write: avail too low\n"));
+       } else {
+               DLOG((DLOG_PAGE, "lfs_gop_write: seglock not held\n"));
+       }
+
+       mutex_enter(&uvm_pageqlock);
+       for (i = 0; i < npages; i++) {
+               pg = pgs[i];
+
+               if (pg->flags & PG_PAGEOUT)
+                       uvm_pageout_done(1);
+               if (pg->flags & PG_DELWRI) {
+                       uvm_pageunwire(pg);
+               }
+               uvm_pageactivate(pg);
+               pg->flags &= ~(PG_CLEAN|PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
+               DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg,
+                       vp, pg->offset));
+               DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags));
+               DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags));
+               DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon));
+               DLOG((DLOG_PAGE, "pg[%d]->uobject = %p\n", i, pg->uobject));
+               DLOG((DLOG_PAGE, "pg[%d]->wire_count = %d\n", i,
+                     pg->wire_count));
+               DLOG((DLOG_PAGE, "pg[%d]->loan_count = %d\n", i,
+                     pg->loan_count));
+       }
+       /* uvm_pageunbusy takes care of PG_BUSY, PG_WANTED */
+       uvm_page_unbusy(pgs, npages);
+       mutex_exit(&uvm_pageqlock);
+       mutex_exit(vp->v_interlock);
+       return EAGAIN;
+}
+
+/*
+ * finish vnode/inode initialization.
+ * used by lfs_vget and lfs_fastvget.
+ */
+void
+lfs_vinit(struct mount *mp, struct vnode **vpp)
+{
+       struct vnode *vp = *vpp;
+       struct inode *ip = VTOI(vp);
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct lfs *fs = ump->um_lfs;
+       int i;
+
+       ip->i_mode = ip->i_ffs1_mode;
+       ip->i_nlink = ip->i_ffs1_nlink;
+       ip->i_lfs_osize = ip->i_size = ip->i_ffs1_size;
+       ip->i_flags = ip->i_ffs1_flags;
+       ip->i_gen = ip->i_ffs1_gen;
+       ip->i_uid = ip->i_ffs1_uid;
+       ip->i_gid = ip->i_ffs1_gid;
+
+       ip->i_lfs_effnblks = ip->i_ffs1_blocks;
+       ip->i_lfs_odnlink = ip->i_ffs1_nlink;
+
+       /*
+        * Initialize the vnode from the inode, check for aliases.  In all
+        * cases re-init ip, the underlying vnode/inode may have changed.
+        */
+       ufs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp);
+       ip = VTOI(vp);
+
+       memset(ip->i_lfs_fragsize, 0, NDADDR * sizeof(*ip->i_lfs_fragsize));
+       if (vp->v_type != VLNK || ip->i_size >= ip->i_ump->um_maxsymlinklen) {
+#ifdef DEBUG
+               for (i = (ip->i_size + fs->lfs_bsize - 1) >> fs->lfs_bshift;
+                   i < NDADDR; i++) {
+                       if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
+                           i == 0)
+                               continue;
+                       if (ip->i_ffs1_db[i] != 0) {
+inconsistent:
+                               lfs_dump_dinode(ip->i_din.ffs1_din);
+                               panic("inconsistent inode");
+                       }
+               }
+               for ( ; i < NDADDR + NIADDR; i++) {
+                       if (ip->i_ffs1_ib[i - NDADDR] != 0) {
+                               goto inconsistent;
+                       }
+               }
+#endif /* DEBUG */
+               for (i = 0; i < NDADDR; i++)
+                       if (ip->i_ffs1_db[i] != 0)
+                               ip->i_lfs_fragsize[i] = blksize(fs, ip, i);
+       }
+
+#ifdef DIAGNOSTIC
+       if (vp->v_type == VNON) {
+# ifdef DEBUG
+               lfs_dump_dinode(ip->i_din.ffs1_din);
+# endif
+               panic("lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n",
+                     (unsigned long long)ip->i_number,
+                     (ip->i_mode & IFMT) >> 12);
+       }
+#endif /* DIAGNOSTIC */
+
+       /*
+        * Finish inode initialization now that aliasing has been resolved.
+        */
+
+       ip->i_devvp = ump->um_devvp;
+       vref(ip->i_devvp);
+       genfs_node_init(vp, &lfs_genfsops);
+       uvm_vnp_setsize(vp, ip->i_size);
+
+       /* Initialize hiblk from file size */
+       ip->i_lfs_hiblk = lblkno(ip->i_lfs, ip->i_size + ip->i_lfs->lfs_bsize - 1) - 1;
+
+       *vpp = vp;
+}
+
+/*
+ * Resize the filesystem to contain the specified number of segments.
+ */
+int
+lfs_resize_fs(struct lfs *fs, int newnsegs)
+{
+       SEGUSE *sup;
+       struct buf *bp, *obp;
+       daddr_t olast, nlast, ilast, noff, start, end;
+       struct vnode *ivp;
+       struct inode *ip;
+       int error, badnews, inc, oldnsegs;
+       int sbbytes, csbbytes, gain, cgain;
+       int i;
+
+       /* Only support v2 and up */
+       if (fs->lfs_version < 2)
+               return EOPNOTSUPP;
+
+       /* If we're doing nothing, do it fast */
+       oldnsegs = fs->lfs_nseg;
+       if (newnsegs == oldnsegs)
+               return 0;
+
+       /* We always have to have two superblocks */
+       if (newnsegs <= dtosn(fs, fs->lfs_sboffs[1]))
+               return EFBIG;
+
+       ivp = fs->lfs_ivnode;
+       ip = VTOI(ivp);
+       error = 0;
+
+       /* Take the segment lock so no one else calls lfs_newseg() */
+       lfs_seglock(fs, SEGM_PROT);
+
+       /*
+        * Make sure the segments we're going to be losing, if any,
+        * are in fact empty.  We hold the seglock, so their status
+        * cannot change underneath us.  Count the superblocks we lose,
+        * while we're at it.
+        */
+       sbbytes = csbbytes = 0;
+       cgain = 0;
+       for (i = newnsegs; i < oldnsegs; i++) {
+               LFS_SEGENTRY(sup, fs, i, bp);
+               badnews = sup->su_nbytes || !(sup->su_flags & SEGUSE_INVAL);
+               if (sup->su_flags & SEGUSE_SUPERBLOCK)
+                       sbbytes += LFS_SBPAD;
+               if (!(sup->su_flags & SEGUSE_DIRTY)) {
+                       ++cgain;
+                       if (sup->su_flags & SEGUSE_SUPERBLOCK)
+                               csbbytes += LFS_SBPAD;
+               }
+               brelse(bp, 0);
+               if (badnews) {
+                       error = EBUSY;
+                       goto out;
+               }
+       }
+
+       /* Note old and new segment table endpoints, and old ifile size */
+       olast = fs->lfs_cleansz + fs->lfs_segtabsz;
+       nlast = howmany(newnsegs, fs->lfs_sepb) + fs->lfs_cleansz;
+       ilast = ivp->v_size >> fs->lfs_bshift;
+       noff = nlast - olast;
+
+       /*
+        * Make sure no one can use the Ifile while we change it around.
+        * Even after taking the iflock we need to make sure no one still
+        * is holding Ifile buffers, so we get each one, to drain them.
+        * (XXX this could be done better.)
+        */
+       rw_enter(&fs->lfs_iflock, RW_WRITER);
+       vn_lock(ivp, LK_EXCLUSIVE | LK_RETRY);
+       for (i = 0; i < ilast; i++) {
+               bread(ivp, i, fs->lfs_bsize, NOCRED, 0, &bp);
+               brelse(bp, 0);
+       }
+
+       /* Allocate new Ifile blocks */
+       for (i = ilast; i < ilast + noff; i++) {
+               if (lfs_balloc(ivp, i * fs->lfs_bsize, fs->lfs_bsize, NOCRED, 0,
+                              &bp) != 0)
+                       panic("balloc extending ifile");
+               memset(bp->b_data, 0, fs->lfs_bsize);
+               VOP_BWRITE(bp->b_vp, bp);
+       }
+
+       /* Register new ifile size */
+       ip->i_size += noff * fs->lfs_bsize; 
+       ip->i_ffs1_size = ip->i_size;
+       uvm_vnp_setsize(ivp, ip->i_size);
+
+       /* Copy the inode table to its new position */
+       if (noff != 0) {
+               if (noff < 0) {
+                       start = nlast;
+                       end = ilast + noff;
+                       inc = 1;
+               } else {
+                       start = ilast + noff - 1;
+                       end = nlast - 1;
+                       inc = -1;
+               }
+               for (i = start; i != end; i += inc) {
+                       if (bread(ivp, i, fs->lfs_bsize, NOCRED,
+                           B_MODIFY, &bp) != 0)
+                               panic("resize: bread dst blk failed");
+                       if (bread(ivp, i - noff, fs->lfs_bsize,
+                           NOCRED, 0, &obp))
+                               panic("resize: bread src blk failed");
+                       memcpy(bp->b_data, obp->b_data, fs->lfs_bsize);
+                       VOP_BWRITE(bp->b_vp, bp);
+                       brelse(obp, 0);
+               }
+       }
+
+       /* If we are expanding, write the new empty SEGUSE entries */
+       if (newnsegs > oldnsegs) {
+               for (i = oldnsegs; i < newnsegs; i++) {
+                       if ((error = bread(ivp, i / fs->lfs_sepb +
+                                          fs->lfs_cleansz, fs->lfs_bsize,
+                                          NOCRED, B_MODIFY, &bp)) != 0)
+                               panic("lfs: ifile read: %d", error);
+                       while ((i + 1) % fs->lfs_sepb && i < newnsegs) {
+                               sup = &((SEGUSE *)bp->b_data)[i % fs->lfs_sepb];
+                               memset(sup, 0, sizeof(*sup));
+                               i++;
+                       }
+                       VOP_BWRITE(bp->b_vp, bp);
+               }
+       }
+
+       /* Zero out unused superblock offsets */
+       for (i = 2; i < LFS_MAXNUMSB; i++)
+               if (dtosn(fs, fs->lfs_sboffs[i]) >= newnsegs)
+                       fs->lfs_sboffs[i] = 0x0;
+
+       /*
+        * Correct superblock entries that depend on fs size.
+        * The computations of these are as follows:
+        *
+        * size  = segtod(fs, nseg)
+        * dsize = segtod(fs, nseg - minfreeseg) - btofsb(#super * LFS_SBPAD)
+        * bfree = dsize - btofsb(fs, bsize * nseg / 2) - blocks_actually_used
+        * avail = segtod(fs, nclean) - btofsb(#clean_super * LFS_SBPAD)
+        *         + (segtod(fs, 1) - (offset - curseg))
+        *         - segtod(fs, minfreeseg - (minfreeseg / 2))
+        *
+        * XXX - we should probably adjust minfreeseg as well.
+        */
+       gain = (newnsegs - oldnsegs);
+       fs->lfs_nseg = newnsegs;
+       fs->lfs_segtabsz = nlast - fs->lfs_cleansz;
+       fs->lfs_size += gain * btofsb(fs, fs->lfs_ssize);
+       fs->lfs_dsize += gain * btofsb(fs, fs->lfs_ssize) - btofsb(fs, sbbytes);
+       fs->lfs_bfree += gain * btofsb(fs, fs->lfs_ssize) - btofsb(fs, sbbytes)
+                      - gain * btofsb(fs, fs->lfs_bsize / 2);
+       if (gain > 0) {
+               fs->lfs_nclean += gain;
+               fs->lfs_avail += gain * btofsb(fs, fs->lfs_ssize);
+       } else {
+               fs->lfs_nclean -= cgain;
+               fs->lfs_avail -= cgain * btofsb(fs, fs->lfs_ssize) -
+                                btofsb(fs, csbbytes);
+       }
+
+       /* Resize segment flag cache */
+       fs->lfs_suflags[0] = (u_int32_t *)realloc(fs->lfs_suflags[0],
+                                                 fs->lfs_nseg * sizeof(u_int32_t),
+                                                 M_SEGMENT, M_WAITOK);
+       fs->lfs_suflags[1] = (u_int32_t *)realloc(fs->lfs_suflags[1],
+                                                 fs->lfs_nseg * sizeof(u_int32_t),
+                                                 M_SEGMENT, M_WAITOK);
+       for (i = oldnsegs; i < newnsegs; i++)
+               fs->lfs_suflags[0][i] = fs->lfs_suflags[1][i] = 0x0;
+
+       /* Truncate Ifile if necessary */
+       if (noff < 0)
+               lfs_truncate(ivp, ivp->v_size + (noff << fs->lfs_bshift), 0,
+                   NOCRED);
+
+       /* Update cleaner info so the cleaner can die */
+       bread(ivp, 0, fs->lfs_bsize, NOCRED, B_MODIFY, &bp);
+       ((CLEANERINFO *)bp->b_data)->clean = fs->lfs_nclean;
+       ((CLEANERINFO *)bp->b_data)->dirty = fs->lfs_nseg - fs->lfs_nclean;
+       VOP_BWRITE(bp->b_vp, bp);
+
+       /* Let Ifile accesses proceed */
+       VOP_UNLOCK(ivp);
+       rw_exit(&fs->lfs_iflock);
+
+    out:
+       lfs_segunlock(fs);
+       return error;
+}
diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c
new file mode 100644 (file)
index 0000000..f30a5d2
--- /dev/null
@@ -0,0 +1,2478 @@
+/*     $NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $      */
+
+/*-
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Konrad E. Schroder <perseant@hhhh.org>.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * Copyright (c) 1986, 1989, 1991, 1993, 1995
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)lfs_vnops.c 8.13 (Berkeley) 6/10/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $");
+
+#ifdef _KERNEL_OPT
+#include "opt_compat_netbsd.h"
+#include "opt_uvm_page_trkown.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/pool.h>
+#include <sys/signalvar.h>
+#include <sys/kauth.h>
+#include <sys/syslog.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_pmap.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+
+#include <ufs/lfs/lfs.h>
+#include <ufs/lfs/lfs_extern.h>
+
+extern pid_t lfs_writer_daemon;
+int lfs_ignore_lazy_sync = 1;
+
+/* Global vfs data structures for lfs. */
+int (**lfs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, ufs_lookup },               /* lookup */
+       { &vop_create_desc, lfs_create },               /* create */
+       { &vop_whiteout_desc, ufs_whiteout },           /* whiteout */
+       { &vop_mknod_desc, lfs_mknod },                 /* mknod */
+       { &vop_open_desc, ufs_open },                   /* open */
+       { &vop_close_desc, lfs_close },                 /* close */
+       { &vop_access_desc, ufs_access },               /* access */
+       { &vop_getattr_desc, lfs_getattr },             /* getattr */
+       { &vop_setattr_desc, lfs_setattr },             /* setattr */
+       { &vop_read_desc, lfs_read },                   /* read */
+       { &vop_write_desc, lfs_write },                 /* write */
+       { &vop_ioctl_desc, ufs_ioctl },                 /* ioctl */
+       { &vop_fcntl_desc, lfs_fcntl },                 /* fcntl */
+       { &vop_poll_desc, ufs_poll },                   /* poll */
+       { &vop_kqfilter_desc, genfs_kqfilter },         /* kqfilter */
+       { &vop_revoke_desc, ufs_revoke },               /* revoke */
+       { &vop_mmap_desc, lfs_mmap },                   /* mmap */
+       { &vop_fsync_desc, lfs_fsync },                 /* fsync */
+       { &vop_seek_desc, ufs_seek },                   /* seek */
+       { &vop_remove_desc, lfs_remove },               /* remove */
+       { &vop_link_desc, lfs_link },                   /* link */
+       { &vop_rename_desc, lfs_rename },               /* rename */
+       { &vop_mkdir_desc, lfs_mkdir },                 /* mkdir */
+       { &vop_rmdir_desc, lfs_rmdir },                 /* rmdir */
+       { &vop_symlink_desc, lfs_symlink },             /* symlink */
+       { &vop_readdir_desc, ufs_readdir },             /* readdir */
+       { &vop_readlink_desc, ufs_readlink },           /* readlink */
+       { &vop_abortop_desc, ufs_abortop },             /* abortop */
+       { &vop_inactive_desc, lfs_inactive },           /* inactive */
+       { &vop_reclaim_desc, lfs_reclaim },             /* reclaim */
+       { &vop_lock_desc, ufs_lock },                   /* lock */
+       { &vop_unlock_desc, ufs_unlock },               /* unlock */
+       { &vop_bmap_desc, ufs_bmap },                   /* bmap */
+       { &vop_strategy_desc, lfs_strategy },           /* strategy */
+       { &vop_print_desc, ufs_print },                 /* print */
+       { &vop_islocked_desc, ufs_islocked },           /* islocked */
+       { &vop_pathconf_desc, ufs_pathconf },           /* pathconf */
+       { &vop_advlock_desc, ufs_advlock },             /* advlock */
+       { &vop_bwrite_desc, lfs_bwrite },               /* bwrite */
+       { &vop_getpages_desc, lfs_getpages },           /* getpages */
+       { &vop_putpages_desc, lfs_putpages },           /* putpages */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc lfs_vnodeop_opv_desc =
+       { &lfs_vnodeop_p, lfs_vnodeop_entries };
+
+int (**lfs_specop_p)(void *);
+const struct vnodeopv_entry_desc lfs_specop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, spec_lookup },              /* lookup */
+       { &vop_create_desc, spec_create },              /* create */
+       { &vop_mknod_desc, spec_mknod },                /* mknod */
+       { &vop_open_desc, spec_open },                  /* open */
+       { &vop_close_desc, lfsspec_close },             /* close */
+       { &vop_access_desc, ufs_access },               /* access */
+       { &vop_getattr_desc, lfs_getattr },             /* getattr */
+       { &vop_setattr_desc, lfs_setattr },             /* setattr */
+       { &vop_read_desc, ufsspec_read },               /* read */
+       { &vop_write_desc, ufsspec_write },             /* write */
+       { &vop_ioctl_desc, spec_ioctl },                /* ioctl */
+       { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
+       { &vop_poll_desc, spec_poll },                  /* poll */
+       { &vop_kqfilter_desc, spec_kqfilter },          /* kqfilter */
+       { &vop_revoke_desc, spec_revoke },              /* revoke */
+       { &vop_mmap_desc, spec_mmap },                  /* mmap */
+       { &vop_fsync_desc, spec_fsync },                /* fsync */
+       { &vop_seek_desc, spec_seek },                  /* seek */
+       { &vop_remove_desc, spec_remove },              /* remove */
+       { &vop_link_desc, spec_link },                  /* link */
+       { &vop_rename_desc, spec_rename },              /* rename */
+       { &vop_mkdir_desc, spec_mkdir },                /* mkdir */
+       { &vop_rmdir_desc, spec_rmdir },                /* rmdir */
+       { &vop_symlink_desc, spec_symlink },            /* symlink */
+       { &vop_readdir_desc, spec_readdir },            /* readdir */
+       { &vop_readlink_desc, spec_readlink },          /* readlink */
+       { &vop_abortop_desc, spec_abortop },            /* abortop */
+       { &vop_inactive_desc, lfs_inactive },           /* inactive */
+       { &vop_reclaim_desc, lfs_reclaim },             /* reclaim */
+       { &vop_lock_desc, ufs_lock },                   /* lock */
+       { &vop_unlock_desc, ufs_unlock },               /* unlock */
+       { &vop_bmap_desc, spec_bmap },                  /* bmap */
+       { &vop_strategy_desc, spec_strategy },          /* strategy */
+       { &vop_print_desc, ufs_print },                 /* print */
+       { &vop_islocked_desc, ufs_islocked },           /* islocked */
+       { &vop_pathconf_desc, spec_pathconf },          /* pathconf */
+       { &vop_advlock_desc, spec_advlock },            /* advlock */
+       { &vop_bwrite_desc, vn_bwrite },                /* bwrite */
+       { &vop_getpages_desc, spec_getpages },          /* getpages */
+       { &vop_putpages_desc, spec_putpages },          /* putpages */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc lfs_specop_opv_desc =
+       { &lfs_specop_p, lfs_specop_entries };
+
+int (**lfs_fifoop_p)(void *);
+const struct vnodeopv_entry_desc lfs_fifoop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, vn_fifo_bypass },           /* lookup */
+       { &vop_create_desc, vn_fifo_bypass },           /* create */
+       { &vop_mknod_desc, vn_fifo_bypass },            /* mknod */
+       { &vop_open_desc, vn_fifo_bypass },             /* open */
+       { &vop_close_desc, lfsfifo_close },             /* close */
+       { &vop_access_desc, ufs_access },               /* access */
+       { &vop_getattr_desc, lfs_getattr },             /* getattr */
+       { &vop_setattr_desc, lfs_setattr },             /* setattr */
+       { &vop_read_desc, ufsfifo_read },               /* read */
+       { &vop_write_desc, ufsfifo_write },             /* write */
+       { &vop_ioctl_desc, vn_fifo_bypass },            /* ioctl */
+       { &vop_fcntl_desc, ufs_fcntl },                 /* fcntl */
+       { &vop_poll_desc, vn_fifo_bypass },             /* poll */
+       { &vop_kqfilter_desc, vn_fifo_bypass },         /* kqfilter */
+       { &vop_revoke_desc, vn_fifo_bypass },           /* revoke */
+       { &vop_mmap_desc, vn_fifo_bypass },             /* mmap */
+       { &vop_fsync_desc, vn_fifo_bypass },            /* fsync */
+       { &vop_seek_desc, vn_fifo_bypass },             /* seek */
+       { &vop_remove_desc, vn_fifo_bypass },           /* remove */
+       { &vop_link_desc, vn_fifo_bypass },             /* link */
+       { &vop_rename_desc, vn_fifo_bypass },           /* rename */
+       { &vop_mkdir_desc, vn_fifo_bypass },            /* mkdir */
+       { &vop_rmdir_desc, vn_fifo_bypass },            /* rmdir */
+       { &vop_symlink_desc, vn_fifo_bypass },          /* symlink */
+       { &vop_readdir_desc, vn_fifo_bypass },          /* readdir */
+       { &vop_readlink_desc, vn_fifo_bypass },         /* readlink */
+       { &vop_abortop_desc, vn_fifo_bypass },          /* abortop */
+       { &vop_inactive_desc, lfs_inactive },           /* inactive */
+       { &vop_reclaim_desc, lfs_reclaim },             /* reclaim */
+       { &vop_lock_desc, ufs_lock },                   /* lock */
+       { &vop_unlock_desc, ufs_unlock },               /* unlock */
+       { &vop_bmap_desc, vn_fifo_bypass },             /* bmap */
+       { &vop_strategy_desc, vn_fifo_bypass },         /* strategy */
+       { &vop_print_desc, ufs_print },                 /* print */
+       { &vop_islocked_desc, ufs_islocked },           /* islocked */
+       { &vop_pathconf_desc, vn_fifo_bypass },         /* pathconf */
+       { &vop_advlock_desc, vn_fifo_bypass },          /* advlock */
+       { &vop_bwrite_desc, lfs_bwrite },               /* bwrite */
+       { &vop_putpages_desc, vn_fifo_bypass },         /* putpages */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc lfs_fifoop_opv_desc =
+       { &lfs_fifoop_p, lfs_fifoop_entries };
+
+static int check_dirty(struct lfs *, struct vnode *, off_t, off_t, off_t, int, int, struct vm_page **);
+
+#define        LFS_READWRITE
+#include <ufs/ufs/ufs_readwrite.c>
+#undef LFS_READWRITE
+
+/*
+ * Synch an open file.
+ */
+/* ARGSUSED */
+int
+lfs_fsync(void *v)
+{
+       struct vop_fsync_args /* {
+               struct vnode *a_vp;
+               kauth_cred_t a_cred;
+               int a_flags;
+               off_t offlo;
+               off_t offhi;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       int error, wait;
+       struct inode *ip = VTOI(vp);
+       struct lfs *fs = ip->i_lfs;
+
+       /* If we're mounted read-only, don't try to sync. */
+       if (fs->lfs_ronly)
+               return 0;
+
+       /* If a removed vnode is being cleaned, no need to sync here. */
+       if ((ap->a_flags & FSYNC_RECLAIM) != 0 && ip->i_mode == 0)
+               return 0;
+
+       /*
+        * Trickle sync simply adds this vnode to the pager list, as if
+        * the pagedaemon had requested a pageout.
+        */
+       if (ap->a_flags & FSYNC_LAZY) {
+               if (lfs_ignore_lazy_sync == 0) {
+                       mutex_enter(&lfs_lock);
+                       if (!(ip->i_flags & IN_PAGING)) {
+                               ip->i_flags |= IN_PAGING;
+                               TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip,
+                                                 i_lfs_pchain);
+                       }
+                       wakeup(&lfs_writer_daemon);
+                       mutex_exit(&lfs_lock);
+               }
+               return 0;
+       }
+
+       /*
+        * If a vnode is bring cleaned, flush it out before we try to
+        * reuse it.  This prevents the cleaner from writing files twice
+        * in the same partial segment, causing an accounting underflow.
+        */
+       if (ap->a_flags & FSYNC_RECLAIM && ip->i_flags & IN_CLEANING) {
+               lfs_vflush(vp);
+       }
+
+       wait = (ap->a_flags & FSYNC_WAIT);
+       do {
+               mutex_enter(vp->v_interlock);
+               error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
+                                    round_page(ap->a_offhi),
+                                    PGO_CLEANIT | (wait ? PGO_SYNCIO : 0));
+               if (error == EAGAIN) {
+                       mutex_enter(&lfs_lock);
+                       mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_fsync",
+                               hz / 100 + 1, &lfs_lock);
+                       mutex_exit(&lfs_lock);
+               }
+       } while (error == EAGAIN);
+       if (error)
+               return error;
+
+       if ((ap->a_flags & FSYNC_DATAONLY) == 0)
+               error = lfs_update(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+
+       if (error == 0 && ap->a_flags & FSYNC_CACHE) {
+               int l = 0;
+               error = VOP_IOCTL(ip->i_devvp, DIOCCACHESYNC, &l, FWRITE,
+                                 curlwp->l_cred);
+       }
+       if (wait && !VPISEMPTY(vp))
+               LFS_SET_UINO(ip, IN_MODIFIED);
+
+       return error;
+}
+
+/*
+ * Take IN_ADIROP off, then call ufs_inactive.
+ */
+int
+lfs_inactive(void *v)
+{
+       struct vop_inactive_args /* {
+               struct vnode *a_vp;
+       } */ *ap = v;
+
+       lfs_unmark_vnode(ap->a_vp);
+
+       /*
+        * The Ifile is only ever inactivated on unmount.
+        * Streamline this process by not giving it more dirty blocks.
+        */
+       if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM) {
+               mutex_enter(&lfs_lock);
+               LFS_CLR_UINO(VTOI(ap->a_vp), IN_ALLMOD);
+               mutex_exit(&lfs_lock);
+               VOP_UNLOCK(ap->a_vp);
+               return 0;
+       }
+
+       return ufs_inactive(v);
+}
+
+/*
+ * These macros are used to bracket UFS directory ops, so that we can
+ * identify all the pages touched during directory ops which need to
+ * be ordered and flushed atomically, so that they may be recovered.
+ *
+ * Because we have to mark nodes VU_DIROP in order to prevent
+ * the cache from reclaiming them while a dirop is in progress, we must
+ * also manage the number of nodes so marked (otherwise we can run out).
+ * We do this by setting lfs_dirvcount to the number of marked vnodes; it
+ * is decremented during segment write, when VU_DIROP is taken off.
+ */
+#define        MARK_VNODE(vp)                  lfs_mark_vnode(vp)
+#define        UNMARK_VNODE(vp)                lfs_unmark_vnode(vp)
+#define        SET_DIROP_CREATE(dvp, vpp)      lfs_set_dirop_create((dvp), (vpp))
+#define        SET_DIROP_REMOVE(dvp, vp)       lfs_set_dirop((dvp), (vp))
+static int lfs_set_dirop_create(struct vnode *, struct vnode **);
+static int lfs_set_dirop(struct vnode *, struct vnode *);
+
+static int
+lfs_set_dirop(struct vnode *dvp, struct vnode *vp)
+{
+       struct lfs *fs;
+       int error;
+
+       KASSERT(VOP_ISLOCKED(dvp));
+       KASSERT(vp == NULL || VOP_ISLOCKED(vp));
+
+       fs = VTOI(dvp)->i_lfs;
+
+       ASSERT_NO_SEGLOCK(fs);
+       /*
+        * LFS_NRESERVE calculates direct and indirect blocks as well
+        * as an inode block; an overestimate in most cases.
+        */
+       if ((error = lfs_reserve(fs, dvp, vp, LFS_NRESERVE(fs))) != 0)
+               return (error);
+
+    restart:
+       mutex_enter(&lfs_lock);
+       if (fs->lfs_dirops == 0) {
+               mutex_exit(&lfs_lock);
+               lfs_check(dvp, LFS_UNUSED_LBN, 0);
+               mutex_enter(&lfs_lock);
+       }
+       while (fs->lfs_writer) {
+               error = mtsleep(&fs->lfs_dirops, (PRIBIO + 1) | PCATCH,
+                   "lfs_sdirop", 0, &lfs_lock);
+               if (error == EINTR) {
+                       mutex_exit(&lfs_lock);
+                       goto unreserve;
+               }
+       }
+       if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) {
+               wakeup(&lfs_writer_daemon);
+               mutex_exit(&lfs_lock);
+               preempt();
+               goto restart;
+       }
+
+       if (lfs_dirvcount > LFS_MAX_DIROP) {
+               mutex_exit(&lfs_lock);
+               DLOG((DLOG_DIROP, "lfs_set_dirop: sleeping with dirops=%d, "
+                     "dirvcount=%d\n", fs->lfs_dirops, lfs_dirvcount));
+               if ((error = mtsleep(&lfs_dirvcount,
+                   PCATCH | PUSER | PNORELOCK, "lfs_maxdirop", 0,
+                   &lfs_lock)) != 0) {
+                       goto unreserve;
+               }
+               goto restart;
+       }
+
+       ++fs->lfs_dirops;
+       fs->lfs_doifile = 1;
+       mutex_exit(&lfs_lock);
+
+       /* Hold a reference so SET_ENDOP will be happy */
+       vref(dvp);
+       if (vp) {
+               vref(vp);
+               MARK_VNODE(vp);
+       }
+
+       MARK_VNODE(dvp);
+       return 0;
+
+  unreserve:
+       lfs_reserve(fs, dvp, vp, -LFS_NRESERVE(fs));
+       return error;
+}
+
+/*
+ * Get a new vnode *before* adjusting the dirop count, to avoid a deadlock
+ * in getnewvnode(), if we have a stacked filesystem mounted on top
+ * of us.
+ *
+ * NB: this means we have to clear the new vnodes on error.  Fortunately
+ * SET_ENDOP is there to do that for us.
+ */
+static int
+lfs_set_dirop_create(struct vnode *dvp, struct vnode **vpp)
+{
+       int error;
+       struct lfs *fs;
+
+       fs = VFSTOUFS(dvp->v_mount)->um_lfs;
+       ASSERT_NO_SEGLOCK(fs);
+       if (fs->lfs_ronly)
+               return EROFS;
+       if (vpp == NULL) {
+               return lfs_set_dirop(dvp, NULL);
+       }
+       error = getnewvnode(VT_LFS, dvp->v_mount, lfs_vnodeop_p, NULL, vpp);
+       if (error) {
+               DLOG((DLOG_ALLOC, "lfs_set_dirop_create: dvp %p error %d\n",
+                     dvp, error));
+               return error;
+       }
+       if ((error = lfs_set_dirop(dvp, NULL)) != 0) {
+               ungetnewvnode(*vpp);
+               *vpp = NULL;
+               return error;
+       }
+       return 0;
+}
+
+#define        SET_ENDOP_BASE(fs, dvp, str)                                    \
+       do {                                                            \
+               mutex_enter(&lfs_lock);                         \
+               --(fs)->lfs_dirops;                                     \
+               if (!(fs)->lfs_dirops) {                                \
+                       if ((fs)->lfs_nadirop) {                        \
+                               panic("SET_ENDOP: %s: no dirops but "   \
+                                       " nadirop=%d", (str),           \
+                                       (fs)->lfs_nadirop);             \
+                       }                                               \
+                       wakeup(&(fs)->lfs_writer);                      \
+                       mutex_exit(&lfs_lock);                          \
+                       lfs_check((dvp), LFS_UNUSED_LBN, 0);            \
+               } else                                                  \
+                       mutex_exit(&lfs_lock);                          \
+       } while(0)
+#define SET_ENDOP_CREATE(fs, dvp, nvpp, str)                           \
+       do {                                                            \
+               UNMARK_VNODE(dvp);                                      \
+               if (nvpp && *nvpp)                                      \
+                       UNMARK_VNODE(*nvpp);                            \
+               /* Check for error return to stem vnode leakage */      \
+               if (nvpp && *nvpp && !((*nvpp)->v_uflag & VU_DIROP))    \
+                       ungetnewvnode(*(nvpp));                         \
+               SET_ENDOP_BASE((fs), (dvp), (str));                     \
+               lfs_reserve((fs), (dvp), NULL, -LFS_NRESERVE(fs));      \
+               vrele(dvp);                                             \
+       } while(0)
+#define SET_ENDOP_CREATE_AP(ap, str)                                   \
+       SET_ENDOP_CREATE(VTOI((ap)->a_dvp)->i_lfs, (ap)->a_dvp,         \
+                        (ap)->a_vpp, (str))
+#define SET_ENDOP_REMOVE(fs, dvp, ovp, str)                            \
+       do {                                                            \
+               UNMARK_VNODE(dvp);                                      \
+               if (ovp)                                                \
+                       UNMARK_VNODE(ovp);                              \
+               SET_ENDOP_BASE((fs), (dvp), (str));                     \
+               lfs_reserve((fs), (dvp), (ovp), -LFS_NRESERVE(fs));     \
+               vrele(dvp);                                             \
+               if (ovp)                                                \
+                       vrele(ovp);                                     \
+       } while(0)
+
+void
+lfs_mark_vnode(struct vnode *vp)
+{
+       struct inode *ip = VTOI(vp);
+       struct lfs *fs = ip->i_lfs;
+
+       mutex_enter(&lfs_lock);
+       if (!(ip->i_flag & IN_ADIROP)) {
+               if (!(vp->v_uflag & VU_DIROP)) {
+                       mutex_enter(vp->v_interlock);
+                       (void)lfs_vref(vp);
+                       ++lfs_dirvcount;
+                       ++fs->lfs_dirvcount;
+                       TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+                       vp->v_uflag |= VU_DIROP;
+               }
+               ++fs->lfs_nadirop;
+               ip->i_flag |= IN_ADIROP;
+       } else
+               KASSERT(vp->v_uflag & VU_DIROP);
+       mutex_exit(&lfs_lock);
+}
+
+void
+lfs_unmark_vnode(struct vnode *vp)
+{
+       struct inode *ip = VTOI(vp);
+
+       if (ip && (ip->i_flag & IN_ADIROP)) {
+               KASSERT(vp->v_uflag & VU_DIROP);
+               mutex_enter(&lfs_lock);
+               --ip->i_lfs->lfs_nadirop;
+               mutex_exit(&lfs_lock);
+               ip->i_flag &= ~IN_ADIROP;
+       }
+}
+
+int
+lfs_symlink(void *v)
+{
+       struct vop_symlink_args /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+               struct vattr *a_vap;
+               char *a_target;
+       } */ *ap = v;
+       int error;
+
+       if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+               vput(ap->a_dvp);
+               return error;
+       }
+       error = ufs_symlink(ap);
+       SET_ENDOP_CREATE_AP(ap, "symlink");
+       return (error);
+}
+
+int
+lfs_mknod(void *v)
+{
+       struct vop_mknod_args   /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+               struct vattr *a_vap;
+       } */ *ap = v;
+       struct vattr *vap = ap->a_vap;
+       struct vnode **vpp = ap->a_vpp;
+       struct inode *ip;
+       int error;
+       struct mount    *mp;
+       ino_t           ino;
+       struct ufs_lookup_results *ulr;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(ap->a_dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
+       if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+               vput(ap->a_dvp);
+               return error;
+       }
+       error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
+                             ap->a_dvp, ulr, vpp, ap->a_cnp);
+
+       /* Either way we're done with the dirop at this point */
+       SET_ENDOP_CREATE_AP(ap, "mknod");
+
+       if (error)
+               return (error);
+
+       ip = VTOI(*vpp);
+       mp  = (*vpp)->v_mount;
+       ino = ip->i_number;
+       ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+       if (vap->va_rdev != VNOVAL) {
+               /*
+                * Want to be able to use this to make badblock
+                * inodes, so don't truncate the dev number.
+                */
+#if 0
+               ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
+                                          UFS_MPNEEDSWAP((*vpp)->v_mount));
+#else
+               ip->i_ffs1_rdev = vap->va_rdev;
+#endif
+       }
+
+       /*
+        * Call fsync to write the vnode so that we don't have to deal with
+        * flushing it when it's marked VU_DIROP|VI_XLOCK.
+        *
+        * XXX KS - If we can't flush we also can't call vgone(), so must
+        * return.  But, that leaves this vnode in limbo, also not good.
+        * Can this ever happen (barring hardware failure)?
+        */
+       if ((error = VOP_FSYNC(*vpp, NOCRED, FSYNC_WAIT, 0, 0)) != 0) {
+               panic("lfs_mknod: couldn't fsync (ino %llu)",
+                     (unsigned long long)ino);
+               /* return (error); */
+       }
+       /*
+        * Remove vnode so that it will be reloaded by VFS_VGET and
+        * checked to see if it is an alias of an existing entry in
+        * the inode cache.
+        */
+       /* Used to be vput, but that causes us to call VOP_INACTIVE twice. */
+
+       VOP_UNLOCK(*vpp);
+       (*vpp)->v_type = VNON;
+       vgone(*vpp);
+       error = VFS_VGET(mp, ino, vpp);
+
+       if (error != 0) {
+               *vpp = NULL;
+               return (error);
+       }
+       return (0);
+}
+
+int
+lfs_create(void *v)
+{
+       struct vop_create_args  /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+               struct vattr *a_vap;
+       } */ *ap = v;
+       int error;
+
+       if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+               vput(ap->a_dvp);
+               return error;
+       }
+       error = ufs_create(ap);
+       SET_ENDOP_CREATE_AP(ap, "create");
+       return (error);
+}
+
+int
+lfs_mkdir(void *v)
+{
+       struct vop_mkdir_args   /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+               struct vattr *a_vap;
+       } */ *ap = v;
+       int error;
+
+       if ((error = SET_DIROP_CREATE(ap->a_dvp, ap->a_vpp)) != 0) {
+               vput(ap->a_dvp);
+               return error;
+       }
+       error = ufs_mkdir(ap);
+       SET_ENDOP_CREATE_AP(ap, "mkdir");
+       return (error);
+}
+
+int
+lfs_remove(void *v)
+{
+       struct vop_remove_args  /* {
+               struct vnode *a_dvp;
+               struct vnode *a_vp;
+               struct componentname *a_cnp;
+       } */ *ap = v;
+       struct vnode *dvp, *vp;
+       struct inode *ip;
+       int error;
+
+       dvp = ap->a_dvp;
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       if ((error = SET_DIROP_REMOVE(dvp, vp)) != 0) {
+               if (dvp == vp)
+                       vrele(vp);
+               else
+                       vput(vp);
+               vput(dvp);
+               return error;
+       }
+       error = ufs_remove(ap);
+       if (ip->i_nlink == 0)
+               lfs_orphan(ip->i_lfs, ip->i_number);
+       SET_ENDOP_REMOVE(ip->i_lfs, dvp, ap->a_vp, "remove");
+       return (error);
+}
+
+int
+lfs_rmdir(void *v)
+{
+       struct vop_rmdir_args   /* {
+               struct vnodeop_desc *a_desc;
+               struct vnode *a_dvp;
+               struct vnode *a_vp;
+               struct componentname *a_cnp;
+       } */ *ap = v;
+       struct vnode *vp;
+       struct inode *ip;
+       int error;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       if ((error = SET_DIROP_REMOVE(ap->a_dvp, ap->a_vp)) != 0) {
+               if (ap->a_dvp == vp)
+                       vrele(ap->a_dvp);
+               else
+                       vput(ap->a_dvp);
+               vput(vp);
+               return error;
+       }
+       error = ufs_rmdir(ap);
+       if (ip->i_nlink == 0)
+               lfs_orphan(ip->i_lfs, ip->i_number);
+       SET_ENDOP_REMOVE(ip->i_lfs, ap->a_dvp, ap->a_vp, "rmdir");
+       return (error);
+}
+
+int
+lfs_link(void *v)
+{
+       struct vop_link_args    /* {
+               struct vnode *a_dvp;
+               struct vnode *a_vp;
+               struct componentname *a_cnp;
+       } */ *ap = v;
+       int error;
+       struct vnode **vpp = NULL;
+
+       if ((error = SET_DIROP_CREATE(ap->a_dvp, vpp)) != 0) {
+               vput(ap->a_dvp);
+               return error;
+       }
+       error = ufs_link(ap);
+       SET_ENDOP_CREATE(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vpp, "link");
+       return (error);
+}
+
+int
+lfs_rename(void *v)
+{
+       struct vop_rename_args  /* {
+               struct vnode *a_fdvp;
+               struct vnode *a_fvp;
+               struct componentname *a_fcnp;
+               struct vnode *a_tdvp;
+               struct vnode *a_tvp;
+               struct componentname *a_tcnp;
+       } */ *ap = v;
+       struct vnode *tvp, *fvp, *tdvp, *fdvp;
+       struct componentname *tcnp, *fcnp;
+       int error;
+       struct lfs *fs;
+
+       fs = VTOI(ap->a_fdvp)->i_lfs;
+       tvp = ap->a_tvp;
+       tdvp = ap->a_tdvp;
+       tcnp = ap->a_tcnp;
+       fvp = ap->a_fvp;
+       fdvp = ap->a_fdvp;
+       fcnp = ap->a_fcnp;
+
+       /*
+        * Check for cross-device rename.
+        * If it is, we don't want to set dirops, just error out.
+        * (In particular note that MARK_VNODE(tdvp) will DTWT on
+        * a cross-device rename.)
+        *
+        * Copied from ufs_rename.
+        */
+       if ((fvp->v_mount != tdvp->v_mount) ||
+           (tvp && (fvp->v_mount != tvp->v_mount))) {
+               error = EXDEV;
+               goto errout;
+       }
+
+       /*
+        * Check to make sure we're not renaming a vnode onto itself
+        * (deleting a hard link by renaming one name onto another);
+        * if we are we can't recursively call VOP_REMOVE since that
+        * would leave us with an unaccounted-for number of live dirops.
+        *
+        * Inline the relevant section of ufs_rename here, *before*
+        * calling SET_DIROP_REMOVE.
+        */
+       if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
+                   (VTOI(tdvp)->i_flags & APPEND))) {
+               error = EPERM;
+               goto errout;
+       }
+       if (fvp == tvp) {
+               if (fvp->v_type == VDIR) {
+                       error = EINVAL;
+                       goto errout;
+               }
+
+               /* Release destination completely. */
+               VOP_ABORTOP(tdvp, tcnp);
+               vput(tdvp);
+               vput(tvp);
+
+               /* Delete source. */
+               vrele(fvp);
+               fcnp->cn_flags &= ~(MODMASK);
+               fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+               fcnp->cn_nameiop = DELETE;
+               vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+               if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+                       vput(fdvp);
+                       return (error);
+               }
+               return (VOP_REMOVE(fdvp, fvp, fcnp));
+       }
+
+       if ((error = SET_DIROP_REMOVE(tdvp, tvp)) != 0)
+               goto errout;
+       MARK_VNODE(fdvp);
+       MARK_VNODE(fvp);
+
+       error = ufs_rename(ap);
+       UNMARK_VNODE(fdvp);
+       UNMARK_VNODE(fvp);
+       SET_ENDOP_REMOVE(fs, tdvp, tvp, "rename");
+       return (error);
+
+  errout:
+       VOP_ABORTOP(tdvp, ap->a_tcnp); /* XXX, why not in NFS? */
+       if (tdvp == tvp)
+               vrele(tdvp);
+       else
+               vput(tdvp);
+       if (tvp)
+               vput(tvp);
+       VOP_ABORTOP(fdvp, ap->a_fcnp); /* XXX, why not in NFS? */
+       vrele(fdvp);
+       vrele(fvp);
+       return (error);
+}
+
+/* XXX hack to avoid calling ITIMES in getattr */
+int
+lfs_getattr(void *v)
+{
+       struct vop_getattr_args /* {
+               struct vnode *a_vp;
+               struct vattr *a_vap;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       struct vattr *vap = ap->a_vap;
+       struct lfs *fs = ip->i_lfs;
+       /*
+        * Copy from inode table
+        */
+       vap->va_fsid = ip->i_dev;
+       vap->va_fileid = ip->i_number;
+       vap->va_mode = ip->i_mode & ~IFMT;
+       vap->va_nlink = ip->i_nlink;
+       vap->va_uid = ip->i_uid;
+       vap->va_gid = ip->i_gid;
+       vap->va_rdev = (dev_t)ip->i_ffs1_rdev;
+       vap->va_size = vp->v_size;
+       vap->va_atime.tv_sec = ip->i_ffs1_atime;
+       vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
+       vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
+       vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
+       vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
+       vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
+       vap->va_flags = ip->i_flags;
+       vap->va_gen = ip->i_gen;
+       /* this doesn't belong here */
+       if (vp->v_type == VBLK)
+               vap->va_blocksize = BLKDEV_IOSIZE;
+       else if (vp->v_type == VCHR)
+               vap->va_blocksize = MAXBSIZE;
+       else
+               vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+       vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks);
+       vap->va_type = vp->v_type;
+       vap->va_filerev = ip->i_modrev;
+       return (0);
+}
+
+/*
+ * Check to make sure the inode blocks won't choke the buffer
+ * cache, then call ufs_setattr as usual.
+ */
+int
+lfs_setattr(void *v)
+{
+       struct vop_setattr_args /* {
+               struct vnode *a_vp;
+               struct vattr *a_vap;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+
+       lfs_check(vp, LFS_UNUSED_LBN, 0);
+       return ufs_setattr(v);
+}
+
+/*
+ * Release the block we hold on lfs_newseg wrapping.  Called on file close,
+ * or explicitly from LFCNWRAPGO.  Called with the interlock held.
+ */
+static int
+lfs_wrapgo(struct lfs *fs, struct inode *ip, int waitfor)
+{
+       if (fs->lfs_stoplwp != curlwp)
+               return EBUSY;
+
+       fs->lfs_stoplwp = NULL;
+       cv_signal(&fs->lfs_stopcv);
+
+       KASSERT(fs->lfs_nowrap > 0);
+       if (fs->lfs_nowrap <= 0) {
+               return 0;
+       }
+
+       if (--fs->lfs_nowrap == 0) {
+               log(LOG_NOTICE, "%s: re-enabled log wrap\n", fs->lfs_fsmnt);
+               wakeup(&fs->lfs_wrappass);
+               lfs_wakeup_cleaner(fs);
+       }
+       if (waitfor) {
+               mtsleep(&fs->lfs_nextseg, PCATCH | PUSER, "segment",
+                   0, &lfs_lock);
+       }
+
+       return 0;
+}
+
+/*
+ * Close called
+ */
+/* ARGSUSED */
+int
+lfs_close(void *v)
+{
+       struct vop_close_args /* {
+               struct vnode *a_vp;
+               int  a_fflag;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       struct lfs *fs = ip->i_lfs;
+
+       if ((ip->i_number == ROOTINO || ip->i_number == LFS_IFILE_INUM) &&
+           fs->lfs_stoplwp == curlwp) {
+               mutex_enter(&lfs_lock);
+               log(LOG_NOTICE, "lfs_close: releasing log wrap control\n");
+               lfs_wrapgo(fs, ip, 0);
+               mutex_exit(&lfs_lock);
+       }
+
+       if (vp == ip->i_lfs->lfs_ivnode &&
+           vp->v_mount->mnt_iflag & IMNT_UNMOUNT)
+               return 0;
+
+       if (vp->v_usecount > 1 && vp != ip->i_lfs->lfs_ivnode) {
+               LFS_ITIMES(ip, NULL, NULL, NULL);
+       }
+       return (0);
+}
+
+/*
+ * Close wrapper for special devices.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+lfsspec_close(void *v)
+{
+       struct vop_close_args /* {
+               struct vnode    *a_vp;
+               int             a_fflag;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+       struct vnode    *vp;
+       struct inode    *ip;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       if (vp->v_usecount > 1) {
+               LFS_ITIMES(ip, NULL, NULL, NULL);
+       }
+       return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Close wrapper for fifo's.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+lfsfifo_close(void *v)
+{
+       struct vop_close_args /* {
+               struct vnode    *a_vp;
+               int             a_fflag;
+               kauth_cred_     a_cred;
+       } */ *ap = v;
+       struct vnode    *vp;
+       struct inode    *ip;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       if (ap->a_vp->v_usecount > 1) {
+               LFS_ITIMES(ip, NULL, NULL, NULL);
+       }
+       return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+
+int
+lfs_reclaim(void *v)
+{
+       struct vop_reclaim_args /* {
+               struct vnode *a_vp;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       struct lfs *fs = ip->i_lfs;
+       int error;
+
+       /*
+        * The inode must be freed and updated before being removed
+        * from its hash chain.  Other threads trying to gain a hold
+        * on the inode will be stalled because it is locked (VI_XLOCK).
+        */
+       if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
+               lfs_vfree(vp, ip->i_number, ip->i_omode);
+
+       mutex_enter(&lfs_lock);
+       LFS_CLR_UINO(ip, IN_ALLMOD);
+       mutex_exit(&lfs_lock);
+       if ((error = ufs_reclaim(vp)))
+               return (error);
+
+       /*
+        * Take us off the paging and/or dirop queues if we were on them.
+        * We shouldn't be on them.
+        */
+       mutex_enter(&lfs_lock);
+       if (ip->i_flags & IN_PAGING) {
+               log(LOG_WARNING, "%s: reclaimed vnode is IN_PAGING\n",
+                   fs->lfs_fsmnt);
+               ip->i_flags &= ~IN_PAGING;
+               TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+       }
+       if (vp->v_uflag & VU_DIROP) {
+               panic("reclaimed vnode is VU_DIROP");
+               vp->v_uflag &= ~VU_DIROP;
+               TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+       }
+       mutex_exit(&lfs_lock);
+
+       pool_put(&lfs_dinode_pool, ip->i_din.ffs1_din);
+       lfs_deregister_all(vp);
+       pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
+       ip->inode_ext.lfs = NULL;
+       genfs_node_destroy(vp);
+       pool_put(&lfs_inode_pool, vp->v_data);
+       vp->v_data = NULL;
+       return (0);
+}
+
+/*
+ * Read a block from a storage device.
+ * In order to avoid reading blocks that are in the process of being
+ * written by the cleaner---and hence are not mutexed by the normal
+ * buffer cache / page cache mechanisms---check for collisions before
+ * reading.
+ *
+ * We inline ufs_strategy to make sure that the VOP_BMAP occurs *before*
+ * the active cleaner test.
+ *
+ * XXX This code assumes that lfs_markv makes synchronous checkpoints.
+ */
+int
+lfs_strategy(void *v)
+{
+       struct vop_strategy_args /* {
+               struct vnode *a_vp;
+               struct buf *a_bp;
+       } */ *ap = v;
+       struct buf      *bp;
+       struct lfs      *fs;
+       struct vnode    *vp;
+       struct inode    *ip;
+       daddr_t         tbn;
+       int             i, sn, error, slept;
+
+       bp = ap->a_bp;
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       fs = ip->i_lfs;
+
+       /* lfs uses its strategy routine only for read */
+       KASSERT(bp->b_flags & B_READ);
+
+       if (vp->v_type == VBLK || vp->v_type == VCHR)
+               panic("lfs_strategy: spec");
+       KASSERT(bp->b_bcount != 0);
+       if (bp->b_blkno == bp->b_lblkno) {
+               error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+                                NULL);
+               if (error) {
+                       bp->b_error = error;
+                       bp->b_resid = bp->b_bcount;
+                       biodone(bp);
+                       return (error);
+               }
+               if ((long)bp->b_blkno == -1) /* no valid data */
+                       clrbuf(bp);
+       }
+       if ((long)bp->b_blkno < 0) { /* block is not on disk */
+               bp->b_resid = bp->b_bcount;
+               biodone(bp);
+               return (0);
+       }
+
+       slept = 1;
+       mutex_enter(&lfs_lock);
+       while (slept && fs->lfs_seglock) {
+               mutex_exit(&lfs_lock);
+               /*
+                * Look through list of intervals.
+                * There will only be intervals to look through
+                * if the cleaner holds the seglock.
+                * Since the cleaner is synchronous, we can trust
+                * the list of intervals to be current.
+                */
+               tbn = dbtofsb(fs, bp->b_blkno);
+               sn = dtosn(fs, tbn);
+               slept = 0;
+               for (i = 0; i < fs->lfs_cleanind; i++) {
+                       if (sn == dtosn(fs, fs->lfs_cleanint[i]) &&
+                           tbn >= fs->lfs_cleanint[i]) {
+                               DLOG((DLOG_CLEAN,
+                                     "lfs_strategy: ino %d lbn %" PRId64
+                                     " ind %d sn %d fsb %" PRIx32
+                                     " given sn %d fsb %" PRIx64 "\n",
+                                     ip->i_number, bp->b_lblkno, i,
+                                     dtosn(fs, fs->lfs_cleanint[i]),
+                                     fs->lfs_cleanint[i], sn, tbn));
+                               DLOG((DLOG_CLEAN,
+                                     "lfs_strategy: sleeping on ino %d lbn %"
+                                     PRId64 "\n", ip->i_number, bp->b_lblkno));
+                               mutex_enter(&lfs_lock);
+                               if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) {
+                                       /* Cleaner can't wait for itself */
+                                       mtsleep(&fs->lfs_iocount,
+                                               (PRIBIO + 1) | PNORELOCK,
+                                               "clean2", 0,
+                                               &lfs_lock);
+                                       slept = 1;
+                                       break;
+                               } else if (fs->lfs_seglock) {
+                                       mtsleep(&fs->lfs_seglock,
+                                               (PRIBIO + 1) | PNORELOCK,
+                                               "clean1", 0,
+                                               &lfs_lock);
+                                       slept = 1;
+                                       break;
+                               }
+                               mutex_exit(&lfs_lock);
+                       }
+               }
+               mutex_enter(&lfs_lock);
+       }
+       mutex_exit(&lfs_lock);
+
+       vp = ip->i_devvp;
+       VOP_STRATEGY(vp, bp);
+       return (0);
+}
+
+void
+lfs_flush_dirops(struct lfs *fs)
+{
+       struct inode *ip, *nip;
+       struct vnode *vp;
+       extern int lfs_dostats;
+       struct segment *sp;
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+       KASSERT(fs->lfs_nadirop == 0);
+
+       if (fs->lfs_ronly)
+               return;
+
+       mutex_enter(&lfs_lock);
+       if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) {
+               mutex_exit(&lfs_lock);
+               return;
+       } else
+               mutex_exit(&lfs_lock);
+
+       if (lfs_dostats)
+               ++lfs_stats.flush_invoked;
+
+       /*
+        * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
+        * Technically this is a checkpoint (the on-disk state is valid)
+        * even though we are leaving out all the file data.
+        */
+       lfs_imtime(fs);
+       lfs_seglock(fs, SEGM_CKP);
+       sp = fs->lfs_sp;
+
+       /*
+        * lfs_writevnodes, optimized to get dirops out of the way.
+        * Only write dirops, and don't flush files' pages, only
+        * blocks from the directories.
+        *
+        * We don't need to vref these files because they are
+        * dirops and so hold an extra reference until the
+        * segunlock clears them of that status.
+        *
+        * We don't need to check for IN_ADIROP because we know that
+        * no dirops are active.
+        *
+        */
+       mutex_enter(&lfs_lock);
+       for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
+               nip = TAILQ_NEXT(ip, i_lfs_dchain);
+               mutex_exit(&lfs_lock);
+               vp = ITOV(ip);
+
+               KASSERT((ip->i_flag & IN_ADIROP) == 0);
+
+               /*
+                * All writes to directories come from dirops; all
+                * writes to files' direct blocks go through the page
+                * cache, which we're not touching.  Reads to files
+                * and/or directories will not be affected by writing
+                * directory blocks inodes and file inodes.  So we don't
+                * really need to lock.  If we don't lock, though,
+                * make sure that we don't clear IN_MODIFIED
+                * unnecessarily.
+                */
+               if (vp->v_iflag & VI_XLOCK) {
+                       mutex_enter(&lfs_lock);
+                       continue;
+               }
+               /* XXX see below
+                * waslocked = VOP_ISLOCKED(vp);
+                */
+               if (vp->v_type != VREG &&
+                   ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) {
+                       lfs_writefile(fs, sp, vp);
+                       if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
+                           !(ip->i_flag & IN_ALLMOD)) {
+                               mutex_enter(&lfs_lock);
+                               LFS_SET_UINO(ip, IN_MODIFIED);
+                               mutex_exit(&lfs_lock);
+                       }
+               }
+               KDASSERT(ip->i_number != LFS_IFILE_INUM);
+               (void) lfs_writeinode(fs, sp, ip);
+               mutex_enter(&lfs_lock);
+               /*
+                * XXX
+                * LK_EXCLOTHER is dead -- what is intended here?
+                * if (waslocked == LK_EXCLOTHER)
+                *      LFS_SET_UINO(ip, IN_MODIFIED);
+                */
+       }
+       mutex_exit(&lfs_lock);
+       /* We've written all the dirops there are */
+       ((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
+       lfs_finalize_fs_seguse(fs);
+       (void) lfs_writeseg(fs, sp);
+       lfs_segunlock(fs);
+}
+
+/*
+ * Flush all vnodes for which the pagedaemon has requested pageouts.
+ * Skip over any files that are marked VU_DIROP (since lfs_flush_dirop()
+ * has just run, this would be an error).  If we have to skip a vnode
+ * for any reason, just skip it; if we have to wait for the cleaner,
+ * abort.  The writer daemon will call us again later.
+ */
+void
+lfs_flush_pchain(struct lfs *fs)
+{
+       struct inode *ip, *nip;
+       struct vnode *vp;
+       extern int lfs_dostats;
+       struct segment *sp;
+       int error;
+
+       ASSERT_NO_SEGLOCK(fs);
+
+       if (fs->lfs_ronly)
+               return;
+
+       mutex_enter(&lfs_lock);
+       if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) {
+               mutex_exit(&lfs_lock);
+               return;
+       } else
+               mutex_exit(&lfs_lock);
+
+       /* Get dirops out of the way */
+       lfs_flush_dirops(fs);
+
+       if (lfs_dostats)
+               ++lfs_stats.flush_invoked;
+
+       /*
+        * Inline lfs_segwrite/lfs_writevnodes, but just for pageouts.
+        */
+       lfs_imtime(fs);
+       lfs_seglock(fs, 0);
+       sp = fs->lfs_sp;
+
+       /*
+        * lfs_writevnodes, optimized to clear pageout requests.
+        * Only write non-dirop files that are in the pageout queue.
+        * We're very conservative about what we write; we want to be
+        * fast and async.
+        */
+       mutex_enter(&lfs_lock);
+    top:
+       for (ip = TAILQ_FIRST(&fs->lfs_pchainhd); ip != NULL; ip = nip) {
+               nip = TAILQ_NEXT(ip, i_lfs_pchain);
+               vp = ITOV(ip);
+
+               if (!(ip->i_flags & IN_PAGING))
+                       goto top;
+
+               mutex_enter(vp->v_interlock);
+               if ((vp->v_iflag & VI_XLOCK) || (vp->v_uflag & VU_DIROP) != 0) {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+               if (vp->v_type != VREG) {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+               if (lfs_vref(vp))
+                       continue;
+               mutex_exit(&lfs_lock);
+
+               if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_RETRY) != 0) {
+                       lfs_vunref(vp);
+                       mutex_enter(&lfs_lock);
+                       continue;
+               }
+
+               error = lfs_writefile(fs, sp, vp);
+               if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
+                   !(ip->i_flag & IN_ALLMOD)) {
+                       mutex_enter(&lfs_lock);
+                       LFS_SET_UINO(ip, IN_MODIFIED);
+                       mutex_exit(&lfs_lock);
+               }
+               KDASSERT(ip->i_number != LFS_IFILE_INUM);
+               (void) lfs_writeinode(fs, sp, ip);
+
+               VOP_UNLOCK(vp);
+               lfs_vunref(vp);
+
+               if (error == EAGAIN) {
+                       lfs_writeseg(fs, sp);
+                       mutex_enter(&lfs_lock);
+                       break;
+               }
+               mutex_enter(&lfs_lock);
+       }
+       mutex_exit(&lfs_lock);
+       (void) lfs_writeseg(fs, sp);
+       lfs_segunlock(fs);
+}
+
+/*
+ * Provide a fcntl interface to sys_lfs_{segwait,bmapv,markv}.
+ */
+int
+lfs_fcntl(void *v)
+{
+       struct vop_fcntl_args /* {
+               struct vnode *a_vp;
+               u_int a_command;
+               void * a_data;
+               int  a_fflag;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct timeval tv;
+       struct timeval *tvp;
+       BLOCK_INFO *blkiov;
+       CLEANERINFO *cip;
+       SEGUSE *sup;
+       int blkcnt, error, oclean;
+       size_t fh_size;
+       struct lfs_fcntl_markv blkvp;
+       struct lwp *l;
+       fsid_t *fsidp;
+       struct lfs *fs;
+       struct buf *bp;
+       fhandle_t *fhp;
+       daddr_t off;
+
+       /* Only respect LFS fcntls on fs root or Ifile */
+       if (VTOI(ap->a_vp)->i_number != ROOTINO &&
+           VTOI(ap->a_vp)->i_number != LFS_IFILE_INUM) {
+               return ufs_fcntl(v);
+       }
+
+       /* Avoid locking a draining lock */
+       if (ap->a_vp->v_mount->mnt_iflag & IMNT_UNMOUNT) {
+               return ESHUTDOWN;
+       }
+
+       /* LFS control and monitoring fcntls are available only to root */
+       l = curlwp;
+       if (((ap->a_command & 0xff00) >> 8) == 'L' &&
+           (error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+                                            NULL)) != 0)
+               return (error);
+
+       fs = VTOI(ap->a_vp)->i_lfs;
+       fsidp = &ap->a_vp->v_mount->mnt_stat.f_fsidx;
+
+       error = 0;
+       switch ((int)ap->a_command) {
+           case LFCNSEGWAITALL_COMPAT_50:
+           case LFCNSEGWAITALL_COMPAT:
+               fsidp = NULL;
+               /* FALLSTHROUGH */
+           case LFCNSEGWAIT_COMPAT_50:
+           case LFCNSEGWAIT_COMPAT:
+               {
+                       struct timeval50 *tvp50
+                               = (struct timeval50 *)ap->a_data;
+                       timeval50_to_timeval(tvp50, &tv);
+                       tvp = &tv;
+               }
+               goto segwait_common;
+           case LFCNSEGWAITALL:
+               fsidp = NULL;
+               /* FALLSTHROUGH */
+           case LFCNSEGWAIT:
+               tvp = (struct timeval *)ap->a_data;
+segwait_common:
+               mutex_enter(&lfs_lock);
+               ++fs->lfs_sleepers;
+               mutex_exit(&lfs_lock);
+
+               error = lfs_segwait(fsidp, tvp);
+
+               mutex_enter(&lfs_lock);
+               if (--fs->lfs_sleepers == 0)
+                       wakeup(&fs->lfs_sleepers);
+               mutex_exit(&lfs_lock);
+               return error;
+
+           case LFCNBMAPV:
+           case LFCNMARKV:
+               blkvp = *(struct lfs_fcntl_markv *)ap->a_data;
+
+               blkcnt = blkvp.blkcnt;
+               if ((u_int) blkcnt > LFS_MARKV_MAXBLKCNT)
+                       return (EINVAL);
+               blkiov = lfs_malloc(fs, blkcnt * sizeof(BLOCK_INFO), LFS_NB_BLKIOV);
+               if ((error = copyin(blkvp.blkiov, blkiov,
+                    blkcnt * sizeof(BLOCK_INFO))) != 0) {
+                       lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+                       return error;
+               }
+
+               mutex_enter(&lfs_lock);
+               ++fs->lfs_sleepers;
+               mutex_exit(&lfs_lock);
+               if (ap->a_command == LFCNBMAPV)
+                       error = lfs_bmapv(l->l_proc, fsidp, blkiov, blkcnt);
+               else /* LFCNMARKV */
+                       error = lfs_markv(l->l_proc, fsidp, blkiov, blkcnt);
+               if (error == 0)
+                       error = copyout(blkiov, blkvp.blkiov,
+                                       blkcnt * sizeof(BLOCK_INFO));
+               mutex_enter(&lfs_lock);
+               if (--fs->lfs_sleepers == 0)
+                       wakeup(&fs->lfs_sleepers);
+               mutex_exit(&lfs_lock);
+               lfs_free(fs, blkiov, LFS_NB_BLKIOV);
+               return error;
+
+           case LFCNRECLAIM:
+               /*
+                * Flush dirops and write Ifile, allowing empty segments
+                * to be immediately reclaimed.
+                */
+               lfs_writer_enter(fs, "pndirop");
+               off = fs->lfs_offset;
+               lfs_seglock(fs, SEGM_FORCE_CKP | SEGM_CKP);
+               lfs_flush_dirops(fs);
+               LFS_CLEANERINFO(cip, fs, bp);
+               oclean = cip->clean;
+               LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
+               lfs_segwrite(ap->a_vp->v_mount, SEGM_FORCE_CKP);
+               fs->lfs_sp->seg_flags |= SEGM_PROT;
+               lfs_segunlock(fs);
+               lfs_writer_leave(fs);
+
+#ifdef DEBUG
+               LFS_CLEANERINFO(cip, fs, bp);
+               DLOG((DLOG_CLEAN, "lfs_fcntl: reclaim wrote %" PRId64
+                     " blocks, cleaned %" PRId32 " segments (activesb %d)\n",
+                     fs->lfs_offset - off, cip->clean - oclean,
+                     fs->lfs_activesb));
+               LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
+#endif
+
+               return 0;
+
+           case LFCNIFILEFH_COMPAT:
+               /* Return the filehandle of the Ifile */
+               if ((error = kauth_authorize_system(l->l_cred,
+                   KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)) != 0)
+                       return (error);
+               fhp = (struct fhandle *)ap->a_data;
+               fhp->fh_fsid = *fsidp;
+               fh_size = 16;   /* former VFS_MAXFIDSIZ */
+               return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
+
+           case LFCNIFILEFH_COMPAT2:
+           case LFCNIFILEFH:
+               /* Return the filehandle of the Ifile */
+               fhp = (struct fhandle *)ap->a_data;
+               fhp->fh_fsid = *fsidp;
+               fh_size = sizeof(struct lfs_fhandle) -
+                   offsetof(fhandle_t, fh_fid);
+               return lfs_vptofh(fs->lfs_ivnode, &(fhp->fh_fid), &fh_size);
+
+           case LFCNREWIND:
+               /* Move lfs_offset to the lowest-numbered segment */
+               return lfs_rewind(fs, *(int *)ap->a_data);
+
+           case LFCNINVAL:
+               /* Mark a segment SEGUSE_INVAL */
+               LFS_SEGENTRY(sup, fs, *(int *)ap->a_data, bp);
+               if (sup->su_nbytes > 0) {
+                       brelse(bp, 0);
+                       lfs_unset_inval_all(fs);
+                       return EBUSY;
+               }
+               sup->su_flags |= SEGUSE_INVAL;
+               VOP_BWRITE(bp->b_vp, bp);
+               return 0;
+
+           case LFCNRESIZE:
+               /* Resize the filesystem */
+               return lfs_resize_fs(fs, *(int *)ap->a_data);
+
+           case LFCNWRAPSTOP:
+           case LFCNWRAPSTOP_COMPAT:
+               /*
+                * Hold lfs_newseg at segment 0; if requested, sleep until
+                * the filesystem wraps around.  To support external agents
+                * (dump, fsck-based regression test) that need to look at
+                * a snapshot of the filesystem, without necessarily
+                * requiring that all fs activity stops.
+                */
+               if (fs->lfs_stoplwp == curlwp)
+                       return EALREADY;
+
+               mutex_enter(&lfs_lock);
+               while (fs->lfs_stoplwp != NULL)
+                       cv_wait(&fs->lfs_stopcv, &lfs_lock);
+               fs->lfs_stoplwp = curlwp;
+               if (fs->lfs_nowrap == 0)
+                       log(LOG_NOTICE, "%s: disabled log wrap\n", fs->lfs_fsmnt);
+               ++fs->lfs_nowrap;
+               if (*(int *)ap->a_data == 1
+                   || ap->a_command == LFCNWRAPSTOP_COMPAT) {
+                       log(LOG_NOTICE, "LFCNSTOPWRAP waiting for log wrap\n");
+                       error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
+                               "segwrap", 0, &lfs_lock);
+                       log(LOG_NOTICE, "LFCNSTOPWRAP done waiting\n");
+                       if (error) {
+                               lfs_wrapgo(fs, VTOI(ap->a_vp), 0);
+                       }
+               }
+               mutex_exit(&lfs_lock);
+               return 0;
+
+           case LFCNWRAPGO:
+           case LFCNWRAPGO_COMPAT:
+               /*
+                * Having done its work, the agent wakes up the writer.
+                * If the argument is 1, it sleeps until a new segment
+                * is selected.
+                */
+               mutex_enter(&lfs_lock);
+               error = lfs_wrapgo(fs, VTOI(ap->a_vp),
+                                  ap->a_command == LFCNWRAPGO_COMPAT ? 1 :
+                                   *((int *)ap->a_data));
+               mutex_exit(&lfs_lock);
+               return error;
+
+           case LFCNWRAPPASS:
+               if ((VTOI(ap->a_vp)->i_lfs_iflags & LFSI_WRAPWAIT))
+                       return EALREADY;
+               mutex_enter(&lfs_lock);
+               if (fs->lfs_stoplwp != curlwp) {
+                       mutex_exit(&lfs_lock);
+                       return EALREADY;
+               }
+               if (fs->lfs_nowrap == 0) {
+                       mutex_exit(&lfs_lock);
+                       return EBUSY;
+               }
+               fs->lfs_wrappass = 1;
+               wakeup(&fs->lfs_wrappass);
+               /* Wait for the log to wrap, if asked */
+               if (*(int *)ap->a_data) {
+                       mutex_enter(ap->a_vp->v_interlock);
+                       lfs_vref(ap->a_vp);
+                       VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT;
+                       log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n");
+                       error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
+                               "segwrap", 0, &lfs_lock);
+                       log(LOG_NOTICE, "LFCNPASS done waiting\n");
+                       VTOI(ap->a_vp)->i_lfs_iflags &= ~LFSI_WRAPWAIT;
+                       lfs_vunref(ap->a_vp);
+               }
+               mutex_exit(&lfs_lock);
+               return error;
+
+           case LFCNWRAPSTATUS:
+               mutex_enter(&lfs_lock);
+               *(int *)ap->a_data = fs->lfs_wrapstatus;
+               mutex_exit(&lfs_lock);
+               return 0;
+
+           default:
+               return ufs_fcntl(v);
+       }
+       return 0;
+}
+
+int
+lfs_getpages(void *v)
+{
+       struct vop_getpages_args /* {
+               struct vnode *a_vp;
+               voff_t a_offset;
+               struct vm_page **a_m;
+               int *a_count;
+               int a_centeridx;
+               vm_prot_t a_access_type;
+               int a_advice;
+               int a_flags;
+       } */ *ap = v;
+
+       if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM &&
+           (ap->a_access_type & VM_PROT_WRITE) != 0) {
+               return EPERM;
+       }
+       if ((ap->a_access_type & VM_PROT_WRITE) != 0) {
+               mutex_enter(&lfs_lock);
+               LFS_SET_UINO(VTOI(ap->a_vp), IN_MODIFIED);
+               mutex_exit(&lfs_lock);
+       }
+
+       /*
+        * we're relying on the fact that genfs_getpages() always read in
+        * entire filesystem blocks.
+        */
+       return genfs_getpages(v);
+}
+
+/*
+ * Wait for a page to become unbusy, possibly printing diagnostic messages
+ * as well.
+ *
+ * Called with vp->v_interlock held; return with it held.
+ */
+static void
+wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label)
+{
+       if ((pg->flags & PG_BUSY) == 0)
+               return;         /* Nothing to wait for! */
+
+#if defined(DEBUG) && defined(UVM_PAGE_TRKOWN)
+       static struct vm_page *lastpg;
+
+       if (label != NULL && pg != lastpg) {
+               if (pg->owner_tag) {
+                       printf("lfs_putpages[%d.%d]: %s: page %p owner %d.%d [%s]\n",
+                              curproc->p_pid, curlwp->l_lid, label,
+                              pg, pg->owner, pg->lowner, pg->owner_tag);
+               } else {
+                       printf("lfs_putpages[%d.%d]: %s: page %p unowned?!\n",
+                              curproc->p_pid, curlwp->l_lid, label, pg);
+               }
+       }
+       lastpg = pg;
+#endif
+
+       pg->flags |= PG_WANTED;
+       UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0, "lfsput", 0);
+       mutex_enter(vp->v_interlock);
+}
+
+/*
+ * This routine is called by lfs_putpages() when it can't complete the
+ * write because a page is busy.  This means that either (1) someone,
+ * possibly the pagedaemon, is looking at this page, and will give it up
+ * presently; or (2) we ourselves are holding the page busy in the
+ * process of being written (either gathered or actually on its way to
+ * disk).  We don't need to give up the segment lock, but we might need
+ * to call lfs_writeseg() to expedite the page's journey to disk.
+ *
+ * Called with vp->v_interlock held; return with it held.
+ */
+/* #define BUSYWAIT */
+static void
+write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
+              int seglocked, const char *label)
+{
+#ifndef BUSYWAIT
+       struct inode *ip = VTOI(vp);
+       struct segment *sp = fs->lfs_sp;
+       int count = 0;
+
+       if (pg == NULL)
+               return;
+
+       while (pg->flags & PG_BUSY &&
+           pg->uobject == &vp->v_uobj) {
+               mutex_exit(vp->v_interlock);
+               if (sp->cbpp - sp->bpp > 1) {
+                       /* Write gathered pages */
+                       lfs_updatemeta(sp);
+                       lfs_release_finfo(fs);
+                       (void) lfs_writeseg(fs, sp);
+
+                       /*
+                        * Reinitialize FIP
+                        */
+                       KASSERT(sp->vp == vp);
+                       lfs_acquire_finfo(fs, ip->i_number,
+                                         ip->i_gen);
+               }
+               ++count;
+               mutex_enter(vp->v_interlock);
+               wait_for_page(vp, pg, label);
+       }
+       if (label != NULL && count > 1)
+               printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc->p_pid,
+                      label, (count > 0 ? "looping, " : ""), count);
+#else
+       preempt(1);
+#endif
+}
+
+/*
+ * Make sure that for all pages in every block in the given range,
+ * either all are dirty or all are clean.  If any of the pages
+ * we've seen so far are dirty, put the vnode on the paging chain,
+ * and mark it IN_PAGING.
+ *
+ * If checkfirst != 0, don't check all the pages but return at the
+ * first dirty page.
+ */
+static int
+check_dirty(struct lfs *fs, struct vnode *vp,
+           off_t startoffset, off_t endoffset, off_t blkeof,
+           int flags, int checkfirst, struct vm_page **pgp)
+{
+       int by_list;
+       struct vm_page *curpg = NULL; /* XXX: gcc */
+       struct vm_page *pgs[MAXBSIZE / PAGE_SIZE], *pg;
+       off_t soff = 0; /* XXX: gcc */
+       voff_t off;
+       int i;
+       int nonexistent;
+       int any_dirty;  /* number of dirty pages */
+       int dirty;      /* number of dirty pages in a block */
+       int tdirty;
+       int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
+       int pagedaemon = (curlwp == uvm.pagedaemon_lwp);
+
+       ASSERT_MAYBE_SEGLOCK(fs);
+  top:
+       by_list = (vp->v_uobj.uo_npages <=
+                  ((endoffset - startoffset) >> PAGE_SHIFT) *
+                  UVM_PAGE_TREE_PENALTY);
+       any_dirty = 0;
+
+       if (by_list) {
+               curpg = TAILQ_FIRST(&vp->v_uobj.memq);
+       } else {
+               soff = startoffset;
+       }
+       while (by_list || soff < MIN(blkeof, endoffset)) {
+               if (by_list) {
+                       /*
+                        * Find the first page in a block.  Skip
+                        * blocks outside our area of interest or beyond
+                        * the end of file.
+                        */
+                       KASSERT(curpg == NULL
+                           || (curpg->flags & PG_MARKER) == 0);
+                       if (pages_per_block > 1) {
+                               while (curpg &&
+                                   ((curpg->offset & fs->lfs_bmask) ||
+                                   curpg->offset >= vp->v_size ||
+                                   curpg->offset >= endoffset)) {
+                                       curpg = TAILQ_NEXT(curpg, listq.queue);
+                                       KASSERT(curpg == NULL ||
+                                           (curpg->flags & PG_MARKER) == 0);
+                               }
+                       }
+                       if (curpg == NULL)
+                               break;
+                       soff = curpg->offset;
+               }
+
+               /*
+                * Mark all pages in extended range busy; find out if any
+                * of them are dirty.
+                */
+               nonexistent = dirty = 0;
+               for (i = 0; i == 0 || i < pages_per_block; i++) {
+                       if (by_list && pages_per_block <= 1) {
+                               pgs[i] = pg = curpg;
+                       } else {
+                               off = soff + (i << PAGE_SHIFT);
+                               pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off);
+                               if (pg == NULL) {
+                                       ++nonexistent;
+                                       continue;
+                               }
+                       }
+                       KASSERT(pg != NULL);
+
+                       /*
+                        * If we're holding the segment lock, we can deadlock
+                        * against a process that has our page and is waiting
+                        * for the cleaner, while the cleaner waits for the
+                        * segment lock.  Just bail in that case.
+                        */
+                       if ((pg->flags & PG_BUSY) &&
+                           (pagedaemon || LFS_SEGLOCK_HELD(fs))) {
+                               if (i > 0)
+                                       uvm_page_unbusy(pgs, i);
+                               DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
+                               if (pgp)
+                                       *pgp = pg;
+                               return -1;
+                       }
+
+                       while (pg->flags & PG_BUSY) {
+                               wait_for_page(vp, pg, NULL);
+                               if (i > 0)
+                                       uvm_page_unbusy(pgs, i);
+                               goto top;
+                       }
+                       pg->flags |= PG_BUSY;
+                       UVM_PAGE_OWN(pg, "lfs_putpages");
+
+                       pmap_page_protect(pg, VM_PROT_NONE);
+                       tdirty = (pmap_clear_modify(pg) ||
+                                 (pg->flags & PG_CLEAN) == 0);
+                       dirty += tdirty;
+               }
+               if (pages_per_block > 0 && nonexistent >= pages_per_block) {
+                       if (by_list) {
+                               curpg = TAILQ_NEXT(curpg, listq.queue);
+                       } else {
+                               soff += fs->lfs_bsize;
+                       }
+                       continue;
+               }
+
+               any_dirty += dirty;
+               KASSERT(nonexistent == 0);
+
+               /*
+                * If any are dirty make all dirty; unbusy them,
+                * but if we were asked to clean, wire them so that
+                * the pagedaemon doesn't bother us about them while
+                * they're on their way to disk.
+                */
+               for (i = 0; i == 0 || i < pages_per_block; i++) {
+                       pg = pgs[i];
+                       KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
+                       if (dirty) {
+                               pg->flags &= ~PG_CLEAN;
+                               if (flags & PGO_FREE) {
+                                       /*
+                                        * Wire the page so that
+                                        * pdaemon doesn't see it again.
+                                        */
+                                       mutex_enter(&uvm_pageqlock);
+                                       uvm_pagewire(pg);
+                                       mutex_exit(&uvm_pageqlock);
+
+                                       /* Suspended write flag */
+                                       pg->flags |= PG_DELWRI;
+                               }
+                       }
+                       if (pg->flags & PG_WANTED)
+                               wakeup(pg);
+                       pg->flags &= ~(PG_WANTED|PG_BUSY);
+                       UVM_PAGE_OWN(pg, NULL);
+               }
+
+               if (checkfirst && any_dirty)
+                       break;
+
+               if (by_list) {
+                       curpg = TAILQ_NEXT(curpg, listq.queue);
+               } else {
+                       soff += MAX(PAGE_SIZE, fs->lfs_bsize);
+               }
+       }
+
+       return any_dirty;
+}
+
+/*
+ * lfs_putpages functions like genfs_putpages except that
+ *
+ * (1) It needs to bounds-check the incoming requests to ensure that
+ *     they are block-aligned; if they are not, expand the range and
+ *     do the right thing in case, e.g., the requested range is clean
+ *     but the expanded range is dirty.
+ *
+ * (2) It needs to explicitly send blocks to be written when it is done.
+ *     If VOP_PUTPAGES is called without the seglock held, we simply take
+ *     the seglock and let lfs_segunlock wait for us.
+ *     XXX There might be a bad situation if we have to flush a vnode while
+ *     XXX lfs_markv is in operation.  As of this writing we panic in this
+ *     XXX case.
+ *
+ * Assumptions:
+ *
+ * (1) The caller does not hold any pages in this vnode busy.  If it does,
+ *     there is a danger that when we expand the page range and busy the
+ *     pages we will deadlock.
+ *
+ * (2) We are called with vp->v_interlock held; we must return with it
+ *     released.
+ *
+ * (3) We don't absolutely have to free pages right away, provided that
+ *     the request does not have PGO_SYNCIO.  When the pagedaemon gives
+ *     us a request with PGO_FREE, we take the pages out of the paging
+ *     queue and wake up the writer, which will handle freeing them for us.
+ *
+ *     We ensure that for any filesystem block, all pages for that
+ *     block are either resident or not, even if those pages are higher
+ *     than EOF; that means that we will be getting requests to free
+ *     "unused" pages above EOF all the time, and should ignore them.
+ *
+ * (4) If we are called with PGO_LOCKED, the finfo array we are to write
+ *     into has been set up for us by lfs_writefile.  If not, we will
+ *     have to handle allocating and/or freeing an finfo entry.
+ *
+ * XXX note that we're (ab)using PGO_LOCKED as "seglock held".
+ */
+
+/* How many times to loop before we should start to worry */
+#define TOOMANY 4
+
+int
+lfs_putpages(void *v)
+{
+       int error;
+       struct vop_putpages_args /* {
+               struct vnode *a_vp;
+               voff_t a_offlo;
+               voff_t a_offhi;
+               int a_flags;
+       } */ *ap = v;
+       struct vnode *vp;
+       struct inode *ip;
+       struct lfs *fs;
+       struct segment *sp;
+       off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
+       off_t off, max_endoffset;
+       bool seglocked, sync, pagedaemon;
+       struct vm_page *pg, *busypg;
+       UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
+#ifdef DEBUG
+       int debug_n_again, debug_n_dirtyclean;
+#endif
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       fs = ip->i_lfs;
+       sync = (ap->a_flags & PGO_SYNCIO) != 0;
+       pagedaemon = (curlwp == uvm.pagedaemon_lwp);
+
+       /* Putpages does nothing for metadata. */
+       if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
+               mutex_exit(vp->v_interlock);
+               return 0;
+       }
+
+       /*
+        * If there are no pages, don't do anything.
+        */
+       if (vp->v_uobj.uo_npages == 0) {
+               if (TAILQ_EMPTY(&vp->v_uobj.memq) &&
+                   (vp->v_iflag & VI_ONWORKLST) &&
+                   LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
+                       vp->v_iflag &= ~VI_WRMAPDIRTY;
+                       vn_syncer_remove_from_worklist(vp);
+               }
+               mutex_exit(vp->v_interlock);
+               
+               /* Remove us from paging queue, if we were on it */
+               mutex_enter(&lfs_lock);
+               if (ip->i_flags & IN_PAGING) {
+                       ip->i_flags &= ~IN_PAGING;
+                       TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+               }
+               mutex_exit(&lfs_lock);
+               return 0;
+       }
+
+       blkeof = blkroundup(fs, ip->i_size);
+
+       /*
+        * Ignore requests to free pages past EOF but in the same block
+        * as EOF, unless the request is synchronous.  (If the request is
+        * sync, it comes from lfs_truncate.)
+        * XXXUBC Make these pages look "active" so the pagedaemon won't
+        * XXXUBC bother us with them again.
+        */
+       if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
+               origoffset = ap->a_offlo;
+               for (off = origoffset; off < blkeof; off += fs->lfs_bsize) {
+                       pg = uvm_pagelookup(&vp->v_uobj, off);
+                       KASSERT(pg != NULL);
+                       while (pg->flags & PG_BUSY) {
+                               pg->flags |= PG_WANTED;
+                               UVM_UNLOCK_AND_WAIT(pg, vp->v_interlock, 0,
+                                                   "lfsput2", 0);
+                               mutex_enter(vp->v_interlock);
+                       }
+                       mutex_enter(&uvm_pageqlock);
+                       uvm_pageactivate(pg);
+                       mutex_exit(&uvm_pageqlock);
+               }
+               ap->a_offlo = blkeof;
+               if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) {
+                       mutex_exit(vp->v_interlock);
+                       return 0;
+               }
+       }
+
+       /*
+        * Extend page range to start and end at block boundaries.
+        * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
+        */
+       origoffset = ap->a_offlo;
+       origendoffset = ap->a_offhi;
+       startoffset = origoffset & ~(fs->lfs_bmask);
+       max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift)
+                                              << fs->lfs_bshift;
+
+       if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
+               endoffset = max_endoffset;
+               origendoffset = endoffset;
+       } else {
+               origendoffset = round_page(ap->a_offhi);
+               endoffset = round_page(blkroundup(fs, origendoffset));
+       }
+
+       KASSERT(startoffset > 0 || endoffset >= startoffset);
+       if (startoffset == endoffset) {
+               /* Nothing to do, why were we called? */
+               mutex_exit(vp->v_interlock);
+               DLOG((DLOG_PAGE, "lfs_putpages: startoffset = endoffset = %"
+                     PRId64 "\n", startoffset));
+               return 0;
+       }
+
+       ap->a_offlo = startoffset;
+       ap->a_offhi = endoffset;
+
+       /*
+        * If not cleaning, just send the pages through genfs_putpages
+        * to be returned to the pool.
+        */
+       if (!(ap->a_flags & PGO_CLEANIT))
+               return genfs_putpages(v);
+
+       /* Set PGO_BUSYFAIL to avoid deadlocks */
+       ap->a_flags |= PGO_BUSYFAIL;
+
+       /*
+        * Likewise, if we are asked to clean but the pages are not
+        * dirty, we can just free them using genfs_putpages.
+        */
+#ifdef DEBUG
+       debug_n_dirtyclean = 0;
+#endif
+       do {
+               int r;
+
+               /* Count the number of dirty pages */
+               r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
+                               ap->a_flags, 1, NULL);
+               if (r < 0) {
+                       /* Pages are busy with another process */
+                       mutex_exit(vp->v_interlock);
+                       return EDEADLK;
+               }
+               if (r > 0) /* Some pages are dirty */
+                       break;
+
+               /*
+                * Sometimes pages are dirtied between the time that
+                * we check and the time we try to clean them.
+                * Instruct lfs_gop_write to return EDEADLK in this case
+                * so we can write them properly.
+                */
+               ip->i_lfs_iflags |= LFSI_NO_GOP_WRITE;
+               r = genfs_do_putpages(vp, startoffset, endoffset,
+                                      ap->a_flags & ~PGO_SYNCIO, &busypg);
+               ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE;
+               if (r != EDEADLK)
+                       return r;
+
+               /* One of the pages was busy.  Start over. */
+               mutex_enter(vp->v_interlock);
+               wait_for_page(vp, busypg, "dirtyclean");
+#ifdef DEBUG
+               ++debug_n_dirtyclean;
+#endif
+       } while(1);
+
+#ifdef DEBUG
+       if (debug_n_dirtyclean > TOOMANY)
+               printf("lfs_putpages: dirtyclean: looping, n = %d\n",
+                      debug_n_dirtyclean);
+#endif
+
+       /*
+        * Dirty and asked to clean.
+        *
+        * Pagedaemon can't actually write LFS pages; wake up
+        * the writer to take care of that.  The writer will
+        * notice the pager inode queue and act on that.
+        *
+        * XXX We must drop the vp->interlock before taking the lfs_lock or we
+        * get a nasty deadlock with lfs_flush_pchain().
+        */
+       if (pagedaemon) {
+               mutex_exit(vp->v_interlock);
+               mutex_enter(&lfs_lock);
+               if (!(ip->i_flags & IN_PAGING)) {
+                       ip->i_flags |= IN_PAGING;
+                       TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+               } 
+               wakeup(&lfs_writer_daemon);
+               mutex_exit(&lfs_lock);
+               preempt();
+               return EWOULDBLOCK;
+       }
+
+       /*
+        * If this is a file created in a recent dirop, we can't flush its
+        * inode until the dirop is complete.  Drain dirops, then flush the
+        * filesystem (taking care of any other pending dirops while we're
+        * at it).
+        */
+       if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
+           (vp->v_uflag & VU_DIROP)) {
+               int locked;
+
+               DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n"));
+               /* XXX VOP_ISLOCKED() may not be used for lock decisions. */
+               locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+               mutex_exit(vp->v_interlock);
+               lfs_writer_enter(fs, "ppdirop");
+               if (locked)
+                       VOP_UNLOCK(vp); /* XXX why? */
+
+               mutex_enter(&lfs_lock);
+               lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
+               mutex_exit(&lfs_lock);
+
+               if (locked)
+                       VOP_LOCK(vp, LK_EXCLUSIVE);
+               mutex_enter(vp->v_interlock);
+               lfs_writer_leave(fs);
+
+               /* XXX the flush should have taken care of this one too! */
+       }
+
+       /*
+        * This is it.  We are going to write some pages.  From here on
+        * down it's all just mechanics.
+        *
+        * Don't let genfs_putpages wait; lfs_segunlock will wait for us.
+        */
+       ap->a_flags &= ~PGO_SYNCIO;
+
+       /*
+        * If we've already got the seglock, flush the node and return.
+        * The FIP has already been set up for us by lfs_writefile,
+        * and FIP cleanup and lfs_updatemeta will also be done there,
+        * unless genfs_putpages returns EDEADLK; then we must flush
+        * what we have, and correct FIP and segment header accounting.
+        */
+  get_seglock:
+       /*
+        * If we are not called with the segment locked, lock it.
+        * Account for a new FIP in the segment header, and set sp->vp.
+        * (This should duplicate the setup at the top of lfs_writefile().)
+        */
+       seglocked = (ap->a_flags & PGO_LOCKED) != 0;
+       if (!seglocked) {
+               mutex_exit(vp->v_interlock);
+               error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0));
+               if (error != 0)
+                       return error;
+               mutex_enter(vp->v_interlock);
+               lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
+       }
+       sp = fs->lfs_sp;
+       KASSERT(sp->vp == NULL);
+       sp->vp = vp;
+
+       /*
+        * Ensure that the partial segment is marked SS_DIROP if this
+        * vnode is a DIROP.
+        */
+       if (!seglocked && vp->v_uflag & VU_DIROP)
+               ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+
+       /*
+        * Loop over genfs_putpages until all pages are gathered.
+        * genfs_putpages() drops the interlock, so reacquire it if necessary.
+        * Whenever we lose the interlock we have to rerun check_dirty, as
+        * well, since more pages might have been dirtied in our absence.
+        */
+#ifdef DEBUG
+       debug_n_again = 0;
+#endif
+       do {
+               busypg = NULL;
+               if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
+                               ap->a_flags, 0, &busypg) < 0) {
+                       mutex_exit(vp->v_interlock);
+
+                       mutex_enter(vp->v_interlock);
+                       write_and_wait(fs, vp, busypg, seglocked, NULL);
+                       if (!seglocked) {
+                               mutex_exit(vp->v_interlock);
+                               lfs_release_finfo(fs);
+                               lfs_segunlock(fs);
+                               mutex_enter(vp->v_interlock);
+                       }
+                       sp->vp = NULL;
+                       goto get_seglock;
+               }
+       
+               busypg = NULL;
+               error = genfs_do_putpages(vp, startoffset, endoffset,
+                                          ap->a_flags, &busypg);
+       
+               if (error == EDEADLK || error == EAGAIN) {
+                       DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
+                             " %d ino %d off %x (seg %d)\n", error,
+                             ip->i_number, fs->lfs_offset,
+                             dtosn(fs, fs->lfs_offset)));
+
+                       mutex_enter(vp->v_interlock);
+                       write_and_wait(fs, vp, busypg, seglocked, "again");
+               }
+#ifdef DEBUG
+               ++debug_n_again;
+#endif
+       } while (error == EDEADLK);
+#ifdef DEBUG
+       if (debug_n_again > TOOMANY)
+               printf("lfs_putpages: again: looping, n = %d\n", debug_n_again);
+#endif
+
+       KASSERT(sp != NULL && sp->vp == vp);
+       if (!seglocked) {
+               sp->vp = NULL;
+
+               /* Write indirect blocks as well */
+               lfs_gather(fs, fs->lfs_sp, vp, lfs_match_indir);
+               lfs_gather(fs, fs->lfs_sp, vp, lfs_match_dindir);
+               lfs_gather(fs, fs->lfs_sp, vp, lfs_match_tindir);
+
+               KASSERT(sp->vp == NULL);
+               sp->vp = vp;
+       }
+
+       /*
+        * Blocks are now gathered into a segment waiting to be written.
+        * All that's left to do is update metadata, and write them.
+        */
+       lfs_updatemeta(sp);
+       KASSERT(sp->vp == vp);
+       sp->vp = NULL;
+
+       /*
+        * If we were called from lfs_writefile, we don't need to clean up
+        * the FIP or unlock the segment lock.  We're done.
+        */
+       if (seglocked)
+               return error;
+
+       /* Clean up FIP and send it to disk. */
+       lfs_release_finfo(fs);
+       lfs_writeseg(fs, fs->lfs_sp);
+
+       /*
+        * Remove us from paging queue if we wrote all our pages.
+        */
+       if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
+               mutex_enter(&lfs_lock);
+               if (ip->i_flags & IN_PAGING) {
+                       ip->i_flags &= ~IN_PAGING;
+                       TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+               }
+               mutex_exit(&lfs_lock);
+       }
+
+       /*
+        * XXX - with the malloc/copy writeseg, the pages are freed by now
+        * even if we don't wait (e.g. if we hold a nested lock).  This
+        * will not be true if we stop using malloc/copy.
+        */
+       KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT);
+       lfs_segunlock(fs);
+
+       /*
+        * Wait for v_numoutput to drop to zero.  The seglock should
+        * take care of this, but there is a slight possibility that
+        * aiodoned might not have got around to our buffers yet.
+        */
+       if (sync) {
+               mutex_enter(vp->v_interlock);
+               while (vp->v_numoutput > 0) {
+                       DLOG((DLOG_PAGE, "lfs_putpages: ino %d sleeping on"
+                             " num %d\n", ip->i_number, vp->v_numoutput));
+                       cv_wait(&vp->v_cv, vp->v_interlock);
+               }
+               mutex_exit(vp->v_interlock);
+       }
+       return error;
+}
+
+/*
+ * Return the last logical file offset that should be written for this file
+ * if we're doing a write that ends at "size". If writing, we need to know
+ * about sizes on disk, i.e. fragments if there are any; if reading, we need
+ * to know about entire blocks.
+ */
+void
+lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
+{
+       struct inode *ip = VTOI(vp);
+       struct lfs *fs = ip->i_lfs;
+       daddr_t olbn, nlbn;
+
+       olbn = lblkno(fs, ip->i_size);
+       nlbn = lblkno(fs, size);
+       if (!(flags & GOP_SIZE_MEM) && nlbn < NDADDR && olbn <= nlbn) {
+               *eobp = fragroundup(fs, size);
+       } else {
+               *eobp = blkroundup(fs, size);
+       }
+}
+
+#ifdef DEBUG
+void lfs_dump_vop(void *);
+
+void
+lfs_dump_vop(void *v)
+{
+       struct vop_putpages_args /* {
+               struct vnode *a_vp;
+               voff_t a_offlo;
+               voff_t a_offhi;
+               int a_flags;
+       } */ *ap = v;
+
+#ifdef DDB
+       vfs_vnode_print(ap->a_vp, 0, printf);
+#endif
+       lfs_dump_dinode(VTOI(ap->a_vp)->i_din.ffs1_din);
+}
+#endif
+
+int
+lfs_mmap(void *v)
+{
+       struct vop_mmap_args /* {
+               const struct vnodeop_desc *a_desc;
+               struct vnode *a_vp;
+               vm_prot_t a_prot;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+
+       if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM)
+               return EOPNOTSUPP;
+       return ufs_mmap(v);
+}
diff --git a/sys/ufs/mfs/Makefile b/sys/ufs/mfs/Makefile
new file mode 100644 (file)
index 0000000..c0fdca9
--- /dev/null
@@ -0,0 +1,7 @@
+#      $NetBSD: Makefile,v 1.2 1999/07/03 18:40:32 thorpej Exp $
+
+INCSDIR= /usr/include/ufs/mfs
+
+INCS=  mfs_extern.h mfsnode.h
+
+.include <bsd.kinc.mk>
diff --git a/sys/ufs/mfs/mfs_miniroot.c b/sys/ufs/mfs/mfs_miniroot.c
new file mode 100644 (file)
index 0000000..cfd4a03
--- /dev/null
@@ -0,0 +1,68 @@
+/*     $NetBSD: mfs_miniroot.c,v 1.1 2010/03/02 17:20:02 pooka Exp $   */
+
+/*
+ * Copyright (c) 1989, 1990, 1993, 1994
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)mfs_vfsops.c        8.11 (Berkeley) 6/19/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: mfs_miniroot.c,v 1.1 2010/03/02 17:20:02 pooka Exp $");
+
+#include <sys/param.h>
+
+#include <ufs/mfs/mfs_extern.h>
+#include <ufs/ffs/fs.h>
+
+void * mfs_rootbase;   /* address of mini-root in kernel virtual memory */
+u_long mfs_rootsize;   /* size of mini-root in bytes */
+
+/*
+ * This is called early in boot to set the base address and size
+ * of the mini-root.
+ */
+int
+mfs_initminiroot(void *base)
+{
+       struct fs *fs = (struct fs *)((char *)base + SBLOCK_UFS1);
+       static bool inited = false;
+
+       if (inited)
+               panic("mfs_initminiroot() called more than once");
+       inited = true;
+
+       /* check for valid super block */
+       if (fs->fs_magic != FS_UFS1_MAGIC || fs->fs_bsize > MAXBSIZE ||
+           fs->fs_bsize < sizeof(struct fs))
+               return (0);
+       rootfstype = MOUNT_MFS;
+       mfs_rootbase = base;
+       mfs_rootsize = fs->fs_fsize * fs->fs_size;
+       rootdev = makedev(255, 0);
+       return (mfs_rootsize);
+}
diff --git a/sys/ufs/mfs/mfs_vfsops.c b/sys/ufs/mfs/mfs_vfsops.c
new file mode 100644 (file)
index 0000000..292998d
--- /dev/null
@@ -0,0 +1,444 @@
+/*     $NetBSD: mfs_vfsops.c,v 1.103 2011/06/12 03:36:01 rmind Exp $   */
+
+/*
+ * Copyright (c) 1989, 1990, 1993, 1994
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)mfs_vfsops.c        8.11 (Berkeley) 6/19/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: mfs_vfsops.c,v 1.103 2011/06/12 03:36:01 rmind Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_compat_netbsd.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/bufq.h>
+#include <sys/mount.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/kmem.h>
+#include <sys/module.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <ufs/mfs/mfsnode.h>
+#include <ufs/mfs/mfs_extern.h>
+
+MODULE(MODULE_CLASS_VFS, mfs, "ffs");
+
+kmutex_t mfs_lock;     /* global lock */
+
+/* used for building internal dev_t, minor == 0 reserved for miniroot */
+static int mfs_minor = 1;
+static int mfs_initcnt;
+
+extern int (**mfs_vnodeop_p)(void *);
+
+static struct sysctllog *mfs_sysctl_log;
+
+/*
+ * mfs vfs operations.
+ */
+
+extern const struct vnodeopv_desc mfs_vnodeop_opv_desc;
+
+const struct vnodeopv_desc * const mfs_vnodeopv_descs[] = {
+       &mfs_vnodeop_opv_desc,
+       NULL,
+};
+
+struct vfsops mfs_vfsops = {
+       MOUNT_MFS,
+       sizeof (struct mfs_args),
+       mfs_mount,
+       mfs_start,
+       ffs_unmount,
+       ufs_root,
+       ufs_quotactl,
+       mfs_statvfs,
+       ffs_sync,
+       ffs_vget,
+       ffs_fhtovp,
+       ffs_vptofh,
+       mfs_init,
+       mfs_reinit,
+       mfs_done,
+       NULL,
+       (int (*)(struct mount *, struct vnode *, struct timespec *)) eopnotsupp,
+       vfs_stdextattrctl,
+       (void *)eopnotsupp,     /* vfs_suspendctl */
+       genfs_renamelock_enter,
+       genfs_renamelock_exit,
+       (void *)eopnotsupp,
+       mfs_vnodeopv_descs,
+       0,
+       { NULL, NULL },
+};
+
+static int
+mfs_modcmd(modcmd_t cmd, void *arg)
+{
+       int error;
+
+       switch (cmd) {
+       case MODULE_CMD_INIT:
+               error = vfs_attach(&mfs_vfsops);
+               if (error != 0)
+                       break;
+               sysctl_createv(&mfs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT,
+                              CTLTYPE_NODE, "vfs", NULL,
+                              NULL, 0, NULL, 0,
+                              CTL_VFS, CTL_EOL);
+               sysctl_createv(&mfs_sysctl_log, 0, NULL, NULL,
+                              CTLFLAG_PERMANENT|CTLFLAG_ALIAS,
+                              CTLTYPE_NODE, "mfs",
+                              SYSCTL_DESCR("Memory based file system"),
+                              NULL, 1, NULL, 0,
+                              CTL_VFS, 3, CTL_EOL);
+               /*
+                * XXX the "1" and the "3" above could be dynamic, thereby
+                * eliminating one more instance of the "number to vfs"
+                * mapping problem, but they are in order as taken from
+                * sys/mount.h
+                */
+               break;
+       case MODULE_CMD_FINI:
+               error = vfs_detach(&mfs_vfsops);
+               if (error != 0)
+                       break;
+               sysctl_teardown(&mfs_sysctl_log);
+               break;
+       default:
+               error = ENOTTY;
+               break;
+       }
+
+       return (error);
+}
+
+/*
+ * Memory based filesystem initialization.
+ */
+void
+mfs_init(void)
+{
+
+       if (mfs_initcnt++ == 0) {
+               mutex_init(&mfs_lock, MUTEX_DEFAULT, IPL_NONE);
+               ffs_init();
+       }
+}
+
+void
+mfs_reinit(void)
+{
+
+       ffs_reinit();
+}
+
+void
+mfs_done(void)
+{
+
+       if (--mfs_initcnt == 0) {
+               ffs_done();
+               mutex_destroy(&mfs_lock);
+       }
+}
+
+/*
+ * Called by main() when mfs is going to be mounted as root.
+ */
+
+int
+mfs_mountroot(void)
+{
+       struct fs *fs;
+       struct mount *mp;
+       struct lwp *l = curlwp;         /* XXX */
+       struct ufsmount *ump;
+       struct mfsnode *mfsp;
+       int error = 0;
+
+       if ((error = vfs_rootmountalloc(MOUNT_MFS, "mfs_root", &mp))) {
+               vrele(rootvp);
+               return (error);
+       }
+
+       mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP);
+       rootvp->v_data = mfsp;
+       rootvp->v_op = mfs_vnodeop_p;
+       rootvp->v_tag = VT_MFS;
+       mfsp->mfs_baseoff = mfs_rootbase;
+       mfsp->mfs_size = mfs_rootsize;
+       mfsp->mfs_vnode = rootvp;
+       mfsp->mfs_proc = NULL;          /* indicate kernel space */
+       mfsp->mfs_shutdown = 0;
+       cv_init(&mfsp->mfs_cv, "mfs");
+       mfsp->mfs_refcnt = 1;
+       bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0);
+       if ((error = ffs_mountfs(rootvp, mp, l)) != 0) {
+               vfs_unbusy(mp, false, NULL);
+               bufq_free(mfsp->mfs_buflist);
+               vfs_destroy(mp);
+               kmem_free(mfsp, sizeof(*mfsp));
+               return (error);
+       }
+       mutex_enter(&mountlist_lock);
+       CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+       mutex_exit(&mountlist_lock);
+       mp->mnt_vnodecovered = NULLVP;
+       ump = VFSTOUFS(mp);
+       fs = ump->um_fs;
+       (void) copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0);
+       (void)ffs_statvfs(mp, &mp->mnt_stat);
+       vfs_unbusy(mp, false, NULL);
+       return (0);
+}
+
+/*
+ * VFS Operations.
+ *
+ * mount system call
+ */
+/* ARGSUSED */
+int
+mfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
+{
+       struct lwp *l = curlwp;
+       struct vnode *devvp;
+       struct mfs_args *args = data;
+       struct ufsmount *ump;
+       struct fs *fs;
+       struct mfsnode *mfsp;
+       struct proc *p;
+       int flags, error = 0;
+
+       if (*data_len < sizeof *args)
+               return EINVAL;
+
+       p = l->l_proc;
+       if (mp->mnt_flag & MNT_GETARGS) {
+               struct vnode *vp;
+
+               ump = VFSTOUFS(mp);
+               if (ump == NULL)
+                       return EIO;
+
+               vp = ump->um_devvp;
+               if (vp == NULL)
+                       return EIO;
+
+               mfsp = VTOMFS(vp);
+               if (mfsp == NULL)
+                       return EIO;
+
+               args->fspec = NULL;
+               args->base = mfsp->mfs_baseoff;
+               args->size = mfsp->mfs_size;
+               *data_len = sizeof *args;
+               return 0;
+       }
+       /*
+        * XXX turn off async to avoid hangs when writing lots of data.
+        * the problem is that MFS needs to allocate pages to clean pages,
+        * so if we wait until the last minute to clean pages then there
+        * may not be any pages available to do the cleaning.
+        * ... and since the default partially-synchronous mode turns out
+        * to not be sufficient under heavy load, make it full synchronous.
+        */
+       mp->mnt_flag &= ~MNT_ASYNC;
+       mp->mnt_flag |= MNT_SYNCHRONOUS;
+
+       /*
+        * If updating, check whether changing from read-only to
+        * read/write; if there is no device name, that's all we do.
+        */
+       if (mp->mnt_flag & MNT_UPDATE) {
+               ump = VFSTOUFS(mp);
+               fs = ump->um_fs;
+               if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+                       flags = WRITECLOSE;
+                       if (mp->mnt_flag & MNT_FORCE)
+                               flags |= FORCECLOSE;
+                       error = ffs_flushfiles(mp, flags, l);
+                       if (error)
+                               return (error);
+               }
+               if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR))
+                       fs->fs_ronly = 0;
+               if (args->fspec == NULL)
+                       return EINVAL;
+               return (0);
+       }
+       error = getnewvnode(VT_MFS, NULL, mfs_vnodeop_p, NULL, &devvp);
+       if (error)
+               return (error);
+       devvp->v_vflag |= VV_MPSAFE;
+       devvp->v_type = VBLK;
+       spec_node_init(devvp, makedev(255, mfs_minor));
+       mfs_minor++;
+       mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP);
+       devvp->v_data = mfsp;
+       mfsp->mfs_baseoff = args->base;
+       mfsp->mfs_size = args->size;
+       mfsp->mfs_vnode = devvp;
+       mfsp->mfs_proc = p;
+       mfsp->mfs_shutdown = 0;
+       cv_init(&mfsp->mfs_cv, "mfsidl");
+       mfsp->mfs_refcnt = 1;
+       bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0);
+       if ((error = ffs_mountfs(devvp, mp, l)) != 0) {
+               mfsp->mfs_shutdown = 1;
+               vrele(devvp);
+               return (error);
+       }
+       ump = VFSTOUFS(mp);
+       fs = ump->um_fs;
+       error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
+           UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
+       if (error)
+               return error;
+       (void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname,
+               sizeof(fs->fs_fsmnt));
+       fs->fs_fsmnt[sizeof(fs->fs_fsmnt) - 1] = '\0';
+       /* XXX: cleanup on error */
+       return 0;
+}
+
+/*
+ * Used to grab the process and keep it in the kernel to service
+ * memory filesystem I/O requests.
+ *
+ * Loop servicing I/O requests.
+ * Copy the requested data into or out of the memory filesystem
+ * address space.
+ */
+/* ARGSUSED */
+int
+mfs_start(struct mount *mp, int flags)
+{
+       struct vnode *vp;
+       struct mfsnode *mfsp;
+       struct proc *p;
+       struct buf *bp;
+       void *base;
+       int sleepreturn = 0, refcnt, error;
+       ksiginfoq_t kq;
+
+       /*
+        * Ensure that file system is still mounted when getting mfsnode.
+        * Add a reference to the mfsnode to prevent it disappearing in
+        * this routine.
+        */
+       if ((error = vfs_busy(mp, NULL)) != 0)
+               return error;
+       vp = VFSTOUFS(mp)->um_devvp;
+       mfsp = VTOMFS(vp);
+       mutex_enter(&mfs_lock);
+       mfsp->mfs_refcnt++;
+       mutex_exit(&mfs_lock);
+       vfs_unbusy(mp, false, NULL);
+
+       base = mfsp->mfs_baseoff;
+       mutex_enter(&mfs_lock);
+       while (mfsp->mfs_shutdown != 1) {
+               while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) {
+                       mutex_exit(&mfs_lock);
+                       mfs_doio(bp, base);
+                       mutex_enter(&mfs_lock);
+               }
+               /*
+                * If a non-ignored signal is received, try to unmount.
+                * If that fails, or the filesystem is already in the
+                * process of being unmounted, clear the signal (it has been
+                * "processed"), otherwise we will loop here, as tsleep
+                * will always return EINTR/ERESTART.
+                */
+               if (sleepreturn != 0) {
+                       mutex_exit(&mfs_lock);
+                       if (dounmount(mp, 0, curlwp) != 0) {
+                               p = curproc;
+                               ksiginfo_queue_init(&kq);
+                               mutex_enter(p->p_lock);
+                               sigclearall(p, NULL, &kq);
+                               mutex_exit(p->p_lock);
+                               ksiginfo_queue_drain(&kq);
+                       }
+                       sleepreturn = 0;
+                       mutex_enter(&mfs_lock);
+                       continue;
+               }
+
+               sleepreturn = cv_wait_sig(&mfsp->mfs_cv, &mfs_lock);
+       }
+       KASSERT(bufq_peek(mfsp->mfs_buflist) == NULL);
+       refcnt = --mfsp->mfs_refcnt;
+       mutex_exit(&mfs_lock);
+       if (refcnt == 0) {
+               bufq_free(mfsp->mfs_buflist);
+               cv_destroy(&mfsp->mfs_cv);
+               kmem_free(mfsp, sizeof(*mfsp));
+       }
+       return (sleepreturn);
+}
+
+/*
+ * Get file system statistics.
+ */
+int
+mfs_statvfs(struct mount *mp, struct statvfs *sbp)
+{
+       int error;
+
+       error = ffs_statvfs(mp, sbp);
+       if (error)
+               return error;
+       (void)strncpy(sbp->f_fstypename, mp->mnt_op->vfs_name,
+           sizeof(sbp->f_fstypename));
+       sbp->f_fstypename[sizeof(sbp->f_fstypename) - 1] = '\0';
+       return 0;
+}
diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c
new file mode 100644 (file)
index 0000000..53a2c58
--- /dev/null
@@ -0,0 +1,327 @@
+/*     $NetBSD: mfs_vnops.c,v 1.54 2010/06/24 13:03:19 hannken Exp $   */
+
+/*
+ * Copyright (c) 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)mfs_vnops.c 8.11 (Berkeley) 5/22/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: mfs_vnops.c,v 1.54 2010/06/24 13:03:19 hannken Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/bufq.h>
+#include <sys/vnode.h>
+#include <sys/kmem.h>
+
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/specfs/specdev.h>
+
+#include <machine/vmparam.h>
+
+#include <ufs/mfs/mfsnode.h>
+#include <ufs/mfs/mfs_extern.h>
+
+/*
+ * mfs vnode operations.
+ */
+int (**mfs_vnodeop_p)(void *);
+const struct vnodeopv_entry_desc mfs_vnodeop_entries[] = {
+       { &vop_default_desc, vn_default_error },
+       { &vop_lookup_desc, mfs_lookup },               /* lookup */
+       { &vop_create_desc, mfs_create },               /* create */
+       { &vop_mknod_desc, mfs_mknod },                 /* mknod */
+       { &vop_open_desc, mfs_open },                   /* open */
+       { &vop_close_desc, mfs_close },                 /* close */
+       { &vop_access_desc, mfs_access },               /* access */
+       { &vop_getattr_desc, mfs_getattr },             /* getattr */
+       { &vop_setattr_desc, mfs_setattr },             /* setattr */
+       { &vop_read_desc, mfs_read },                   /* read */
+       { &vop_write_desc, mfs_write },                 /* write */
+       { &vop_ioctl_desc, mfs_ioctl },                 /* ioctl */
+       { &vop_poll_desc, mfs_poll },                   /* poll */
+       { &vop_revoke_desc, mfs_revoke },               /* revoke */
+       { &vop_mmap_desc, mfs_mmap },                   /* mmap */
+       { &vop_fsync_desc, spec_fsync },                /* fsync */
+       { &vop_seek_desc, mfs_seek },                   /* seek */
+       { &vop_remove_desc, mfs_remove },               /* remove */
+       { &vop_link_desc, mfs_link },                   /* link */
+       { &vop_rename_desc, mfs_rename },               /* rename */
+       { &vop_mkdir_desc, mfs_mkdir },                 /* mkdir */
+       { &vop_rmdir_desc, mfs_rmdir },                 /* rmdir */
+       { &vop_symlink_desc, mfs_symlink },             /* symlink */
+       { &vop_readdir_desc, mfs_readdir },             /* readdir */
+       { &vop_readlink_desc, mfs_readlink },           /* readlink */
+       { &vop_abortop_desc, mfs_abortop },             /* abortop */
+       { &vop_inactive_desc, mfs_inactive },           /* inactive */
+       { &vop_reclaim_desc, mfs_reclaim },             /* reclaim */
+       { &vop_lock_desc, genfs_nolock },               /* lock */
+       { &vop_unlock_desc, genfs_nounlock },           /* unlock */
+       { &vop_bmap_desc, mfs_bmap },                   /* bmap */
+       { &vop_strategy_desc, mfs_strategy },           /* strategy */
+       { &vop_print_desc, mfs_print },                 /* print */
+       { &vop_islocked_desc, mfs_islocked },           /* islocked */
+       { &vop_pathconf_desc, mfs_pathconf },           /* pathconf */
+       { &vop_advlock_desc, mfs_advlock },             /* advlock */
+       { &vop_bwrite_desc, mfs_bwrite },               /* bwrite */
+       { &vop_putpages_desc, mfs_putpages },           /* putpages */
+       { NULL, NULL }
+};
+const struct vnodeopv_desc mfs_vnodeop_opv_desc =
+       { &mfs_vnodeop_p, mfs_vnodeop_entries };
+
+/*
+ * Vnode Operations.
+ *
+ * Open called to allow memory filesystem to initialize and
+ * validate before actual IO. Record our process identifier
+ * so we can tell when we are doing I/O to ourself.
+ */
+/* ARGSUSED */
+int
+mfs_open(void *v)
+{
+       struct vop_open_args /* {
+               struct vnode *a_vp;
+               int  a_mode;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+
+       if (ap->a_vp->v_type != VBLK) {
+               panic("mfs_ioctl not VBLK");
+               /* NOTREACHED */
+       }
+       return (0);
+}
+
+/*
+ * Pass I/O requests to the memory filesystem process.
+ */
+int
+mfs_strategy(void *v)
+{
+       struct vop_strategy_args /* {
+               struct vnode *a_vp;
+               struct buf *a_bp;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct buf *bp = ap->a_bp;
+       struct mfsnode *mfsp;
+
+       if (vp->v_type != VBLK || vp->v_usecount == 0)
+               panic("mfs_strategy: bad dev");
+       mfsp = VTOMFS(vp);
+       /* check for mini-root access */
+       if (mfsp->mfs_proc == NULL) {
+               void *base;
+
+               base = (char *)mfsp->mfs_baseoff + (bp->b_blkno << DEV_BSHIFT);
+               if (bp->b_flags & B_READ)
+                       memcpy(bp->b_data, base, bp->b_bcount);
+               else
+                       memcpy(base, bp->b_data, bp->b_bcount);
+               bp->b_resid = 0;
+               biodone(bp);
+       } else if (mfsp->mfs_proc == curproc) {
+               mfs_doio(bp, mfsp->mfs_baseoff);
+       } else if (doing_shutdown) {
+               /*
+                * bitbucket I/O during shutdown.
+                * Note that reads should *not* happen here, but..
+                */
+               if (bp->b_flags & B_READ)
+                       printf("warning: mfs read during shutdown\n");
+               bp->b_resid = 0;
+               biodone(bp);
+       } else {
+               mutex_enter(&mfs_lock);
+               bufq_put(mfsp->mfs_buflist, bp);
+               cv_broadcast(&mfsp->mfs_cv);
+               mutex_exit(&mfs_lock);
+       }
+       return (0);
+}
+
+/*
+ * Memory file system I/O.
+ */
+void
+mfs_doio(struct buf *bp, void *base)
+{
+
+       base = (char *)base + (bp->b_blkno << DEV_BSHIFT);
+       if (bp->b_flags & B_READ)
+               bp->b_error = copyin(base, bp->b_data, bp->b_bcount);
+       else
+               bp->b_error = copyout(bp->b_data, base, bp->b_bcount);
+       if (bp->b_error == 0)
+               bp->b_resid = 0;
+       biodone(bp);
+}
+
+/*
+ * This is a noop, simply returning what one has been given.
+ */
+int
+mfs_bmap(void *v)
+{
+       struct vop_bmap_args /* {
+               struct vnode *a_vp;
+               daddr_t  a_bn;
+               struct vnode **a_vpp;
+               daddr_t *a_bnp;
+               int *a_runp;
+       } */ *ap = v;
+
+       if (ap->a_vpp != NULL)
+               *ap->a_vpp = ap->a_vp;
+       if (ap->a_bnp != NULL)
+               *ap->a_bnp = ap->a_bn;
+       if (ap->a_runp != NULL)
+                *ap->a_runp = 0;
+       return (0);
+}
+
+/*
+ * Memory filesystem close routine
+ */
+/* ARGSUSED */
+int
+mfs_close(void *v)
+{
+       struct vop_close_args /* {
+               struct vnode *a_vp;
+               int  a_fflag;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct mfsnode *mfsp = VTOMFS(vp);
+       struct buf *bp;
+       int error;
+
+       /*
+        * Finish any pending I/O requests.
+        */
+       mutex_enter(&mfs_lock);
+       while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) {
+               mutex_exit(&mfs_lock);
+               mfs_doio(bp, mfsp->mfs_baseoff);
+               mutex_enter(&mfs_lock);
+       }
+       mutex_exit(&mfs_lock);
+       /*
+        * On last close of a memory filesystem
+        * we must invalidate any in core blocks, so that
+        * we can, free up its vnode.
+        */
+       if ((error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0)) != 0)
+               return (error);
+       /*
+        * There should be no way to have any more uses of this
+        * vnode, so if we find any other uses, it is a panic.
+        */
+       if (bufq_peek(mfsp->mfs_buflist) != NULL)
+               panic("mfs_close");
+       /*
+        * Send a request to the filesystem server to exit.
+        */
+       mutex_enter(&mfs_lock);
+       mfsp->mfs_shutdown = 1;
+       cv_broadcast(&mfsp->mfs_cv);
+       mutex_exit(&mfs_lock);
+       return (0);
+}
+
+/*
+ * Memory filesystem inactive routine
+ */
+/* ARGSUSED */
+int
+mfs_inactive(void *v)
+{
+       struct vop_inactive_args /* {
+               struct vnode *a_vp;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct mfsnode *mfsp = VTOMFS(vp);
+
+       if (bufq_peek(mfsp->mfs_buflist) != NULL)
+               panic("mfs_inactive: not inactive (mfs_buflist %p)",
+                       bufq_peek(mfsp->mfs_buflist));
+       VOP_UNLOCK(vp);
+       return (0);
+}
+
+/*
+ * Reclaim a memory filesystem devvp so that it can be reused.
+ */
+int
+mfs_reclaim(void *v)
+{
+       struct vop_reclaim_args /* {
+               struct vnode *a_vp;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct mfsnode *mfsp = VTOMFS(vp);
+       int refcnt;
+
+       mutex_enter(&mfs_lock);
+       vp->v_data = NULL;
+       refcnt = --mfsp->mfs_refcnt;
+       mutex_exit(&mfs_lock);
+
+       if (refcnt == 0) {
+               bufq_free(mfsp->mfs_buflist);
+               cv_destroy(&mfsp->mfs_cv);
+               kmem_free(mfsp, sizeof(*mfsp));
+       }
+
+       return (0);
+}
+
+/*
+ * Print out the contents of an mfsnode.
+ */
+int
+mfs_print(void *v)
+{
+       struct vop_print_args /* {
+               struct vnode *a_vp;
+       } */ *ap = v;
+       struct mfsnode *mfsp = VTOMFS(ap->a_vp);
+
+       printf("tag VT_MFS, pid %d, base %p, size %ld\n",
+           (mfsp->mfs_proc != NULL) ? mfsp->mfs_proc->p_pid : 0,
+           mfsp->mfs_baseoff, mfsp->mfs_size);
+       return (0);
+}
diff --git a/sys/ufs/ufs/Makefile b/sys/ufs/ufs/Makefile
new file mode 100644 (file)
index 0000000..6f08db6
--- /dev/null
@@ -0,0 +1,8 @@
+#      $NetBSD: Makefile,v 1.7 2011/03/06 17:08:39 bouyer Exp $
+
+INCSDIR= /usr/include/ufs/ufs
+
+INCS=  dinode.h dir.h extattr.h inode.h quota.h quota1.h quota2.h \
+       ufs_bswap.h ufs_extern.h ufs_wapbl.h ufsmount.h
+
+.include <bsd.kinc.mk>
similarity index 100%
rename from include/ufs/ufs/dir.h
rename to sys/ufs/ufs/dir.h
similarity index 100%
rename from include/ufs/ufs/inode.h
rename to sys/ufs/ufs/inode.h
similarity index 100%
rename from include/ufs/ufs/quota.h
rename to sys/ufs/ufs/quota.h
diff --git a/sys/ufs/ufs/quota1_subr.c b/sys/ufs/ufs/quota1_subr.c
new file mode 100644 (file)
index 0000000..ff6a06c
--- /dev/null
@@ -0,0 +1,95 @@
+/* $NetBSD: quota1_subr.c,v 1.6 2011/11/25 16:55:05 dholland Exp $ */
+/*-
+  * Copyright (c) 2010 Manuel Bouyer
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  *
+  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  * POSSIBILITY OF SUCH DAMAGE.
+  */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: quota1_subr.c,v 1.6 2011/11/25 16:55:05 dholland Exp $");
+
+#include <sys/types.h>
+#include <machine/limits.h>
+
+#include <sys/quota.h>
+#include <quota/quotaprop.h>
+#include <ufs/ufs/quota1.h>
+
+static uint64_t
+dqblk2q2e_limit(uint32_t lim)
+{
+       if (lim == 0)
+               return UQUAD_MAX;
+       else
+               return (lim - 1);
+}
+
+static uint32_t
+q2e2dqblk_limit(uint64_t lim)
+{
+       if (lim == UQUAD_MAX)
+               return 0;
+       else
+               return (lim + 1);
+}
+
+void
+dqblk_to_quotaval(const struct dqblk *dqblk, struct quotaval *qv)
+{
+       /* XXX is qv_grace getting handled correctly? */
+
+       qv[QUOTA_LIMIT_BLOCK].qv_hardlimit =
+           dqblk2q2e_limit(dqblk->dqb_bhardlimit);
+       qv[QUOTA_LIMIT_BLOCK].qv_softlimit =
+           dqblk2q2e_limit(dqblk->dqb_bsoftlimit);
+       qv[QUOTA_LIMIT_BLOCK].qv_usage       = dqblk->dqb_curblocks;
+       qv[QUOTA_LIMIT_BLOCK].qv_expiretime      = dqblk->dqb_btime;
+
+       qv[QUOTA_LIMIT_FILE].qv_hardlimit =
+           dqblk2q2e_limit(dqblk->dqb_ihardlimit);
+       qv[QUOTA_LIMIT_FILE].qv_softlimit =
+           dqblk2q2e_limit(dqblk->dqb_isoftlimit);
+       qv[QUOTA_LIMIT_FILE].qv_usage       = dqblk->dqb_curinodes;
+       qv[QUOTA_LIMIT_FILE].qv_expiretime      = dqblk->dqb_itime;
+}
+
+void
+quotaval_to_dqblk(const struct quotaval *qv, struct dqblk *dqblk)
+{
+       /* XXX is qv_grace getting handled correctly? */
+
+       dqblk->dqb_bhardlimit =
+           q2e2dqblk_limit(qv[QUOTA_LIMIT_BLOCK].qv_hardlimit);
+       dqblk->dqb_bsoftlimit =
+           q2e2dqblk_limit(qv[QUOTA_LIMIT_BLOCK].qv_softlimit);
+       dqblk->dqb_curblocks  = qv[QUOTA_LIMIT_BLOCK].qv_usage;
+       dqblk->dqb_btime      = qv[QUOTA_LIMIT_BLOCK].qv_expiretime;
+
+       dqblk->dqb_ihardlimit =
+           q2e2dqblk_limit(qv[QUOTA_LIMIT_FILE].qv_hardlimit);
+       dqblk->dqb_isoftlimit =
+           q2e2dqblk_limit(qv[QUOTA_LIMIT_FILE].qv_softlimit);
+       dqblk->dqb_curinodes  = qv[QUOTA_LIMIT_FILE].qv_usage;
+       dqblk->dqb_itime      = qv[QUOTA_LIMIT_FILE].qv_expiretime;
+}
+
diff --git a/sys/ufs/ufs/quota2_subr.c b/sys/ufs/ufs/quota2_subr.c
new file mode 100644 (file)
index 0000000..f91007f
--- /dev/null
@@ -0,0 +1,108 @@
+/* $NetBSD: quota2_subr.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $ */
+/*-
+  * Copyright (c) 2010 Manuel Bouyer
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  *
+  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  * POSSIBILITY OF SUCH DAMAGE.
+  */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: quota2_subr.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/time.h>
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/quota2.h>
+
+#ifndef _KERNEL
+#include <string.h>
+#endif
+
+void
+quota2_addfreeq2e(struct quota2_header *q2h, void *bp, uint64_t baseoff,
+    uint64_t bsize, int ns)
+{
+       uint64_t blkoff = baseoff % bsize;
+       int i, nq2e;
+       struct quota2_entry *q2e;
+
+       q2e = (void *)((char *)bp + blkoff);
+       nq2e = (bsize - blkoff) / sizeof(*q2e);
+       for (i = 0; i < nq2e; i++) {
+               q2e[i].q2e_next = q2h->q2h_free;
+               q2h->q2h_free = ufs_rw64(i * sizeof(*q2e) + baseoff, ns);
+       }
+}
+
+void
+quota2_create_blk0(uint64_t bsize, void *bp, int q2h_hash_shift, int type,
+    int ns)
+{
+       struct quota2_header *q2h;
+       const int quota2_hash_size = 1 << q2h_hash_shift;
+       const int quota2_full_header_size = sizeof(struct quota2_header) +
+           sizeof(q2h->q2h_entries[0]) * quota2_hash_size;
+       int i;
+
+       memset(bp, 0, bsize);
+       q2h = bp;
+       q2h->q2h_magic_number = ufs_rw32(Q2_HEAD_MAGIC, ns);
+       q2h->q2h_type = type;
+       q2h->q2h_hash_shift = q2h_hash_shift;
+       q2h->q2h_hash_size = ufs_rw16(quota2_hash_size, ns);
+       /* setup defaut entry: unlimited, 7 days grace */
+       for (i = 0; i < N_QL; i++) {
+               q2h->q2h_defentry.q2e_val[i].q2v_hardlimit =
+                   q2h->q2h_defentry.q2e_val[i].q2v_softlimit =
+                   ufs_rw64(UQUAD_MAX, ns);
+               q2h->q2h_defentry.q2e_val[i].q2v_grace =
+                   ufs_rw64(7ULL * 24ULL * 3600ULL, ns);
+       }
+
+       /* first quota entry, after the hash table */
+       quota2_addfreeq2e(q2h, bp, quota2_full_header_size, bsize, ns);
+}
+
+void
+quota2_ufs_rwq2v(const struct quota2_val *s, struct quota2_val *d, int needswap)
+{
+       d->q2v_hardlimit = ufs_rw64(s->q2v_hardlimit, needswap);
+       d->q2v_softlimit = ufs_rw64(s->q2v_softlimit, needswap);
+       d->q2v_cur = ufs_rw64(s->q2v_cur, needswap);
+       d->q2v_time = ufs_rw64(s->q2v_time, needswap);
+       d->q2v_grace = ufs_rw64(s->q2v_grace, needswap);
+}
+
+void
+quota2_ufs_rwq2e(const struct quota2_entry *s, struct quota2_entry *d,
+int needswap)
+{
+       quota2_ufs_rwq2v(&s->q2e_val[QL_BLOCK], &d->q2e_val[QL_BLOCK],
+           needswap);
+       quota2_ufs_rwq2v(&s->q2e_val[QL_FILE], &d->q2e_val[QL_FILE],
+           needswap);
+       d->q2e_uid = ufs_rw32(s->q2e_uid, needswap);
+}
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
new file mode 100644 (file)
index 0000000..3420e22
--- /dev/null
@@ -0,0 +1,405 @@
+/*     $NetBSD: ufs_bmap.c,v 1.49 2011/03/06 17:08:39 bouyer Exp $     */
+
+/*
+ * Copyright (c) 1989, 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_bmap.c  8.8 (Berkeley) 8/11/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_bmap.c,v 1.49 2011/03/06 17:08:39 bouyer Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/trace.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+
+static bool
+ufs_issequential(const struct ufsmount *ump, daddr_t daddr0, daddr_t daddr1)
+{
+
+       /* for ufs, blocks in a hole is not 'contiguous'. */
+       if (daddr0 == 0)
+               return false;
+
+       return (daddr0 + ump->um_seqinc == daddr1);
+}
+
+/*
+ * Bmap converts the logical block number of a file to its physical block
+ * number on the disk. The conversion is done by using the logical block
+ * number to index into the array of block pointers described by the dinode.
+ */
+int
+ufs_bmap(void *v)
+{
+       struct vop_bmap_args /* {
+               struct vnode *a_vp;
+               daddr_t  a_bn;
+               struct vnode **a_vpp;
+               daddr_t *a_bnp;
+               int *a_runp;
+       } */ *ap = v;
+       int error;
+
+       /*
+        * Check for underlying vnode requests and ensure that logical
+        * to physical mapping is requested.
+        */
+       if (ap->a_vpp != NULL)
+               *ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
+       if (ap->a_bnp == NULL)
+               return (0);
+
+       fstrans_start(ap->a_vp->v_mount, FSTRANS_SHARED);
+       error = ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
+           ap->a_runp, ufs_issequential);
+       fstrans_done(ap->a_vp->v_mount);
+       return error;
+}
+
+/*
+ * Indirect blocks are now on the vnode for the file.  They are given negative
+ * logical block numbers.  Indirect blocks are addressed by the negative
+ * address of the first data block to which they point.  Double indirect blocks
+ * are addressed by one less than the address of the first indirect block to
+ * which they point.  Triple indirect blocks are addressed by one less than
+ * the address of the first double indirect block to which they point.
+ *
+ * ufs_bmaparray does the bmap conversion, and if requested returns the
+ * array of logical blocks which must be traversed to get to a block.
+ * Each entry contains the offset into that block that gets you to the
+ * next block and the disk address of the block (if it is assigned).
+ */
+
+int
+ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
+    int *nump, int *runp, ufs_issequential_callback_t is_sequential)
+{
+       struct inode *ip;
+       struct buf *bp, *cbp;
+       struct ufsmount *ump;
+       struct mount *mp;
+       struct indir a[NIADDR + 1], *xap;
+       daddr_t daddr;
+       daddr_t metalbn;
+       int error, maxrun = 0, num;
+
+       ip = VTOI(vp);
+       mp = vp->v_mount;
+       ump = ip->i_ump;
+#ifdef DIAGNOSTIC
+       if ((ap != NULL && nump == NULL) || (ap == NULL && nump != NULL))
+               panic("ufs_bmaparray: invalid arguments");
+#endif
+
+       if (runp) {
+               /*
+                * XXX
+                * If MAXBSIZE is the largest transfer the disks can handle,
+                * we probably want maxrun to be 1 block less so that we
+                * don't create a block larger than the device can handle.
+                */
+               *runp = 0;
+               maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
+       }
+
+       if (bn >= 0 && bn < NDADDR) {
+               if (nump != NULL)
+                       *nump = 0;
+               if (ump->um_fstype == UFS1)
+                       daddr = ufs_rw32(ip->i_ffs1_db[bn],
+                           UFS_MPNEEDSWAP(ump));
+               else
+                       daddr = ufs_rw64(ip->i_ffs2_db[bn],
+                           UFS_MPNEEDSWAP(ump));
+               *bnp = blkptrtodb(ump, daddr);
+               /*
+                * Since this is FFS independent code, we are out of
+                * scope for the definitions of BLK_NOCOPY and
+                * BLK_SNAP, but we do know that they will fall in
+                * the range 1..um_seqinc, so we use that test and
+                * return a request for a zeroed out buffer if attempts
+                * are made to read a BLK_NOCOPY or BLK_SNAP block.
+                */
+               if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
+                   && daddr > 0 &&
+                   daddr < ump->um_seqinc) {
+                       *bnp = -1;
+               } else if (*bnp == 0) {
+                       if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
+                           == SF_SNAPSHOT) {
+                               *bnp = blkptrtodb(ump, bn * ump->um_seqinc);
+                       } else {
+                               *bnp = -1;
+                       }
+               } else if (runp) {
+                       if (ump->um_fstype == UFS1) {
+                               for (++bn; bn < NDADDR && *runp < maxrun &&
+                                   is_sequential(ump,
+                                       ufs_rw32(ip->i_ffs1_db[bn - 1],
+                                           UFS_MPNEEDSWAP(ump)),
+                                       ufs_rw32(ip->i_ffs1_db[bn],
+                                           UFS_MPNEEDSWAP(ump)));
+                                   ++bn, ++*runp);
+                       } else {
+                               for (++bn; bn < NDADDR && *runp < maxrun &&
+                                   is_sequential(ump,
+                                       ufs_rw64(ip->i_ffs2_db[bn - 1],
+                                           UFS_MPNEEDSWAP(ump)),
+                                       ufs_rw64(ip->i_ffs2_db[bn],
+                                           UFS_MPNEEDSWAP(ump)));
+                                   ++bn, ++*runp);
+                       }
+               }
+               return (0);
+       }
+
+       xap = ap == NULL ? a : ap;
+       if (!nump)
+               nump = &num;
+       if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
+               return (error);
+
+       num = *nump;
+
+       /* Get disk address out of indirect block array */
+       if (ump->um_fstype == UFS1)
+               daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off],
+                   UFS_MPNEEDSWAP(ump));
+       else
+               daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off],
+                   UFS_MPNEEDSWAP(ump));
+
+       for (bp = NULL, ++xap; --num; ++xap) {
+               /*
+                * Exit the loop if there is no disk address assigned yet and
+                * the indirect block isn't in the cache, or if we were
+                * looking for an indirect block and we've found it.
+                */
+
+               metalbn = xap->in_lbn;
+               if (metalbn == bn)
+                       break;
+               if (daddr == 0) {
+                       mutex_enter(&bufcache_lock);
+                       cbp = incore(vp, metalbn);
+                       mutex_exit(&bufcache_lock);
+                       if (cbp == NULL)
+                               break;
+               }
+
+               /*
+                * If we get here, we've either got the block in the cache
+                * or we have a disk address for it, go fetch it.
+                */
+               if (bp)
+                       brelse(bp, 0);
+
+               xap->in_exists = 1;
+               bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
+               if (bp == NULL) {
+
+                       /*
+                        * getblk() above returns NULL only iff we are
+                        * pagedaemon.  See the implementation of getblk
+                        * for detail.
+                        */
+
+                       return (ENOMEM);
+               }
+               if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
+                       trace(TR_BREADHIT, pack(vp, size), metalbn);
+               }
+#ifdef DIAGNOSTIC
+               else if (!daddr)
+                       panic("ufs_bmaparray: indirect block not in cache");
+#endif
+               else {
+                       trace(TR_BREADMISS, pack(vp, size), metalbn);
+                       bp->b_blkno = blkptrtodb(ump, daddr);
+                       bp->b_flags |= B_READ;
+                       BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+                       VOP_STRATEGY(vp, bp);
+                       curlwp->l_ru.ru_inblock++;      /* XXX */
+                       if ((error = biowait(bp)) != 0) {
+                               brelse(bp, 0);
+                               return (error);
+                       }
+               }
+               if (ump->um_fstype == UFS1) {
+                       daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off],
+                           UFS_MPNEEDSWAP(ump));
+                       if (num == 1 && daddr && runp) {
+                               for (bn = xap->in_off + 1;
+                                   bn < MNINDIR(ump) && *runp < maxrun &&
+                                   is_sequential(ump,
+                                       ufs_rw32(((int32_t *)bp->b_data)[bn-1],
+                                           UFS_MPNEEDSWAP(ump)),
+                                       ufs_rw32(((int32_t *)bp->b_data)[bn],
+                                           UFS_MPNEEDSWAP(ump)));
+                                   ++bn, ++*runp);
+                       }
+               } else {
+                       daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off],
+                           UFS_MPNEEDSWAP(ump));
+                       if (num == 1 && daddr && runp) {
+                               for (bn = xap->in_off + 1;
+                                   bn < MNINDIR(ump) && *runp < maxrun &&
+                                   is_sequential(ump,
+                                       ufs_rw64(((int64_t *)bp->b_data)[bn-1],
+                                           UFS_MPNEEDSWAP(ump)),
+                                       ufs_rw64(((int64_t *)bp->b_data)[bn],
+                                           UFS_MPNEEDSWAP(ump)));
+                                   ++bn, ++*runp);
+                       }
+               }
+       }
+       if (bp)
+               brelse(bp, 0);
+
+       /*
+        * Since this is FFS independent code, we are out of scope for the
+        * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
+        * will fall in the range 1..um_seqinc, so we use that test and
+        * return a request for a zeroed out buffer if attempts are made
+        * to read a BLK_NOCOPY or BLK_SNAP block.
+        */
+       if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT
+           && daddr > 0 && daddr < ump->um_seqinc) {
+               *bnp = -1;
+               return (0);
+       }
+       *bnp = blkptrtodb(ump, daddr);
+       if (*bnp == 0) {
+               if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
+                   == SF_SNAPSHOT) {
+                       *bnp = blkptrtodb(ump, bn * ump->um_seqinc);
+               } else {
+                       *bnp = -1;
+               }
+       }
+       return (0);
+}
+
+/*
+ * Create an array of logical block number/offset pairs which represent the
+ * path of indirect blocks required to access a data block.  The first "pair"
+ * contains the logical block number of the appropriate single, double or
+ * triple indirect block and the offset into the inode indirect block array.
+ * Note, the logical block number of the inode single/double/triple indirect
+ * block appears twice in the array, once with the offset into the i_ffs1_ib and
+ * once with the offset into the page itself.
+ */
+int
+ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump)
+{
+       daddr_t metalbn, realbn;
+       struct ufsmount *ump;
+       int64_t blockcnt;
+       int lbc;
+       int i, numlevels, off;
+
+       ump = VFSTOUFS(vp->v_mount);
+       if (nump)
+               *nump = 0;
+       numlevels = 0;
+       realbn = bn;
+       if (bn < 0)
+               bn = -bn;
+       KASSERT(bn >= NDADDR);
+
+       /*
+        * Determine the number of levels of indirection.  After this loop
+        * is done, blockcnt indicates the number of data blocks possible
+        * at the given level of indirection, and NIADDR - i is the number
+        * of levels of indirection needed to locate the requested block.
+        */
+
+       bn -= NDADDR;
+       for (lbc = 0, i = NIADDR;; i--, bn -= blockcnt) {
+               if (i == 0)
+                       return (EFBIG);
+
+               lbc += ump->um_lognindir;
+               blockcnt = (int64_t)1 << lbc;
+
+               if (bn < blockcnt)
+                       break;
+       }
+
+       /* Calculate the address of the first meta-block. */
+       metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + NIADDR - i);
+
+       /*
+        * At each iteration, off is the offset into the bap array which is
+        * an array of disk addresses at the current level of indirection.
+        * The logical block number and the offset in that block are stored
+        * into the argument array.
+        */
+       ap->in_lbn = metalbn;
+       ap->in_off = off = NIADDR - i;
+       ap->in_exists = 0;
+       ap++;
+       for (++numlevels; i <= NIADDR; i++) {
+               /* If searching for a meta-data block, quit when found. */
+               if (metalbn == realbn)
+                       break;
+
+               lbc -= ump->um_lognindir;
+               off = (bn >> lbc) & (MNINDIR(ump) - 1);
+
+               ++numlevels;
+               ap->in_lbn = metalbn;
+               ap->in_off = off;
+               ap->in_exists = 0;
+               ++ap;
+
+               metalbn -= -1 + ((int64_t)off << lbc);
+       }
+       if (nump)
+               *nump = numlevels;
+       return (0);
+}
diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c
new file mode 100644 (file)
index 0000000..e893a93
--- /dev/null
@@ -0,0 +1,1171 @@
+/*     $NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $   */
+
+/*
+ * Copyright (c) 2001, 2002 Ian Dowse.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_dirhash.c,v 1.34 2009/10/05 23:48:08 rmind Exp $");
+
+/*
+ * This implements a hash-based lookup scheme for UFS directories.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/types.h>
+#include <sys/hash.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/pool.h>
+#include <sys/sysctl.h>
+#include <sys/atomic.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/dirhash.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+
+#define WRAPINCR(val, limit)   (((val) + 1 == (limit)) ? 0 : ((val) + 1))
+#define WRAPDECR(val, limit)   (((val) == 0) ? ((limit) - 1) : ((val) - 1))
+#define OFSFMT(ip)             ((ip)->i_ump->um_maxsymlinklen <= 0)
+#define BLKFREE2IDX(n)         ((n) > DH_NFSTATS ? DH_NFSTATS : (n))
+
+static u_int ufs_dirhashminblks = 5;
+static u_int ufs_dirhashmaxmem = 2 * 1024 * 1024;
+static u_int ufs_dirhashmem;
+static u_int ufs_dirhashcheck = 0;
+
+static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen);
+static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff,
+          int dirblksiz);
+static void ufsdirhash_delslot(struct dirhash *dh, int slot);
+static int ufsdirhash_findslot(struct dirhash *dh, const char *name,
+          int namelen, doff_t offset);
+static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset,
+          int dirblksiz);
+static int ufsdirhash_recycle(int wanted);
+
+static pool_cache_t ufsdirhashblk_cache;
+static pool_cache_t ufsdirhash_cache;
+
+#define DIRHASHLIST_LOCK()             mutex_enter(&ufsdirhash_lock)
+#define DIRHASHLIST_UNLOCK()           mutex_exit(&ufsdirhash_lock)
+#define DIRHASH_LOCK(dh)               mutex_enter(&(dh)->dh_lock)
+#define DIRHASH_UNLOCK(dh)             mutex_exit(&(dh)->dh_lock)
+#define DIRHASH_BLKALLOC()             \
+    pool_cache_get(ufsdirhashblk_cache, PR_NOWAIT)
+#define DIRHASH_BLKFREE(ptr)           \
+    pool_cache_put(ufsdirhashblk_cache, ptr)
+
+/* Dirhash list; recently-used entries are near the tail. */
+static TAILQ_HEAD(, dirhash) ufsdirhash_list;
+
+/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
+static kmutex_t ufsdirhash_lock;
+
+static struct sysctllog *ufsdirhash_sysctl_log;
+
+/*
+ * Locking order:
+ *     ufsdirhash_lock
+ *     dh_lock
+ *
+ * The dh_lock mutex should be acquired either via the inode lock, or via
+ * ufsdirhash_lock. Only the owner of the inode may free the associated
+ * dirhash, but anything can steal its memory and set dh_hash to NULL.
+ */
+
+/*
+ * Attempt to build up a hash table for the directory contents in
+ * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
+ */
+int
+ufsdirhash_build(struct inode *ip)
+{
+       struct dirhash *dh;
+       struct buf *bp = NULL;
+       struct direct *ep;
+       struct vnode *vp;
+       doff_t bmask, pos;
+       int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
+       const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+       int dirblksiz = ip->i_ump->um_dirblksiz;
+
+       /* Check if we can/should use dirhash. */
+       if (ip->i_dirhash == NULL) {
+               if (ip->i_size < (ufs_dirhashminblks * dirblksiz) || OFSFMT(ip))
+                       return (-1);
+       } else {
+               /* Hash exists, but sysctls could have changed. */
+               if (ip->i_size < (ufs_dirhashminblks * dirblksiz) ||
+                   ufs_dirhashmem > ufs_dirhashmaxmem) {
+                       ufsdirhash_free(ip);
+                       return (-1);
+               }
+               /* Check if hash exists and is intact (note: unlocked read). */
+               if (ip->i_dirhash->dh_hash != NULL)
+                       return (0);
+               /* Free the old, recycled hash and build a new one. */
+               ufsdirhash_free(ip);
+       }
+
+       /* Don't hash removed directories. */
+       if (ip->i_nlink == 0)
+               return (-1);
+
+       vp = ip->i_vnode;
+       /* Allocate 50% more entries than this dir size could ever need. */
+       KASSERT(ip->i_size >= dirblksiz);
+       nslots = ip->i_size / DIRECTSIZ(1);
+       nslots = (nslots * 3 + 1) / 2;
+       narrays = howmany(nslots, DH_NBLKOFF);
+       nslots = narrays * DH_NBLKOFF;
+       dirblocks = howmany(ip->i_size, dirblksiz);
+       nblocks = (dirblocks * 3 + 1) / 2;
+
+       memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
+           narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
+           nblocks * sizeof(*dh->dh_blkfree);
+
+       while (atomic_add_int_nv(&ufs_dirhashmem, memreqd) >
+           ufs_dirhashmaxmem) {
+               atomic_add_int(&ufs_dirhashmem, -memreqd);
+               if (memreqd > ufs_dirhashmaxmem / 2)
+                       return (-1);
+               /* Try to free some space. */
+               if (ufsdirhash_recycle(memreqd) != 0)
+                       return (-1);
+               else
+                       DIRHASHLIST_UNLOCK();
+       }
+
+       /*
+        * Use non-blocking mallocs so that we will revert to a linear
+        * lookup on failure rather than potentially blocking forever.
+        */
+       dh = pool_cache_get(ufsdirhash_cache, PR_NOWAIT);
+       if (dh == NULL) {
+               atomic_add_int(&ufs_dirhashmem, -memreqd);
+               return (-1);
+       }
+       memset(dh, 0, sizeof(*dh));
+       mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE);
+       DIRHASH_LOCK(dh);
+       dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]);
+       dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP);
+       dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]);
+       dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP);
+       if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
+               goto fail;
+       for (i = 0; i < narrays; i++) {
+               if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL)
+                       goto fail;
+               for (j = 0; j < DH_NBLKOFF; j++)
+                       dh->dh_hash[i][j] = DIRHASH_EMPTY;
+       }
+
+       /* Initialise the hash table and block statistics. */
+       dh->dh_narrays = narrays;
+       dh->dh_hlen = nslots;
+       dh->dh_nblk = nblocks;
+       dh->dh_dirblks = dirblocks;
+       for (i = 0; i < dirblocks; i++)
+               dh->dh_blkfree[i] = dirblksiz / DIRALIGN;
+       for (i = 0; i < DH_NFSTATS; i++)
+               dh->dh_firstfree[i] = -1;
+       dh->dh_firstfree[DH_NFSTATS] = 0;
+       dh->dh_seqopt = 0;
+       dh->dh_seqoff = 0;
+       dh->dh_score = DH_SCOREINIT;
+       ip->i_dirhash = dh;
+
+       bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
+       pos = 0;
+       while (pos < ip->i_size) {
+               if ((curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
+                   != 0) {
+                       preempt();
+               }
+               /* If necessary, get the next directory block. */
+               if ((pos & bmask) == 0) {
+                       if (bp != NULL)
+                               brelse(bp, 0);
+                       if (ufs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0)
+                               goto fail;
+               }
+
+               /* Add this entry to the hash. */
+               ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
+               if (ep->d_reclen == 0 || ep->d_reclen >
+                   dirblksiz - (pos & (dirblksiz - 1))) {
+                       /* Corrupted directory. */
+                       brelse(bp, 0);
+                       goto fail;
+               }
+               if (ep->d_ino != 0) {
+                       /* Add the entry (simplified ufsdirhash_add). */
+                       slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
+                       while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
+                               slot = WRAPINCR(slot, dh->dh_hlen);
+                       dh->dh_hused++;
+                       DH_ENTRY(dh, slot) = pos;
+                       ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep, needswap),
+                           dirblksiz);
+               }
+               pos += ep->d_reclen;
+       }
+
+       if (bp != NULL)
+               brelse(bp, 0);
+       DIRHASHLIST_LOCK();
+       TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
+       dh->dh_onlist = 1;
+       DIRHASH_UNLOCK(dh);
+       DIRHASHLIST_UNLOCK();
+       return (0);
+
+fail:
+       DIRHASH_UNLOCK(dh);
+       if (dh->dh_hash != NULL) {
+               for (i = 0; i < narrays; i++)
+                       if (dh->dh_hash[i] != NULL)
+                               DIRHASH_BLKFREE(dh->dh_hash[i]);
+               kmem_free(dh->dh_hash, dh->dh_hashsz);
+       }
+       if (dh->dh_blkfree != NULL)
+               kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
+       mutex_destroy(&dh->dh_lock);
+       pool_cache_put(ufsdirhash_cache, dh);
+       ip->i_dirhash = NULL;
+       atomic_add_int(&ufs_dirhashmem, -memreqd);
+       return (-1);
+}
+
+/*
+ * Free any hash table associated with inode 'ip'.
+ */
+void
+ufsdirhash_free(struct inode *ip)
+{
+       struct dirhash *dh;
+       int i, mem;
+
+       if ((dh = ip->i_dirhash) == NULL)
+               return;
+
+       if (dh->dh_onlist) {
+               DIRHASHLIST_LOCK();
+               if (dh->dh_onlist)
+                       TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+               DIRHASHLIST_UNLOCK();
+       }
+
+       /* The dirhash pointed to by 'dh' is exclusively ours now. */
+       mem = sizeof(*dh);
+       if (dh->dh_hash != NULL) {
+               for (i = 0; i < dh->dh_narrays; i++)
+                       DIRHASH_BLKFREE(dh->dh_hash[i]);
+               kmem_free(dh->dh_hash, dh->dh_hashsz);
+               kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
+               mem += dh->dh_hashsz;
+               mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash);
+               mem += dh->dh_nblk * sizeof(*dh->dh_blkfree);
+       }
+       mutex_destroy(&dh->dh_lock);
+       pool_cache_put(ufsdirhash_cache, dh);
+       ip->i_dirhash = NULL;
+
+       atomic_add_int(&ufs_dirhashmem, -mem);
+}
+
+/*
+ * Find the offset of the specified name within the given inode.
+ * Returns 0 on success, ENOENT if the entry does not exist, or
+ * EJUSTRETURN if the caller should revert to a linear search.
+ *
+ * If successful, the directory offset is stored in *offp, and a
+ * pointer to a struct buf containing the entry is stored in *bpp. If
+ * prevoffp is non-NULL, the offset of the previous entry within
+ * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
+ * is the first in a block, the start of the block is used).
+ */
+int
+ufsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp,
+    struct buf **bpp, doff_t *prevoffp)
+{
+       struct dirhash *dh, *dh_next;
+       struct direct *dp;
+       struct vnode *vp;
+       struct buf *bp;
+       doff_t blkoff, bmask, offset, prevoff;
+       int i, slot;
+       const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+       int dirblksiz = ip->i_ump->um_dirblksiz;
+
+       if ((dh = ip->i_dirhash) == NULL)
+               return (EJUSTRETURN);
+
+       /*
+        * Move this dirhash towards the end of the list if it has a
+        * score higher than the next entry, and acquire the dh_lock.
+        * Optimise the case where it's already the last by performing
+        * an unlocked read of the TAILQ_NEXT pointer.
+        *
+        * In both cases, end up holding just dh_lock.
+        */
+       if (TAILQ_NEXT(dh, dh_list) != NULL) {
+               DIRHASHLIST_LOCK();
+               DIRHASH_LOCK(dh);
+               /*
+                * If the new score will be greater than that of the next
+                * entry, then move this entry past it. With both mutexes
+                * held, dh_next won't go away, but its dh_score could
+                * change; that's not important since it is just a hint.
+                */
+               if (dh->dh_hash != NULL &&
+                   (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
+                   dh->dh_score >= dh_next->dh_score) {
+                       KASSERT(dh->dh_onlist);
+                       TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+                       TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
+                           dh_list);
+               }
+               DIRHASHLIST_UNLOCK();
+       } else {
+               /* Already the last, though that could change as we wait. */
+               DIRHASH_LOCK(dh);
+       }
+       if (dh->dh_hash == NULL) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return (EJUSTRETURN);
+       }
+
+       /* Update the score. */
+       if (dh->dh_score < DH_SCOREMAX)
+               dh->dh_score++;
+
+       vp = ip->i_vnode;
+       bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
+       blkoff = -1;
+       bp = NULL;
+restart:
+       slot = ufsdirhash_hash(dh, name, namelen);
+
+       if (dh->dh_seqopt) {
+               /*
+                * Sequential access optimisation. dh_seqoff contains the
+                * offset of the directory entry immediately following
+                * the last entry that was looked up. Check if this offset
+                * appears in the hash chain for the name we are looking for.
+                */
+               for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
+                   i = WRAPINCR(i, dh->dh_hlen))
+                       if (offset == dh->dh_seqoff)
+                               break;
+               if (offset == dh->dh_seqoff) {
+                       /*
+                        * We found an entry with the expected offset. This
+                        * is probably the entry we want, but if not, the
+                        * code below will turn off seqoff and retry.
+                        */
+                       slot = i;
+               } else
+                       dh->dh_seqopt = 0;
+       }
+
+       for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
+           slot = WRAPINCR(slot, dh->dh_hlen)) {
+               if (offset == DIRHASH_DEL)
+                       continue;
+
+               if (offset < 0 || offset >= ip->i_size)
+                       panic("ufsdirhash_lookup: bad offset in hash array");
+               if ((offset & ~bmask) != blkoff) {
+                       if (bp != NULL)
+                               brelse(bp, 0);
+                       blkoff = offset & ~bmask;
+                       if (ufs_blkatoff(vp, (off_t)blkoff,
+                           NULL, &bp, false) != 0) {
+                               DIRHASH_UNLOCK(dh);
+                               return (EJUSTRETURN);
+                       }
+               }
+               dp = (struct direct *)((char *)bp->b_data + (offset & bmask));
+               if (dp->d_reclen == 0 || dp->d_reclen >
+                   dirblksiz - (offset & (dirblksiz - 1))) {
+                       /* Corrupted directory. */
+                       DIRHASH_UNLOCK(dh);
+                       brelse(bp, 0);
+                       return (EJUSTRETURN);
+               }
+               if (dp->d_namlen == namelen &&
+                   memcmp(dp->d_name, name, namelen) == 0) {
+                       /* Found. Get the prev offset if needed. */
+                       if (prevoffp != NULL) {
+                               if (offset & (dirblksiz - 1)) {
+                                       prevoff = ufsdirhash_getprev(dp,
+                                           offset, dirblksiz);
+                                       if (prevoff == -1) {
+                                               brelse(bp, 0);
+                                               return (EJUSTRETURN);
+                                       }
+                               } else
+                                       prevoff = offset;
+                               *prevoffp = prevoff;
+                       }
+
+                       /* Check for sequential access, and update offset. */
+                       if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
+                               dh->dh_seqopt = 1;
+                       dh->dh_seqoff = offset + DIRSIZ(0, dp, needswap);
+                       DIRHASH_UNLOCK(dh);
+
+                       *bpp = bp;
+                       *offp = offset;
+                       return (0);
+               }
+
+               if (dh->dh_hash == NULL) {
+                       DIRHASH_UNLOCK(dh);
+                       if (bp != NULL)
+                               brelse(bp, 0);
+                       ufsdirhash_free(ip);
+                       return (EJUSTRETURN);
+               }
+               /*
+                * When the name doesn't match in the seqopt case, go back
+                * and search normally.
+                */
+               if (dh->dh_seqopt) {
+                       dh->dh_seqopt = 0;
+                       goto restart;
+               }
+       }
+       DIRHASH_UNLOCK(dh);
+       if (bp != NULL)
+               brelse(bp, 0);
+       return (ENOENT);
+}
+
+/*
+ * Find a directory block with room for 'slotneeded' bytes. Returns
+ * the offset of the directory entry that begins the free space.
+ * This will either be the offset of an existing entry that has free
+ * space at the end, or the offset of an entry with d_ino == 0 at
+ * the start of a DIRBLKSIZ block.
+ *
+ * To use the space, the caller may need to compact existing entries in
+ * the directory. The total number of bytes in all of the entries involved
+ * in the compaction is stored in *slotsize. In other words, all of
+ * the entries that must be compacted are exactly contained in the
+ * region beginning at the returned offset and spanning *slotsize bytes.
+ *
+ * Returns -1 if no space was found, indicating that the directory
+ * must be extended.
+ */
+doff_t
+ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
+{
+       struct direct *dp;
+       struct dirhash *dh;
+       struct buf *bp;
+       doff_t pos, slotstart;
+       int dirblock, error, freebytes, i;
+       const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+       int dirblksiz = ip->i_ump->um_dirblksiz;
+
+       if ((dh = ip->i_dirhash) == NULL)
+               return (-1);
+
+       DIRHASH_LOCK(dh);
+       if (dh->dh_hash == NULL) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return (-1);
+       }
+
+       /* Find a directory block with the desired free space. */
+       dirblock = -1;
+       for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
+               if ((dirblock = dh->dh_firstfree[i]) != -1)
+                       break;
+       if (dirblock == -1) {
+               DIRHASH_UNLOCK(dh);
+               return (-1);
+       }
+
+       KASSERT(dirblock < dh->dh_nblk &&
+           dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN));
+       pos = dirblock * dirblksiz;
+       error = ufs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false);
+       if (error) {
+               DIRHASH_UNLOCK(dh);
+               return (-1);
+       }
+       /* Find the first entry with free space. */
+       for (i = 0; i < dirblksiz; ) {
+               if (dp->d_reclen == 0) {
+                       DIRHASH_UNLOCK(dh);
+                       brelse(bp, 0);
+                       return (-1);
+               }
+               if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp, needswap))
+                       break;
+               i += dp->d_reclen;
+               dp = (struct direct *)((char *)dp + dp->d_reclen);
+       }
+       if (i > dirblksiz) {
+               DIRHASH_UNLOCK(dh);
+               brelse(bp, 0);
+               return (-1);
+       }
+       slotstart = pos + i;
+
+       /* Find the range of entries needed to get enough space */
+       freebytes = 0;
+       while (i < dirblksiz && freebytes < slotneeded) {
+               freebytes += dp->d_reclen;
+               if (dp->d_ino != 0)
+                       freebytes -= DIRSIZ(0, dp, needswap);
+               if (dp->d_reclen == 0) {
+                       DIRHASH_UNLOCK(dh);
+                       brelse(bp, 0);
+                       return (-1);
+               }
+               i += dp->d_reclen;
+               dp = (struct direct *)((char *)dp + dp->d_reclen);
+       }
+       if (i > dirblksiz) {
+               DIRHASH_UNLOCK(dh);
+               brelse(bp, 0);
+               return (-1);
+       }
+       if (freebytes < slotneeded)
+               panic("ufsdirhash_findfree: free mismatch");
+       DIRHASH_UNLOCK(dh);
+       brelse(bp, 0);
+       *slotsize = pos + i - slotstart;
+       return (slotstart);
+}
+
+/*
+ * Return the start of the unused space at the end of a directory, or
+ * -1 if there are no trailing unused blocks.
+ */
+doff_t
+ufsdirhash_enduseful(struct inode *ip)
+{
+       struct dirhash *dh;
+       int i;
+       int dirblksiz = ip->i_ump->um_dirblksiz;
+
+       if ((dh = ip->i_dirhash) == NULL)
+               return (-1);
+
+       DIRHASH_LOCK(dh);
+       if (dh->dh_hash == NULL) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return (-1);
+       }
+
+       if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) {
+               DIRHASH_UNLOCK(dh);
+               return (-1);
+       }
+
+       for (i = dh->dh_dirblks - 1; i >= 0; i--)
+               if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
+                       break;
+       DIRHASH_UNLOCK(dh);
+       return ((doff_t)(i + 1) * dirblksiz);
+}
+
+/*
+ * Insert information into the hash about a new directory entry. dirp
+ * points to a struct direct containing the entry, and offset specifies
+ * the offset of this entry.
+ */
+void
+ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
+{
+       struct dirhash *dh;
+       int slot;
+       const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+       int dirblksiz = ip->i_ump->um_dirblksiz;
+
+       if ((dh = ip->i_dirhash) == NULL)
+               return;
+
+       DIRHASH_LOCK(dh);
+       if (dh->dh_hash == NULL) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return;
+       }
+
+       KASSERT(offset < dh->dh_dirblks * dirblksiz);
+       /*
+        * Normal hash usage is < 66%. If the usage gets too high then
+        * remove the hash entirely and let it be rebuilt later.
+        */
+       if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return;
+       }
+
+       /* Find a free hash slot (empty or deleted), and add the entry. */
+       slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
+       while (DH_ENTRY(dh, slot) >= 0)
+               slot = WRAPINCR(slot, dh->dh_hlen);
+       if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
+               dh->dh_hused++;
+       DH_ENTRY(dh, slot) = offset;
+
+       /* Update the per-block summary info. */
+       ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp, needswap), dirblksiz);
+       DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Remove the specified directory entry from the hash. The entry to remove
+ * is defined by the name in `dirp', which must exist at the specified
+ * `offset' within the directory.
+ */
+void
+ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
+{
+       struct dirhash *dh;
+       int slot;
+       const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+       int dirblksiz = ip->i_ump->um_dirblksiz;
+
+       if ((dh = ip->i_dirhash) == NULL)
+               return;
+
+       DIRHASH_LOCK(dh);
+       if (dh->dh_hash == NULL) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return;
+       }
+
+       KASSERT(offset < dh->dh_dirblks * dirblksiz);
+       /* Find the entry */
+       slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
+
+       /* Remove the hash entry. */
+       ufsdirhash_delslot(dh, slot);
+
+       /* Update the per-block summary info. */
+       ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp, needswap), dirblksiz);
+       DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Change the offset associated with a directory entry in the hash. Used
+ * when compacting directory blocks.
+ */
+void
+ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
+    doff_t newoff)
+{
+       struct dirhash *dh;
+       int slot;
+
+       if ((dh = ip->i_dirhash) == NULL)
+               return;
+       DIRHASH_LOCK(dh);
+       if (dh->dh_hash == NULL) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return;
+       }
+
+       KASSERT(oldoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz &&
+           newoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz);
+       /* Find the entry, and update the offset. */
+       slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
+       DH_ENTRY(dh, slot) = newoff;
+       DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Inform dirhash that the directory has grown by one block that
+ * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
+ */
+void
+ufsdirhash_newblk(struct inode *ip, doff_t offset)
+{
+       struct dirhash *dh;
+       int block;
+       int dirblksiz = ip->i_ump->um_dirblksiz;
+
+       if ((dh = ip->i_dirhash) == NULL)
+               return;
+       DIRHASH_LOCK(dh);
+       if (dh->dh_hash == NULL) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return;
+       }
+
+       KASSERT(offset == dh->dh_dirblks * dirblksiz);
+       block = offset / dirblksiz;
+       if (block >= dh->dh_nblk) {
+               /* Out of space; must rebuild. */
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return;
+       }
+       dh->dh_dirblks = block + 1;
+
+       /* Account for the new free block. */
+       dh->dh_blkfree[block] = dirblksiz / DIRALIGN;
+       if (dh->dh_firstfree[DH_NFSTATS] == -1)
+               dh->dh_firstfree[DH_NFSTATS] = block;
+       DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Inform dirhash that the directory is being truncated.
+ */
+void
+ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
+{
+       struct dirhash *dh;
+       int block, i;
+       int dirblksiz = ip->i_ump->um_dirblksiz;
+
+       if ((dh = ip->i_dirhash) == NULL)
+               return;
+
+       DIRHASH_LOCK(dh);
+       if (dh->dh_hash == NULL) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return;
+       }
+
+       KASSERT(offset <= dh->dh_dirblks * dirblksiz);
+       block = howmany(offset, dirblksiz);
+       /*
+        * If the directory shrinks to less than 1/8 of dh_nblk blocks
+        * (about 20% of its original size due to the 50% extra added in
+        * ufsdirhash_build) then free it, and let the caller rebuild
+        * if necessary.
+        */
+       if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return;
+       }
+
+       /*
+        * Remove any `first free' information pertaining to the
+        * truncated blocks. All blocks we're removing should be
+        * completely unused.
+        */
+       if (dh->dh_firstfree[DH_NFSTATS] >= block)
+               dh->dh_firstfree[DH_NFSTATS] = -1;
+       for (i = block; i < dh->dh_dirblks; i++)
+               if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
+                       panic("ufsdirhash_dirtrunc: blocks in use");
+       for (i = 0; i < DH_NFSTATS; i++)
+               if (dh->dh_firstfree[i] >= block)
+                       panic("ufsdirhash_dirtrunc: first free corrupt");
+       dh->dh_dirblks = block;
+       DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Debugging function to check that the dirhash information about
+ * a directory block matches its actual contents. Panics if a mismatch
+ * is detected.
+ *
+ * On entry, `sbuf' should point to the start of an in-core
+ * DIRBLKSIZ-sized directory block, and `offset' should contain the
+ * offset from the start of the directory of that block.
+ */
+void
+ufsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset)
+{
+       struct dirhash *dh;
+       struct direct *dp;
+       int block, ffslot, i, nfree;
+       const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
+       int dirblksiz = ip->i_ump->um_dirblksiz;
+
+       if (!ufs_dirhashcheck)
+               return;
+       if ((dh = ip->i_dirhash) == NULL)
+               return;
+
+       DIRHASH_LOCK(dh);
+       if (dh->dh_hash == NULL) {
+               DIRHASH_UNLOCK(dh);
+               ufsdirhash_free(ip);
+               return;
+       }
+
+       block = offset / dirblksiz;
+       if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks)
+               panic("ufsdirhash_checkblock: bad offset");
+
+       nfree = 0;
+       for (i = 0; i < dirblksiz; i += dp->d_reclen) {
+               dp = (struct direct *)(sbuf + i);
+               if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz)
+                       panic("ufsdirhash_checkblock: bad dir");
+
+               if (dp->d_ino == 0) {
+#if 0
+                       /*
+                        * XXX entries with d_ino == 0 should only occur
+                        * at the start of a DIRBLKSIZ block. However the
+                        * ufs code is tolerant of such entries at other
+                        * offsets, and fsck does not fix them.
+                        */
+                       if (i != 0)
+                               panic("ufsdirhash_checkblock: bad dir inode");
+#endif
+                       nfree += dp->d_reclen;
+                       continue;
+               }
+
+               /* Check that the entry exists (will panic if it doesn't). */
+               ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
+
+               nfree += dp->d_reclen - DIRSIZ(0, dp, needswap);
+       }
+       if (i != dirblksiz)
+               panic("ufsdirhash_checkblock: bad dir end");
+
+       if (dh->dh_blkfree[block] * DIRALIGN != nfree)
+               panic("ufsdirhash_checkblock: bad free count");
+
+       ffslot = BLKFREE2IDX(nfree / DIRALIGN);
+       for (i = 0; i <= DH_NFSTATS; i++)
+               if (dh->dh_firstfree[i] == block && i != ffslot)
+                       panic("ufsdirhash_checkblock: bad first-free");
+       if (dh->dh_firstfree[ffslot] == -1)
+               panic("ufsdirhash_checkblock: missing first-free entry");
+       DIRHASH_UNLOCK(dh);
+}
+
+/*
+ * Hash the specified filename into a dirhash slot.
+ */
+static int
+ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen)
+{
+       u_int32_t hash;
+
+       /*
+        * We hash the name and then some other bit of data that is
+        * invariant over the dirhash's lifetime. Otherwise names
+        * differing only in the last byte are placed close to one
+        * another in the table, which is bad for linear probing.
+        */
+       hash = hash32_buf(name, namelen, HASH32_BUF_INIT);
+       hash = hash32_buf(&dh, sizeof(dh), hash);
+       return (hash % dh->dh_hlen);
+}
+
+/*
+ * Adjust the number of free bytes in the block containing `offset'
+ * by the value specified by `diff'.
+ *
+ * The caller must ensure we have exclusive access to `dh'; normally
+ * that means that dh_lock should be held, but this is also called
+ * from ufsdirhash_build() where exclusive access can be assumed.
+ */
+static void
+ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz)
+{
+       int block, i, nfidx, ofidx;
+
+       KASSERT(mutex_owned(&dh->dh_lock));
+
+       /* Update the per-block summary info. */
+       block = offset / dirblksiz;
+       KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks);
+       ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
+       dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
+       nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
+
+       /* Update the `first free' list if necessary. */
+       if (ofidx != nfidx) {
+               /* If removing, scan forward for the next block. */
+               if (dh->dh_firstfree[ofidx] == block) {
+                       for (i = block + 1; i < dh->dh_dirblks; i++)
+                               if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
+                                       break;
+                       dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
+               }
+
+               /* Make this the new `first free' if necessary */
+               if (dh->dh_firstfree[nfidx] > block ||
+                   dh->dh_firstfree[nfidx] == -1)
+                       dh->dh_firstfree[nfidx] = block;
+       }
+}
+
+/*
+ * Find the specified name which should have the specified offset.
+ * Returns a slot number, and panics on failure.
+ *
+ * `dh' must be locked on entry and remains so on return.
+ */
+static int
+ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen,
+    doff_t offset)
+{
+       int slot;
+
+       KASSERT(mutex_owned(&dh->dh_lock));
+
+       /* Find the entry. */
+       KASSERT(dh->dh_hused < dh->dh_hlen);
+       slot = ufsdirhash_hash(dh, name, namelen);
+       while (DH_ENTRY(dh, slot) != offset &&
+           DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
+               slot = WRAPINCR(slot, dh->dh_hlen);
+       if (DH_ENTRY(dh, slot) != offset)
+               panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
+
+       return (slot);
+}
+
+/*
+ * Remove the entry corresponding to the specified slot from the hash array.
+ *
+ * `dh' must be locked on entry and remains so on return.
+ */
+static void
+ufsdirhash_delslot(struct dirhash *dh, int slot)
+{
+       int i;
+
+       KASSERT(mutex_owned(&dh->dh_lock));
+
+       /* Mark the entry as deleted. */
+       DH_ENTRY(dh, slot) = DIRHASH_DEL;
+
+       /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
+       for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
+               i = WRAPINCR(i, dh->dh_hlen);
+       if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
+               i = WRAPDECR(i, dh->dh_hlen);
+               while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
+                       DH_ENTRY(dh, i) = DIRHASH_EMPTY;
+                       dh->dh_hused--;
+                       i = WRAPDECR(i, dh->dh_hlen);
+               }
+               KASSERT(dh->dh_hused >= 0);
+       }
+}
+
+/*
+ * Given a directory entry and its offset, find the offset of the
+ * previous entry in the same DIRBLKSIZ-sized block. Returns an
+ * offset, or -1 if there is no previous entry in the block or some
+ * other problem occurred.
+ */
+static doff_t
+ufsdirhash_getprev(struct direct *dirp, doff_t offset, int dirblksiz)
+{
+       struct direct *dp;
+       char *blkbuf;
+       doff_t blkoff, prevoff;
+       int entrypos, i;
+
+       blkoff = offset & ~(dirblksiz - 1);     /* offset of start of block */
+       entrypos = offset & (dirblksiz - 1);    /* entry relative to block */
+       blkbuf = (char *)dirp - entrypos;
+       prevoff = blkoff;
+
+       /* If `offset' is the start of a block, there is no previous entry. */
+       if (entrypos == 0)
+               return (-1);
+
+       /* Scan from the start of the block until we get to the entry. */
+       for (i = 0; i < entrypos; i += dp->d_reclen) {
+               dp = (struct direct *)(blkbuf + i);
+               if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
+                       return (-1);    /* Corrupted directory. */
+               prevoff = blkoff + i;
+       }
+       return (prevoff);
+}
+
+/*
+ * Try to free up `wanted' bytes by stealing memory from existing
+ * dirhashes. Returns zero with list locked if successful.
+ */
+static int
+ufsdirhash_recycle(int wanted)
+{
+       struct dirhash *dh;
+       doff_t **hash;
+       u_int8_t *blkfree;
+       int i, mem, narrays;
+       size_t hashsz, blkfreesz;
+
+       DIRHASHLIST_LOCK();
+       while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
+               /* Find a dirhash, and lock it. */
+               if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
+                       DIRHASHLIST_UNLOCK();
+                       return (-1);
+               }
+               DIRHASH_LOCK(dh);
+               KASSERT(dh->dh_hash != NULL);
+
+               /* Decrement the score; only recycle if it becomes zero. */
+               if (--dh->dh_score > 0) {
+                       DIRHASH_UNLOCK(dh);
+                       DIRHASHLIST_UNLOCK();
+                       return (-1);
+               }
+
+               /* Remove it from the list and detach its memory. */
+               TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
+               dh->dh_onlist = 0;
+               hash = dh->dh_hash;
+               hashsz = dh->dh_hashsz;
+               dh->dh_hash = NULL;
+               blkfree = dh->dh_blkfree;
+               blkfreesz = dh->dh_blkfreesz;
+               dh->dh_blkfree = NULL;
+               narrays = dh->dh_narrays;
+               mem = narrays * sizeof(*dh->dh_hash) +
+                   narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
+                   dh->dh_nblk * sizeof(*dh->dh_blkfree);
+
+               /* Unlock everything, free the detached memory. */
+               DIRHASH_UNLOCK(dh);
+               DIRHASHLIST_UNLOCK();
+
+               for (i = 0; i < narrays; i++)
+                       DIRHASH_BLKFREE(hash[i]);
+               kmem_free(hash, hashsz);
+               kmem_free(blkfree, blkfreesz);
+
+               /* Account for the returned memory, and repeat if necessary. */
+               DIRHASHLIST_LOCK();
+               atomic_add_int(&ufs_dirhashmem, -mem);
+       }
+       /* Success. */
+       return (0);
+}
+
+static void
+ufsdirhash_sysctl_init(void)
+{
+       const struct sysctlnode *rnode, *cnode;
+
+       sysctl_createv(&ufsdirhash_sysctl_log, 0, NULL, &rnode,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_NODE, "vfs", NULL,
+                      NULL, 0, NULL, 0,
+                      CTL_VFS, CTL_EOL);
+
+       sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_NODE, "ufs",
+                      SYSCTL_DESCR("ufs"),
+                      NULL, 0, NULL, 0,
+                      CTL_CREATE, CTL_EOL);
+
+       sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &rnode,
+                      CTLFLAG_PERMANENT,
+                      CTLTYPE_NODE, "dirhash",
+                      SYSCTL_DESCR("dirhash"),
+                      NULL, 0, NULL, 0,
+                      CTL_CREATE, CTL_EOL);
+
+       sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "minblocks",
+                      SYSCTL_DESCR("minimum hashed directory size in blocks"),
+                      NULL, 0, &ufs_dirhashminblks, 0,
+                      CTL_CREATE, CTL_EOL);
+
+       sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "maxmem",
+                      SYSCTL_DESCR("maximum dirhash memory usage"),
+                      NULL, 0, &ufs_dirhashmaxmem, 0,
+                      CTL_CREATE, CTL_EOL);
+
+       sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+                      CTLFLAG_PERMANENT|CTLFLAG_READONLY,
+                      CTLTYPE_INT, "memused",
+                      SYSCTL_DESCR("current dirhash memory usage"),
+                      NULL, 0, &ufs_dirhashmem, 0,
+                      CTL_CREATE, CTL_EOL);
+
+       sysctl_createv(&ufsdirhash_sysctl_log, 0, &rnode, &cnode,
+                      CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+                      CTLTYPE_INT, "docheck",
+                      SYSCTL_DESCR("enable extra sanity checks"),
+                      NULL, 0, &ufs_dirhashcheck, 0,
+                      CTL_CREATE, CTL_EOL);
+}
+
+void
+ufsdirhash_init(void)
+{
+
+       mutex_init(&ufsdirhash_lock, MUTEX_DEFAULT, IPL_NONE);
+       ufsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0,
+           0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL);
+       ufsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0,
+           0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL);
+       TAILQ_INIT(&ufsdirhash_list);
+       ufsdirhash_sysctl_init();
+}
+
+void
+ufsdirhash_done(void)
+{
+
+       KASSERT(TAILQ_EMPTY(&ufsdirhash_list));
+       pool_cache_destroy(ufsdirhashblk_cache);
+       pool_cache_destroy(ufsdirhash_cache);
+       mutex_destroy(&ufsdirhash_lock);
+       sysctl_teardown(&ufsdirhash_sysctl_log);
+}
diff --git a/sys/ufs/ufs/ufs_extattr.c b/sys/ufs/ufs/ufs_extattr.c
new file mode 100644 (file)
index 0000000..8b456b8
--- /dev/null
@@ -0,0 +1,1551 @@
+/*     $NetBSD: ufs_extattr.c,v 1.35 2011/07/07 14:56:45 manu Exp $    */
+
+/*-
+ * Copyright (c) 1999-2002 Robert N. M. Watson
+ * Copyright (c) 2002-2003 Networks Associates Technology, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * This software was developed for the FreeBSD Project in part by Network
+ * Associates Laboratories, the Security Research Division of Network
+ * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
+ * as part of the DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Support for file system extended attributes on the UFS1 file system.
+ *
+ * Extended attributes are defined in the form name=value, where name is
+ * a nul-terminated string in the style of a file name, and value is a
+ * binary blob of zero or more bytes.  The UFS1 extended attribute service
+ * layers support for extended attributes onto a backing file, in the style
+ * of the quota implementation, meaning that it requires no underlying format
+ * changes to the file system.  This design choice exchanges simplicity,
+ * usability, and easy deployment for performance.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_extattr.c,v 1.35 2011/07/07 14:56:45 manu Exp $");
+
+#ifdef _KERNEL_OPT
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/reboot.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/namei.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/lwp.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/lock.h>
+#include <sys/dirent.h>
+#include <sys/extattr.h>
+#include <sys/sysctl.h>
+
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/extattr.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+
+static MALLOC_JUSTDEFINE(M_UFS_EXTATTR, "ufs_extattr","ufs extended attribute");
+
+int ufs_extattr_sync = 1;
+int ufs_extattr_autocreate = 1024;
+
+static int     ufs_extattr_valid_attrname(int attrnamespace,
+                   const char *attrname);
+static int     ufs_extattr_enable_with_open(struct ufsmount *ump,
+                   struct vnode *vp, int attrnamespace, const char *attrname,
+                   struct lwp *l);
+static int     ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
+                   const char *attrname, struct vnode *backing_vnode,
+                   struct lwp *l);
+static int     ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
+                   const char *attrname, struct lwp *l);
+static int     ufs_extattr_get(struct vnode *vp, int attrnamespace,
+                   const char *name, struct uio *uio, size_t *size,
+                   kauth_cred_t cred, struct lwp *l);
+static int     ufs_extattr_list(struct vnode *vp, int attrnamespace,
+                   struct uio *uio, size_t *size, int flag,
+                   kauth_cred_t cred, struct lwp *l);
+static int     ufs_extattr_set(struct vnode *vp, int attrnamespace,
+                   const char *name, struct uio *uio, kauth_cred_t cred,
+                   struct lwp *l);
+static int     ufs_extattr_rm(struct vnode *vp, int attrnamespace,
+                   const char *name, kauth_cred_t cred, struct lwp *l);
+static struct ufs_extattr_list_entry *ufs_extattr_find_attr(struct ufsmount *,
+                   int, const char *);
+static int     ufs_extattr_get_header(struct vnode *, 
+                   struct ufs_extattr_list_entry *, 
+                   struct ufs_extattr_header *, off_t *);
+
+/*
+ * Per-FS attribute lock protecting attribute operations.
+ * XXX Right now there is a lot of lock contention due to having a single
+ * lock per-FS; really, this should be far more fine-grained.
+ */
+static void
+ufs_extattr_uepm_lock(struct ufsmount *ump)
+{
+
+       /* XXX Why does this need to be recursive? */
+       if (mutex_owned(&ump->um_extattr.uepm_lock)) {
+               ump->um_extattr.uepm_lockcnt++;
+               return;
+       }
+       mutex_enter(&ump->um_extattr.uepm_lock);
+}
+
+static void
+ufs_extattr_uepm_unlock(struct ufsmount *ump)
+{
+
+       if (ump->um_extattr.uepm_lockcnt != 0) {
+               KASSERT(mutex_owned(&ump->um_extattr.uepm_lock));
+               ump->um_extattr.uepm_lockcnt--;
+               return;
+       }
+       mutex_exit(&ump->um_extattr.uepm_lock);
+}
+
+/*-
+ * Determine whether the name passed is a valid name for an actual
+ * attribute.
+ *
+ * Invalid currently consists of:
+ *      NULL pointer for attrname
+ *      zero-length attrname (used to retrieve application attribute list)
+ */
+static int
+ufs_extattr_valid_attrname(int attrnamespace, const char *attrname)
+{
+
+       if (attrname == NULL)
+               return (0);
+       if (strlen(attrname) == 0)
+               return (0);
+       return (1);
+}
+
+/*
+ * Autocreate an attribute storage
+ */
+static struct ufs_extattr_list_entry *
+ufs_extattr_autocreate_attr(struct vnode *vp, int attrnamespace,
+    const char *attrname, struct lwp *l)
+{
+       struct mount *mp = vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct vnode *backing_vp;
+       struct nameidata nd;
+       struct pathbuf *pb;
+       char *path;
+       struct ufs_extattr_fileheader uef;
+       struct ufs_extattr_list_entry *uele;
+       int error;
+
+       path = PNBUF_GET();
+
+       /* 
+        * We only support system and user namespace autocreation
+        */ 
+       switch (attrnamespace) {
+       case EXTATTR_NAMESPACE_SYSTEM:
+               (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s", 
+                              mp->mnt_stat.f_mntonname,
+                              UFS_EXTATTR_FSROOTSUBDIR,
+                              UFS_EXTATTR_SUBDIR_SYSTEM,
+                              attrname);
+               break;
+       case EXTATTR_NAMESPACE_USER:
+               (void)snprintf(path, PATH_MAX, "%s/%s/%s/%s", 
+                              mp->mnt_stat.f_mntonname,
+                              UFS_EXTATTR_FSROOTSUBDIR,
+                              UFS_EXTATTR_SUBDIR_USER,
+                              attrname);
+               break;
+       default:
+               PNBUF_PUT(path);
+               return NULL;
+               break;
+       }
+
+       /*
+        * When setting attribute on the root vnode, we get it 
+        * already locked, and vn_open/namei/VFS_ROOT will try to
+        * look it, causing a panic. Unlock it first.
+        */ 
+       if (vp->v_vflag && VV_ROOT) {
+               KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+               VOP_UNLOCK(vp);
+       }
+       KASSERT(VOP_ISLOCKED(vp) == 0);
+
+       pb = pathbuf_create(path);
+       NDINIT(&nd, CREATE, LOCKPARENT, pb);
+       
+       error = vn_open(&nd, O_CREAT|O_RDWR, 0600);
+
+       /*
+        * Reacquire the lock on the vnode if it was root.
+        */
+       KASSERT(VOP_ISLOCKED(vp) == 0);
+       if (vp->v_vflag && VV_ROOT)
+               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+       KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
+       if (error != 0) {
+               pathbuf_destroy(pb);
+               PNBUF_PUT(path);
+               return NULL;
+       }
+
+       KASSERT(nd.ni_vp != NULL);
+       KASSERT(VOP_ISLOCKED(nd.ni_vp) == LK_EXCLUSIVE);
+       KASSERT(VOP_ISLOCKED(nd.ni_dvp) == 0);
+
+       /*
+        * backing_vp is the backing store. 
+        */     
+       backing_vp = nd.ni_vp;
+       pathbuf_destroy(pb);
+       PNBUF_PUT(path);
+
+       uef.uef_magic = UFS_EXTATTR_MAGIC;
+       uef.uef_version = UFS_EXTATTR_VERSION;
+       uef.uef_size = ufs_extattr_autocreate;
+
+       error = vn_rdwr(UIO_WRITE, backing_vp, &uef, sizeof(uef), 0,
+                       UIO_SYSSPACE, IO_NODELOCKED|IO_APPEND, 
+                       l->l_cred, NULL, l);
+
+       VOP_UNLOCK(backing_vp);
+
+       if (error != 0) {
+               printf("%s: write uef header failed for %s, error = %d\n", 
+                      __func__, attrname, error);
+               vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
+               return NULL;
+       }
+
+       /*
+        * ufs_extattr_enable_with_open increases the vnode reference
+        * count. Not sure why, but do the same here.
+        */
+       vref(vp);
+
+       /*
+        * Now enable attribute. 
+        */
+       error = ufs_extattr_enable(ump,attrnamespace, attrname, backing_vp, l);
+       KASSERT(VOP_ISLOCKED(backing_vp) == 0);
+
+       if (error != 0) {
+               printf("%s: enable %s failed, error %d\n", 
+                      __func__, attrname, error);
+               vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
+               return NULL;
+       }
+
+       uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
+       if (uele == NULL) {
+               printf("%s: atttribute %s created but not found!\n",
+                      __func__, attrname);
+               vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
+               return NULL;
+       }
+
+       printf("%s: EA backing store autocreated for %s\n",
+              mp->mnt_stat.f_mntonname, attrname);
+
+       return uele;
+}
+
+/*
+ * Locate an attribute given a name and mountpoint.
+ * Must be holding uepm lock for the mount point.
+ */
+static struct ufs_extattr_list_entry *
+ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace,
+    const char *attrname)
+{
+       struct ufs_extattr_list_entry *search_attribute;
+
+       for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list);
+           search_attribute != NULL;
+           search_attribute = LIST_NEXT(search_attribute, uele_entries)) {
+               if (!(strncmp(attrname, search_attribute->uele_attrname,
+                   UFS_EXTATTR_MAXEXTATTRNAME)) &&
+                   (attrnamespace == search_attribute->uele_attrnamespace)) {
+                       return (search_attribute);
+               }
+       }
+
+       return (0);
+}
+
+/*
+ * Initialize per-FS structures supporting extended attributes.  Do not
+ * start extended attributes yet.
+ */
+void
+ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm)
+{
+
+       uepm->uepm_flags = 0;
+       uepm->uepm_lockcnt = 0;
+
+       LIST_INIT(&uepm->uepm_list);
+       mutex_init(&uepm->uepm_lock, MUTEX_DEFAULT, IPL_NONE);
+       uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED;
+}
+
+/*
+ * Destroy per-FS structures supporting extended attributes.  Assumes
+ * that EAs have already been stopped, and will panic if not.
+ */
+void
+ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm)
+{
+
+       if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
+               panic("ufs_extattr_uepm_destroy: not initialized");
+
+       if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+               panic("ufs_extattr_uepm_destroy: called while still started");
+
+       /*
+        * It's not clear that either order for the next two lines is
+        * ideal, and it should never be a problem if this is only called
+        * during unmount, and with vfs_busy().
+        */
+       uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED;
+       mutex_destroy(&uepm->uepm_lock);
+}
+
+/*
+ * Start extended attribute support on an FS.
+ */
+int
+ufs_extattr_start(struct mount *mp, struct lwp *l)
+{
+       struct ufsmount *ump;
+       int error = 0;
+
+       ump = VFSTOUFS(mp);
+
+       ufs_extattr_uepm_lock(ump);
+
+       if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) {
+               error = EOPNOTSUPP;
+               goto unlock;
+       }
+       if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) {
+               error = EBUSY;
+               goto unlock;
+       }
+
+       ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED;
+
+       ump->um_extattr.uepm_ucred = l->l_cred;
+       kauth_cred_hold(ump->um_extattr.uepm_ucred);
+
+ unlock:
+       ufs_extattr_uepm_unlock(ump);
+
+       return (error);
+}
+
+/*
+ * Helper routine: given a locked parent directory and filename, return
+ * the locked vnode of the inode associated with the name.  Will not
+ * follow symlinks, may return any type of vnode.  Lock on parent will
+ * be released even in the event of a failure.  In the event that the
+ * target is the parent (i.e., "."), there will be two references and
+ * one lock, requiring the caller to possibly special-case.
+ */
+static int
+ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, const char *dirname,
+    struct vnode **vp, struct lwp *l)
+{
+       struct vop_lookup_args vargs;
+       struct componentname cnp;
+       struct vnode *target_vp;
+       char *pnbuf;
+       int error;
+
+       KASSERT(VOP_ISLOCKED(start_dvp) == LK_EXCLUSIVE);
+
+       pnbuf = PNBUF_GET();
+
+       memset(&cnp, 0, sizeof(cnp));
+       cnp.cn_nameiop = LOOKUP;
+       cnp.cn_flags = ISLASTCN | lockparent;
+       cnp.cn_cred = l->l_cred;
+       cnp.cn_nameptr = pnbuf;
+       error = copystr(dirname, pnbuf, MAXPATHLEN, &cnp.cn_namelen);
+       if (error) {
+               if (lockparent == 0) {
+                       VOP_UNLOCK(start_dvp);
+               }
+               PNBUF_PUT(pnbuf);
+               printf("ufs_extattr_lookup: copystr failed\n");
+               return (error);
+       }
+       cnp.cn_namelen--;       /* trim nul termination */
+       vargs.a_desc = NULL;
+       vargs.a_dvp = start_dvp;
+       vargs.a_vpp = &target_vp;
+       vargs.a_cnp = &cnp;
+       error = ufs_lookup(&vargs);
+       PNBUF_PUT(pnbuf);
+       if (error) {
+               if (lockparent == 0) {
+                       VOP_UNLOCK(start_dvp);
+               }
+               return (error);
+       }
+#if 0
+       if (target_vp == start_dvp)
+               panic("ufs_extattr_lookup: target_vp == start_dvp");
+#endif
+
+       if ((target_vp != start_dvp) && (lockparent == 0))
+                VOP_UNLOCK(start_dvp);
+
+       KASSERT(VOP_ISLOCKED(target_vp) == LK_EXCLUSIVE);
+       *vp = target_vp;
+       return (0);
+}
+
+/*
+ * Enable an EA using the passed filesystem, backing vnode, attribute name,
+ * namespace, and proc.  Will perform a VOP_OPEN() on the vp, so expects vp
+ * to be locked when passed in.  The vnode will be returned unlocked,
+ * regardless of success/failure of the function.  As a result, the caller
+ * will always need to vrele(), but not vput().
+ */
+static int
+ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp,
+    int attrnamespace, const char *attrname, struct lwp *l)
+{
+       int error;
+
+       error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred);
+       if (error) {
+               printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed "
+                   "with %d\n", error);
+               VOP_UNLOCK(vp);
+               return (error);
+       }
+
+       mutex_enter(vp->v_interlock);
+       vp->v_writecount++;
+       mutex_exit(vp->v_interlock);
+
+       vref(vp);
+
+       VOP_UNLOCK(vp);
+
+       error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, l);
+       if (error != 0)
+               vn_close(vp, FREAD|FWRITE, l->l_cred);
+       return (error);
+}
+
+/*
+ * Given a locked directory vnode, iterate over the names in the directory
+ * and use ufs_extattr_lookup() to retrieve locked vnodes of potential
+ * attribute files.  Then invoke ufs_extattr_enable_with_open() on each
+ * to attempt to start the attribute.  Leaves the directory locked on
+ * exit.
+ */
+static int
+ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp,
+    int attrnamespace, struct lwp *l)
+{
+       struct vop_readdir_args vargs;
+       struct statvfs *sbp = &ump->um_mountp->mnt_stat;
+       struct dirent *dp, *edp;
+       struct vnode *attr_vp;
+       struct uio auio;
+       struct iovec aiov;
+       char *dirbuf;
+       int error, eofflag = 0;
+
+       if (dvp->v_type != VDIR)
+               return (ENOTDIR);
+
+       dirbuf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK);
+
+       auio.uio_iov = &aiov;
+       auio.uio_iovcnt = 1;
+       auio.uio_rw = UIO_READ;
+       auio.uio_offset = 0;
+       UIO_SETUP_SYSSPACE(&auio);
+
+       vargs.a_desc = NULL;
+       vargs.a_vp = dvp;
+       vargs.a_uio = &auio;
+       vargs.a_cred = l->l_cred;
+       vargs.a_eofflag = &eofflag;
+       vargs.a_ncookies = NULL;
+       vargs.a_cookies = NULL;
+
+       while (!eofflag) {
+               auio.uio_resid = DIRBLKSIZ;
+               aiov.iov_base = dirbuf;
+               aiov.iov_len = DIRBLKSIZ;
+               error = ufs_readdir(&vargs);
+               if (error) {
+                       printf("ufs_extattr_iterate_directory: ufs_readdir "
+                           "%d\n", error);
+                       return (error);
+               }
+
+               /*
+                * XXXRW: While in UFS, we always get DIRBLKSIZ returns from
+                * the directory code on success, on other file systems this
+                * may not be the case.  For portability, we should check the
+                * read length on return from ufs_readdir().
+                */
+               edp = (struct dirent *)&dirbuf[DIRBLKSIZ];
+               for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+                       if (dp->d_reclen == 0)
+                               break;
+                       /* Skip "." and ".." */
+                       if (dp->d_name[0] == '.' &&
+                           (dp->d_name[1] == '\0' ||
+                            (dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
+                               goto next;
+                       error = ufs_extattr_lookup(dvp, LOCKPARENT,
+                           dp->d_name, &attr_vp, l);
+                       if (error == ENOENT) {
+                               goto next; /* keep silent */
+                       } else if (error) {
+                               printf("ufs_extattr_iterate_directory: lookup "
+                                   "%s %d\n", dp->d_name, error);
+                       } else if (attr_vp == dvp) {
+                               vrele(attr_vp);
+                       } else if (attr_vp->v_type != VREG) {
+                               vput(attr_vp);
+                       } else {
+                               error = ufs_extattr_enable_with_open(ump,
+                                   attr_vp, attrnamespace, dp->d_name, l);
+                               vrele(attr_vp);
+                               if (error) {
+                                       printf("ufs_extattr_iterate_directory: "
+                                           "enable %s %d\n", dp->d_name,
+                                           error);
+                               } else if (bootverbose) {
+                                       printf("%s: EA %s loaded\n",
+                                              sbp->f_mntonname, dp->d_name);
+                               }
+                       }
+ next:
+                       dp = (struct dirent *) ((char *)dp + dp->d_reclen);
+                       if (dp >= edp)
+                               break;
+               }
+       }
+       free(dirbuf, M_TEMP);
+       
+       return (0);
+}
+
+/*
+ * Auto-start of extended attributes, to be executed (optionally) at
+ * mount-time.
+ */
+int
+ufs_extattr_autostart(struct mount *mp, struct lwp *l)
+{
+       struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp;
+       int error;
+
+       /*
+        * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root?
+        * If so, automatically start EA's.
+        */
+       error = VFS_ROOT(mp, &rvp);
+       if (error) {
+               printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n",
+                   error);
+               return (error);
+       }
+
+       KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
+
+       error = ufs_extattr_lookup(rvp, 0,
+           UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, l);
+       if (error) {
+               /* rvp ref'd but now unlocked */
+               KASSERT(VOP_ISLOCKED(rvp) == 0);
+               vrele(rvp);
+               return (error);
+       }
+       if (rvp == attr_dvp) {
+               /* Should never happen. */
+               KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
+               vrele(attr_dvp);
+               vput(rvp);
+               return (EINVAL);
+       }
+       KASSERT(VOP_ISLOCKED(rvp) == 0);
+       vrele(rvp);
+
+       KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+
+       if (attr_dvp->v_type != VDIR) {
+               printf("ufs_extattr_autostart: %s != VDIR\n",
+                   UFS_EXTATTR_FSROOTSUBDIR);
+               goto return_vput_attr_dvp;
+       }
+
+       error = ufs_extattr_start(mp, l);
+       if (error) {
+               printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n",
+                   error);
+               goto return_vput_attr_dvp;
+       }
+
+       /*
+        * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM,
+        * UFS_EXTATTR_SUBDIR_USER.  For each, iterate over the sub-directory,
+        * and start with appropriate type.  Failures in either don't
+        * result in an over-all failure.  attr_dvp is left locked to
+        * be cleaned up on exit.
+        */
+       error = ufs_extattr_lookup(attr_dvp, LOCKPARENT,
+           UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, l);
+       KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+       if (error == 0) {
+               KASSERT(VOP_ISLOCKED(attr_system_dvp) == LK_EXCLUSIVE);
+               error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
+                   attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, l);
+               if (error)
+                       printf("ufs_extattr_iterate_directory returned %d\n",
+                           error);
+               KASSERT(VOP_ISLOCKED(attr_system_dvp) == LK_EXCLUSIVE);
+               vput(attr_system_dvp);
+       }
+
+       error = ufs_extattr_lookup(attr_dvp, LOCKPARENT,
+           UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, l);
+       KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+       if (error == 0) {
+               KASSERT(VOP_ISLOCKED(attr_user_dvp) == LK_EXCLUSIVE);
+               error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
+                   attr_user_dvp, EXTATTR_NAMESPACE_USER, l);
+               if (error)
+                       printf("ufs_extattr_iterate_directory returned %d\n",
+                           error);
+               KASSERT(VOP_ISLOCKED(attr_user_dvp) == LK_EXCLUSIVE);
+               vput(attr_user_dvp);
+       }
+
+       /* Mask startup failures in sub-directories. */
+       error = 0;
+
+ return_vput_attr_dvp:
+       KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
+       vput(attr_dvp);
+
+       return (error);
+}
+
+/*
+ * Stop extended attribute support on an FS.
+ */
+void
+ufs_extattr_stop(struct mount *mp, struct lwp *l)
+{
+       struct ufs_extattr_list_entry *uele;
+       struct ufsmount *ump = VFSTOUFS(mp);
+
+       ufs_extattr_uepm_lock(ump);
+
+       /*
+        * If we haven't been started, no big deal.  Just short-circuit
+        * the processing work.
+        */
+       if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
+               goto unlock;
+       }
+
+       while (LIST_FIRST(&ump->um_extattr.uepm_list) != NULL) {
+               uele = LIST_FIRST(&ump->um_extattr.uepm_list);
+               ufs_extattr_disable(ump, uele->uele_attrnamespace,
+                   uele->uele_attrname, l);
+       }
+
+       ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
+
+       kauth_cred_free(ump->um_extattr.uepm_ucred);
+       ump->um_extattr.uepm_ucred = NULL;
+
+ unlock:
+       ufs_extattr_uepm_unlock(ump);
+}
+
+/*
+ * Enable a named attribute on the specified filesystem; provide an
+ * unlocked backing vnode to hold the attribute data.
+ */
+static int
+ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
+    const char *attrname, struct vnode *backing_vnode, struct lwp *l)
+{
+       struct ufs_extattr_list_entry *attribute;
+       struct iovec aiov;
+       struct uio auio;
+       int error = 0;
+
+       if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
+               return (EINVAL);
+       if (backing_vnode->v_type != VREG)
+               return (EINVAL);
+
+       attribute = malloc(sizeof(*attribute), M_UFS_EXTATTR,
+           M_WAITOK | M_ZERO);
+
+       if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
+               error = EOPNOTSUPP;
+               goto free_exit;
+       }
+
+       if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) {
+               error = EEXIST;
+               goto free_exit;
+       }
+
+       strncpy(attribute->uele_attrname, attrname,
+           UFS_EXTATTR_MAXEXTATTRNAME);
+       attribute->uele_attrnamespace = attrnamespace;
+       memset(&attribute->uele_fileheader, 0,
+           sizeof(struct ufs_extattr_fileheader));
+       
+       attribute->uele_backing_vnode = backing_vnode;
+
+       auio.uio_iov = &aiov;
+       auio.uio_iovcnt = 1;
+       aiov.iov_base = (void *) &attribute->uele_fileheader;
+       aiov.iov_len = sizeof(struct ufs_extattr_fileheader);
+       auio.uio_resid = sizeof(struct ufs_extattr_fileheader);
+       auio.uio_offset = (off_t) 0;
+       auio.uio_rw = UIO_READ;
+       UIO_SETUP_SYSSPACE(&auio);
+
+       vn_lock(backing_vnode, LK_SHARED | LK_RETRY);
+       error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED,
+           ump->um_extattr.uepm_ucred);
+
+       if (error)
+               goto unlock_free_exit;
+
+       if (auio.uio_resid != 0) {
+               printf("ufs_extattr_enable: malformed attribute header\n");
+               error = EINVAL;
+               goto unlock_free_exit;
+       }
+
+       /*
+        * Try to determine the byte order of the attribute file.
+        */
+       if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
+               attribute->uele_flags |= UELE_F_NEEDSWAP;
+               attribute->uele_fileheader.uef_magic =
+                   ufs_rw32(attribute->uele_fileheader.uef_magic,
+                            UELE_NEEDSWAP(attribute));
+               if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
+                       printf("ufs_extattr_enable: invalid attribute header "
+                              "magic\n");
+                       error = EINVAL;
+                       goto unlock_free_exit;
+               }
+       }
+       attribute->uele_fileheader.uef_version =
+           ufs_rw32(attribute->uele_fileheader.uef_version,
+                    UELE_NEEDSWAP(attribute));
+       attribute->uele_fileheader.uef_size =
+           ufs_rw32(attribute->uele_fileheader.uef_size,
+                    UELE_NEEDSWAP(attribute));
+
+       if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) {
+               printf("ufs_extattr_enable: incorrect attribute header "
+                   "version\n");
+               error = EINVAL;
+               goto unlock_free_exit;
+       }
+
+       LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute,
+           uele_entries);
+
+       VOP_UNLOCK(backing_vnode);
+       return (0);
+
+ unlock_free_exit:
+       VOP_UNLOCK(backing_vnode);
+
+ free_exit:
+       free(attribute, M_UFS_EXTATTR);
+       return (error);
+}
+
+/*
+ * Disable extended attribute support on an FS.
+ */
+static int
+ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
+    const char *attrname, struct lwp *l)
+{
+       struct ufs_extattr_list_entry *uele;
+       int error = 0;
+
+       if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
+               return (EINVAL);
+
+       uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
+       if (!uele)
+               return (ENOATTR);
+
+       LIST_REMOVE(uele, uele_entries);
+
+       error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE,
+           l->l_cred);
+
+       free(uele, M_UFS_EXTATTR);
+
+       return (error);
+}
+
+/*
+ * VFS call to manage extended attributes in UFS.  If filename_vp is
+ * non-NULL, it must be passed in locked, and regardless of errors in
+ * processing, will be unlocked.
+ */
+int
+ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
+    int attrnamespace, const char *attrname)
+{
+       struct lwp *l = curlwp;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       int error;
+
+       /*
+        * Only privileged processes can configure extended attributes.
+        */
+       if ((error = kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
+           NULL)) != 0) {
+               if (filename_vp != NULL)
+                       VOP_UNLOCK(filename_vp);
+               return (error);
+       }
+
+       switch(cmd) {
+       case UFS_EXTATTR_CMD_START:
+               if (filename_vp != NULL) {
+                       VOP_UNLOCK(filename_vp);
+                       return (EINVAL);
+               }
+               if (attrname != NULL)
+                       return (EINVAL);
+
+               error = ufs_extattr_autostart(mp, l);
+               return (error);
+               
+       case UFS_EXTATTR_CMD_STOP:
+               if (filename_vp != NULL) {
+                       VOP_UNLOCK(filename_vp);
+                       return (EINVAL);
+               }
+               if (attrname != NULL)
+                       return (EINVAL);
+
+               ufs_extattr_stop(mp, l);
+               return (0);
+
+       case UFS_EXTATTR_CMD_ENABLE:
+               if (filename_vp == NULL)
+                       return (EINVAL);
+               if (attrname == NULL) {
+                       VOP_UNLOCK(filename_vp);
+                       return (EINVAL);
+               }
+
+               /*
+                * ufs_extattr_enable_with_open() will always unlock the
+                * vnode, regardless of failure.
+                */
+               ufs_extattr_uepm_lock(ump);
+               error = ufs_extattr_enable_with_open(ump, filename_vp,
+                   attrnamespace, attrname, l);
+               ufs_extattr_uepm_unlock(ump);
+               return (error);
+
+       case UFS_EXTATTR_CMD_DISABLE:
+               if (filename_vp != NULL) {
+                       VOP_UNLOCK(filename_vp);
+                       return (EINVAL);
+               }
+               if (attrname == NULL)
+                       return (EINVAL);
+
+               ufs_extattr_uepm_lock(ump);
+               error = ufs_extattr_disable(ump, attrnamespace, attrname, l);
+               ufs_extattr_uepm_unlock(ump);
+               return (error);
+
+       default:
+               return (EINVAL);
+       }
+}
+
+/*
+ * Read extended attribute header for a given vnode and attribute.
+ * Backing vnode should be locked and unlocked by caller.
+ */
+static int
+ufs_extattr_get_header(struct vnode *vp, struct ufs_extattr_list_entry *uele,
+    struct ufs_extattr_header *ueh, off_t *bap)
+{
+       struct mount *mp = vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct inode *ip = VTOI(vp);
+       off_t base_offset;
+       struct iovec aiov;
+       struct uio aio;
+       int error;
+
+       /*
+        * Find base offset of header in file based on file header size, and
+        * data header size + maximum data size, indexed by inode number.
+        */
+       base_offset = sizeof(struct ufs_extattr_fileheader) +
+           ip->i_number * (sizeof(struct ufs_extattr_header) +
+           uele->uele_fileheader.uef_size);
+
+       /*
+        * Read in the data header to see if the data is defined, and if so
+        * how much.
+        */
+       memset(ueh, 0, sizeof(struct ufs_extattr_header));
+       aiov.iov_base = ueh;
+       aiov.iov_len = sizeof(struct ufs_extattr_header);
+       aio.uio_iov = &aiov;
+       aio.uio_iovcnt = 1;
+       aio.uio_rw = UIO_READ;
+       aio.uio_offset = base_offset;
+       aio.uio_resid = sizeof(struct ufs_extattr_header);
+       UIO_SETUP_SYSSPACE(&aio);
+
+       error = VOP_READ(uele->uele_backing_vnode, &aio,
+           IO_NODELOCKED, ump->um_extattr.uepm_ucred);
+       if (error)
+               return error;
+
+       /*
+        * Attribute headers are kept in file system byte order.
+        * XXX What about the blob of data?
+        */
+       ueh->ueh_flags = ufs_rw32(ueh->ueh_flags, UELE_NEEDSWAP(uele));
+       ueh->ueh_len   = ufs_rw32(ueh->ueh_len, UELE_NEEDSWAP(uele));
+       ueh->ueh_i_gen = ufs_rw32(ueh->ueh_i_gen, UELE_NEEDSWAP(uele));
+
+       /* Defined? */
+       if ((ueh->ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0)
+               return ENOATTR;
+
+       /* Valid for the current inode generation? */
+       if (ueh->ueh_i_gen != ip->i_gen) {
+               /*
+                * The inode itself has a different generation number
+                * than the uele data.  For now, the best solution
+                * is to coerce this to undefined, and let it get cleaned
+                * up by the next write or extattrctl clean.
+                */
+               printf("%s (%s): inode gen inconsistency (%u, %jd)\n",
+                      __func__,  mp->mnt_stat.f_mntonname, ueh->ueh_i_gen,
+                      (intmax_t)ip->i_gen);
+               return ENOATTR;
+       }
+
+       /* Local size consistency check. */
+       if (ueh->ueh_len > uele->uele_fileheader.uef_size)
+               return ENXIO;
+
+       /* Return base offset */
+       if (bap != NULL)
+               *bap = base_offset;
+
+       return 0;
+}
+
+/*
+ * Vnode operation to retrieve a named extended attribute.
+ */
+int
+ufs_getextattr(struct vop_getextattr_args *ap)
+/*
+vop_getextattr {
+       IN struct vnode *a_vp;
+       IN int a_attrnamespace;
+       IN const char *a_name;
+       INOUT struct uio *a_uio;
+       OUT size_t *a_size;
+       IN kauth_cred_t a_cred;
+};
+*/
+{
+       struct mount *mp = ap->a_vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       int error;
+
+       ufs_extattr_uepm_lock(ump);
+
+       error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name,
+           ap->a_uio, ap->a_size, ap->a_cred, curlwp);
+
+       ufs_extattr_uepm_unlock(ump);
+
+       return (error);
+}
+
+/*
+ * Real work associated with retrieving a named attribute--assumes that
+ * the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name,
+    struct uio *uio, size_t *size, kauth_cred_t cred, struct lwp *l)
+{
+       struct ufs_extattr_list_entry *attribute;
+       struct ufs_extattr_header ueh;
+       struct mount *mp = vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       off_t base_offset;
+       size_t len, old_len;
+       int error = 0;
+
+       if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+               return (EOPNOTSUPP);
+
+       if (strlen(name) == 0)
+               return (EINVAL);
+
+       error = extattr_check_cred(vp, attrnamespace, cred, l, IREAD);
+       if (error)
+               return (error);
+
+       attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
+       if (!attribute)
+               return (ENOATTR);
+
+       /*
+        * Allow only offsets of zero to encourage the read/replace
+        * extended attribute semantic.  Otherwise we can't guarantee
+        * atomicity, as we don't provide locks for extended attributes.
+        */
+       if (uio != NULL && uio->uio_offset != 0)
+               return (ENXIO);
+
+       /*
+        * Don't need to get a lock on the backing file if the getattr is
+        * being applied to the backing file, as the lock is already held.
+        */
+       if (attribute->uele_backing_vnode != vp)
+               vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY);
+
+       error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
+       if (error)
+               goto vopunlock_exit;
+
+       /* Return full data size if caller requested it. */
+       if (size != NULL)
+               *size = ueh.ueh_len;
+
+       /* Return data if the caller requested it. */
+       if (uio != NULL) {
+               /* Allow for offset into the attribute data. */
+               uio->uio_offset = base_offset + sizeof(struct
+                   ufs_extattr_header);
+
+               /*
+                * Figure out maximum to transfer -- use buffer size and
+                * local data limit.
+                */
+               len = MIN(uio->uio_resid, ueh.ueh_len);
+               old_len = uio->uio_resid;
+               uio->uio_resid = len;
+
+               error = VOP_READ(attribute->uele_backing_vnode, uio,
+                   IO_NODELOCKED, ump->um_extattr.uepm_ucred);
+               if (error)
+                       goto vopunlock_exit;
+
+               uio->uio_resid = old_len - (len - uio->uio_resid);
+       }
+
+ vopunlock_exit:
+
+       if (uio != NULL)
+               uio->uio_offset = 0;
+
+       if (attribute->uele_backing_vnode != vp)
+               VOP_UNLOCK(attribute->uele_backing_vnode);
+
+       return (error);
+}
+
+/*
+ * Vnode operation to list extended attribute for a vnode
+ */
+int
+ufs_listextattr(struct vop_listextattr_args *ap)
+/*
+vop_listextattr {
+       IN struct vnode *a_vp;
+       IN int a_attrnamespace;
+       INOUT struct uio *a_uio;
+       OUT size_t *a_size;
+       IN int flag;
+       IN kauth_cred_t a_cred;
+       struct proc *a_p;
+};
+*/
+{
+       struct mount *mp = ap->a_vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       int error;
+
+       ufs_extattr_uepm_lock(ump);
+
+       error = ufs_extattr_list(ap->a_vp, ap->a_attrnamespace,
+           ap->a_uio, ap->a_size, ap->a_flag, ap->a_cred, curlwp);
+
+       ufs_extattr_uepm_unlock(ump);
+
+       return (error);
+}
+
+/*
+ * Real work associated with retrieving list of attributes--assumes that
+ * the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_list(struct vnode *vp, int attrnamespace,
+    struct uio *uio, size_t *size, int flag, 
+    kauth_cred_t cred, struct lwp *l)
+{
+       struct ufs_extattr_list_entry *uele;
+       struct ufs_extattr_header ueh;
+       struct mount *mp = vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       size_t listsize = 0;
+       int error = 0;
+
+       if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+               return (EOPNOTSUPP);
+
+       error = extattr_check_cred(vp, attrnamespace, cred, l, IREAD);
+       if (error)
+               return (error);
+
+       LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) {
+               unsigned char attrnamelen;
+
+               if (uele->uele_attrnamespace != attrnamespace)
+                       continue;
+
+               error = ufs_extattr_get_header(vp, uele, &ueh, NULL);
+               if (error == ENOATTR)
+                       continue;       
+               if (error != 0)
+                       return error;
+
+               /*
+                * Don't need to get a lock on the backing file if 
+                * the listattr is being applied to the backing file, 
+                * as the lock is already held.
+                */
+               if (uele->uele_backing_vnode != vp)
+                       vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY);
+
+               /*
+                * +1 for trailing NUL (listxattr flavor)
+                *  or leading name length (extattr_list_file flavor)
+                */
+               attrnamelen = strlen(uele->uele_attrname);
+               listsize += attrnamelen + 1;
+
+               /* Return data if the caller requested it. */
+               if (uio != NULL) {
+                       /*
+                        * We support two flavors. Either NUL-terminated
+                        * strings (a la listxattr), or non NUL-terminated,
+                        * one byte length prefixed strings (for
+                        * extattr_list_file). EXTATTR_LIST_LENPREFIX switches
+                        * that second behavior.
+                        */
+                       if (flag & EXTATTR_LIST_LENPREFIX) {
+                               uint8_t len = (uint8_t)attrnamelen;
+
+                               /* Copy leading name length */
+                               error = uiomove(&len, sizeof(len), uio);
+                               if (error != 0)
+                                       break;  
+                       } else {
+                               /* Include trailing NULL */
+                               attrnamelen++; 
+                       }
+
+                       error = uiomove(uele->uele_attrname, 
+                                       (size_t)attrnamelen, uio);
+                       if (error != 0)
+                               break;  
+               }
+
+               if (uele->uele_backing_vnode != vp)
+                       VOP_UNLOCK(uele->uele_backing_vnode);
+
+               if (error != 0)
+                       return error;
+       }
+
+       if (uio != NULL)
+               uio->uio_offset = 0;
+
+       /* Return full data size if caller requested it. */
+       if (size != NULL)
+               *size = listsize;
+
+       return 0;
+}
+
+/*
+ * Vnode operation to remove a named attribute.
+ */
+int
+ufs_deleteextattr(struct vop_deleteextattr_args *ap)
+/*
+vop_deleteextattr {
+       IN struct vnode *a_vp;
+       IN int a_attrnamespace;
+       IN const char *a_name;
+       IN kauth_cred_t a_cred;
+};
+*/
+{
+       struct mount *mp = ap->a_vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp); 
+       int error;
+
+       ufs_extattr_uepm_lock(ump);
+
+       error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name,
+           ap->a_cred, curlwp);
+
+       ufs_extattr_uepm_unlock(ump);
+
+       return (error);
+}
+
+/*
+ * Vnode operation to set a named attribute.
+ */
+int
+ufs_setextattr(struct vop_setextattr_args *ap)
+/*
+vop_setextattr {
+       IN struct vnode *a_vp;
+       IN int a_attrnamespace;
+       IN const char *a_name;
+       INOUT struct uio *a_uio;
+       IN kauth_cred_t a_cred;
+};
+*/
+{
+       struct mount *mp = ap->a_vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp); 
+       int error;
+
+       ufs_extattr_uepm_lock(ump);
+
+       /*
+        * XXX: No longer a supported way to delete extended attributes.
+        */
+       if (ap->a_uio == NULL) {
+               ufs_extattr_uepm_unlock(ump);
+               return (EINVAL);
+       }
+
+       error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name,
+           ap->a_uio, ap->a_cred, curlwp);
+
+       ufs_extattr_uepm_unlock(ump);
+
+       return (error);
+}
+
+/*
+ * Real work associated with setting a vnode's extended attributes;
+ * assumes that the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name,
+    struct uio *uio, kauth_cred_t cred, struct lwp *l)
+{
+       struct ufs_extattr_list_entry *attribute;
+       struct ufs_extattr_header ueh;
+       struct iovec local_aiov;
+       struct uio local_aio;
+       struct mount *mp = vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct inode *ip = VTOI(vp);
+       off_t base_offset;
+       int error = 0, ioflag;
+
+       if (vp->v_mount->mnt_flag & MNT_RDONLY)
+               return (EROFS);
+       if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+               return (EOPNOTSUPP);
+       if (!ufs_extattr_valid_attrname(attrnamespace, name))
+               return (EINVAL);
+
+       error = extattr_check_cred(vp, attrnamespace, cred, l, IWRITE);
+       if (error)
+               return (error);
+
+       attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
+       if (!attribute) {
+               attribute =  ufs_extattr_autocreate_attr(vp, attrnamespace, 
+                                                        name, l);
+               if  (!attribute)
+                       return (ENOATTR);
+       }
+
+       /*
+        * Early rejection of invalid offsets/length.
+        * Reject: any offset but 0 (replace)
+        *       Any size greater than attribute size limit
+        */
+       if (uio->uio_offset != 0 ||
+           uio->uio_resid > attribute->uele_fileheader.uef_size)
+               return (ENXIO);
+
+       /*
+        * Find base offset of header in file based on file header size, and
+        * data header size + maximum data size, indexed by inode number.
+        */
+       base_offset = sizeof(struct ufs_extattr_fileheader) +
+           ip->i_number * (sizeof(struct ufs_extattr_header) +
+           attribute->uele_fileheader.uef_size);
+
+       /*
+        * Write out a data header for the data.
+        */
+       ueh.ueh_len = ufs_rw32((uint32_t) uio->uio_resid,
+           UELE_NEEDSWAP(attribute));
+       ueh.ueh_flags = ufs_rw32(UFS_EXTATTR_ATTR_FLAG_INUSE,
+                                UELE_NEEDSWAP(attribute));
+       ueh.ueh_i_gen = ufs_rw32(ip->i_gen, UELE_NEEDSWAP(attribute));
+       local_aiov.iov_base = &ueh;
+       local_aiov.iov_len = sizeof(struct ufs_extattr_header);
+       local_aio.uio_iov = &local_aiov;
+       local_aio.uio_iovcnt = 1;
+       local_aio.uio_rw = UIO_WRITE;
+       local_aio.uio_offset = base_offset;
+       local_aio.uio_resid = sizeof(struct ufs_extattr_header);
+       UIO_SETUP_SYSSPACE(&local_aio);
+
+       /*
+        * Don't need to get a lock on the backing file if the setattr is
+        * being applied to the backing file, as the lock is already held.
+        */
+       if (attribute->uele_backing_vnode != vp)
+               vn_lock(attribute->uele_backing_vnode, 
+                   LK_EXCLUSIVE | LK_RETRY);
+
+       ioflag = IO_NODELOCKED;
+       if (ufs_extattr_sync)
+               ioflag |= IO_SYNC;
+       error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
+           ump->um_extattr.uepm_ucred);
+       if (error)
+               goto vopunlock_exit;
+
+       if (local_aio.uio_resid != 0) {
+               error = ENXIO;
+               goto vopunlock_exit;
+       }
+
+       /*
+        * Write out user data.
+        * XXX NOT ATOMIC WITH RESPECT TO THE HEADER.
+        */
+       uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header);
+
+       ioflag = IO_NODELOCKED;
+       if (ufs_extattr_sync)
+               ioflag |= IO_SYNC;
+       error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag,
+           ump->um_extattr.uepm_ucred);
+
+ vopunlock_exit:
+       uio->uio_offset = 0;
+
+       if (attribute->uele_backing_vnode != vp)
+               VOP_UNLOCK(attribute->uele_backing_vnode);
+
+       return (error);
+}
+
+/*
+ * Real work associated with removing an extended attribute from a vnode.
+ * Assumes the attribute lock has already been grabbed.
+ */
+static int
+ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name,
+    kauth_cred_t cred, struct lwp *l)
+{
+       struct ufs_extattr_list_entry *attribute;
+       struct ufs_extattr_header ueh;
+       struct mount *mp = vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct iovec local_aiov;
+       struct uio local_aio;
+       off_t base_offset;
+       int error = 0, ioflag;
+
+       if (vp->v_mount->mnt_flag & MNT_RDONLY)  
+               return (EROFS);
+       if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
+               return (EOPNOTSUPP);
+       if (!ufs_extattr_valid_attrname(attrnamespace, name))
+               return (EINVAL);
+
+       error = extattr_check_cred(vp, attrnamespace, cred, l, IWRITE);
+       if (error)
+               return (error);
+
+       attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
+       if (!attribute)
+               return (ENOATTR);
+
+       /*
+        * Don't need to get a lock on the backing file if the getattr is
+        * being applied to the backing file, as the lock is already held.
+        */
+       if (attribute->uele_backing_vnode != vp)
+               vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);
+
+       error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
+       if (error)
+               goto vopunlock_exit;
+
+       /* Flag it as not in use. */
+       ueh.ueh_flags = 0;              /* No need to byte swap 0 */
+       ueh.ueh_len = 0;                /* ...ditto... */
+
+       local_aiov.iov_base = &ueh;
+       local_aiov.iov_len = sizeof(struct ufs_extattr_header);
+       local_aio.uio_iov = &local_aiov;
+       local_aio.uio_iovcnt = 1;
+       local_aio.uio_rw = UIO_WRITE;
+       local_aio.uio_offset = base_offset;
+       local_aio.uio_resid = sizeof(struct ufs_extattr_header);
+       UIO_SETUP_SYSSPACE(&local_aio);
+
+       ioflag = IO_NODELOCKED;
+       if (ufs_extattr_sync)
+               ioflag |= IO_SYNC;
+       error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
+           ump->um_extattr.uepm_ucred);
+       if (error)
+               goto vopunlock_exit;
+
+       if (local_aio.uio_resid != 0)
+               error = ENXIO;
+
+ vopunlock_exit:
+       VOP_UNLOCK(attribute->uele_backing_vnode);
+
+       return (error);
+}
+
+/*
+ * Called by UFS when an inode is no longer active and should have its
+ * attributes stripped.
+ */
+void
+ufs_extattr_vnode_inactive(struct vnode *vp, struct lwp *l)
+{
+       struct ufs_extattr_list_entry *uele;
+       struct mount *mp = vp->v_mount;
+       struct ufsmount *ump = VFSTOUFS(mp);
+
+       /*
+        * In that case, we cannot lock. We should not have any active vnodes
+        * on the fs if this is not yet initialized but is going to be, so
+        * this can go unlocked.
+        */
+       if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
+               return;
+
+       ufs_extattr_uepm_lock(ump);
+
+       if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
+               ufs_extattr_uepm_unlock(ump);
+               return;
+       }
+
+       LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries)
+               ufs_extattr_rm(vp, uele->uele_attrnamespace,
+                   uele->uele_attrname, lwp0.l_cred, l);
+
+       ufs_extattr_uepm_unlock(ump);
+}
+
+void
+ufs_extattr_init(void)
+{
+
+       malloc_type_attach(M_UFS_EXTATTR);
+}
+
+void
+ufs_extattr_done(void)
+{
+
+       malloc_type_detach(M_UFS_EXTATTR);
+}
diff --git a/sys/ufs/ufs/ufs_ihash.c b/sys/ufs/ufs/ufs_ihash.c
new file mode 100644 (file)
index 0000000..213f335
--- /dev/null
@@ -0,0 +1,191 @@
+/*     $NetBSD: ufs_ihash.c,v 1.31 2011/06/12 03:36:02 rmind Exp $     */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_ihash.c 8.7 (Berkeley) 5/17/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_ihash.c,v 1.31 2011/06/12 03:36:02 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufs_extern.h>
+
+/*
+ * Structures associated with inode cacheing.
+ */
+static LIST_HEAD(ihashhead, inode) *ihashtbl;
+static u_long  ihash;          /* size of hash table - 1 */
+#define INOHASH(device, inum)  (((device) + (inum)) & ihash)
+
+kmutex_t       ufs_ihash_lock;
+kmutex_t       ufs_hashlock;
+
+/*
+ * Initialize inode hash table.
+ */
+void
+ufs_ihashinit(void)
+{
+
+       mutex_init(&ufs_hashlock, MUTEX_DEFAULT, IPL_NONE);
+       mutex_init(&ufs_ihash_lock, MUTEX_DEFAULT, IPL_NONE);
+       ihashtbl = hashinit(desiredvnodes, HASH_LIST, true, &ihash);
+}
+
+/*
+ * Reinitialize inode hash table.
+ */
+
+void
+ufs_ihashreinit(void)
+{
+       struct inode *ip;
+       struct ihashhead *oldhash, *hash;
+       u_long oldmask, mask, val;
+       int i;
+
+       hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
+       mutex_enter(&ufs_ihash_lock);
+       oldhash = ihashtbl;
+       oldmask = ihash;
+       ihashtbl = hash;
+       ihash = mask;
+       for (i = 0; i <= oldmask; i++) {
+               while ((ip = LIST_FIRST(&oldhash[i])) != NULL) {
+                       LIST_REMOVE(ip, i_hash);
+                       val = INOHASH(ip->i_dev, ip->i_number);
+                       LIST_INSERT_HEAD(&hash[val], ip, i_hash);
+               }
+       }
+       mutex_exit(&ufs_ihash_lock);
+       hashdone(oldhash, HASH_LIST, oldmask);
+}
+
+/*
+ * Free inode hash table.
+ */
+void
+ufs_ihashdone(void)
+{
+
+       hashdone(ihashtbl, HASH_LIST, ihash);
+       mutex_destroy(&ufs_hashlock);
+       mutex_destroy(&ufs_ihash_lock);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, return it, even if it is locked.
+ */
+struct vnode *
+ufs_ihashlookup(dev_t dev, ino_t inum)
+{
+       struct inode *ip;
+       struct ihashhead *ipp;
+
+       KASSERT(mutex_owned(&ufs_ihash_lock));
+
+       ipp = &ihashtbl[INOHASH(dev, inum)];
+       LIST_FOREACH(ip, ipp, i_hash) {
+               if (inum == ip->i_number && dev == ip->i_dev)
+                       break;
+       }
+       if (ip)
+               return (ITOV(ip));
+       return (NULLVP);
+}
+
+/*
+ * Use the device/inum pair to find the incore inode, and return a pointer
+ * to it. If it is in core, but locked, wait for it.
+ */
+struct vnode *
+ufs_ihashget(dev_t dev, ino_t inum, int flags)
+{
+       struct ihashhead *ipp;
+       struct inode *ip;
+       struct vnode *vp;
+
+ loop:
+       mutex_enter(&ufs_ihash_lock);
+       ipp = &ihashtbl[INOHASH(dev, inum)];
+       LIST_FOREACH(ip, ipp, i_hash) {
+               if (inum == ip->i_number && dev == ip->i_dev) {
+                       vp = ITOV(ip);
+                       if (flags == 0) {
+                               mutex_exit(&ufs_ihash_lock);
+                       } else {
+                               mutex_enter(vp->v_interlock);
+                               mutex_exit(&ufs_ihash_lock);
+                               if (vget(vp, flags))
+                                       goto loop;
+                       }
+                       return (vp);
+               }
+       }
+       mutex_exit(&ufs_ihash_lock);
+       return (NULL);
+}
+
+/*
+ * Insert the inode into the hash table, and return it locked.
+ */
+void
+ufs_ihashins(struct inode *ip)
+{
+       struct ihashhead *ipp;
+
+       KASSERT(mutex_owned(&ufs_hashlock));
+
+       /* lock the inode, then put it on the appropriate hash list */
+       VOP_LOCK(ITOV(ip), LK_EXCLUSIVE);
+
+       mutex_enter(&ufs_ihash_lock);
+       ipp = &ihashtbl[INOHASH(ip->i_dev, ip->i_number)];
+       LIST_INSERT_HEAD(ipp, ip, i_hash);
+       mutex_exit(&ufs_ihash_lock);
+}
+
+/*
+ * Remove the inode from the hash table.
+ */
+void
+ufs_ihashrem(struct inode *ip)
+{
+       mutex_enter(&ufs_ihash_lock);
+       LIST_REMOVE(ip, i_hash);
+       mutex_exit(&ufs_ihash_lock);
+}
diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c
new file mode 100644 (file)
index 0000000..7a9eea4
--- /dev/null
@@ -0,0 +1,311 @@
+/*     $NetBSD: ufs_inode.c,v 1.88 2011/09/20 14:01:33 chs Exp $       */
+
+/*
+ * Copyright (c) 1991, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_inode.c 8.9 (Berkeley) 5/14/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.88 2011/09/20 14:01:33 chs Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#include "opt_wapbl.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/namei.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+#include <sys/kmem.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#ifdef UFS_EXTATTR
+#include <ufs/ufs/extattr.h>
+#endif
+
+#include <uvm/uvm.h>
+
+extern int prtactive;
+
+/*
+ * Last reference to an inode.  If necessary, write or delete it.
+ */
+int
+ufs_inactive(void *v)
+{
+       struct vop_inactive_args /* {
+               struct vnode *a_vp;
+               struct bool *a_recycle;
+       } */ *ap = v;
+       struct vnode *vp = ap->a_vp;
+       struct inode *ip = VTOI(vp);
+       struct mount *transmp;
+       mode_t mode;
+       int error = 0;
+       int logged = 0;
+
+       UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount);
+
+       transmp = vp->v_mount;
+       fstrans_start(transmp, FSTRANS_LAZY);
+       /*
+        * Ignore inodes related to stale file handles.
+        */
+       if (ip->i_mode == 0)
+               goto out;
+       if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+#ifdef UFS_EXTATTR
+               ufs_extattr_vnode_inactive(vp, curlwp);
+#endif
+               error = UFS_WAPBL_BEGIN(vp->v_mount);
+               if (error)
+                       goto out;
+               logged = 1;
+               if (ip->i_size != 0) {
+                       /*
+                        * When journaling, only truncate one indirect block
+                        * at a time
+                        */
+                       if (vp->v_mount->mnt_wapbl) {
+                               uint64_t incr = MNINDIR(ip->i_ump) <<
+                                   vp->v_mount->mnt_fs_bshift; /* Power of 2 */
+                               uint64_t base = NDADDR <<
+                                   vp->v_mount->mnt_fs_bshift;
+                               while (!error && ip->i_size > base + incr) {
+                                       /*
+                                        * round down to next full indirect
+                                        * block boundary.
+                                        */
+                                       uint64_t nsize = base +
+                                           ((ip->i_size - base - 1) &
+                                           ~(incr - 1));
+                                       error = UFS_TRUNCATE(vp, nsize, 0,
+                                           NOCRED);
+                                       if (error)
+                                               break;
+                                       UFS_WAPBL_END(vp->v_mount);
+                                       error = UFS_WAPBL_BEGIN(vp->v_mount);
+                                       if (error)
+                                               goto out;
+                               }
+                       }
+                       if (!error)
+                               error = UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED);
+               }
+#if defined(QUOTA) || defined(QUOTA2)
+               (void)chkiq(ip, -1, NOCRED, 0);
+#endif
+               DIP_ASSIGN(ip, rdev, 0);
+               mode = ip->i_mode;
+               ip->i_mode = 0;
+               ip->i_omode = mode;
+               DIP_ASSIGN(ip, mode, 0);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               /*
+                * Defer final inode free and update to ufs_reclaim().
+                */
+       }
+
+       if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
+               if (!logged++) {
+                       int err;
+                       err = UFS_WAPBL_BEGIN(vp->v_mount);
+                       if (err)
+                               goto out;
+               }
+               UFS_UPDATE(vp, NULL, NULL, 0);
+       }
+       if (logged)
+               UFS_WAPBL_END(vp->v_mount);
+out:
+       /*
+        * If we are done with the inode, reclaim it
+        * so that it can be reused immediately.
+        */
+       *ap->a_recycle = (ip->i_mode == 0);
+       VOP_UNLOCK(vp);
+       fstrans_done(transmp);
+       return (error);
+}
+
+/*
+ * Reclaim an inode so that it can be used for other purposes.
+ */
+int
+ufs_reclaim(struct vnode *vp)
+{
+       struct inode *ip = VTOI(vp);
+
+       if (prtactive && vp->v_usecount > 1)
+               vprint("ufs_reclaim: pushing active", vp);
+
+       if (!UFS_WAPBL_BEGIN(vp->v_mount)) {
+               UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
+               UFS_WAPBL_END(vp->v_mount);
+       }
+       UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
+
+       /*
+        * Remove the inode from its hash chain.
+        */
+       ufs_ihashrem(ip);
+
+       if (ip->i_devvp) {
+               vrele(ip->i_devvp);
+               ip->i_devvp = 0;
+       }
+#if defined(QUOTA) || defined(QUOTA2)
+       ufsquota_free(ip);
+#endif
+#ifdef UFS_DIRHASH
+       if (ip->i_dirhash != NULL)
+               ufsdirhash_free(ip);
+#endif
+       return (0);
+}
+
+/*
+ * allocate a range of blocks in a file.
+ * after this function returns, any page entirely contained within the range
+ * will map to invalid data and thus must be overwritten before it is made
+ * accessible to others.
+ */
+
+int
+ufs_balloc_range(struct vnode *vp, off_t off, off_t len, kauth_cred_t cred,
+    int flags)
+{
+       off_t neweof;   /* file size after the operation */
+       off_t neweob;   /* offset next to the last block after the operation */
+       off_t pagestart; /* starting offset of range covered by pgs */
+       off_t eob;      /* offset next to allocated blocks */
+       struct uvm_object *uobj;
+       int i, delta, error, npages;
+       int bshift = vp->v_mount->mnt_fs_bshift;
+       int bsize = 1 << bshift;
+       int ppb = MAX(bsize >> PAGE_SHIFT, 1);
+       struct vm_page **pgs;
+       size_t pgssize;
+       UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist);
+       UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x",
+                   vp, off, len, vp->v_size);
+
+       neweof = MAX(vp->v_size, off + len);
+       GOP_SIZE(vp, neweof, &neweob, 0);
+
+       error = 0;
+       uobj = &vp->v_uobj;
+
+       /*
+        * read or create pages covering the range of the allocation and
+        * keep them locked until the new block is allocated, so there
+        * will be no window where the old contents of the new block are
+        * visible to racing threads.
+        */
+
+       pagestart = trunc_page(off) & ~(bsize - 1);
+       npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT);
+       pgssize = npages * sizeof(struct vm_page *);
+       pgs = kmem_zalloc(pgssize, KM_SLEEP);
+
+       /*
+        * adjust off to be block-aligned.
+        */
+
+       delta = off & (bsize - 1);
+       off -= delta;
+       len += delta;
+
+       genfs_node_wrlock(vp);
+       mutex_enter(uobj->vmobjlock);
+       error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0,
+           VM_PROT_WRITE, 0, PGO_SYNCIO | PGO_PASTEOF | PGO_NOBLOCKALLOC |
+           PGO_NOTIMESTAMP | PGO_GLOCKHELD);
+       if (error) {
+               goto out;
+       }
+
+       /*
+        * now allocate the range.
+        */
+
+       error = GOP_ALLOC(vp, off, len, flags, cred);
+       genfs_node_unlock(vp);
+
+       /*
+        * if the allocation succeeded, clear PG_CLEAN on all the pages
+        * and clear PG_RDONLY on any pages that are now fully backed
+        * by disk blocks.  if the allocation failed, we do not invalidate
+        * the pages since they might have already existed and been dirty,
+        * in which case we need to keep them around.  if we created the pages,
+        * they will be clean and read-only, and leaving such pages
+        * in the cache won't cause any problems.
+        */
+
+       GOP_SIZE(vp, off + len, &eob, 0);
+       mutex_enter(uobj->vmobjlock);
+       mutex_enter(&uvm_pageqlock);
+       for (i = 0; i < npages; i++) {
+               KASSERT((pgs[i]->flags & PG_RELEASED) == 0);
+               if (!error) {
+                       if (off <= pagestart + (i << PAGE_SHIFT) &&
+                           pagestart + ((i + 1) << PAGE_SHIFT) <= eob) {
+                               pgs[i]->flags &= ~PG_RDONLY;
+                       }
+                       pgs[i]->flags &= ~PG_CLEAN;
+               }
+               uvm_pageactivate(pgs[i]);
+       }
+       mutex_exit(&uvm_pageqlock);
+       uvm_page_unbusy(pgs, npages);
+       mutex_exit(uobj->vmobjlock);
+
+ out:
+       kmem_free(pgs, pgssize);
+       return error;
+}
diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c
new file mode 100644 (file)
index 0000000..aa395de
--- /dev/null
@@ -0,0 +1,1500 @@
+/*     $NetBSD: ufs_lookup.c,v 1.111 2011/07/17 22:07:59 dholland Exp $        */
+
+/*
+ * Copyright (c) 1989, 1993
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_lookup.c        8.9 (Berkeley) 8/11/94
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.111 2011/07/17 22:07:59 dholland Exp $");
+
+#ifdef _KERNEL_OPT
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+#include <sys/proc.h>
+#include <sys/kmem.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#ifdef DIAGNOSTIC
+int    dirchk = 1;
+#else
+int    dirchk = 0;
+#endif
+
+#define        FSFMT(vp)       (((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0)
+
+/*
+ * Convert a component of a pathname into a pointer to a locked inode.
+ * This is a very central and rather complicated routine.
+ * If the file system is not maintained in a strict tree hierarchy,
+ * this can result in a deadlock situation (see comments in code below).
+ *
+ * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
+ * on whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it and the target of the pathname
+ * exists, lookup returns both the target and its parent directory locked.
+ * When creating or renaming and LOCKPARENT is specified, the target may
+ * not be ".".  When deleting and LOCKPARENT is specified, the target may
+ * be "."., but the caller must check to ensure it does an vrele and vput
+ * instead of two vputs.
+ *
+ * Overall outline of ufs_lookup:
+ *
+ *     check accessibility of directory
+ *     look for name in cache, if found, then if at end of path
+ *       and deleting or creating, drop it, else return name
+ *     search for name in directory, to found or notfound
+ * notfound:
+ *     if creating, return locked directory, leaving info on available slots
+ *     else return error
+ * found:
+ *     if at end of path and deleting, return information to allow delete
+ *     if at end of path and rewriting (RENAME and LOCKPARENT), lock target
+ *       inode and return info to allow rewrite
+ *     if not at end, add name to cache; if at end and neither creating
+ *       nor deleting, add name to cache
+ */
+int
+ufs_lookup(void *v)
+{
+       struct vop_lookup_args /* {
+               struct vnode *a_dvp;
+               struct vnode **a_vpp;
+               struct componentname *a_cnp;
+       } */ *ap = v;
+       struct vnode *vdp = ap->a_dvp;  /* vnode for directory being searched */
+       struct inode *dp = VTOI(vdp);   /* inode for directory being searched */
+       struct buf *bp;                 /* a buffer of directory entries */
+       struct direct *ep;              /* the current directory entry */
+       int entryoffsetinblock;         /* offset of ep in bp's buffer */
+       enum {NONE, COMPACT, FOUND} slotstatus;
+       doff_t slotoffset;              /* offset of area with free space */
+       int slotsize;                   /* size of area at slotoffset */
+       int slotfreespace;              /* amount of space free in slot */
+       int slotneeded;                 /* size of the entry we're seeking */
+       int numdirpasses;               /* strategy for directory search */
+       doff_t endsearch;               /* offset to end directory search */
+       doff_t prevoff;                 /* prev entry dp->i_offset */
+       struct vnode *pdp;              /* saved dp during symlink work */
+       struct vnode *tdp;              /* returned by VFS_VGET */
+       doff_t enduseful;               /* pointer past last used dir slot */
+       u_long bmask;                   /* block offset mask */
+       int namlen, error;
+       struct vnode **vpp = ap->a_vpp;
+       struct componentname *cnp = ap->a_cnp;
+       kauth_cred_t cred = cnp->cn_cred;
+       int flags;
+       int nameiop = cnp->cn_nameiop;
+       struct ufsmount *ump = dp->i_ump;
+       const int needswap = UFS_MPNEEDSWAP(ump);
+       int dirblksiz = ump->um_dirblksiz;
+       ino_t foundino;
+       struct ufs_lookup_results *results;
+
+       flags = cnp->cn_flags;
+
+       bp = NULL;
+       slotoffset = -1;
+       *vpp = NULL;
+       endsearch = 0; /* silence compiler warning */
+
+       /*
+        * Produce the auxiliary lookup results into i_crap. Increment
+        * its serial number so elsewhere we can tell if we're using
+        * stale results. This should not be done this way. XXX.
+        */
+       results = &dp->i_crap;
+       dp->i_crapcounter++;
+
+       /*
+        * Check accessiblity of directory.
+        */
+       if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
+               return (error);
+
+       if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
+           (nameiop == DELETE || nameiop == RENAME))
+               return (EROFS);
+
+       /*
+        * We now have a segment name to search for, and a directory to search.
+        *
+        * Before tediously performing a linear scan of the directory,
+        * check the name cache to see if the directory/name pair
+        * we are looking for is known already.
+        */
+       if ((error = cache_lookup(vdp, vpp, cnp)) >= 0) {
+               return (error);
+       }
+
+       fstrans_start(vdp->v_mount, FSTRANS_SHARED);
+
+       /*
+        * Suppress search for slots unless creating
+        * file and at end of pathname, in which case
+        * we watch for a place to put the new file in
+        * case it doesn't already exist.
+        */
+       slotstatus = FOUND;
+       slotfreespace = slotsize = slotneeded = 0;
+       if ((nameiop == CREATE || nameiop == RENAME) &&
+           (flags & ISLASTCN)) {
+               slotstatus = NONE;
+               slotneeded = DIRECTSIZ(cnp->cn_namelen);
+       }
+
+       /*
+        * If there is cached information on a previous search of
+        * this directory, pick up where we last left off.
+        * We cache only lookups as these are the most common
+        * and have the greatest payoff. Caching CREATE has little
+        * benefit as it usually must search the entire directory
+        * to determine that the entry does not exist. Caching the
+        * location of the last DELETE or RENAME has not reduced
+        * profiling time and hence has been removed in the interest
+        * of simplicity.
+        */
+       bmask = vdp->v_mount->mnt_stat.f_iosize - 1;
+
+#ifdef UFS_DIRHASH
+       /*
+        * Use dirhash for fast operations on large directories. The logic
+        * to determine whether to hash the directory is contained within
+        * ufsdirhash_build(); a zero return means that it decided to hash
+        * this directory and it successfully built up the hash table.
+        */
+       if (ufsdirhash_build(dp) == 0) {
+               /* Look for a free slot if needed. */
+               enduseful = dp->i_size;
+               if (slotstatus != FOUND) {
+                       slotoffset = ufsdirhash_findfree(dp, slotneeded,
+                           &slotsize);
+                       if (slotoffset >= 0) {
+                               slotstatus = COMPACT;
+                               enduseful = ufsdirhash_enduseful(dp);
+                               if (enduseful < 0)
+                                       enduseful = dp->i_size;
+                       }
+               }
+               /* Look up the component. */
+               numdirpasses = 1;
+               entryoffsetinblock = 0; /* silence compiler warning */
+               switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
+                   &results->ulr_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) {
+               case 0:
+                       ep = (struct direct *)((char *)bp->b_data +
+                           (results->ulr_offset & bmask));
+                       goto foundentry;
+               case ENOENT:
+                       results->ulr_offset = roundup(dp->i_size, dirblksiz);
+                       goto notfound;
+               default:
+                       /* Something failed; just do a linear search. */
+                       break;
+               }
+       }
+#endif /* UFS_DIRHASH */
+
+       if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
+           results->ulr_diroff >= dp->i_size) {
+               entryoffsetinblock = 0;
+               results->ulr_offset = 0;
+               numdirpasses = 1;
+       } else {
+               results->ulr_offset = results->ulr_diroff;
+               if ((entryoffsetinblock = results->ulr_offset & bmask) &&
+                   (error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
+                   NULL, &bp, false)))
+                       goto out;
+               numdirpasses = 2;
+               nchstats.ncs_2passes++;
+       }
+       prevoff = results->ulr_offset;
+       endsearch = roundup(dp->i_size, dirblksiz);
+       enduseful = 0;
+
+searchloop:
+       while (results->ulr_offset < endsearch) {
+               if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD)
+                       preempt();
+               /*
+                * If necessary, get the next directory block.
+                */
+               if ((results->ulr_offset & bmask) == 0) {
+                       if (bp != NULL)
+                               brelse(bp, 0);
+                       error = ufs_blkatoff(vdp, (off_t)results->ulr_offset, NULL,
+                           &bp, false);
+                       if (error)
+                               goto out;
+                       entryoffsetinblock = 0;
+               }
+               /*
+                * If still looking for a slot, and at a DIRBLKSIZ
+                * boundary, have to start looking for free space again.
+                */
+               if (slotstatus == NONE &&
+                   (entryoffsetinblock & (dirblksiz - 1)) == 0) {
+                       slotoffset = -1;
+                       slotfreespace = 0;
+               }
+               /*
+                * Get pointer to next entry.
+                * Full validation checks are slow, so we only check
+                * enough to insure forward progress through the
+                * directory. Complete checks can be run by patching
+                * "dirchk" to be true.
+                */
+               KASSERT(bp != NULL);
+               ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock);
+               if (ep->d_reclen == 0 ||
+                   (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) {
+                       int i;
+
+                       ufs_dirbad(dp, results->ulr_offset, "mangled entry");
+                       i = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
+                       results->ulr_offset += i;
+                       entryoffsetinblock += i;
+                       continue;
+               }
+
+               /*
+                * If an appropriate sized slot has not yet been found,
+                * check to see if one is available. Also accumulate space
+                * in the current block so that we can determine if
+                * compaction is viable.
+                */
+               if (slotstatus != FOUND) {
+                       int size = ufs_rw16(ep->d_reclen, needswap);
+
+                       if (ep->d_ino != 0)
+                               size -= DIRSIZ(FSFMT(vdp), ep, needswap);
+                       if (size > 0) {
+                               if (size >= slotneeded) {
+                                       slotstatus = FOUND;
+                                       slotoffset = results->ulr_offset;
+                                       slotsize = ufs_rw16(ep->d_reclen,
+                                           needswap);
+                               } else if (slotstatus == NONE) {
+                                       slotfreespace += size;
+                                       if (slotoffset == -1)
+                                               slotoffset = results->ulr_offset;
+                                       if (slotfreespace >= slotneeded) {
+                                               slotstatus = COMPACT;
+                                               slotsize = results->ulr_offset +
+                                                   ufs_rw16(ep->d_reclen,
+                                                            needswap) -
+                                                   slotoffset;
+                                       }
+                               }
+                       }
+               }
+
+               /*
+                * Check for a name match.
+                */
+               if (ep->d_ino) {
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+                       if (FSFMT(vdp) && needswap == 0)
+                               namlen = ep->d_type;
+                       else
+                               namlen = ep->d_namlen;
+#else
+                       if (FSFMT(vdp) && needswap != 0)
+                               namlen = ep->d_type;
+                       else
+                               namlen = ep->d_namlen;
+#endif
+                       if (namlen == cnp->cn_namelen &&
+                           !memcmp(cnp->cn_nameptr, ep->d_name,
+                           (unsigned)namlen)) {
+#ifdef UFS_DIRHASH
+foundentry:
+#endif
+                               /*
+                                * Save directory entry's inode number and
+                                * reclen in ndp->ni_ufs area, and release
+                                * directory buffer.
+                                */
+                               if (!FSFMT(vdp) && ep->d_type == DT_WHT) {
+                                       slotstatus = FOUND;
+                                       slotoffset = results->ulr_offset;
+                                       slotsize = ufs_rw16(ep->d_reclen,
+                                           needswap);
+                                       results->ulr_reclen = slotsize;
+                                       /*
+                                        * This is used to set results->ulr_endoff,
+                                        * which may be used by ufs_direnter2()
+                                        * as a length to truncate the
+                                        * directory to.  Therefore, it must
+                                        * point past the end of the last
+                                        * non-empty directory entry.  We don't
+                                        * know where that is in this case, so
+                                        * we effectively disable shrinking by
+                                        * using the existing size of the
+                                        * directory.
+                                        *
+                                        * Note that we wouldn't expect to
+                                        * shrink the directory while rewriting
+                                        * an existing entry anyway.
+                                        */
+                                       enduseful = endsearch;
+                                       ap->a_cnp->cn_flags |= ISWHITEOUT;
+                                       numdirpasses--;
+                                       goto notfound;
+                               }
+                               foundino = ufs_rw32(ep->d_ino, needswap);
+                               results->ulr_reclen = ufs_rw16(ep->d_reclen, needswap);
+                               goto found;
+                       }
+               }
+               prevoff = results->ulr_offset;
+               results->ulr_offset += ufs_rw16(ep->d_reclen, needswap);
+               entryoffsetinblock += ufs_rw16(ep->d_reclen, needswap);
+               if (ep->d_ino)
+                       enduseful = results->ulr_offset;
+       }
+notfound:
+       /*
+        * If we started in the middle of the directory and failed
+        * to find our target, we must check the beginning as well.
+        */
+       if (numdirpasses == 2) {
+               numdirpasses--;
+               results->ulr_offset = 0;
+               endsearch = results->ulr_diroff;
+               goto searchloop;
+       }
+       if (bp != NULL)
+               brelse(bp, 0);
+       /*
+        * If creating, and at end of pathname and current
+        * directory has not been removed, then can consider
+        * allowing file to be created.
+        */
+       if ((nameiop == CREATE || nameiop == RENAME ||
+            (nameiop == DELETE &&
+             (ap->a_cnp->cn_flags & DOWHITEOUT) &&
+             (ap->a_cnp->cn_flags & ISWHITEOUT))) &&
+           (flags & ISLASTCN) && dp->i_nlink != 0) {
+               /*
+                * Access for write is interpreted as allowing
+                * creation of files in the directory.
+                */
+               error = VOP_ACCESS(vdp, VWRITE, cred);
+               if (error)
+                       goto out;
+               /*
+                * Return an indication of where the new directory
+                * entry should be put.  If we didn't find a slot,
+                * then set results->ulr_count to 0 indicating
+                * that the new slot belongs at the end of the
+                * directory. If we found a slot, then the new entry
+                * can be put in the range from results->ulr_offset to
+                * results->ulr_offset + results->ulr_count.
+                */
+               if (slotstatus == NONE) {
+                       results->ulr_offset = roundup(dp->i_size, dirblksiz);
+                       results->ulr_count = 0;
+                       enduseful = results->ulr_offset;
+               } else if (nameiop == DELETE) {
+                       results->ulr_offset = slotoffset;
+                       if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+                               results->ulr_count = 0;
+                       else
+                               results->ulr_count = results->ulr_offset - prevoff;
+               } else {
+                       results->ulr_offset = slotoffset;
+                       results->ulr_count = slotsize;
+                       if (enduseful < slotoffset + slotsize)
+                               enduseful = slotoffset + slotsize;
+               }
+               results->ulr_endoff = roundup(enduseful, dirblksiz);
+#if 0 /* commented out by dbj. none of the on disk fields changed */
+               dp->i_flag |= IN_CHANGE | IN_UPDATE;
+#endif
+               /*
+                * We return with the directory locked, so that
+                * the parameters we set up above will still be
+                * valid if we actually decide to do a direnter().
+                * We return ni_vp == NULL to indicate that the entry
+                * does not currently exist; we leave a pointer to
+                * the (locked) directory inode in ndp->ni_dvp.
+                *
+                * NB - if the directory is unlocked, then this
+                * information cannot be used.
+                */
+               error = EJUSTRETURN;
+               goto out;
+       }
+       /*
+        * Insert name into cache (as non-existent) if appropriate.
+        */
+       if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+               cache_enter(vdp, *vpp, cnp);
+       error = ENOENT;
+       goto out;
+
+found:
+       if (numdirpasses == 2)
+               nchstats.ncs_pass2++;
+       /*
+        * Check that directory length properly reflects presence
+        * of this entry.
+        */
+       if (results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, needswap) > dp->i_size) {
+               ufs_dirbad(dp, results->ulr_offset, "i_size too small");
+               dp->i_size = results->ulr_offset + DIRSIZ(FSFMT(vdp), ep, needswap);
+               DIP_ASSIGN(dp, size, dp->i_size);
+               dp->i_flag |= IN_CHANGE | IN_UPDATE;
+               UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
+       }
+       brelse(bp, 0);
+
+       /*
+        * Found component in pathname.
+        * If the final component of path name, save information
+        * in the cache as to where the entry was found.
+        */
+       if ((flags & ISLASTCN) && nameiop == LOOKUP)
+               results->ulr_diroff = results->ulr_offset &~ (dirblksiz - 1);
+
+       /*
+        * If deleting, and at end of pathname, return
+        * parameters which can be used to remove file.
+        * Lock the inode, being careful with ".".
+        */
+       if (nameiop == DELETE && (flags & ISLASTCN)) {
+               /*
+                * Write access to directory required to delete files.
+                */
+               error = VOP_ACCESS(vdp, VWRITE, cred);
+               if (error)
+                       goto out;
+               /*
+                * Return pointer to current entry in results->ulr_offset,
+                * and distance past previous entry (if there
+                * is a previous entry in this block) in results->ulr_count.
+                * Save directory inode pointer in ndp->ni_dvp for dirremove().
+                */
+               if ((results->ulr_offset & (dirblksiz - 1)) == 0)
+                       results->ulr_count = 0;
+               else
+                       results->ulr_count = results->ulr_offset - prevoff;
+               if (dp->i_number == foundino) {
+                       vref(vdp);
+                       *vpp = vdp;
+                       error = 0;
+                       goto out;
+               }
+               if (flags & ISDOTDOT)
+                       VOP_UNLOCK(vdp); /* race to get the inode */
+               error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+               if (flags & ISDOTDOT)
+                       vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+               if (error)
+                       goto out;
+               /*
+                * If directory is "sticky", then user must own
+                * the directory, or the file in it, else she
+                * may not delete it (unless she's root). This
+                * implements append-only directories.
+                */
+               if ((dp->i_mode & ISVTX) &&
+                   kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+                    NULL) != 0 &&
+                   kauth_cred_geteuid(cred) != dp->i_uid &&
+                   VTOI(tdp)->i_uid != kauth_cred_geteuid(cred)) {
+                       vput(tdp);
+                       error = EPERM;
+                       goto out;
+               }
+               *vpp = tdp;
+               error = 0;
+               goto out;
+       }
+
+       /*
+        * If rewriting (RENAME), return the inode and the
+        * information required to rewrite the present directory
+        * Must get inode of directory entry to verify it's a
+        * regular file, or empty directory.
+        */
+       if (nameiop == RENAME && (flags & ISLASTCN)) {
+               error = VOP_ACCESS(vdp, VWRITE, cred);
+               if (error)
+                       goto out;
+               /*
+                * Careful about locking second inode.
+                * This can only occur if the target is ".".
+                */
+               if (dp->i_number == foundino) {
+                       error = EISDIR;
+                       goto out;
+               }
+               if (flags & ISDOTDOT)
+                       VOP_UNLOCK(vdp); /* race to get the inode */
+               error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+               if (flags & ISDOTDOT)
+                       vn_lock(vdp, LK_EXCLUSIVE | LK_RETRY);
+               if (error)
+                       goto out;
+               *vpp = tdp;
+               error = 0;
+               goto out;
+       }
+
+       /*
+        * Step through the translation in the name.  We do not `vput' the
+        * directory because we may need it again if a symbolic link
+        * is relative to the current directory.  Instead we save it
+        * unlocked as "pdp".  We must get the target inode before unlocking
+        * the directory to insure that the inode will not be removed
+        * before we get it.  We prevent deadlock by always fetching
+        * inodes from the root, moving down the directory tree. Thus
+        * when following backward pointers ".." we must unlock the
+        * parent directory before getting the requested directory.
+        * There is a potential race condition here if both the current
+        * and parent directories are removed before the VFS_VGET for the
+        * inode associated with ".." returns.  We hope that this occurs
+        * infrequently since we cannot avoid this race condition without
+        * implementing a sophisticated deadlock detection algorithm.
+        * Note also that this simple deadlock detection scheme will not
+        * work if the file system has any hard links other than ".."
+        * that point backwards in the directory structure.
+        */
+       pdp = vdp;
+       if (flags & ISDOTDOT) {
+               VOP_UNLOCK(pdp);        /* race to get the inode */
+               error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+               vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY);
+               if (error) {
+                       goto out;
+               }
+               *vpp = tdp;
+       } else if (dp->i_number == foundino) {
+               vref(vdp);      /* we want ourself, ie "." */
+               *vpp = vdp;
+       } else {
+               error = VFS_VGET(vdp->v_mount, foundino, &tdp);
+               if (error)
+                       goto out;
+               *vpp = tdp;
+       }
+
+       /*
+        * Insert name into cache if appropriate.
+        */
+       if (cnp->cn_flags & MAKEENTRY)
+               cache_enter(vdp, *vpp, cnp);
+       error = 0;
+
+out:
+       fstrans_done(vdp->v_mount);
+       return error;
+}
+
+void
+ufs_dirbad(struct inode *ip, doff_t offset, const char *how)
+{
+       struct mount *mp;
+
+       mp = ITOV(ip)->v_mount;
+       printf("%s: bad dir ino %llu at offset %d: %s\n",
+           mp->mnt_stat.f_mntonname, (unsigned long long)ip->i_number,
+           offset, how);
+       if ((mp->mnt_stat.f_flag & MNT_RDONLY) == 0)
+               panic("bad dir");
+}
+
+/*
+ * Do consistency checking on a directory entry:
+ *     record length must be multiple of 4
+ *     entry must fit in rest of its DIRBLKSIZ block
+ *     record must be large enough to contain entry
+ *     name is not longer than FFS_MAXNAMLEN
+ *     name must be as long as advertised, and null terminated
+ */
+int
+ufs_dirbadentry(struct vnode *dp, struct direct *ep, int entryoffsetinblock)
+{
+       int i;
+       int namlen;
+       struct ufsmount *ump = VFSTOUFS(dp->v_mount);
+       const int needswap = UFS_MPNEEDSWAP(ump);
+       int dirblksiz = ump->um_dirblksiz;
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+       if (FSFMT(dp) && needswap == 0)
+               namlen = ep->d_type;
+       else
+               namlen = ep->d_namlen;
+#else
+       if (FSFMT(dp) && needswap != 0)
+               namlen = ep->d_type;
+       else
+               namlen = ep->d_namlen;
+#endif
+       if ((ufs_rw16(ep->d_reclen, needswap) & 0x3) != 0 ||
+           ufs_rw16(ep->d_reclen, needswap) >
+               dirblksiz - (entryoffsetinblock & (dirblksiz - 1)) ||
+           ufs_rw16(ep->d_reclen, needswap) <
+               DIRSIZ(FSFMT(dp), ep, needswap) ||
+           namlen > FFS_MAXNAMLEN) {
+               /*return (1); */
+               printf("First bad, reclen=%#x, DIRSIZ=%lu, namlen=%d, "
+                       "flags=%#x, entryoffsetinblock=%d, dirblksiz = %d\n",
+                       ufs_rw16(ep->d_reclen, needswap),
+                       (u_long)DIRSIZ(FSFMT(dp), ep, needswap),
+                       namlen, dp->v_mount->mnt_flag, entryoffsetinblock,
+                       dirblksiz);
+               goto bad;
+       }
+       if (ep->d_ino == 0)
+               return (0);
+       for (i = 0; i < namlen; i++)
+               if (ep->d_name[i] == '\0') {
+                       /*return (1); */
+                       printf("Second bad\n");
+                       goto bad;
+       }
+       if (ep->d_name[i])
+               goto bad;
+       return (0);
+bad:
+       return (1);
+}
+
+/*
+ * Construct a new directory entry after a call to namei, using the
+ * name in the componentname argument cnp. The argument ip is the
+ * inode to which the new directory entry will refer.
+ */
+void
+ufs_makedirentry(struct inode *ip, struct componentname *cnp,
+    struct direct *newdirp)
+{
+       newdirp->d_ino = ip->i_number;
+       newdirp->d_namlen = cnp->cn_namelen;
+       memcpy(newdirp->d_name, cnp->cn_nameptr, (size_t)cnp->cn_namelen);
+       newdirp->d_name[cnp->cn_namelen] = '\0';
+       if (FSFMT(ITOV(ip)))
+               newdirp->d_type = 0;
+       else
+               newdirp->d_type = IFTODT(ip->i_mode);
+}
+
+/*
+ * Write a directory entry after a call to namei, using the parameters
+ * that ufs_lookup left in nameidata and in the ufs_lookup_results.
+ *
+ * DVP is the directory to be updated. It must be locked.
+ * ULR is the ufs_lookup_results structure from the final lookup step.
+ * TVP is not used. (XXX: why is it here? remove it)
+ * DIRP is the new directory entry contents.
+ * CNP is the componentname from the final lookup step.
+ * NEWDIRBP is not used and (XXX) should be removed. The previous
+ * comment here said it was used by the now-removed softupdates code.
+ *
+ * The link count of the target inode is *not* incremented; the
+ * caller does that.
+ *
+ * If ulr->ulr_count is 0, ufs_lookup did not find space to insert the
+ * directory entry. ulr_offset, which is the place to put the entry,
+ * should be on a block boundary (and should be at the end of the
+ * directory AFAIK) and a fresh block is allocated to put the new
+ * directory entry in.
+ *
+ * If ulr->ulr_count is not zero, ufs_lookup found a slot to insert
+ * the entry into. This slot ranges from ulr_offset to ulr_offset +
+ * ulr_count. However, this slot may already be partially populated
+ * requiring compaction. See notes below.
+ *
+ * Furthermore, if ulr_count is not zero and ulr_endoff is not the
+ * same as i_size, the directory is truncated to size ulr_endoff.
+ */
+int
+ufs_direnter(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+    struct vnode *tvp, struct direct *dirp,
+    struct componentname *cnp, struct buf *newdirbp)
+{
+       kauth_cred_t cr;
+       struct lwp *l;
+       int newentrysize;
+       struct inode *dp;
+       struct buf *bp;
+       u_int dsize;
+       struct direct *ep, *nep;
+       int error, ret, blkoff, loc, spacefree;
+       char *dirbuf;
+       struct timespec ts;
+       struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
+       const int needswap = UFS_MPNEEDSWAP(ump);
+       int dirblksiz = ump->um_dirblksiz;
+
+       UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
+
+       error = 0;
+       cr = cnp->cn_cred;
+       l = curlwp;
+
+       dp = VTOI(dvp);
+       newentrysize = DIRSIZ(0, dirp, 0);
+
+#if 0
+       struct ufs_lookup_results *ulr;
+       /* XXX should handle this material another way */
+       ulr = &dp->i_crap;
+       UFS_CHECK_CRAPCOUNTER(dp);
+#endif
+
+       if (ulr->ulr_count == 0) {
+               /*
+                * If ulr_count is 0, then namei could find no
+                * space in the directory. Here, ulr_offset will
+                * be on a directory block boundary and we will write the
+                * new entry into a fresh block.
+                */
+               if (ulr->ulr_offset & (dirblksiz - 1))
+                       panic("ufs_direnter: newblk");
+               if ((error = UFS_BALLOC(dvp, (off_t)ulr->ulr_offset, dirblksiz,
+                   cr, B_CLRBUF | B_SYNC, &bp)) != 0) {
+                       return (error);
+               }
+               dp->i_size = ulr->ulr_offset + dirblksiz;
+               DIP_ASSIGN(dp, size, dp->i_size);
+               dp->i_flag |= IN_CHANGE | IN_UPDATE;
+               uvm_vnp_setsize(dvp, dp->i_size);
+               dirp->d_reclen = ufs_rw16(dirblksiz, needswap);
+               dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
+               if (FSFMT(dvp)) {
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+                       if (needswap == 0) {
+#else
+                       if (needswap != 0) {
+#endif
+                               u_char tmp = dirp->d_namlen;
+                               dirp->d_namlen = dirp->d_type;
+                               dirp->d_type = tmp;
+                       }
+               }
+               blkoff = ulr->ulr_offset & (ump->um_mountp->mnt_stat.f_iosize - 1);
+               memcpy((char *)bp->b_data + blkoff, dirp, newentrysize);
+#ifdef UFS_DIRHASH
+               if (dp->i_dirhash != NULL) {
+                       ufsdirhash_newblk(dp, ulr->ulr_offset);
+                       ufsdirhash_add(dp, dirp, ulr->ulr_offset);
+                       ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
+                           ulr->ulr_offset);
+               }
+#endif
+               error = VOP_BWRITE(bp->b_vp, bp);
+               vfs_timestamp(&ts);
+               ret = UFS_UPDATE(dvp, &ts, &ts, UPDATE_DIROP);
+               if (error == 0)
+                       return (ret);
+               return (error);
+       }
+
+       /*
+        * If ulr_count is non-zero, then namei found space for the new
+        * entry in the range ulr_offset to url_offset + url_count
+        * in the directory. To use this space, we may have to compact
+        * the entries located there, by copying them together towards the
+        * beginning of the block, leaving the free space in one usable
+        * chunk at the end.
+        */
+
+       /*
+        * Increase size of directory if entry eats into new space.
+        * This should never push the size past a new multiple of
+        * DIRBLKSIZ.
+        *
+        * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
+        */
+       if (ulr->ulr_offset + ulr->ulr_count > dp->i_size) {
+#ifdef DIAGNOSTIC
+               printf("ufs_direnter: reached 4.2-only block, "
+                      "not supposed to happen\n");
+#endif
+               dp->i_size = ulr->ulr_offset + ulr->ulr_count;
+               DIP_ASSIGN(dp, size, dp->i_size);
+               dp->i_flag |= IN_CHANGE | IN_UPDATE;
+               UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+       }
+       /*
+        * Get the block containing the space for the new directory entry.
+        */
+       error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp, true);
+       if (error) {
+               return (error);
+       }
+       /*
+        * Find space for the new entry. In the simple case, the entry at
+        * offset base will have the space. If it does not, then namei
+        * arranged that compacting the region dp->i_offset to
+        * dp->i_offset + dp->i_count would yield the space.
+        */
+       ep = (struct direct *)dirbuf;
+       dsize = (ep->d_ino != 0) ?  DIRSIZ(FSFMT(dvp), ep, needswap) : 0;
+       spacefree = ufs_rw16(ep->d_reclen, needswap) - dsize;
+       for (loc = ufs_rw16(ep->d_reclen, needswap); loc < ulr->ulr_count; ) {
+               uint16_t reclen;
+
+               nep = (struct direct *)(dirbuf + loc);
+
+               /* Trim the existing slot (NB: dsize may be zero). */
+               ep->d_reclen = ufs_rw16(dsize, needswap);
+               ep = (struct direct *)((char *)ep + dsize);
+
+               reclen = ufs_rw16(nep->d_reclen, needswap);
+               loc += reclen;
+               if (nep->d_ino == 0) {
+                       /*
+                        * A mid-block unused entry. Such entries are
+                        * never created by the kernel, but fsck_ffs
+                        * can create them (and it doesn't fix them).
+                        *
+                        * Add up the free space, and initialise the
+                        * relocated entry since we don't memcpy it.
+                        */
+                       spacefree += reclen;
+                       ep->d_ino = 0;
+                       dsize = 0;
+                       continue;
+               }
+               dsize = DIRSIZ(FSFMT(dvp), nep, needswap);
+               spacefree += reclen - dsize;
+#ifdef UFS_DIRHASH
+               if (dp->i_dirhash != NULL)
+                       ufsdirhash_move(dp, nep,
+                           ulr->ulr_offset + ((char *)nep - dirbuf),
+                           ulr->ulr_offset + ((char *)ep - dirbuf));
+#endif
+               memcpy((void *)ep, (void *)nep, dsize);
+       }
+       /*
+        * Here, `ep' points to a directory entry containing `dsize' in-use
+        * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0,
+        * then the entry is completely unused (dsize == 0). The value
+        * of ep->d_reclen is always indeterminate.
+        *
+        * Update the pointer fields in the previous entry (if any),
+        * copy in the new entry, and write out the block.
+        */
+       if (ep->d_ino == 0 ||
+           (ufs_rw32(ep->d_ino, needswap) == WINO &&
+            memcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
+               if (spacefree + dsize < newentrysize)
+                       panic("ufs_direnter: compact1");
+               dirp->d_reclen = spacefree + dsize;
+       } else {
+               if (spacefree < newentrysize)
+                       panic("ufs_direnter: compact2");
+               dirp->d_reclen = spacefree;
+               ep->d_reclen = ufs_rw16(dsize, needswap);
+               ep = (struct direct *)((char *)ep + dsize);
+       }
+       dirp->d_reclen = ufs_rw16(dirp->d_reclen, needswap);
+       dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
+       if (FSFMT(dvp)) {
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+               if (needswap == 0) {
+#else
+               if (needswap != 0) {
+#endif
+                       u_char tmp = dirp->d_namlen;
+                       dirp->d_namlen = dirp->d_type;
+                       dirp->d_type = tmp;
+               }
+       }
+#ifdef UFS_DIRHASH
+       if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
+           dirp->d_reclen == spacefree))
+               ufsdirhash_add(dp, dirp, ulr->ulr_offset + ((char *)ep - dirbuf));
+#endif
+       memcpy((void *)ep, (void *)dirp, (u_int)newentrysize);
+#ifdef UFS_DIRHASH
+       if (dp->i_dirhash != NULL)
+               ufsdirhash_checkblock(dp, dirbuf -
+                   (ulr->ulr_offset & (dirblksiz - 1)),
+                   ulr->ulr_offset & ~(dirblksiz - 1));
+#endif
+       error = VOP_BWRITE(bp->b_vp, bp);
+       dp->i_flag |= IN_CHANGE | IN_UPDATE;
+       /*
+        * If all went well, and the directory can be shortened, proceed
+        * with the truncation. Note that we have to unlock the inode for
+        * the entry that we just entered, as the truncation may need to
+        * lock other inodes which can lead to deadlock if we also hold a
+        * lock on the newly entered node.
+        */
+       if (error == 0 && ulr->ulr_endoff && ulr->ulr_endoff < dp->i_size) {
+#ifdef UFS_DIRHASH
+               if (dp->i_dirhash != NULL)
+                       ufsdirhash_dirtrunc(dp, ulr->ulr_endoff);
+#endif
+               (void) UFS_TRUNCATE(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, cr);
+       }
+       UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+       return (error);
+}
+
+/*
+ * Remove a directory entry after a call to namei, using the
+ * parameters that ufs_lookup left in nameidata and in the
+ * ufs_lookup_results.
+ *
+ * DVP is the directory to be updated. It must be locked.
+ * ULR is the ufs_lookup_results structure from the final lookup step.
+ * IP, if not null, is the inode being unlinked.
+ * FLAGS may contain DOWHITEOUT.
+ * ISRMDIR is not used and (XXX) should be removed.
+ *
+ * If FLAGS contains DOWHITEOUT the entry is replaced with a whiteout
+ * instead of being cleared.
+ *
+ * ulr->ulr_offset contains the position of the directory entry
+ * to be removed.
+ *
+ * ulr->ulr_reclen contains the size of the directory entry to be
+ * removed.
+ *
+ * ulr->ulr_count contains the size of the *previous* directory
+ * entry. This allows finding it, for free space management. If
+ * ulr_count is 0, the target entry is at the beginning of the
+ * directory. (Does this ever happen? The first entry should be ".",
+ * which should only be removed at rmdir time. Does rmdir come here
+ * to clear out the "." and ".." entries? Perhaps, but I doubt it.)
+ *
+ * The space is marked free by adding it to the record length (not
+ * name length) of the preceding entry. If the first entry becomes
+ * free, it is marked free by setting the inode number to 0.
+ *
+ * The link count of IP is decremented. Note that this is not the
+ * inverse behavior of ufs_direnter, which does not adjust link
+ * counts. Sigh.
+ */
+int
+ufs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
+             struct inode *ip, int flags, int isrmdir)
+{
+       struct inode *dp = VTOI(dvp);
+       struct direct *ep;
+       struct buf *bp;
+       int error;
+#ifdef FFS_EI
+       const int needswap = UFS_MPNEEDSWAP(dp->i_ump);
+#endif
+
+       UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
+
+       if (flags & DOWHITEOUT) {
+               /*
+                * Whiteout entry: set d_ino to WINO.
+                */
+               error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, (void *)&ep,
+                                    &bp, true);
+               if (error)
+                       return (error);
+               ep->d_ino = ufs_rw32(WINO, needswap);
+               ep->d_type = DT_WHT;
+               goto out;
+       }
+
+       if ((error = ufs_blkatoff(dvp,
+           (off_t)(ulr->ulr_offset - ulr->ulr_count), (void *)&ep, &bp, true)) != 0)
+               return (error);
+
+#ifdef UFS_DIRHASH
+       /*
+        * Remove the dirhash entry. This is complicated by the fact
+        * that `ep' is the previous entry when dp->i_count != 0.
+        */
+       if (dp->i_dirhash != NULL)
+               ufsdirhash_remove(dp, (ulr->ulr_count == 0) ? ep :
+                  (struct direct *)((char *)ep +
+                  ufs_rw16(ep->d_reclen, needswap)), ulr->ulr_offset);
+#endif
+
+       if (ulr->ulr_count == 0) {
+               /*
+                * First entry in block: set d_ino to zero.
+                */
+               ep->d_ino = 0;
+       } else {
+               /*
+                * Collapse new free space into previous entry.
+                */
+               ep->d_reclen =
+                   ufs_rw16(ufs_rw16(ep->d_reclen, needswap) + ulr->ulr_reclen,
+                       needswap);
+       }
+
+#ifdef UFS_DIRHASH
+       if (dp->i_dirhash != NULL) {
+               int dirblksiz = ip->i_ump->um_dirblksiz;
+               ufsdirhash_checkblock(dp, (char *)ep -
+                   ((ulr->ulr_offset - ulr->ulr_count) & (dirblksiz - 1)),
+                   ulr->ulr_offset & ~(dirblksiz - 1));
+       }
+#endif
+
+out:
+       if (ip) {
+               ip->i_nlink--;
+               DIP_ASSIGN(ip, nlink, ip->i_nlink);
+               ip->i_flag |= IN_CHANGE;
+               UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0);
+       }
+       error = VOP_BWRITE(bp->b_vp, bp);
+       dp->i_flag |= IN_CHANGE | IN_UPDATE;
+       /*
+        * If the last named reference to a snapshot goes away,
+        * drop its snapshot reference so that it will be reclaimed
+        * when last open reference goes away.
+        */
+       if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 &&
+           ip->i_nlink == 0)
+               ffs_snapgone(ip);
+       UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0);
+       return (error);
+}
+
+/*
+ * Rewrite an existing directory entry to point at the inode supplied.
+ *
+ * DP is the directory to update.
+ * OFFSET is the position of the entry in question. It may come
+ * from ulr_offset of a ufs_lookup_results.
+ * OIP is the old inode the directory previously pointed to.
+ * NEWINUM is the number of the new inode.
+ * NEWTYPE is the new value for the type field of the directory entry.
+ * (This is ignored if the fs doesn't support that.)
+ * ISRMDIR is not used and (XXX) should be removed.
+ * IFLAGS are added to DP's inode flags.
+ *
+ * The link count of OIP is decremented. Note that the link count of
+ * the new inode is *not* incremented. Yay for symmetry.
+ */
+int
+ufs_dirrewrite(struct inode *dp, off_t offset,
+    struct inode *oip, ino_t newinum, int newtype,
+    int isrmdir, int iflags)
+{
+       struct buf *bp;
+       struct direct *ep;
+       struct vnode *vdp = ITOV(dp);
+       int error;
+
+       error = ufs_blkatoff(vdp, offset, (void *)&ep, &bp, true);
+       if (error)
+               return (error);
+       ep->d_ino = ufs_rw32(newinum, UFS_MPNEEDSWAP(dp->i_ump));
+       if (!FSFMT(vdp))
+               ep->d_type = newtype;
+       oip->i_nlink--;
+       DIP_ASSIGN(oip, nlink, oip->i_nlink);
+       oip->i_flag |= IN_CHANGE;
+       UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP);
+       error = VOP_BWRITE(bp->b_vp, bp);
+       dp->i_flag |= iflags;
+       /*
+        * If the last named reference to a snapshot goes away,
+        * drop its snapshot reference so that it will be reclaimed
+        * when last open reference goes away.
+        */
+       if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_nlink == 0)
+               ffs_snapgone(oip);
+       UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
+       return (error);
+}
+
+/*
+ * Check if a directory is empty or not.
+ * Inode supplied must be locked.
+ *
+ * Using a struct dirtemplate here is not precisely
+ * what we want, but better than using a struct direct.
+ *
+ * NB: does not handle corrupted directories.
+ */
+int
+ufs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred)
+{
+       doff_t off;
+       struct dirtemplate dbuf;
+       struct direct *dp = (struct direct *)&dbuf;
+       int error, namlen;
+       size_t count;
+       const int needswap = UFS_IPNEEDSWAP(ip);
+#define        MINDIRSIZ (sizeof (struct dirtemplate) / 2)
+
+       for (off = 0; off < ip->i_size;
+           off += ufs_rw16(dp->d_reclen, needswap)) {
+               error = vn_rdwr(UIO_READ, ITOV(ip), (void *)dp, MINDIRSIZ, off,
+                  UIO_SYSSPACE, IO_NODELOCKED, cred, &count, NULL);
+               /*
+                * Since we read MINDIRSIZ, residual must
+                * be 0 unless we're at end of file.
+                */
+               if (error || count != 0)
+                       return (0);
+               /* avoid infinite loops */
+               if (dp->d_reclen == 0)
+                       return (0);
+               /* skip empty entries */
+               if (dp->d_ino == 0 || ufs_rw32(dp->d_ino, needswap) == WINO)
+                       continue;
+               /* accept only "." and ".." */
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+               if (FSFMT(ITOV(ip)) && needswap == 0)
+                       namlen = dp->d_type;
+               else
+                       namlen = dp->d_namlen;
+#else
+               if (FSFMT(ITOV(ip)) && needswap != 0)
+                       namlen = dp->d_type;
+               else
+                       namlen = dp->d_namlen;
+#endif
+               if (namlen > 2)
+                       return (0);
+               if (dp->d_name[0] != '.')
+                       return (0);
+               /*
+                * At this point namlen must be 1 or 2.
+                * 1 implies ".", 2 implies ".." if second
+                * char is also "."
+                */
+               if (namlen == 1 &&
+                   ufs_rw32(dp->d_ino, needswap) == ip->i_number)
+                       continue;
+               if (dp->d_name[1] == '.' &&
+                   ufs_rw32(dp->d_ino, needswap) == parentino)
+                       continue;
+               return (0);
+       }
+       return (1);
+}
+
+/*
+ * Check if source directory is in the path of the target directory.
+ * Target is supplied locked, source is unlocked.
+ * The target is always vput before returning.
+ */
+int
+ufs_checkpath(struct inode *source, struct inode *target, kauth_cred_t cred)
+{
+       struct vnode *nextvp, *vp;
+       int error, rootino, namlen;
+       struct dirtemplate dirbuf;
+       const int needswap = UFS_MPNEEDSWAP(target->i_ump);
+
+       vp = ITOV(target);
+       if (target->i_number == source->i_number) {
+               error = EEXIST;
+               goto out;
+       }
+       rootino = ROOTINO;
+       error = 0;
+       if (target->i_number == rootino)
+               goto out;
+
+       for (;;) {
+               if (vp->v_type != VDIR) {
+                       error = ENOTDIR;
+                       break;
+               }
+               error = vn_rdwr(UIO_READ, vp, (void *)&dirbuf,
+                   sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
+                   IO_NODELOCKED, cred, NULL, NULL);
+               if (error != 0)
+                       break;
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+               if (FSFMT(vp) && needswap == 0)
+                       namlen = dirbuf.dotdot_type;
+               else
+                       namlen = dirbuf.dotdot_namlen;
+#else
+               if (FSFMT(vp) && needswap != 0)
+                       namlen = dirbuf.dotdot_type;
+               else
+                       namlen = dirbuf.dotdot_namlen;
+#endif
+               if (namlen != 2 ||
+                   dirbuf.dotdot_name[0] != '.' ||
+                   dirbuf.dotdot_name[1] != '.') {
+                       error = ENOTDIR;
+                       break;
+               }
+               if (ufs_rw32(dirbuf.dotdot_ino, needswap) == source->i_number) {
+                       error = EINVAL;
+                       break;
+               }
+               if (ufs_rw32(dirbuf.dotdot_ino, needswap) == rootino)
+                       break;
+               VOP_UNLOCK(vp);
+               error = VFS_VGET(vp->v_mount,
+                   ufs_rw32(dirbuf.dotdot_ino, needswap), &nextvp);
+               vrele(vp);
+               if (error) {
+                       vp = NULL;
+                       break;
+               }
+               vp = nextvp;
+       }
+
+out:
+       if (error == ENOTDIR)
+               printf("checkpath: .. not a directory\n");
+       if (vp != NULL)
+               vput(vp);
+       return (error);
+}
+
+/*
+ * Extract the inode number of ".." from a directory.
+ * Helper for ufs_parentcheck.
+ */
+static int
+ufs_readdotdot(struct vnode *vp, int needswap, kauth_cred_t cred, ino_t *result)
+{
+       struct dirtemplate dirbuf;
+       int namlen, error;
+
+       error = vn_rdwr(UIO_READ, vp, &dirbuf,
+                   sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE,
+                   IO_NODELOCKED, cred, NULL, NULL);
+       if (error) {
+               return error;
+       }
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+       if (FSFMT(vp) && needswap == 0)
+               namlen = dirbuf.dotdot_type;
+       else
+               namlen = dirbuf.dotdot_namlen;
+#else
+       if (FSFMT(vp) && needswap != 0)
+               namlen = dirbuf.dotdot_type;
+       else
+               namlen = dirbuf.dotdot_namlen;
+#endif
+       if (namlen != 2 ||
+           dirbuf.dotdot_name[0] != '.' ||
+           dirbuf.dotdot_name[1] != '.') {
+               printf("ufs_readdotdot: directory %llu contains "
+                      "garbage instead of ..\n",
+                      (unsigned long long) VTOI(vp)->i_number);
+               return ENOTDIR;
+       }
+       *result = ufs_rw32(dirbuf.dotdot_ino, needswap);
+       return 0;
+}
+
+/*
+ * Check if LOWER is a descendent of UPPER. If we find UPPER, return
+ * nonzero in FOUND and return a reference to the immediate descendent
+ * of UPPER in UPPERCHILD. If we don't find UPPER (that is, if we
+ * reach the volume root and that isn't UPPER), return zero in FOUND
+ * and null in UPPERCHILD.
+ *
+ * Neither UPPER nor LOWER should be locked.
+ *
+ * On error (such as a permissions error checking up the directory
+ * tree) fail entirely.
+ *
+ * Note that UPPER and LOWER must be on the same volume, and because
+ * we inspect only that volume NEEDSWAP can be constant.
+ */
+int
+ufs_parentcheck(struct vnode *upper, struct vnode *lower, kauth_cred_t cred,
+               int *found_ret, struct vnode **upperchild_ret)
+{
+       const int needswap = UFS_MPNEEDSWAP(VTOI(lower)->i_ump);
+       ino_t upper_ino, found_ino;
+       struct vnode *current, *next;
+       int error;
+
+       if (upper == lower) {
+               vref(upper);
+               *found_ret = 1;
+               *upperchild_ret = upper;
+               return 0;
+       }
+       if (VTOI(lower)->i_number == ROOTINO) {
+               *found_ret = 0;
+               *upperchild_ret = NULL;
+               return 0;
+       }
+
+       upper_ino = VTOI(upper)->i_number;
+
+       current = lower;
+       vref(current);
+       vn_lock(current, LK_EXCLUSIVE | LK_RETRY);
+
+       for (;;) {
+               error = ufs_readdotdot(current, needswap, cred, &found_ino);
+               if (error) {
+                       vput(current);
+                       return error;
+               }
+               if (found_ino == upper_ino) {
+                       VOP_UNLOCK(current);
+                       *found_ret = 1;
+                       *upperchild_ret = current;
+                       return 0;
+               }
+               if (found_ino == ROOTINO) {
+                       vput(current);
+                       *found_ret = 0;
+                       *upperchild_ret = NULL;
+                       return 0;
+               }
+               VOP_UNLOCK(current);
+               error = VFS_VGET(current->v_mount, found_ino, &next);
+               if (error) {
+                       vrele(current);
+                       return error;
+               }
+               KASSERT(VOP_ISLOCKED(next));
+               if (next->v_type != VDIR) {
+                       printf("ufs_parentcheck: inode %llu reached via .. of "
+                              "inode %llu is not a directory\n",
+                           (unsigned long long)VTOI(next)->i_number,
+                           (unsigned long long)VTOI(current)->i_number);
+                       vput(next);
+                       vrele(current);
+                       return ENOTDIR;
+               }
+               vrele(current);
+               current = next;
+       }
+
+       return 0;
+}
+
+#define        UFS_DIRRABLKS 0
+int ufs_dirrablks = UFS_DIRRABLKS;
+
+/*
+ * ufs_blkatoff: Return buffer with the contents of block "offset" from
+ * the beginning of directory "vp".  If "res" is non-zero, fill it in with
+ * a pointer to the remaining space in the directory.  If the caller intends
+ * to modify the buffer returned, "modify" must be true.
+ */
+
+int
+ufs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp,
+    bool modify)
+{
+       struct inode *ip;
+       struct buf *bp;
+       daddr_t lbn;
+       const int dirrablks = ufs_dirrablks;
+       daddr_t *blks;
+       int *blksizes;
+       int run, error;
+       struct mount *mp = vp->v_mount;
+       const int bshift = mp->mnt_fs_bshift;
+       const int bsize = 1 << bshift;
+       off_t eof;
+
+       blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP);
+       blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP);
+       ip = VTOI(vp);
+       KASSERT(vp->v_size == ip->i_size);
+       GOP_SIZE(vp, vp->v_size, &eof, 0);
+       lbn = offset >> bshift;
+
+       for (run = 0; run <= dirrablks;) {
+               const off_t curoff = lbn << bshift;
+               const int size = MIN(eof - curoff, bsize);
+
+               if (size == 0) {
+                       break;
+               }
+               KASSERT(curoff < eof);
+               blks[run] = lbn;
+               blksizes[run] = size;
+               lbn++;
+               run++;
+               if (size != bsize) {
+                       break;
+               }
+       }
+       KASSERT(run >= 1);
+       error = breadn(vp, blks[0], blksizes[0], &blks[1], &blksizes[1],
+           run - 1, NOCRED, (modify ? B_MODIFY : 0), &bp);
+       if (error != 0) {
+               brelse(bp, 0);
+               *bpp = NULL;
+               goto out;
+       }
+       if (res) {
+               *res = (char *)bp->b_data + (offset & (bsize - 1));
+       }
+       *bpp = bp;
+
+ out:
+       kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t));
+       kmem_free(blksizes, (1 + dirrablks) * sizeof(int));
+       return error;
+}
diff --git a/sys/ufs/ufs/ufs_quota.c b/sys/ufs/ufs/ufs_quota.c
new file mode 100644 (file)
index 0000000..78cef57
--- /dev/null
@@ -0,0 +1,877 @@
+/*     $NetBSD: ufs_quota.c,v 1.70 2011/03/24 17:05:46 bouyer Exp $    */
+
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993, 1995
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Robert Elz at The University of Melbourne.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_quota.c,v 1.70 2011/03/24 17:05:46 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#endif 
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_quota.h>
+#include <quota/quotaprop.h>
+
+kmutex_t dqlock;
+kcondvar_t dqcv;
+
+/*
+ * Code pertaining to management of the in-core dquot data structures.
+ */
+#define DQHASH(dqvp, id) \
+       (((((long)(dqvp)) >> 8) + id) & dqhash)
+static LIST_HEAD(dqhashhead, dquot) *dqhashtbl;
+static u_long dqhash;
+static pool_cache_t dquot_cache;
+
+
+static int quota_handle_cmd_get_version(struct mount *, struct lwp *,
+    prop_dictionary_t, prop_array_t);
+static int quota_handle_cmd_get(struct mount *, struct lwp *,
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_set(struct mount *, struct lwp *,
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_getall(struct mount *, struct lwp *,
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_clear(struct mount *, struct lwp *,
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_quotaon(struct mount *, struct lwp *, 
+    prop_dictionary_t, int, prop_array_t);
+static int quota_handle_cmd_quotaoff(struct mount *, struct lwp *, 
+    prop_dictionary_t, int, prop_array_t);
+/*
+ * Initialize the quota fields of an inode.
+ */
+void
+ufsquota_init(struct inode *ip)
+{
+       int i;
+
+       for (i = 0; i < MAXQUOTAS; i++)
+               ip->i_dquot[i] = NODQUOT;
+}
+
+/*
+ * Release the quota fields from an inode.
+ */
+void
+ufsquota_free(struct inode *ip)
+{
+       int i;
+
+       for (i = 0; i < MAXQUOTAS; i++) {
+               dqrele(ITOV(ip), ip->i_dquot[i]);
+               ip->i_dquot[i] = NODQUOT;
+       }
+}
+
+/*
+ * Update disk usage, and take corrective action.
+ */
+int
+chkdq(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
+{
+       /* do not track snapshot usage, or we will deadlock */
+       if ((ip->i_flags & SF_SNAPSHOT) != 0)
+               return 0;
+
+#ifdef QUOTA
+       if (ip->i_ump->um_flags & UFS_QUOTA)
+               return chkdq1(ip, change, cred, flags);
+#endif
+#ifdef QUOTA2
+       if (ip->i_ump->um_flags & UFS_QUOTA2)
+               return chkdq2(ip, change, cred, flags);
+#endif
+       return 0;
+}
+
+/*
+ * Check the inode limit, applying corrective action.
+ */
+int
+chkiq(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
+{
+       /* do not track snapshot usage, or we will deadlock */
+       if ((ip->i_flags & SF_SNAPSHOT) != 0)
+               return 0;
+#ifdef QUOTA
+       if (ip->i_ump->um_flags & UFS_QUOTA)
+               return chkiq1(ip, change, cred, flags);
+#endif
+#ifdef QUOTA2
+       if (ip->i_ump->um_flags & UFS_QUOTA2)
+               return chkiq2(ip, change, cred, flags);
+#endif
+       return 0;
+}
+
+int
+quota_handle_cmd(struct mount *mp, struct lwp *l, prop_dictionary_t cmddict)
+{
+       int error = 0;
+       const char *cmd, *type;
+       prop_array_t datas;
+       int q2type;
+
+       if (!prop_dictionary_get_cstring_nocopy(cmddict, "command", &cmd))
+               return EINVAL;
+       if (!prop_dictionary_get_cstring_nocopy(cmddict, "type", &type))
+               return EINVAL;
+       if (!strcmp(type, QUOTADICT_CLASS_USER)) {
+               q2type = USRQUOTA;
+       } else if (!strcmp(type, QUOTADICT_CLASS_GROUP)) {
+               q2type = GRPQUOTA;
+       } else
+               return EOPNOTSUPP;
+       datas = prop_dictionary_get(cmddict, "data");
+       if (datas == NULL || prop_object_type(datas) != PROP_TYPE_ARRAY)
+               return EINVAL;
+
+       prop_object_retain(datas);
+       prop_dictionary_remove(cmddict, "data"); /* prepare for return */
+
+       if (strcmp(cmd, "get version") == 0) {
+               error = quota_handle_cmd_get_version(mp, l, cmddict, datas);
+               goto end;
+       }
+       if (strcmp(cmd, "quotaon") == 0) {
+               error = quota_handle_cmd_quotaon(mp, l, cmddict,
+                   q2type, datas);
+               goto end;
+       }
+       if (strcmp(cmd, "quotaoff") == 0) {
+               error = quota_handle_cmd_quotaoff(mp, l, cmddict,
+                   q2type, datas);
+               goto end;
+       }
+       if (strcmp(cmd, "get") == 0) {
+               error = quota_handle_cmd_get(mp, l, cmddict, q2type, datas);
+               goto end;
+       }
+       if (strcmp(cmd, "set") == 0) {
+               error = quota_handle_cmd_set(mp, l, cmddict, q2type, datas);
+               goto end;
+       }
+       if (strcmp(cmd, "getall") == 0) {
+               error = quota_handle_cmd_getall(mp, l, cmddict, q2type, datas);
+               goto end;
+       }
+       if (strcmp(cmd, "clear") == 0) {
+               error = quota_handle_cmd_clear(mp, l, cmddict, q2type, datas);
+               goto end;
+       }
+       error = EOPNOTSUPP;
+end:
+       error = (prop_dictionary_set_int8(cmddict, "return",
+           error) ? 0 : ENOMEM);
+       prop_object_release(datas);
+       return error;
+}
+
+static int 
+quota_handle_cmd_get_version(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, prop_array_t datas)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       prop_array_t replies;
+       prop_dictionary_t data;
+       int error = 0;
+
+       if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
+               return EOPNOTSUPP;
+
+       replies = prop_array_create();
+       if (replies == NULL)
+               return ENOMEM;
+
+       data = prop_dictionary_create();
+       if (data == NULL) {
+               prop_object_release(replies);
+               return ENOMEM;
+       }
+
+#ifdef QUOTA
+       if (ump->um_flags & UFS_QUOTA) {
+               if (!prop_dictionary_set_int8(data, "version", 1))
+                       error = ENOMEM;
+       } else
+#endif
+#ifdef QUOTA2
+       if (ump->um_flags & UFS_QUOTA2) {
+               if (!prop_dictionary_set_int8(data, "version", 2))
+                       error = ENOMEM;
+       } else
+#endif
+               error = 0;
+       if (error)
+               prop_object_release(data);
+       else if (!prop_array_add_and_rel(replies, data))
+               error = ENOMEM;
+       if (error)
+               prop_object_release(replies);
+       else if (!prop_dictionary_set_and_rel(cmddict, "data", replies))
+               error = ENOMEM;
+       return error;
+}
+
+/* XXX shouldn't all this be in kauth ? */
+static int
+quota_get_auth(struct mount *mp, struct lwp *l, uid_t id) {
+       /* The user can always query about his own quota. */
+       if (id == kauth_cred_getuid(l->l_cred))
+               return 0;
+       return kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+           KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(id), NULL);
+}
+
+static int 
+quota_handle_cmd_get(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+       prop_array_t replies;
+       prop_object_iterator_t iter;
+       prop_dictionary_t data;
+       uint32_t id;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       int error, defaultq = 0;
+       const char *idstr;
+
+       if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
+               return EOPNOTSUPP;
+       
+       replies = prop_array_create();
+       if (replies == NULL)
+               return ENOMEM;
+
+       iter = prop_array_iterator(datas);
+       if (iter == NULL) {
+               prop_object_release(replies);
+               return ENOMEM;
+       }
+       while ((data = prop_object_iterator_next(iter)) != NULL) {
+               if (!prop_dictionary_get_uint32(data, "id", &id)) {
+                       if (!prop_dictionary_get_cstring_nocopy(data, "id",
+                           &idstr))
+                               continue;
+                       if (strcmp(idstr, "default")) {
+                               error = EINVAL;
+                               goto err;
+                       }
+                       id = 0;
+                       defaultq = 1;
+               } else {
+                       defaultq = 0;
+               }
+               error = quota_get_auth(mp, l, id);
+               if (error == EPERM)
+                       continue;
+               if (error != 0) 
+                       goto err;
+#ifdef QUOTA
+               if (ump->um_flags & UFS_QUOTA)
+                       error = quota1_handle_cmd_get(ump, type, id, defaultq,
+                           replies);
+               else
+#endif
+#ifdef QUOTA2
+               if (ump->um_flags & UFS_QUOTA2) {
+                       error = quota2_handle_cmd_get(ump, type, id, defaultq,
+                           replies);
+               } else
+#endif
+                       panic("quota_handle_cmd_get: no support ?");
+               
+               if (error == ENOENT)
+                       continue;
+               if (error != 0)
+                       goto err;
+       }
+       prop_object_iterator_release(iter);
+       if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+               error = ENOMEM;
+       } else {
+               error = 0;
+       }
+       return error;
+err:
+       prop_object_iterator_release(iter);
+       prop_object_release(replies);
+       return error;
+}
+
+static int 
+quota_handle_cmd_set(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+       prop_array_t replies;
+       prop_object_iterator_t iter;
+       prop_dictionary_t data;
+       uint32_t id;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       int error, defaultq = 0;
+       const char *idstr;
+
+       if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0)
+               return EOPNOTSUPP;
+       
+       replies = prop_array_create();
+       if (replies == NULL)
+               return ENOMEM;
+
+       iter = prop_array_iterator(datas);
+       if (iter == NULL) {
+               prop_object_release(replies);
+               return ENOMEM;
+       }
+       while ((data = prop_object_iterator_next(iter)) != NULL) {
+               if (!prop_dictionary_get_uint32(data, "id", &id)) {
+                       if (!prop_dictionary_get_cstring_nocopy(data, "id",
+                           &idstr))
+                               continue;
+                       if (strcmp(idstr, "default"))
+                               continue;
+                       id = 0;
+                       defaultq = 1;
+               } else {
+                       defaultq = 0;
+               }
+               error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+                   KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(id), NULL);
+               if (error != 0)
+                       goto err;
+#ifdef QUOTA
+               if (ump->um_flags & UFS_QUOTA)
+                       error = quota1_handle_cmd_set(ump, type, id, defaultq,
+                           data);
+               else
+#endif
+#ifdef QUOTA2
+               if (ump->um_flags & UFS_QUOTA2) {
+                       error = quota2_handle_cmd_set(ump, type, id, defaultq,
+                           data);
+               } else
+#endif
+                       panic("quota_handle_cmd_get: no support ?");
+               
+               if (error && error != ENOENT)
+                       goto err;
+       }
+       prop_object_iterator_release(iter);
+       if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+               error = ENOMEM;
+       } else {
+               error = 0;
+       }
+       return error;
+err:
+       prop_object_iterator_release(iter);
+       prop_object_release(replies);
+       return error;
+}
+
+static int 
+quota_handle_cmd_clear(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+       prop_array_t replies;
+       prop_object_iterator_t iter;
+       prop_dictionary_t data;
+       uint32_t id;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       int error, defaultq = 0;
+       const char *idstr;
+
+       if ((ump->um_flags & UFS_QUOTA2) == 0)
+               return EOPNOTSUPP;
+       
+       replies = prop_array_create();
+       if (replies == NULL)
+               return ENOMEM;
+
+       iter = prop_array_iterator(datas);
+       if (iter == NULL) {
+               prop_object_release(replies);
+               return ENOMEM;
+       }
+       while ((data = prop_object_iterator_next(iter)) != NULL) {
+               if (!prop_dictionary_get_uint32(data, "id", &id)) {
+                       if (!prop_dictionary_get_cstring_nocopy(data, "id",
+                           &idstr))
+                               continue;
+                       if (strcmp(idstr, "default"))
+                               continue;
+                       id = 0;
+                       defaultq = 1;
+               } else {
+                       defaultq = 0;
+               }
+               error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+                   KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(id), NULL);
+               if (error != 0)
+                       goto err;
+#ifdef QUOTA2
+               if (ump->um_flags & UFS_QUOTA2) {
+                       error = quota2_handle_cmd_clear(ump, type, id, defaultq,
+                           data);
+               } else
+#endif
+                       panic("quota_handle_cmd_get: no support ?");
+               
+               if (error && error != ENOENT)
+                       goto err;
+       }
+       prop_object_iterator_release(iter);
+       if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+               error = ENOMEM;
+       } else {
+               error = 0;
+       }
+       return error;
+err:
+       prop_object_iterator_release(iter);
+       prop_object_release(replies);
+       return error;
+}
+
+static int 
+quota_handle_cmd_getall(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+       prop_array_t replies;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       int error;
+
+       if ((ump->um_flags & UFS_QUOTA2) == 0)
+               return EOPNOTSUPP;
+       
+       error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+           KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, NULL, NULL);
+       if (error)
+               return error;
+               
+       replies = prop_array_create();
+       if (replies == NULL)
+               return ENOMEM;
+
+#ifdef QUOTA2
+       if (ump->um_flags & UFS_QUOTA2) {
+               error = quota2_handle_cmd_getall(ump, type, replies);
+       } else
+#endif
+               panic("quota_handle_cmd_getall: no support ?");
+       if (!prop_dictionary_set_and_rel(cmddict, "data", replies)) {
+               error = ENOMEM;
+       } else {
+               error = 0;
+       }
+       return error;
+}
+
+static int 
+quota_handle_cmd_quotaon(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+       prop_dictionary_t data;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       int error;
+       const char *qfile;
+
+       if ((ump->um_flags & UFS_QUOTA2) != 0)
+               return EBUSY;
+       
+       if (prop_array_count(datas) != 1)
+               return EINVAL;
+
+       data = prop_array_get(datas, 0);
+       if (data == NULL)
+               return ENOMEM;
+       if (!prop_dictionary_get_cstring_nocopy(data, "quotafile",
+           &qfile))
+               return EINVAL;
+
+       error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+           KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
+       if (error != 0) {
+               return error;
+       }
+#ifdef QUOTA
+       error = quota1_handle_cmd_quotaon(l, ump, type, qfile);
+#else
+       error = EOPNOTSUPP;
+#endif
+       
+       return error;
+}
+
+static int 
+quota_handle_cmd_quotaoff(struct mount *mp, struct lwp *l, 
+    prop_dictionary_t cmddict, int type, prop_array_t datas)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       int error;
+
+       if ((ump->um_flags & UFS_QUOTA2) != 0)
+               return EOPNOTSUPP;
+       
+       if (prop_array_count(datas) != 0)
+               return EINVAL;
+
+       error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+           KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
+       if (error != 0) {
+               return error;
+       }
+#ifdef QUOTA
+       error = quota1_handle_cmd_quotaoff(l, ump, type);
+#else
+       error = EOPNOTSUPP;
+#endif
+       
+       return error;
+}
+
+/*
+ * Initialize the quota system.
+ */
+void
+dqinit(void)
+{
+
+       mutex_init(&dqlock, MUTEX_DEFAULT, IPL_NONE);
+       cv_init(&dqcv, "quota");
+       dqhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &dqhash);
+       dquot_cache = pool_cache_init(sizeof(struct dquot), 0, 0, 0, "ufsdq",
+           NULL, IPL_NONE, NULL, NULL, NULL);
+}
+
+void
+dqreinit(void)
+{
+       struct dquot *dq;
+       struct dqhashhead *oldhash, *hash;
+       struct vnode *dqvp;
+       u_long oldmask, mask, hashval;
+       int i;
+
+       hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
+       mutex_enter(&dqlock);
+       oldhash = dqhashtbl;
+       oldmask = dqhash;
+       dqhashtbl = hash;
+       dqhash = mask;
+       for (i = 0; i <= oldmask; i++) {
+               while ((dq = LIST_FIRST(&oldhash[i])) != NULL) {
+                       dqvp = dq->dq_ump->um_quotas[dq->dq_type];
+                       LIST_REMOVE(dq, dq_hash);
+                       hashval = DQHASH(dqvp, dq->dq_id);
+                       LIST_INSERT_HEAD(&dqhashtbl[hashval], dq, dq_hash);
+               }
+       }
+       mutex_exit(&dqlock);
+       hashdone(oldhash, HASH_LIST, oldmask);
+}
+
+/*
+ * Free resources held by quota system.
+ */
+void
+dqdone(void)
+{
+
+       pool_cache_destroy(dquot_cache);
+       hashdone(dqhashtbl, HASH_LIST, dqhash);
+       cv_destroy(&dqcv);
+       mutex_destroy(&dqlock);
+}
+
+/*
+ * Set up the quotas for an inode.
+ *
+ * This routine completely defines the semantics of quotas.
+ * If other criterion want to be used to establish quotas, the
+ * MAXQUOTAS value in quotas.h should be increased, and the
+ * additional dquots set up here.
+ */
+int
+getinoquota(struct inode *ip)
+{
+       struct ufsmount *ump = ip->i_ump;
+       struct vnode *vp = ITOV(ip);
+       int i, error;
+       u_int32_t ino_ids[MAXQUOTAS];
+
+       /*
+        * To avoid deadlocks never update quotas for quota files
+        * on the same file system
+        */
+       for (i = 0; i < MAXQUOTAS; i++)
+               if (vp == ump->um_quotas[i])
+                       return 0;
+
+       ino_ids[USRQUOTA] = ip->i_uid;
+       ino_ids[GRPQUOTA] = ip->i_gid;
+       for (i = 0; i < MAXQUOTAS; i++) {
+               /*
+                * If the file id changed the quota needs update.
+                */
+               if (ip->i_dquot[i] != NODQUOT &&
+                   ip->i_dquot[i]->dq_id != ino_ids[i]) {
+                       dqrele(ITOV(ip), ip->i_dquot[i]);
+                       ip->i_dquot[i] = NODQUOT;
+               }
+               /*
+                * Set up the quota based on file id.
+                * ENODEV means that quotas are not enabled.
+                */
+               if (ip->i_dquot[i] == NODQUOT &&
+                   (error = dqget(vp, ino_ids[i], ump, i, &ip->i_dquot[i])) &&
+                   error != ENODEV)
+                       return (error);
+       }
+       return 0;
+}
+
+/*
+ * Obtain a dquot structure for the specified identifier and quota file
+ * reading the information from the file if necessary.
+ */
+int
+dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type,
+    struct dquot **dqp)
+{
+       struct dquot *dq, *ndq;
+       struct dqhashhead *dqh;
+       struct vnode *dqvp;
+       int error = 0; /* XXX gcc */
+
+       /* Lock to see an up to date value for QTF_CLOSING. */
+       mutex_enter(&dqlock);
+       if ((ump->um_flags & (UFS_QUOTA|UFS_QUOTA2)) == 0) {
+               mutex_exit(&dqlock);
+               *dqp = NODQUOT;
+               return (ENODEV);
+       }
+       dqvp = ump->um_quotas[type];
+#ifdef QUOTA
+       if (ump->um_flags & UFS_QUOTA) {
+               if (dqvp == NULLVP || (ump->umq1_qflags[type] & QTF_CLOSING)) {
+                       mutex_exit(&dqlock);
+                       *dqp = NODQUOT;
+                       return (ENODEV);
+               }
+       }
+#endif
+#ifdef QUOTA2
+       if (ump->um_flags & UFS_QUOTA2) {
+               if (dqvp == NULLVP) {
+                       mutex_exit(&dqlock);
+                       *dqp = NODQUOT;
+                       return (ENODEV);
+               }
+       }
+#endif
+       KASSERT(dqvp != vp);
+       /*
+        * Check the cache first.
+        */
+       dqh = &dqhashtbl[DQHASH(dqvp, id)];
+       LIST_FOREACH(dq, dqh, dq_hash) {
+               if (dq->dq_id != id ||
+                   dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
+                       continue;
+               KASSERT(dq->dq_cnt > 0);
+               dqref(dq);
+               mutex_exit(&dqlock);
+               *dqp = dq;
+               return (0);
+       }
+       /*
+        * Not in cache, allocate a new one.
+        */
+       mutex_exit(&dqlock);
+       ndq = pool_cache_get(dquot_cache, PR_WAITOK);
+       /*
+        * Initialize the contents of the dquot structure.
+        */
+       memset((char *)ndq, 0, sizeof *ndq);
+       ndq->dq_flags = 0;
+       ndq->dq_id = id;
+       ndq->dq_ump = ump;
+       ndq->dq_type = type;
+       mutex_init(&ndq->dq_interlock, MUTEX_DEFAULT, IPL_NONE);
+       mutex_enter(&dqlock);
+       dqh = &dqhashtbl[DQHASH(dqvp, id)];
+       LIST_FOREACH(dq, dqh, dq_hash) {
+               if (dq->dq_id != id ||
+                   dq->dq_ump->um_quotas[dq->dq_type] != dqvp)
+                       continue;
+               /*
+                * Another thread beat us allocating this dquot.
+                */
+               KASSERT(dq->dq_cnt > 0);
+               dqref(dq);
+               mutex_exit(&dqlock);
+               mutex_destroy(&ndq->dq_interlock);
+               pool_cache_put(dquot_cache, ndq);
+               *dqp = dq;
+               return 0;
+       }
+       dq = ndq;
+       LIST_INSERT_HEAD(dqh, dq, dq_hash);
+       dqref(dq);
+       mutex_enter(&dq->dq_interlock);
+       mutex_exit(&dqlock);
+#ifdef QUOTA
+       if (ump->um_flags & UFS_QUOTA)
+               error = dq1get(dqvp, id, ump, type, dq);
+#endif
+#ifdef QUOTA2
+       if (ump->um_flags & UFS_QUOTA2)
+               error = dq2get(dqvp, id, ump, type, dq);
+#endif
+       /*
+        * I/O error in reading quota file, release
+        * quota structure and reflect problem to caller.
+        */
+       if (error) {
+               mutex_enter(&dqlock);
+               LIST_REMOVE(dq, dq_hash);
+               mutex_exit(&dqlock);
+               mutex_exit(&dq->dq_interlock);
+               dqrele(vp, dq);
+               *dqp = NODQUOT;
+               return (error);
+       }
+       mutex_exit(&dq->dq_interlock);
+       *dqp = dq;
+       return (0);
+}
+
+/*
+ * Obtain a reference to a dquot.
+ */
+void
+dqref(struct dquot *dq)
+{
+
+       KASSERT(mutex_owned(&dqlock));
+       dq->dq_cnt++;
+       KASSERT(dq->dq_cnt > 0);
+}
+
+/*
+ * Release a reference to a dquot.
+ */
+void
+dqrele(struct vnode *vp, struct dquot *dq)
+{
+
+       if (dq == NODQUOT)
+               return;
+       mutex_enter(&dq->dq_interlock);
+       for (;;) {
+               mutex_enter(&dqlock);
+               if (dq->dq_cnt > 1) {
+                       dq->dq_cnt--;
+                       mutex_exit(&dqlock);
+                       mutex_exit(&dq->dq_interlock);
+                       return;
+               }
+               if ((dq->dq_flags & DQ_MOD) == 0)
+                       break;
+               mutex_exit(&dqlock);
+#ifdef QUOTA
+               if (dq->dq_ump->um_flags & UFS_QUOTA)
+                       (void) dq1sync(vp, dq);
+#endif
+#ifdef QUOTA2
+               if (dq->dq_ump->um_flags & UFS_QUOTA2)
+                       (void) dq2sync(vp, dq);
+#endif
+       }
+       KASSERT(dq->dq_cnt == 1 && (dq->dq_flags & DQ_MOD) == 0);
+       LIST_REMOVE(dq, dq_hash);
+       mutex_exit(&dqlock);
+       mutex_exit(&dq->dq_interlock);
+       mutex_destroy(&dq->dq_interlock);
+       pool_cache_put(dquot_cache, dq);
+}
+
+int
+qsync(struct mount *mp)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+#ifdef QUOTA
+       if (ump->um_flags & UFS_QUOTA)
+               return q1sync(mp);
+#endif
+#ifdef QUOTA2
+       if (ump->um_flags & UFS_QUOTA2)
+               return q2sync(mp);
+#endif
+       return 0;
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * Check the hash chains for stray dquot's.
+ */
+void
+dqflush(struct vnode *vp)
+{
+       struct dquot *dq;
+       int i;
+
+       mutex_enter(&dqlock);
+       for (i = 0; i <= dqhash; i++)
+               LIST_FOREACH(dq, &dqhashtbl[i], dq_hash)
+                       KASSERT(dq->dq_ump->um_quotas[dq->dq_type] != vp);
+       mutex_exit(&dqlock);
+}
+#endif
diff --git a/sys/ufs/ufs/ufs_quota1.c b/sys/ufs/ufs/ufs_quota1.c
new file mode 100644 (file)
index 0000000..4fdb57c
--- /dev/null
@@ -0,0 +1,885 @@
+/*     $NetBSD: ufs_quota1.c,v 1.6 2011/11/25 16:55:05 dholland Exp $  */
+
+/*
+ * Copyright (c) 1982, 1986, 1990, 1993, 1995
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Robert Elz at The University of Melbourne.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_quota1.c,v 1.6 2011/11/25 16:55:05 dholland Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/kauth.h>
+
+#include <quota/quotaprop.h>
+#include <ufs/ufs/quota1.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_quota.h>
+
+static int chkdqchg(struct inode *, int64_t, kauth_cred_t, int);
+static int chkiqchg(struct inode *, int32_t, kauth_cred_t, int);
+
+/*
+ * Update disk usage, and take corrective action.
+ */
+int
+chkdq1(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
+{
+       struct dquot *dq;
+       int i;
+       int ncurblocks, error;
+
+       if ((error = getinoquota(ip)) != 0)
+               return error;
+       if (change == 0)
+               return (0);
+       if (change < 0) {
+               for (i = 0; i < MAXQUOTAS; i++) {
+                       if ((dq = ip->i_dquot[i]) == NODQUOT)
+                               continue;
+                       mutex_enter(&dq->dq_interlock);
+                       ncurblocks = dq->dq_curblocks + change;
+                       if (ncurblocks >= 0)
+                               dq->dq_curblocks = ncurblocks;
+                       else
+                               dq->dq_curblocks = 0;
+                       dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+                       dq->dq_flags |= DQ_MOD;
+                       mutex_exit(&dq->dq_interlock);
+               }
+               return (0);
+       }
+       for (i = 0; i < MAXQUOTAS; i++) {
+               if ((dq = ip->i_dquot[i]) == NODQUOT)
+                       continue;
+               if ((flags & FORCE) == 0 &&
+                   kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA,
+                   KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT, KAUTH_ARG(i),
+                   KAUTH_ARG(QL_BLOCK), NULL) != 0) {
+                       mutex_enter(&dq->dq_interlock);
+                       error = chkdqchg(ip, change, cred, i);
+                       mutex_exit(&dq->dq_interlock);
+                       if (error != 0)
+                               return (error);
+               }
+       }
+       for (i = 0; i < MAXQUOTAS; i++) {
+               if ((dq = ip->i_dquot[i]) == NODQUOT)
+                       continue;
+               mutex_enter(&dq->dq_interlock);
+               dq->dq_curblocks += change;
+               dq->dq_flags |= DQ_MOD;
+               mutex_exit(&dq->dq_interlock);
+       }
+       return (0);
+}
+
+/*
+ * Check for a valid change to a users allocation.
+ * Issue an error message if appropriate.
+ */
+static int
+chkdqchg(struct inode *ip, int64_t change, kauth_cred_t cred, int type)
+{
+       struct dquot *dq = ip->i_dquot[type];
+       long ncurblocks = dq->dq_curblocks + change;
+
+       KASSERT(mutex_owned(&dq->dq_interlock));
+       /*
+        * If user would exceed their hard limit, disallow space allocation.
+        */
+       if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) {
+               if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 &&
+                   ip->i_uid == kauth_cred_geteuid(cred)) {
+                       uprintf("\n%s: write failed, %s disk limit reached\n",
+                           ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+                           quotatypes[type]);
+                       dq->dq_flags |= DQ_WARN(QL_BLOCK);
+               }
+               return (EDQUOT);
+       }
+       /*
+        * If user is over their soft limit for too long, disallow space
+        * allocation. Reset time limit as they cross their soft limit.
+        */
+       if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) {
+               if (dq->dq_curblocks < dq->dq_bsoftlimit) {
+                       dq->dq_btime =
+                           time_second + ip->i_ump->umq1_btime[type];
+                       if (ip->i_uid == kauth_cred_geteuid(cred))
+                               uprintf("\n%s: warning, %s %s\n",
+                                   ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+                                   quotatypes[type], "disk quota exceeded");
+                       return (0);
+               }
+               if (time_second > dq->dq_btime) {
+                       if ((dq->dq_flags & DQ_WARN(QL_BLOCK)) == 0 &&
+                           ip->i_uid == kauth_cred_geteuid(cred)) {
+                               uprintf("\n%s: write failed, %s %s\n",
+                                   ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+                                   quotatypes[type],
+                                   "disk quota exceeded for too long");
+                               dq->dq_flags |= DQ_WARN(QL_BLOCK);
+                       }
+                       return (EDQUOT);
+               }
+       }
+       return (0);
+}
+
+/*
+ * Check the inode limit, applying corrective action.
+ */
+int
+chkiq1(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
+{
+       struct dquot *dq;
+       int i;
+       int ncurinodes, error;
+
+       if ((error = getinoquota(ip)) != 0)
+               return error;
+       if (change == 0)
+               return (0);
+       if (change < 0) {
+               for (i = 0; i < MAXQUOTAS; i++) {
+                       if ((dq = ip->i_dquot[i]) == NODQUOT)
+                               continue;
+                       mutex_enter(&dq->dq_interlock);
+                       ncurinodes = dq->dq_curinodes + change;
+                       if (ncurinodes >= 0)
+                               dq->dq_curinodes = ncurinodes;
+                       else
+                               dq->dq_curinodes = 0;
+                       dq->dq_flags &= ~DQ_WARN(QL_FILE);
+                       dq->dq_flags |= DQ_MOD;
+                       mutex_exit(&dq->dq_interlock);
+               }
+               return (0);
+       }
+       for (i = 0; i < MAXQUOTAS; i++) {
+               if ((dq = ip->i_dquot[i]) == NODQUOT)
+                       continue;
+               if ((flags & FORCE) == 0 && kauth_authorize_system(cred,
+                   KAUTH_SYSTEM_FS_QUOTA, KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT,
+                   KAUTH_ARG(i), KAUTH_ARG(QL_FILE), NULL) != 0) {
+                       mutex_enter(&dq->dq_interlock);
+                       error = chkiqchg(ip, change, cred, i);
+                       mutex_exit(&dq->dq_interlock);
+                       if (error != 0)
+                               return (error);
+               }
+       }
+       for (i = 0; i < MAXQUOTAS; i++) {
+               if ((dq = ip->i_dquot[i]) == NODQUOT)
+                       continue;
+               mutex_enter(&dq->dq_interlock);
+               dq->dq_curinodes += change;
+               dq->dq_flags |= DQ_MOD;
+               mutex_exit(&dq->dq_interlock);
+       }
+       return (0);
+}
+
+/*
+ * Check for a valid change to a users allocation.
+ * Issue an error message if appropriate.
+ */
+static int
+chkiqchg(struct inode *ip, int32_t change, kauth_cred_t cred, int type)
+{
+       struct dquot *dq = ip->i_dquot[type];
+       long ncurinodes = dq->dq_curinodes + change;
+
+       KASSERT(mutex_owned(&dq->dq_interlock));
+       /*
+        * If user would exceed their hard limit, disallow inode allocation.
+        */
+       if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) {
+               if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 &&
+                   ip->i_uid == kauth_cred_geteuid(cred)) {
+                       uprintf("\n%s: write failed, %s inode limit reached\n",
+                           ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+                           quotatypes[type]);
+                       dq->dq_flags |= DQ_WARN(QL_FILE);
+               }
+               return (EDQUOT);
+       }
+       /*
+        * If user is over their soft limit for too long, disallow inode
+        * allocation. Reset time limit as they cross their soft limit.
+        */
+       if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) {
+               if (dq->dq_curinodes < dq->dq_isoftlimit) {
+                       dq->dq_itime =
+                           time_second + ip->i_ump->umq1_itime[type];
+                       if (ip->i_uid == kauth_cred_geteuid(cred))
+                               uprintf("\n%s: warning, %s %s\n",
+                                   ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+                                   quotatypes[type], "inode quota exceeded");
+                       return (0);
+               }
+               if (time_second > dq->dq_itime) {
+                       if ((dq->dq_flags & DQ_WARN(QL_FILE)) == 0 &&
+                           ip->i_uid == kauth_cred_geteuid(cred)) {
+                               uprintf("\n%s: write failed, %s %s\n",
+                                   ITOV(ip)->v_mount->mnt_stat.f_mntonname,
+                                   quotatypes[type],
+                                   "inode quota exceeded for too long");
+                               dq->dq_flags |= DQ_WARN(QL_FILE);
+                       }
+                       return (EDQUOT);
+               }
+       }
+       return (0);
+}
+
+int
+quota1_umount(struct mount *mp, int flags)
+{
+       int i, error;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct lwp *l = curlwp;
+
+       if ((ump->um_flags & UFS_QUOTA) == 0)
+               return 0;
+
+       if ((error = vflush(mp, NULLVP, SKIPSYSTEM | flags)) != 0)
+               return (error);
+
+       for (i = 0; i < MAXQUOTAS; i++) {
+               if (ump->um_quotas[i] != NULLVP) {
+                       quota1_handle_cmd_quotaoff(l, ump, i);
+               }
+       }
+       return 0;
+}
+
+/*
+ * Code to process quotactl commands.
+ */
+
+/*
+ * set up a quota file for a particular file system.
+ */
+int
+quota1_handle_cmd_quotaon(struct lwp *l, struct ufsmount *ump, int type,
+    const char *fname)
+{
+       struct mount *mp = ump->um_mountp;
+       struct vnode *vp, **vpp, *mvp;
+       struct dquot *dq;
+       int error;
+       struct pathbuf *pb;
+       struct nameidata nd;
+
+       if (ump->um_flags & UFS_QUOTA2) {
+               uprintf("%s: quotas v2 already enabled\n",
+                   mp->mnt_stat.f_mntonname);
+               return (EBUSY);
+       }
+               
+       if (mp->mnt_wapbl != NULL) {
+               printf("%s: quota v1 cannot be used with -o log\n",
+                   mp->mnt_stat.f_mntonname);
+               return (EOPNOTSUPP);
+       }
+
+       vpp = &ump->um_quotas[type];
+
+       pb = pathbuf_create(fname);
+       if (pb == NULL) {
+               return ENOMEM;
+       }
+       NDINIT(&nd, LOOKUP, FOLLOW, pb);
+       if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
+               pathbuf_destroy(pb);
+               return error;
+       }
+       vp = nd.ni_vp;
+       pathbuf_destroy(pb);
+
+       VOP_UNLOCK(vp);
+       if (vp->v_type != VREG) {
+               (void) vn_close(vp, FREAD|FWRITE, l->l_cred);
+               return (EACCES);
+       }
+       if (*vpp != vp)
+               quota1_handle_cmd_quotaoff(l, ump, type);
+       mutex_enter(&dqlock);
+       while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0)
+               cv_wait(&dqcv, &dqlock);
+       ump->umq1_qflags[type] |= QTF_OPENING;
+       mutex_exit(&dqlock);
+       mp->mnt_flag |= MNT_QUOTA;
+       vp->v_vflag |= VV_SYSTEM;       /* XXXSMP */
+       *vpp = vp;
+       /*
+        * Save the credential of the process that turned on quotas.
+        * Set up the time limits for this quota.
+        */
+       kauth_cred_hold(l->l_cred);
+       ump->um_cred[type] = l->l_cred;
+       ump->umq1_btime[type] = MAX_DQ_TIME;
+       ump->umq1_itime[type] = MAX_IQ_TIME;
+       if (dqget(NULLVP, 0, ump, type, &dq) == 0) {
+               if (dq->dq_btime > 0)
+                       ump->umq1_btime[type] = dq->dq_btime;
+               if (dq->dq_itime > 0)
+                       ump->umq1_itime[type] = dq->dq_itime;
+               dqrele(NULLVP, dq);
+       }
+       /* Allocate a marker vnode. */
+       mvp = vnalloc(mp);
+       /*
+        * Search vnodes associated with this mount point,
+        * adding references to quota file being opened.
+        * NB: only need to add dquot's for inodes being modified.
+        */
+       mutex_enter(&mntvnode_lock);
+again:
+       for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+               vmark(mvp, vp);
+               mutex_enter(vp->v_interlock);
+               if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) ||
+                   vp->v_type == VNON || vp->v_writecount == 0 ||
+                   (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+               mutex_exit(&mntvnode_lock);
+               if (vget(vp, LK_EXCLUSIVE)) {
+                       mutex_enter(&mntvnode_lock);
+                       (void)vunmark(mvp);
+                       goto again;
+               }
+               if ((error = getinoquota(VTOI(vp))) != 0) {
+                       vput(vp);
+                       mutex_enter(&mntvnode_lock);
+                       (void)vunmark(mvp);
+                       break;
+               }
+               vput(vp);
+               mutex_enter(&mntvnode_lock);
+       }
+       mutex_exit(&mntvnode_lock);
+       vnfree(mvp);
+
+       mutex_enter(&dqlock);
+       ump->umq1_qflags[type] &= ~QTF_OPENING;
+       cv_broadcast(&dqcv);
+       if (error == 0)
+               ump->um_flags |= UFS_QUOTA;
+       mutex_exit(&dqlock);
+       if (error)
+               quota1_handle_cmd_quotaoff(l, ump, type);
+       return (error);
+}
+
+/*
+ * turn off disk quotas for a filesystem.
+ */
+int
+quota1_handle_cmd_quotaoff(struct lwp *l, struct ufsmount *ump, int type)
+{
+       struct mount *mp = ump->um_mountp;
+       struct vnode *vp;
+       struct vnode *qvp, *mvp;
+       struct dquot *dq;
+       struct inode *ip;
+       kauth_cred_t cred;
+       int i, error;
+
+       /* Allocate a marker vnode. */
+       mvp = vnalloc(mp);
+
+       mutex_enter(&dqlock);
+       while ((ump->umq1_qflags[type] & (QTF_CLOSING | QTF_OPENING)) != 0)
+               cv_wait(&dqcv, &dqlock);
+       if ((qvp = ump->um_quotas[type]) == NULLVP) {
+               mutex_exit(&dqlock);
+               vnfree(mvp);
+               return (0);
+       }
+       ump->umq1_qflags[type] |= QTF_CLOSING;
+       ump->um_flags &= ~UFS_QUOTA;
+       mutex_exit(&dqlock);
+       /*
+        * Search vnodes associated with this mount point,
+        * deleting any references to quota file being closed.
+        */
+       mutex_enter(&mntvnode_lock);
+again:
+       for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+               vmark(mvp, vp);
+               mutex_enter(vp->v_interlock);
+               if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) ||
+                   vp->v_type == VNON ||
+                   (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+               mutex_exit(&mntvnode_lock);
+               if (vget(vp, LK_EXCLUSIVE)) {
+                       mutex_enter(&mntvnode_lock);
+                       (void)vunmark(mvp);
+                       goto again;
+               }
+               ip = VTOI(vp);
+               dq = ip->i_dquot[type];
+               ip->i_dquot[type] = NODQUOT;
+               dqrele(vp, dq);
+               vput(vp);
+               mutex_enter(&mntvnode_lock);
+       }
+       mutex_exit(&mntvnode_lock);
+#ifdef DIAGNOSTIC
+       dqflush(qvp);
+#endif
+       qvp->v_vflag &= ~VV_SYSTEM;
+       error = vn_close(qvp, FREAD|FWRITE, l->l_cred);
+       mutex_enter(&dqlock);
+       ump->um_quotas[type] = NULLVP;
+       cred = ump->um_cred[type];
+       ump->um_cred[type] = NOCRED;
+       for (i = 0; i < MAXQUOTAS; i++)
+               if (ump->um_quotas[i] != NULLVP)
+                       break;
+       ump->umq1_qflags[type] &= ~QTF_CLOSING;
+       cv_broadcast(&dqcv);
+       mutex_exit(&dqlock);
+       kauth_cred_free(cred);
+       if (i == MAXQUOTAS)
+               mp->mnt_flag &= ~MNT_QUOTA;
+       return (error);
+}
+
+int             
+quota1_handle_cmd_get(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_array_t replies)
+{
+       struct dquot *dq;
+       struct quotaval qv[QUOTA_NLIMITS];
+       prop_dictionary_t dict;
+       int error;
+       uint64_t *valuesp[QUOTA_NLIMITS];
+       valuesp[QUOTA_LIMIT_BLOCK] = &qv[QUOTA_LIMIT_BLOCK].qv_hardlimit;
+       valuesp[QUOTA_LIMIT_FILE] = &qv[QUOTA_LIMIT_FILE].qv_hardlimit;
+
+
+       if (ump->um_quotas[type] == NULLVP)
+               return ENODEV;
+
+       if (defaultq) { /* we want the grace period of id 0 */
+               if ((error = dqget(NULLVP, 0, ump, type, &dq)) != 0)
+                       return error;
+
+       } else {
+               if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0)
+                       return error;
+       }
+       dqblk_to_quotaval(&dq->dq_un.dq1_dqb, qv);
+       dqrele(NULLVP, dq);
+       if (defaultq) {
+               if (qv[QUOTA_LIMIT_BLOCK].qv_expiretime > 0)
+                       qv[QUOTA_LIMIT_BLOCK].qv_grace =
+                           qv[QUOTA_LIMIT_BLOCK].qv_expiretime;
+               else
+                       qv[QUOTA_LIMIT_BLOCK].qv_grace = MAX_DQ_TIME;
+               if (qv[QUOTA_LIMIT_FILE].qv_expiretime > 0)
+                       qv[QUOTA_LIMIT_FILE].qv_grace =
+                           qv[QUOTA_LIMIT_FILE].qv_expiretime;
+               else
+                       qv[QUOTA_LIMIT_FILE].qv_grace = MAX_DQ_TIME;
+       }
+       dict = quota64toprop(id, defaultq, valuesp,
+           ufs_quota_entry_names, UFS_QUOTA_NENTRIES,
+           ufs_quota_limit_names, QUOTA_NLIMITS);
+       if (dict == NULL)
+               return ENOMEM;
+       if (!prop_array_add_and_rel(replies, dict))
+               return ENOMEM;
+       return 0;
+}
+
+int
+quota1_handle_cmd_set(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_dictionary_t data)
+{
+       struct dquot *dq;
+       struct dqblk dqb;
+       int error;
+       uint64_t bval[2];
+       uint64_t ival[2];
+       const char *val_limitsonly_grace[] = {QUOTADICT_LIMIT_GTIME};
+#define Q1_GTIME 0
+       const char *val_limitsonly_softhard[] =
+           {QUOTADICT_LIMIT_SOFT, QUOTADICT_LIMIT_HARD};
+#define Q1_SOFT 0
+#define Q1_HARD 1
+
+       uint64_t *valuesp[QUOTA_NLIMITS];
+       valuesp[QUOTA_LIMIT_BLOCK] = bval;
+       valuesp[QUOTA_LIMIT_FILE] = ival;
+
+       if (ump->um_quotas[type] == NULLVP)
+               return ENODEV;
+
+       if (defaultq) {
+               /* just update grace times */
+               error = proptoquota64(data, valuesp, val_limitsonly_grace, 1,
+                   ufs_quota_limit_names, QUOTA_NLIMITS);
+               if (error)
+                       return error;
+               if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0)
+                       return error;
+               mutex_enter(&dq->dq_interlock);
+               if (bval[Q1_GTIME] > 0)
+                       ump->umq1_btime[type] = dq->dq_btime =
+                           bval[Q1_GTIME];
+               if (ival[Q1_GTIME] > 0)
+                       ump->umq1_itime[type] = dq->dq_itime =
+                           ival[Q1_GTIME];
+               mutex_exit(&dq->dq_interlock);
+               dq->dq_flags |= DQ_MOD;
+               dqrele(NULLVP, dq);
+               return 0;
+       }
+       error = proptoquota64(data, valuesp, val_limitsonly_softhard, 2,
+           ufs_quota_limit_names, QUOTA_NLIMITS);
+       if (error)
+               return error;
+
+       if ((error = dqget(NULLVP, id, ump, type, &dq)) != 0)
+               return (error);
+       mutex_enter(&dq->dq_interlock);
+       /*
+        * Copy all but the current values.
+        * Reset time limit if previously had no soft limit or were
+        * under it, but now have a soft limit and are over it.
+        */
+       dqb.dqb_curblocks = dq->dq_curblocks;
+       dqb.dqb_curinodes = dq->dq_curinodes;
+       dqb.dqb_btime = dq->dq_btime;
+       dqb.dqb_itime = dq->dq_itime;
+       dqb.dqb_bsoftlimit = (bval[Q1_SOFT] == UQUAD_MAX) ? 0 : bval[Q1_SOFT];
+       dqb.dqb_bhardlimit = (bval[Q1_HARD] == UQUAD_MAX) ? 0 : bval[Q1_HARD];
+       dqb.dqb_isoftlimit = (ival[Q1_SOFT] == UQUAD_MAX) ? 0 : ival[Q1_SOFT];
+       dqb.dqb_ihardlimit = (ival[Q1_HARD] == UQUAD_MAX) ? 0 : ival[Q1_HARD];
+       if (dq->dq_id == 0) {
+               /* also update grace time if available */
+               if (proptoquota64(data, valuesp, val_limitsonly_grace, 1,
+                   ufs_quota_limit_names, QUOTA_NLIMITS) == 0) {
+                       if (bval[Q1_GTIME] > 0)
+                               ump->umq1_btime[type] = dqb.dqb_btime =
+                                   bval[Q1_GTIME];
+                       if (ival[Q1_GTIME] > 0)
+                               ump->umq1_itime[type] = dqb.dqb_itime =
+                                   ival[Q1_GTIME];
+               }
+       }
+       if (dqb.dqb_bsoftlimit &&
+           dq->dq_curblocks >= dqb.dqb_bsoftlimit &&
+           (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit))
+               dqb.dqb_btime = time_second + ump->umq1_btime[type];
+       if (dqb.dqb_isoftlimit &&
+           dq->dq_curinodes >= dqb.dqb_isoftlimit &&
+           (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit))
+               dqb.dqb_itime = time_second + ump->umq1_itime[type];
+       dq->dq_un.dq1_dqb = dqb;
+       if (dq->dq_curblocks < dq->dq_bsoftlimit)
+               dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+       if (dq->dq_curinodes < dq->dq_isoftlimit)
+               dq->dq_flags &= ~DQ_WARN(QL_FILE);
+       if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
+           dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
+               dq->dq_flags |= DQ_FAKE;
+       else
+               dq->dq_flags &= ~DQ_FAKE;
+       dq->dq_flags |= DQ_MOD;
+       mutex_exit(&dq->dq_interlock);
+       dqrele(NULLVP, dq);
+       return (0);
+}
+
+
+#if 0
+/*
+ * Q_SETQUOTA - assign an entire dqblk structure.
+ */
+int
+setquota1(struct mount *mp, u_long id, int type, struct dqblk *dqb)
+{
+       struct dquot *dq;
+       struct dquot *ndq;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       
+
+       if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0)
+               return (error);
+       dq = ndq;
+       mutex_enter(&dq->dq_interlock);
+       /*
+        * Copy all but the current values.
+        * Reset time limit if previously had no soft limit or were
+        * under it, but now have a soft limit and are over it.
+        */
+       dqb->dqb_curblocks = dq->dq_curblocks;
+       dqb->dqb_curinodes = dq->dq_curinodes;
+       if (dq->dq_id != 0) {
+               dqb->dqb_btime = dq->dq_btime;
+               dqb->dqb_itime = dq->dq_itime;
+       }
+       if (dqb->dqb_bsoftlimit &&
+           dq->dq_curblocks >= dqb->dqb_bsoftlimit &&
+           (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit))
+               dqb->dqb_btime = time_second + ump->umq1_btime[type];
+       if (dqb->dqb_isoftlimit &&
+           dq->dq_curinodes >= dqb->dqb_isoftlimit &&
+           (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit))
+               dqb->dqb_itime = time_second + ump->umq1_itime[type];
+       dq->dq_un.dq1_dqb = *dqb;
+       if (dq->dq_curblocks < dq->dq_bsoftlimit)
+               dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+       if (dq->dq_curinodes < dq->dq_isoftlimit)
+               dq->dq_flags &= ~DQ_WARN(QL_FILE);
+       if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
+           dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
+               dq->dq_flags |= DQ_FAKE;
+       else
+               dq->dq_flags &= ~DQ_FAKE;
+       dq->dq_flags |= DQ_MOD;
+       mutex_exit(&dq->dq_interlock);
+       dqrele(NULLVP, dq);
+       return (0);
+}
+
+/*
+ * Q_SETUSE - set current inode and block usage.
+ */
+int
+setuse(struct mount *mp, u_long id, int type, void *addr)
+{
+       struct dquot *dq;
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct dquot *ndq;
+       struct dqblk usage;
+       int error;
+
+       error = copyin(addr, (void *)&usage, sizeof (struct dqblk));
+       if (error)
+               return (error);
+       if ((error = dqget(NULLVP, id, ump, type, &ndq)) != 0)
+               return (error);
+       dq = ndq;
+       mutex_enter(&dq->dq_interlock);
+       /*
+        * Reset time limit if have a soft limit and were
+        * previously under it, but are now over it.
+        */
+       if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit &&
+           usage.dqb_curblocks >= dq->dq_bsoftlimit)
+               dq->dq_btime = time_second + ump->umq1_btime[type];
+       if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit &&
+           usage.dqb_curinodes >= dq->dq_isoftlimit)
+               dq->dq_itime = time_second + ump->umq1_itime[type];
+       dq->dq_curblocks = usage.dqb_curblocks;
+       dq->dq_curinodes = usage.dqb_curinodes;
+       if (dq->dq_curblocks < dq->dq_bsoftlimit)
+               dq->dq_flags &= ~DQ_WARN(QL_BLOCK);
+       if (dq->dq_curinodes < dq->dq_isoftlimit)
+               dq->dq_flags &= ~DQ_WARN(QL_FILE);
+       dq->dq_flags |= DQ_MOD;
+       mutex_exit(&dq->dq_interlock);
+       dqrele(NULLVP, dq);
+       return (0);
+}
+#endif
+
+/*
+ * Q_SYNC - sync quota files to disk.
+ */
+int
+q1sync(struct mount *mp)
+{
+       struct ufsmount *ump = VFSTOUFS(mp);
+       struct vnode *vp, *mvp;
+       struct dquot *dq;
+       int i, error;
+
+       /*
+        * Check if the mount point has any quotas.
+        * If not, simply return.
+        */
+       for (i = 0; i < MAXQUOTAS; i++)
+               if (ump->um_quotas[i] != NULLVP)
+                       break;
+       if (i == MAXQUOTAS)
+               return (0);
+
+       /* Allocate a marker vnode. */
+       mvp = vnalloc(mp);
+
+       /*
+        * Search vnodes associated with this mount point,
+        * synchronizing any modified dquot structures.
+        */
+       mutex_enter(&mntvnode_lock);
+ again:
+       for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
+               vmark(mvp, vp);
+               mutex_enter(vp->v_interlock);
+               if (VTOI(vp) == NULL || vp->v_mount != mp || vismarker(vp) ||
+                   vp->v_type == VNON ||
+                   (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0) {
+                       mutex_exit(vp->v_interlock);
+                       continue;
+               }
+               mutex_exit(&mntvnode_lock);
+               error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT);
+               if (error) {
+                       mutex_enter(&mntvnode_lock);
+                       if (error == ENOENT) {
+                               (void)vunmark(mvp);
+                               goto again;
+                       }
+                       continue;
+               }
+               for (i = 0; i < MAXQUOTAS; i++) {
+                       dq = VTOI(vp)->i_dquot[i];
+                       if (dq == NODQUOT)
+                               continue;
+                       mutex_enter(&dq->dq_interlock);
+                       if (dq->dq_flags & DQ_MOD)
+                               dq1sync(vp, dq);
+                       mutex_exit(&dq->dq_interlock);
+               }
+               vput(vp);
+               mutex_enter(&mntvnode_lock);
+       }
+       mutex_exit(&mntvnode_lock);
+       vnfree(mvp);
+       return (0);
+}
+
+/*
+ * Obtain a dquot structure for the specified identifier and quota file
+ * reading the information from the file if necessary.
+ */
+int
+dq1get(struct vnode *dqvp, u_long id, struct ufsmount *ump, int type,
+    struct dquot *dq)
+{
+       struct iovec aiov;
+       struct uio auio;
+       int error;
+
+       KASSERT(mutex_owned(&dq->dq_interlock));
+       vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY);
+       auio.uio_iov = &aiov;
+       auio.uio_iovcnt = 1;
+       aiov.iov_base = (void *)&dq->dq_un.dq1_dqb;
+       aiov.iov_len = sizeof (struct dqblk);
+       auio.uio_resid = sizeof (struct dqblk);
+       auio.uio_offset = (off_t)(id * sizeof (struct dqblk));
+       auio.uio_rw = UIO_READ;
+       UIO_SETUP_SYSSPACE(&auio);
+       error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]);
+       if (auio.uio_resid == sizeof(struct dqblk) && error == 0)
+               memset((void *)&dq->dq_un.dq1_dqb, 0, sizeof(struct dqblk));
+       VOP_UNLOCK(dqvp);
+       /*
+        * I/O error in reading quota file, release
+        * quota structure and reflect problem to caller.
+        */
+       if (error)
+               return (error);
+       /*
+        * Check for no limit to enforce.
+        * Initialize time values if necessary.
+        */
+       if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 &&
+           dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0)
+               dq->dq_flags |= DQ_FAKE;
+       if (dq->dq_id != 0) {
+               if (dq->dq_btime == 0)
+                       dq->dq_btime = time_second + ump->umq1_btime[type];
+               if (dq->dq_itime == 0)
+                       dq->dq_itime = time_second + ump->umq1_itime[type];
+       }
+       return (0);
+}
+
+/*
+ * Update the disk quota in the quota file.
+ */
+int
+dq1sync(struct vnode *vp, struct dquot *dq)
+{
+       struct vnode *dqvp;
+       struct iovec aiov;
+       struct uio auio;
+       int error;
+
+       if (dq == NODQUOT)
+               panic("dq1sync: dquot");
+       KASSERT(mutex_owned(&dq->dq_interlock));
+       if ((dq->dq_flags & DQ_MOD) == 0)
+               return (0);
+       if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP)
+               panic("dq1sync: file");
+       KASSERT(dqvp != vp);
+       vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY);
+       auio.uio_iov = &aiov;
+       auio.uio_iovcnt = 1;
+       aiov.iov_base = (void *)&dq->dq_un.dq1_dqb;
+       aiov.iov_len = sizeof (struct dqblk);
+       auio.uio_resid = sizeof (struct dqblk);
+       auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk));
+       auio.uio_rw = UIO_WRITE;
+       UIO_SETUP_SYSSPACE(&auio);
+       error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]);
+       if (auio.uio_resid && error == 0)
+               error = EIO;
+       dq->dq_flags &= ~DQ_MOD;
+       VOP_UNLOCK(dqvp);
+       return (error);
+}
diff --git a/sys/ufs/ufs/ufs_quota2.c b/sys/ufs/ufs/ufs_quota2.c
new file mode 100644 (file)
index 0000000..823e398
--- /dev/null
@@ -0,0 +1,1012 @@
+/* $NetBSD: ufs_quota2.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $ */
+/*-
+  * Copyright (c) 2010 Manuel Bouyer
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  *
+  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  * POSSIBILITY OF SUCH DAMAGE.
+  */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_quota2.c,v 1.4 2011/06/07 14:56:13 bouyer Exp $");
+
+#include <sys/buf.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/quota2.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_quota.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#include <quota/quotaprop.h>
+
+/*
+ * LOCKING:
+ * Data in the entries are protected by the associated struct dquot's
+ * dq_interlock (this means we can't read or change a quota entry without
+ * grabing a dquot for it).
+ * The header and lists (including pointers in the data entries, and q2e_uid)
+ * are protected by the global dqlock.
+ * the locking order is dq_interlock -> dqlock
+ */
+
+static int quota2_bwrite(struct mount *, struct buf *);
+static int getinoquota2(struct inode *, bool, bool, struct buf **,
+    struct quota2_entry **);
+static int getq2h(struct ufsmount *, int, struct buf **,
+    struct quota2_header **, int);
+static int getq2e(struct ufsmount *, int, daddr_t, int, struct buf **,
+    struct quota2_entry **, int);
+static int quota2_walk_list(struct ufsmount *, struct buf *, int,
+    uint64_t *, int, void *,
+    int (*func)(struct ufsmount *, uint64_t *, struct quota2_entry *,
+      uint64_t, void *));
+
+static int quota2_dict_update_q2e_limits(prop_dictionary_t,
+    struct quota2_entry *);
+static prop_dictionary_t q2etoprop(struct quota2_entry *, int);
+
+static const char *limnames[] = INITQLNAMES;
+
+static int
+quota2_dict_update_q2e_limits(prop_dictionary_t data,
+    struct quota2_entry *q2e)
+{
+       const char *val_limitsonly_names[] = INITQVNAMES_LIMITSONLY;
+
+       int i, error;
+       prop_dictionary_t val;
+
+       for (i = 0; i < N_QL; i++) {
+               if (!prop_dictionary_get_dict(data, limnames[i], &val))
+                       return EINVAL;
+               error = quotaprop_dict_get_uint64(val,
+                   &q2e->q2e_val[i].q2v_hardlimit,
+                   val_limitsonly_names, N_QV, true);
+               if (error)
+                       return error;
+       }
+       return 0;
+}
+static prop_dictionary_t
+q2etoprop(struct quota2_entry *q2e, int def)
+{
+       const char *val_names[] = INITQVNAMES_ALL;
+       prop_dictionary_t dict1 = prop_dictionary_create();
+       prop_dictionary_t dict2;
+       int i;
+
+       if (dict1 == NULL)
+               return NULL;
+
+       if (def) {
+               if (!prop_dictionary_set_cstring_nocopy(dict1, "id",
+                   "default")) {
+                       goto err;
+               }
+       } else {
+               if (!prop_dictionary_set_uint32(dict1, "id", q2e->q2e_uid)) {
+                       goto err;
+               }
+       }
+       for (i = 0; i < N_QL; i++) {
+               dict2 = limits64toprop(&q2e->q2e_val[i].q2v_hardlimit,
+                   val_names, N_QV);
+               if (dict2 == NULL)
+                       goto err;
+               if (!prop_dictionary_set_and_rel(dict1, limnames[i], dict2))
+                       goto err;
+       }
+       return dict1;
+
+err:
+       prop_object_release(dict1);
+       return NULL;
+}
+
+
+static int
+quota2_bwrite(struct mount *mp, struct buf *bp)
+{
+       if (mp->mnt_flag & MNT_SYNCHRONOUS)
+               return bwrite(bp);
+       else {
+               bdwrite(bp);
+               return 0;
+       }
+}
+
+static int
+getq2h(struct ufsmount *ump, int type,
+    struct buf **bpp, struct quota2_header **q2hp, int flags)
+{
+#ifdef FFS_EI
+       const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+       int error;
+       struct buf *bp;
+       struct quota2_header *q2h;
+
+       KASSERT(mutex_owned(&dqlock));
+       error = bread(ump->um_quotas[type], 0, ump->umq2_bsize,
+           ump->um_cred[type], flags, &bp);
+       if (error)
+               return error;
+       if (bp->b_resid != 0) 
+               panic("dq2get: %s quota file truncated", quotatypes[type]);
+
+       q2h = (void *)bp->b_data;
+       if (ufs_rw32(q2h->q2h_magic_number, needswap) != Q2_HEAD_MAGIC ||
+           q2h->q2h_type != type)
+               panic("dq2get: corrupted %s quota header", quotatypes[type]);
+       *bpp = bp;
+       *q2hp = q2h;
+       return 0;
+}
+
+static int
+getq2e(struct ufsmount *ump, int type, daddr_t lblkno, int blkoffset,
+    struct buf **bpp, struct quota2_entry **q2ep, int flags)
+{
+       int error;
+       struct buf *bp;
+
+       if (blkoffset & (sizeof(uint64_t) - 1)) {
+               panic("dq2get: %s quota file corrupted",
+                   quotatypes[type]);
+       }
+       error = bread(ump->um_quotas[type], lblkno, ump->umq2_bsize,
+           ump->um_cred[type], flags, &bp);
+       if (error)
+               return error;
+       if (bp->b_resid != 0) {
+               panic("dq2get: %s quota file corrupted",
+                   quotatypes[type]);
+       }
+       *q2ep = (void *)((char *)bp->b_data + blkoffset);
+       *bpp = bp;
+       return 0;
+}
+
+/* walk a quota entry list, calling the callback for each entry */
+#define Q2WL_ABORT 0x10000000
+
+static int
+quota2_walk_list(struct ufsmount *ump, struct buf *hbp, int type,
+    uint64_t *offp, int flags, void *a,
+    int (*func)(struct ufsmount *, uint64_t *, struct quota2_entry *, uint64_t, void *))
+{
+#ifdef FFS_EI
+       const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+       daddr_t off = ufs_rw64(*offp, needswap);
+       struct buf *bp, *obp = hbp;
+       int ret = 0, ret2 = 0;
+       struct quota2_entry *q2e;
+       daddr_t lblkno, blkoff, olblkno = 0;
+
+       KASSERT(mutex_owner(&dqlock));
+
+       while (off != 0) {
+               lblkno = (off >> ump->um_mountp->mnt_fs_bshift);
+               blkoff = (off & ump->umq2_bmask);
+               if (lblkno == 0) {
+                       /* in the header block */
+                       bp = hbp;
+               } else if (lblkno == olblkno) {
+                       /* still in the same buf */
+                       bp = obp;
+               } else {
+                       ret = bread(ump->um_quotas[type], lblkno, 
+                           ump->umq2_bsize,
+                           ump->um_cred[type], flags, &bp);
+                       if (ret)
+                               return ret;
+                       if (bp->b_resid != 0) {
+                               panic("quota2_walk_list: %s quota file corrupted",
+                                   quotatypes[type]);
+                       }
+               }
+               q2e = (void *)((char *)(bp->b_data) + blkoff);
+               ret = (*func)(ump, offp, q2e, off, a);
+               if (off != ufs_rw64(*offp, needswap)) {
+                       /* callback changed parent's pointer, redo */
+                       off = ufs_rw64(*offp, needswap);
+                       if (bp != hbp && bp != obp)
+                               ret2 = bwrite(bp);
+               } else {
+                       /* parent if now current */
+                       if (obp != bp && obp != hbp) {
+                               if (flags & B_MODIFY)
+                                       ret2 = bwrite(obp);
+                               else
+                                       brelse(obp, 0);
+                       }
+                       obp = bp;
+                       olblkno = lblkno;
+                       offp = &(q2e->q2e_next);
+                       off = ufs_rw64(*offp, needswap);
+               }
+               if (ret)
+                       break;
+               if (ret2) {
+                       ret = ret2;
+                       break;
+               }
+       }
+       if (obp != hbp) {
+               if (flags & B_MODIFY)
+                       ret2 = bwrite(obp);
+               else
+                       brelse(obp, 0);
+       }
+       if (ret & Q2WL_ABORT)
+               return 0;
+       if (ret == 0)
+               return ret2;
+       return ret;
+}
+
+int
+quota2_umount(struct mount *mp, int flags)
+{
+       int i, error;
+       struct ufsmount *ump = VFSTOUFS(mp);
+
+       if ((ump->um_flags & UFS_QUOTA2) == 0)
+               return 0;
+
+       for (i = 0; i < MAXQUOTAS; i++) {
+               if (ump->um_quotas[i] != NULLVP) {
+                       error = vn_close(ump->um_quotas[i], FREAD|FWRITE,
+                           ump->um_cred[i]);
+                       if (error) {
+                               printf("quota2_umount failed: close(%p) %d\n",
+                                   ump->um_quotas[i], error);
+                               return error;
+                       }
+               }
+               ump->um_quotas[i] = NULLVP;
+       }
+       return 0;
+}
+
+static int 
+quota2_q2ealloc(struct ufsmount *ump, int type, uid_t uid, struct dquot *dq,
+    struct buf **bpp, struct quota2_entry **q2ep)
+{
+       int error, error2;
+       struct buf *hbp, *bp;
+       struct quota2_header *q2h;
+       struct quota2_entry *q2e;
+       daddr_t offset;
+       u_long hash_mask;
+       const int needswap = UFS_MPNEEDSWAP(ump);
+
+       KASSERT(mutex_owned(&dq->dq_interlock));
+       KASSERT(mutex_owned(&dqlock));
+       error = getq2h(ump, type, &hbp, &q2h, B_MODIFY);
+       if (error)
+               return error;
+       offset = ufs_rw64(q2h->q2h_free, needswap);
+       if (offset == 0) {
+               struct vnode *vp = ump->um_quotas[type];
+               struct inode *ip = VTOI(vp);
+               uint64_t size = ip->i_size;
+               /* need to alocate a new disk block */
+               error = UFS_BALLOC(vp, size, ump->umq2_bsize,
+                   ump->um_cred[type], B_CLRBUF | B_SYNC, &bp);
+               if (error) {
+                       brelse(hbp, 0);
+                       return error;
+               }
+               KASSERT((ip->i_size % ump->umq2_bsize) == 0);
+               ip->i_size += ump->umq2_bsize;
+               DIP_ASSIGN(ip, size, ip->i_size);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               uvm_vnp_setsize(vp, ip->i_size);
+               quota2_addfreeq2e(q2h, bp->b_data, size, ump->umq2_bsize,
+                   needswap);
+               error = bwrite(bp);
+               error2 = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+               if (error || error2) {
+                       brelse(hbp, 0);
+                       if (error)
+                               return error;
+                       return error2;
+               }
+               offset = ufs_rw64(q2h->q2h_free, needswap);
+               KASSERT(offset != 0);
+       }
+       dq->dq2_lblkno = (offset >> ump->um_mountp->mnt_fs_bshift);
+       dq->dq2_blkoff = (offset & ump->umq2_bmask);
+       if (dq->dq2_lblkno == 0) {
+               bp = hbp;
+               q2e = (void *)((char *)bp->b_data + dq->dq2_blkoff);
+       } else {
+               error = getq2e(ump, type, dq->dq2_lblkno,
+                   dq->dq2_blkoff, &bp, &q2e, B_MODIFY);
+               if (error) {
+                       brelse(hbp, 0);
+                       return error;
+               }
+       }
+       hash_mask = ((1 << q2h->q2h_hash_shift) - 1);
+       /* remove from free list */
+       q2h->q2h_free = q2e->q2e_next;
+
+       memcpy(q2e, &q2h->q2h_defentry, sizeof(*q2e));
+       q2e->q2e_uid = ufs_rw32(uid, needswap);
+       /* insert in hash list */ 
+       q2e->q2e_next = q2h->q2h_entries[uid & hash_mask];
+       q2h->q2h_entries[uid & hash_mask] = ufs_rw64(offset, needswap);
+       if (hbp != bp) {
+               bwrite(hbp);
+       }
+       *q2ep = q2e;
+       *bpp = bp;
+       return 0;
+}
+
+static int
+getinoquota2(struct inode *ip, bool alloc, bool modify, struct buf **bpp,
+    struct quota2_entry **q2ep)
+{
+       int error;
+       int i;
+       struct dquot *dq;
+       struct ufsmount *ump = ip->i_ump;
+       u_int32_t ino_ids[MAXQUOTAS];
+
+       error = getinoquota(ip);
+       if (error)
+               return error;
+
+       if (alloc) {
+               UFS_WAPBL_JLOCK_ASSERT(ump->um_mountp);
+       }
+        ino_ids[USRQUOTA] = ip->i_uid;
+        ino_ids[GRPQUOTA] = ip->i_gid;
+       /* first get the interlock for all dquot */
+       for (i = 0; i < MAXQUOTAS; i++) {
+               dq = ip->i_dquot[i];
+               if (dq == NODQUOT)
+                       continue;
+               mutex_enter(&dq->dq_interlock);
+       }
+       /* now get the corresponding quota entry */
+       for (i = 0; i < MAXQUOTAS; i++) {
+               bpp[i] = NULL;
+               q2ep[i] = NULL;
+               dq = ip->i_dquot[i];
+               if (dq == NODQUOT)
+                       continue;
+               if (__predict_false(ump->um_quotas[i] == NULL)) {
+                       /*
+                        * quotas have been turned off. This can happen
+                        * at umount time.
+                        */
+                       mutex_exit(&dq->dq_interlock);
+                       dqrele(NULLVP, dq);
+                       ip->i_dquot[i] = NULL;
+                       continue;
+               }
+
+               if ((dq->dq2_lblkno | dq->dq2_blkoff) == 0) {
+                       if (!alloc) {
+                               continue;
+                       }
+                       /* need to alloc a new on-disk quot */
+                       mutex_enter(&dqlock);
+                       error = quota2_q2ealloc(ump, i, ino_ids[i], dq,
+                           &bpp[i], &q2ep[i]);
+                       mutex_exit(&dqlock);
+                       if (error)
+                               return error;
+               } else {
+                       error = getq2e(ump, i, dq->dq2_lblkno,
+                           dq->dq2_blkoff, &bpp[i], &q2ep[i],
+                           modify ? B_MODIFY : 0);
+                       if (error)
+                               return error;
+               }
+       }
+       return 0;
+}
+
+static int
+quota2_check(struct inode *ip, int vtype, int64_t change, kauth_cred_t cred,
+    int flags)
+{
+       int error;
+       struct buf *bp[MAXQUOTAS];
+       struct quota2_entry *q2e[MAXQUOTAS];
+       struct quota2_val *q2vp;
+       struct dquot *dq;
+       uint64_t ncurblks;
+       struct ufsmount *ump = ip->i_ump;
+       struct mount *mp = ump->um_mountp;
+       const int needswap = UFS_MPNEEDSWAP(ump);
+       int i;
+
+       if ((error = getinoquota2(ip, change > 0, change != 0, bp, q2e)) != 0)
+               return error;
+       if (change == 0) {
+               for (i = 0; i < MAXQUOTAS; i++) {
+                       dq = ip->i_dquot[i];
+                       if (dq == NODQUOT)
+                               continue;
+                       if (bp[i])
+                               brelse(bp[i], 0);
+                       mutex_exit(&dq->dq_interlock);
+               }
+               return 0;
+       }
+       if (change < 0) {
+               for (i = 0; i < MAXQUOTAS; i++) {
+                       dq = ip->i_dquot[i];
+                       if (dq == NODQUOT)
+                               continue;
+                       if (q2e[i] == NULL) {
+                               mutex_exit(&dq->dq_interlock);
+                               continue;
+                       }
+                       q2vp = &q2e[i]->q2e_val[vtype];
+                       ncurblks = ufs_rw64(q2vp->q2v_cur, needswap);
+                       if (ncurblks < -change)
+                               ncurblks = 0;
+                       else
+                               ncurblks += change;
+                       q2vp->q2v_cur = ufs_rw64(ncurblks, needswap);
+                       quota2_bwrite(mp, bp[i]);
+                       mutex_exit(&dq->dq_interlock);
+               }
+               return 0;
+       }
+       /* see if the allocation is allowed */
+       for (i = 0; i < MAXQUOTAS; i++) {
+               struct quota2_val q2v;
+               int ql_stat;
+               dq = ip->i_dquot[i];
+               if (dq == NODQUOT)
+                       continue;
+               KASSERT(q2e[i] != NULL);
+               quota2_ufs_rwq2v(&q2e[i]->q2e_val[vtype], &q2v, needswap);
+               ql_stat = quota2_check_limit(&q2v, change, time_second);
+
+               if ((flags & FORCE) == 0 &&
+                   kauth_authorize_system(cred, KAUTH_SYSTEM_FS_QUOTA,
+                   KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT,
+                   KAUTH_ARG(i), KAUTH_ARG(vtype), NULL) != 0) {
+                       /* enforce this limit */
+                       switch(QL_STATUS(ql_stat)) {
+                       case QL_S_DENY_HARD:
+                               if ((dq->dq_flags & DQ_WARN(vtype)) == 0) {
+                                       uprintf("\n%s: write failed, %s %s "
+                                           "limit reached\n",
+                                           mp->mnt_stat.f_mntonname,
+                                           quotatypes[i], limnames[vtype]);
+                                       dq->dq_flags |= DQ_WARN(vtype);
+                               }
+                               error = EDQUOT;
+                               break;
+                       case QL_S_DENY_GRACE:
+                               if ((dq->dq_flags & DQ_WARN(vtype)) == 0) {
+                                       uprintf("\n%s: write failed, %s %s "
+                                           "limit reached\n",
+                                           mp->mnt_stat.f_mntonname,
+                                           quotatypes[i], limnames[vtype]);
+                                       dq->dq_flags |= DQ_WARN(vtype);
+                               }
+                               error = EDQUOT;
+                               break;
+                       case QL_S_ALLOW_SOFT:
+                               if ((dq->dq_flags & DQ_WARN(vtype)) == 0) {
+                                       uprintf("\n%s: warning, %s %s "
+                                           "quota exceeded\n",
+                                           mp->mnt_stat.f_mntonname,
+                                           quotatypes[i], limnames[vtype]);
+                                       dq->dq_flags |= DQ_WARN(vtype);
+                               }
+                               break;
+                       }
+               }
+               /*
+                * always do this; we don't know if the allocation will
+                * succed or not in the end. if we don't do the allocation
+                * q2v_time will be ignored anyway
+                */
+               if (ql_stat & QL_F_CROSS) {
+                       q2v.q2v_time = time_second + q2v.q2v_grace;
+                       quota2_ufs_rwq2v(&q2v, &q2e[i]->q2e_val[vtype],
+                           needswap);
+               }
+       }
+
+       /* now do the allocation if allowed */
+       for (i = 0; i < MAXQUOTAS; i++) {
+               dq = ip->i_dquot[i];
+               if (dq == NODQUOT)
+                       continue;
+               KASSERT(q2e[i] != NULL);
+               if (error == 0) {
+                       q2vp = &q2e[i]->q2e_val[vtype];
+                       ncurblks = ufs_rw64(q2vp->q2v_cur, needswap);
+                       q2vp->q2v_cur = ufs_rw64(ncurblks + change, needswap);
+                       quota2_bwrite(mp, bp[i]);
+               } else
+                       brelse(bp[i], 0);
+               mutex_exit(&dq->dq_interlock);
+       }
+       return error;
+}
+
+int
+chkdq2(struct inode *ip, int64_t change, kauth_cred_t cred, int flags)
+{
+       return quota2_check(ip, QL_BLOCK, change, cred, flags);
+}
+
+int
+chkiq2(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
+{
+       return quota2_check(ip, QL_FILE, change, cred, flags);
+}
+
+int
+quota2_handle_cmd_set(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_dictionary_t data)
+{
+       int error;
+       struct dquot *dq;
+       struct quota2_header *q2h;
+       struct quota2_entry q2e, *q2ep;
+       struct buf *bp;
+       const int needswap = UFS_MPNEEDSWAP(ump);
+
+       if (ump->um_quotas[type] == NULLVP)
+               return ENODEV;
+       error = UFS_WAPBL_BEGIN(ump->um_mountp);
+       if (error)
+               return error;
+       
+       if (defaultq) {
+               mutex_enter(&dqlock);
+               error = getq2h(ump, type, &bp, &q2h, B_MODIFY);
+               if (error) {
+                       mutex_exit(&dqlock);
+                       goto out_wapbl;
+               }
+               quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap);
+               error = quota2_dict_update_q2e_limits(data, &q2e);
+               if (error) {
+                       mutex_exit(&dqlock);
+                       brelse(bp, 0);
+                       goto out_wapbl;
+               }
+               quota2_ufs_rwq2e(&q2e, &q2h->q2h_defentry, needswap);
+               mutex_exit(&dqlock);
+               quota2_bwrite(ump->um_mountp, bp);
+               goto out_wapbl;
+       }
+
+       error = dqget(NULLVP, id, ump, type, &dq);
+       if (error)
+               goto out_wapbl;
+
+       mutex_enter(&dq->dq_interlock);
+       if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) {
+               /* need to alloc a new on-disk quot */
+               mutex_enter(&dqlock);
+               error = quota2_q2ealloc(ump, type, id, dq, &bp, &q2ep);
+               mutex_exit(&dqlock);
+       } else {
+               error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff,
+                   &bp, &q2ep, B_MODIFY);
+       }
+       if (error)
+               goto out_il;
+       
+       quota2_ufs_rwq2e(q2ep, &q2e, needswap);
+       error = quota2_dict_update_q2e_limits(data, &q2e);
+       if (error) {
+               brelse(bp, 0);
+               goto out_il;
+       }
+       quota2_ufs_rwq2e(&q2e, q2ep, needswap);
+       quota2_bwrite(ump->um_mountp, bp);
+
+out_il:
+       mutex_exit(&dq->dq_interlock);
+       dqrele(NULLVP, dq);
+out_wapbl:
+       UFS_WAPBL_END(ump->um_mountp);
+       return error;
+}
+
+struct dq2clear_callback {
+       uid_t id;
+       struct dquot *dq;
+       struct quota2_header *q2h;
+};
+
+static int
+dq2clear_callback(struct ufsmount *ump, uint64_t *offp, struct quota2_entry *q2e,
+    uint64_t off, void *v)
+{
+       struct dq2clear_callback *c = v;
+#ifdef FFS_EI
+       const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+       uint64_t myoff;
+
+       if (ufs_rw32(q2e->q2e_uid, needswap) == c->id) {
+               KASSERT(mutex_owned(&c->dq->dq_interlock));
+               c->dq->dq2_lblkno = 0;
+               c->dq->dq2_blkoff = 0;
+               myoff = *offp;
+               /* remove from hash list */
+               *offp = q2e->q2e_next;
+               /* add to free list */
+               q2e->q2e_next = c->q2h->q2h_free;
+               c->q2h->q2h_free = myoff;
+               return Q2WL_ABORT;
+       }
+       return 0;
+}
+int
+quota2_handle_cmd_clear(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_dictionary_t data)
+{
+       int error, i;
+       struct dquot *dq;
+       struct quota2_header *q2h;
+       struct quota2_entry q2e, *q2ep;
+       struct buf *hbp, *bp;
+       u_long hash_mask;
+       struct dq2clear_callback c;
+
+       if (ump->um_quotas[type] == NULLVP)
+               return ENODEV;
+       if (defaultq)
+               return EOPNOTSUPP;
+
+       /* get the default entry before locking the entry's buffer */
+       mutex_enter(&dqlock);
+       error = getq2h(ump, type, &hbp, &q2h, 0);
+       if (error) {
+               mutex_exit(&dqlock);
+               return error;
+       }
+       /* we'll copy to another disk entry, so no need to swap */
+       memcpy(&q2e, &q2h->q2h_defentry, sizeof(q2e));
+       mutex_exit(&dqlock);
+       brelse(hbp, 0);
+
+       error = dqget(NULLVP, id, ump, type, &dq);
+       if (error)
+               return error;
+
+       mutex_enter(&dq->dq_interlock);
+       if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) {
+               /* already clear, nothing to do */
+               error = ENOENT;
+               goto out_il;
+       }
+       error = UFS_WAPBL_BEGIN(ump->um_mountp);
+       if (error)
+               goto out_dq;
+       
+       error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff,
+           &bp, &q2ep, B_MODIFY);
+       if (error)
+               goto out_wapbl;
+
+       if (q2ep->q2e_val[QL_BLOCK].q2v_cur != 0 ||
+           q2ep->q2e_val[QL_FILE].q2v_cur != 0) {
+               /* can't free this entry; revert to default */
+               for (i = 0; i < N_QL; i++) {
+                       q2ep->q2e_val[i].q2v_softlimit =
+                           q2e.q2e_val[i].q2v_softlimit;
+                       q2ep->q2e_val[i].q2v_hardlimit =
+                           q2e.q2e_val[i].q2v_hardlimit;
+                       q2ep->q2e_val[i].q2v_grace =
+                           q2e.q2e_val[i].q2v_grace;
+                       q2ep->q2e_val[i].q2v_time = 0;
+               }
+               quota2_bwrite(ump->um_mountp, bp);
+               goto out_wapbl;
+       }
+       /* we can free it. release bp so we can walk the list */
+       brelse(bp, 0);
+       mutex_enter(&dqlock);
+       error = getq2h(ump, type, &hbp, &q2h, 0);
+       if (error)
+               goto out_dqlock;
+
+       hash_mask = ((1 << q2h->q2h_hash_shift) - 1);
+       c.dq = dq;
+       c.id = id;
+       c.q2h = q2h;
+       error = quota2_walk_list(ump, hbp, type,
+           &q2h->q2h_entries[id & hash_mask], B_MODIFY, &c,
+           dq2clear_callback);
+
+       bwrite(hbp);
+
+out_dqlock:
+       mutex_exit(&dqlock);
+out_wapbl:
+       UFS_WAPBL_END(ump->um_mountp);
+out_il:
+       mutex_exit(&dq->dq_interlock);
+out_dq:
+       dqrele(NULLVP, dq);
+       return error;
+}
+
+static int
+quota2_array_add_q2e(struct ufsmount *ump, int type,
+    int id, prop_array_t replies)
+{
+       struct dquot *dq;
+       int error;
+       struct quota2_entry *q2ep, q2e;
+       struct buf  *bp;
+       const int needswap = UFS_MPNEEDSWAP(ump);
+       prop_dictionary_t dict;
+
+       error = dqget(NULLVP, id, ump, type, &dq);
+       if (error)
+               return error;
+
+       mutex_enter(&dq->dq_interlock);
+       if (dq->dq2_lblkno == 0 && dq->dq2_blkoff == 0) {
+               mutex_exit(&dq->dq_interlock);
+               dqrele(NULLVP, dq);
+               return ENOENT;
+       }
+       error = getq2e(ump, type, dq->dq2_lblkno, dq->dq2_blkoff,
+           &bp, &q2ep, 0);
+       if (error) {
+               mutex_exit(&dq->dq_interlock);
+               dqrele(NULLVP, dq);
+               return error;
+       }
+       quota2_ufs_rwq2e(q2ep, &q2e, needswap);
+       brelse(bp, 0);
+       mutex_exit(&dq->dq_interlock);
+       dqrele(NULLVP, dq);
+       dict = q2etoprop(&q2e, 0);
+       if (dict == NULL)
+               return ENOMEM;
+       if (!prop_array_add_and_rel(replies, dict))
+               return ENOMEM;
+       return 0;
+}
+
+int
+quota2_handle_cmd_get(struct ufsmount *ump, int type, int id,
+    int defaultq, prop_array_t replies)
+{
+       int error;
+       struct quota2_header *q2h;
+       struct quota2_entry q2e;
+       struct buf *bp;
+       prop_dictionary_t dict;
+       const int needswap = UFS_MPNEEDSWAP(ump);
+
+       if (ump->um_quotas[type] == NULLVP)
+               return ENODEV;
+       if (defaultq) {
+               mutex_enter(&dqlock);
+               error = getq2h(ump, type, &bp, &q2h, 0);
+               if (error) {
+                       mutex_exit(&dqlock);
+                       return error;
+               }
+               quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap);
+               mutex_exit(&dqlock);
+               brelse(bp, 0);
+               dict = q2etoprop(&q2e, defaultq);
+               if (dict == NULL)
+                       return ENOMEM;
+               if (!prop_array_add_and_rel(replies, dict))
+                       return ENOMEM;
+       } else
+               error = quota2_array_add_q2e(ump, type, id, replies);
+       
+       return error;
+}
+
+struct getuids {
+       long nuids; /* number of uids in array */
+       long size;  /* size of array */
+       uid_t *uids; /* array of uids, dynamically allocated */
+};
+
+static int
+quota2_getuids_callback(struct ufsmount *ump, uint64_t *offp,
+    struct quota2_entry *q2ep, uint64_t off, void *v)
+{
+       struct getuids *gu = v;
+       uid_t *newuids;
+#ifdef FFS_EI
+       const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+
+       if (gu->nuids == gu->size) {
+               newuids = realloc(gu->uids, gu->size + PAGE_SIZE, M_TEMP,
+                   M_WAITOK);
+               if (newuids == NULL) {
+                       free(gu->uids, M_TEMP);
+                       return ENOMEM;
+               }
+               gu->uids = newuids;
+               gu->size += (PAGE_SIZE / sizeof(uid_t));
+       }
+       gu->uids[gu->nuids] = ufs_rw32(q2ep->q2e_uid, needswap);
+       gu->nuids++;
+       return 0;
+}
+
+int
+quota2_handle_cmd_getall(struct ufsmount *ump, int type, prop_array_t replies)
+{
+       int error;
+       struct quota2_header *q2h;
+       struct quota2_entry  q2e;
+       struct buf *hbp;
+       prop_dictionary_t dict;
+       uint64_t offset;
+       int i, j;
+       int quota2_hash_size;
+       const int needswap = UFS_MPNEEDSWAP(ump);
+       struct getuids gu;
+
+       if (ump->um_quotas[type] == NULLVP)
+               return ENODEV;
+       mutex_enter(&dqlock);
+       error = getq2h(ump, type, &hbp, &q2h, 0);
+       if (error) {
+               mutex_exit(&dqlock);
+               return error;
+       }
+       quota2_ufs_rwq2e(&q2h->q2h_defentry, &q2e, needswap);
+       dict = q2etoprop(&q2e, 1);
+       if (!prop_array_add_and_rel(replies, dict)) {
+               error = ENOMEM;
+               goto error_bp;
+       }
+       /*
+        * we can't directly get entries as we can't walk the list
+        * with qdlock and grab dq_interlock to read the entries
+        * at the same time. So just walk the lists to build a list of uid,
+        * and then read entries for these uids
+        */
+       memset(&gu, 0, sizeof(gu));
+       quota2_hash_size = ufs_rw16(q2h->q2h_hash_size, needswap);
+       for (i = 0; i < quota2_hash_size ; i++) {
+               offset = q2h->q2h_entries[i];
+               error = quota2_walk_list(ump, hbp, type, &offset, 0, &gu,
+                   quota2_getuids_callback);
+               if (error) {
+                       if (gu.uids != NULL)
+                               free(gu.uids, M_TEMP);
+                       break;
+               }
+       }
+error_bp:
+       mutex_exit(&dqlock);
+       brelse(hbp, 0);
+       if (error)
+               return error;
+       for (j = 0; j < gu.nuids; j++) {
+               error = quota2_array_add_q2e(ump, type,
+                   gu.uids[j], replies);
+               if (error && error != ENOENT)
+                       break;
+       }
+       free(gu.uids, M_TEMP);
+       return error;
+}
+
+int
+q2sync(struct mount *mp)
+{
+       return 0;
+}
+
+struct dq2get_callback {
+       uid_t id;
+       struct dquot *dq;
+};
+
+static int
+dq2get_callback(struct ufsmount *ump, uint64_t *offp, struct quota2_entry *q2e,
+    uint64_t off, void *v)
+{
+       struct dq2get_callback *c = v;
+       daddr_t lblkno;
+       int blkoff;
+#ifdef FFS_EI
+       const int needswap = UFS_MPNEEDSWAP(ump);
+#endif
+
+       if (ufs_rw32(q2e->q2e_uid, needswap) == c->id) {
+               KASSERT(mutex_owned(&c->dq->dq_interlock));
+               lblkno = (off >> ump->um_mountp->mnt_fs_bshift);
+               blkoff = (off & ump->umq2_bmask);
+               c->dq->dq2_lblkno = lblkno;
+               c->dq->dq2_blkoff = blkoff;
+               return Q2WL_ABORT;
+       }
+       return 0;
+}
+
+int
+dq2get(struct vnode *dqvp, u_long id, struct ufsmount *ump, int type,
+    struct dquot *dq)
+{
+       struct buf *bp;
+       struct quota2_header *q2h;
+       int error;
+       daddr_t offset;
+       u_long hash_mask;
+       struct dq2get_callback c = {
+               .id = id,
+               .dq = dq
+       };
+
+       KASSERT(mutex_owned(&dq->dq_interlock));
+       mutex_enter(&dqlock);
+       error = getq2h(ump, type, &bp, &q2h, 0);
+       if (error)
+               goto out_mutex;
+       /* look for our entry */
+       hash_mask = ((1 << q2h->q2h_hash_shift) - 1);
+       offset = q2h->q2h_entries[id & hash_mask];
+       error = quota2_walk_list(ump, bp, type, &offset, 0, (void *)&c,
+           dq2get_callback);
+       brelse(bp, 0);
+out_mutex:
+       mutex_exit(&dqlock);
+       return error;
+}
+
+int
+dq2sync(struct vnode *vp, struct dquot *dq)
+{
+       return 0;
+}
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
new file mode 100644 (file)
index 0000000..4ab40c8
--- /dev/null
@@ -0,0 +1,533 @@
+/*     $NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $     */
+
+/*-
+ * Copyright (c) 1993
+ *     The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_readwrite.c     8.11 (Berkeley) 5/8/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $");
+
+#ifdef LFS_READWRITE
+#define        FS                      struct lfs
+#define        I_FS                    i_lfs
+#define        READ                    lfs_read
+#define        READ_S                  "lfs_read"
+#define        WRITE                   lfs_write
+#define        WRITE_S                 "lfs_write"
+#define        fs_bsize                lfs_bsize
+#define        fs_bmask                lfs_bmask
+#define        UFS_WAPBL_BEGIN(mp)     0
+#define        UFS_WAPBL_END(mp)       do { } while (0)
+#define        UFS_WAPBL_UPDATE(vp, access, modify, flags)     do { } while (0)
+#else
+#define        FS                      struct fs
+#define        I_FS                    i_fs
+#define        READ                    ffs_read
+#define        READ_S                  "ffs_read"
+#define        WRITE                   ffs_write
+#define        WRITE_S                 "ffs_write"
+#endif
+
+/*
+ * Vnode op for reading.
+ */
+/* ARGSUSED */
+int
+READ(void *v)
+{
+       struct vop_read_args /* {
+               struct vnode *a_vp;
+               struct uio *a_uio;
+               int a_ioflag;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp;
+       struct inode *ip;
+       struct uio *uio;
+       struct ufsmount *ump;
+       struct buf *bp;
+       FS *fs;
+       vsize_t bytelen;
+       daddr_t lbn, nextlbn;
+       off_t bytesinfile;
+       long size, xfersize, blkoffset;
+       int error, ioflag;
+       bool usepc = false;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       ump = ip->i_ump;
+       uio = ap->a_uio;
+       ioflag = ap->a_ioflag;
+       error = 0;
+
+#ifdef DIAGNOSTIC
+       if (uio->uio_rw != UIO_READ)
+               panic("%s: mode", READ_S);
+
+       if (vp->v_type == VLNK) {
+               if (ip->i_size < ump->um_maxsymlinklen ||
+                   (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0))
+                       panic("%s: short symlink", READ_S);
+       } else if (vp->v_type != VREG && vp->v_type != VDIR)
+               panic("%s: type %d", READ_S, vp->v_type);
+#endif
+       fs = ip->I_FS;
+       if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
+               return (EFBIG);
+       if (uio->uio_resid == 0)
+               return (0);
+
+#ifndef LFS_READWRITE
+       if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT)
+               return ffs_snapshot_read(vp, uio, ioflag);
+#endif /* !LFS_READWRITE */
+
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+       if (uio->uio_offset >= ip->i_size)
+               goto out;
+
+#ifdef LFS_READWRITE
+       usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM);
+#else /* !LFS_READWRITE */
+       usepc = vp->v_type == VREG;
+#endif /* !LFS_READWRITE */
+       if (usepc) {
+               const int advice = IO_ADV_DECODE(ap->a_ioflag);
+
+               while (uio->uio_resid > 0) {
+                       if (ioflag & IO_DIRECT) {
+                               genfs_directio(vp, uio, ioflag);
+                       }
+                       bytelen = MIN(ip->i_size - uio->uio_offset,
+                           uio->uio_resid);
+                       if (bytelen == 0)
+                               break;
+                       error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
+                           UBC_READ | UBC_PARTIALOK | UBC_UNMAP_FLAG(vp));
+                       if (error)
+                               break;
+               }
+               goto out;
+       }
+
+       for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
+               bytesinfile = ip->i_size - uio->uio_offset;
+               if (bytesinfile <= 0)
+                       break;
+               lbn = lblkno(fs, uio->uio_offset);
+               nextlbn = lbn + 1;
+               size = blksize(fs, ip, lbn);
+               blkoffset = blkoff(fs, uio->uio_offset);
+               xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
+                   bytesinfile);
+
+               if (lblktosize(fs, nextlbn) >= ip->i_size)
+                       error = bread(vp, lbn, size, NOCRED, 0, &bp);
+               else {
+                       int nextsize = blksize(fs, ip, nextlbn);
+                       error = breadn(vp, lbn,
+                           size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
+               }
+               if (error)
+                       break;
+
+               /*
+                * We should only get non-zero b_resid when an I/O error
+                * has occurred, which should cause us to break above.
+                * However, if the short read did not cause an error,
+                * then we want to ensure that we do not uiomove bad
+                * or uninitialized data.
+                */
+               size -= bp->b_resid;
+               if (size < xfersize) {
+                       if (size == 0)
+                               break;
+                       xfersize = size;
+               }
+               error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+               if (error)
+                       break;
+               brelse(bp, 0);
+       }
+       if (bp != NULL)
+               brelse(bp, 0);
+
+ out:
+       if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
+               ip->i_flag |= IN_ACCESS;
+               if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) {
+                       error = UFS_WAPBL_BEGIN(vp->v_mount);
+                       if (error) {
+                               fstrans_done(vp->v_mount);
+                               return error;
+                       }
+                       error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+                       UFS_WAPBL_END(vp->v_mount);
+               }
+       }
+
+       fstrans_done(vp->v_mount);
+       return (error);
+}
+
+/*
+ * Vnode op for writing.
+ */
+int
+WRITE(void *v)
+{
+       struct vop_write_args /* {
+               struct vnode *a_vp;
+               struct uio *a_uio;
+               int a_ioflag;
+               kauth_cred_t a_cred;
+       } */ *ap = v;
+       struct vnode *vp;
+       struct uio *uio;
+       struct inode *ip;
+       FS *fs;
+       struct buf *bp;
+       kauth_cred_t cred;
+       daddr_t lbn;
+       off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
+       int blkoffset, error, flags, ioflag, resid, size, xfersize;
+       int aflag;
+       int extended=0;
+       vsize_t bytelen;
+       bool async;
+       bool usepc = false;
+#ifdef LFS_READWRITE
+       bool need_unreserve = false;
+#endif
+       struct ufsmount *ump;
+
+       cred = ap->a_cred;
+       ioflag = ap->a_ioflag;
+       uio = ap->a_uio;
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       ump = ip->i_ump;
+
+       KASSERT(vp->v_size == ip->i_size);
+#ifdef DIAGNOSTIC
+       if (uio->uio_rw != UIO_WRITE)
+               panic("%s: mode", WRITE_S);
+#endif
+
+       switch (vp->v_type) {
+       case VREG:
+               if (ioflag & IO_APPEND)
+                       uio->uio_offset = ip->i_size;
+               if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
+                       return (EPERM);
+               /* FALLTHROUGH */
+       case VLNK:
+               break;
+       case VDIR:
+               if ((ioflag & IO_SYNC) == 0)
+                       panic("%s: nonsync dir write", WRITE_S);
+               break;
+       default:
+               panic("%s: type", WRITE_S);
+       }
+
+       fs = ip->I_FS;
+       if (uio->uio_offset < 0 ||
+           (u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
+               return (EFBIG);
+#ifdef LFS_READWRITE
+       /* Disallow writes to the Ifile, even if noschg flag is removed */
+       /* XXX can this go away when the Ifile is no longer in the namespace? */
+       if (vp == fs->lfs_ivnode)
+               return (EPERM);
+#endif
+       if (uio->uio_resid == 0)
+               return (0);
+
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+       flags = ioflag & IO_SYNC ? B_SYNC : 0;
+       async = vp->v_mount->mnt_flag & MNT_ASYNC;
+       origoff = uio->uio_offset;
+       resid = uio->uio_resid;
+       osize = ip->i_size;
+       error = 0;
+
+       usepc = vp->v_type == VREG;
+
+       if ((ioflag & IO_JOURNALLOCKED) == 0) {
+               error = UFS_WAPBL_BEGIN(vp->v_mount);
+               if (error) {
+                       fstrans_done(vp->v_mount);
+                       return error;
+               }
+       }
+
+#ifdef LFS_READWRITE
+       async = true;
+       lfs_check(vp, LFS_UNUSED_LBN, 0);
+#endif /* !LFS_READWRITE */
+       if (!usepc)
+               goto bcache;
+
+       preallocoff = round_page(blkroundup(fs, MAX(osize, uio->uio_offset)));
+       aflag = ioflag & IO_SYNC ? B_SYNC : 0;
+       nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
+       endallocoff = nsize - blkoff(fs, nsize);
+
+       /*
+        * if we're increasing the file size, deal with expanding
+        * the fragment if there is one.
+        */
+
+       if (nsize > osize && lblkno(fs, osize) < NDADDR &&
+           lblkno(fs, osize) != lblkno(fs, nsize) &&
+           blkroundup(fs, osize) != osize) {
+               off_t eob;
+
+               eob = blkroundup(fs, osize);
+               uvm_vnp_setwritesize(vp, eob);
+               error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
+               if (error)
+                       goto out;
+               if (flags & B_SYNC) {
+                       mutex_enter(vp->v_interlock);
+                       VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
+                           round_page(eob),
+                           PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+               }
+       }
+
+       while (uio->uio_resid > 0) {
+               int ubc_flags = UBC_WRITE;
+               bool overwrite; /* if we're overwrite a whole block */
+               off_t newoff;
+
+               if (ioflag & IO_DIRECT) {
+                       genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
+               }
+
+               oldoff = uio->uio_offset;
+               blkoffset = blkoff(fs, uio->uio_offset);
+               bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
+               if (bytelen == 0) {
+                       break;
+               }
+
+               /*
+                * if we're filling in a hole, allocate the blocks now and
+                * initialize the pages first.  if we're extending the file,
+                * we can safely allocate blocks without initializing pages
+                * since the new blocks will be inaccessible until the write
+                * is complete.
+                */
+               overwrite = uio->uio_offset >= preallocoff &&
+                   uio->uio_offset < endallocoff;
+               if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 &&
+                   blkoff(fs, uio->uio_offset) == 0 &&
+                   (uio->uio_offset & PAGE_MASK) == 0) {
+                       vsize_t len;
+
+                       len = trunc_page(bytelen);
+                       len -= blkoff(fs, len);
+                       if (len > 0) {
+                               overwrite = true;
+                               bytelen = len;
+                       }
+               }
+
+               newoff = oldoff + bytelen;
+               if (vp->v_size < newoff) {
+                       uvm_vnp_setwritesize(vp, newoff);
+               }
+
+               if (!overwrite) {
+                       error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
+                           cred, aflag);
+                       if (error)
+                               break;
+               } else {
+                       genfs_node_wrlock(vp);
+                       error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
+                           aflag, cred);
+                       genfs_node_unlock(vp);
+                       if (error)
+                               break;
+                       ubc_flags |= UBC_FAULTBUSY;
+               }
+
+               /*
+                * copy the data.
+                */
+
+               error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
+                   IO_ADV_DECODE(ioflag), ubc_flags | UBC_UNMAP_FLAG(vp));
+
+               /*
+                * update UVM's notion of the size now that we've
+                * copied the data into the vnode's pages.
+                *
+                * we should update the size even when uiomove failed.
+                */
+
+               if (vp->v_size < newoff) {
+                       uvm_vnp_setsize(vp, newoff);
+                       extended = 1;
+               }
+
+               if (error)
+                       break;
+
+               /*
+                * flush what we just wrote if necessary.
+                * XXXUBC simplistic async flushing.
+                */
+
+#ifndef LFS_READWRITE
+               if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
+                       mutex_enter(vp->v_interlock);
+                       error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
+                           (uio->uio_offset >> 16) << 16,
+                           PGO_CLEANIT | PGO_JOURNALLOCKED);
+                       if (error)
+                               break;
+               }
+#endif
+       }
+       if (error == 0 && ioflag & IO_SYNC) {
+               mutex_enter(vp->v_interlock);
+               error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask),
+                   round_page(blkroundup(fs, uio->uio_offset)),
+                   PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
+       }
+       goto out;
+
+ bcache:
+       mutex_enter(vp->v_interlock);
+       VOP_PUTPAGES(vp, trunc_page(origoff), round_page(origoff + resid),
+           PGO_CLEANIT | PGO_FREE | PGO_SYNCIO | PGO_JOURNALLOCKED);
+       while (uio->uio_resid > 0) {
+               lbn = lblkno(fs, uio->uio_offset);
+               blkoffset = blkoff(fs, uio->uio_offset);
+               xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
+               if (fs->fs_bsize > xfersize)
+                       flags |= B_CLRBUF;
+               else
+                       flags &= ~B_CLRBUF;
+
+#ifdef LFS_READWRITE
+               error = lfs_reserve(fs, vp, NULL,
+                   btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+               if (error)
+                       break;
+               need_unreserve = true;
+#endif
+               error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
+                   ap->a_cred, flags, &bp);
+
+               if (error)
+                       break;
+               if (uio->uio_offset + xfersize > ip->i_size) {
+                       ip->i_size = uio->uio_offset + xfersize;
+                       DIP_ASSIGN(ip, size, ip->i_size);
+                       uvm_vnp_setsize(vp, ip->i_size);
+                       extended = 1;
+               }
+               size = blksize(fs, ip, lbn) - bp->b_resid;
+               if (xfersize > size)
+                       xfersize = size;
+
+               error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+
+               /*
+                * if we didn't clear the block and the uiomove failed,
+                * the buf will now contain part of some other file,
+                * so we need to invalidate it.
+                */
+               if (error && (flags & B_CLRBUF) == 0) {
+                       brelse(bp, BC_INVAL);
+                       break;
+               }
+#ifdef LFS_READWRITE
+               (void)VOP_BWRITE(bp->b_vp, bp);
+               lfs_reserve(fs, vp, NULL,
+                   -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+               need_unreserve = false;
+#else
+               if (ioflag & IO_SYNC)
+                       (void)bwrite(bp);
+               else if (xfersize + blkoffset == fs->fs_bsize)
+                       bawrite(bp);
+               else
+                       bdwrite(bp);
+#endif
+               if (error || xfersize == 0)
+                       break;
+       }
+#ifdef LFS_READWRITE
+       if (need_unreserve) {
+               lfs_reserve(fs, vp, NULL,
+                   -btofsb(fs, (NIADDR + 1) << fs->lfs_bshift));
+       }
+#endif
+
+       /*
+        * If we successfully wrote any data, and we are not the superuser
+        * we clear the setuid and setgid bits as a precaution against
+        * tampering.
+        */
+out:
+       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+       if (vp->v_mount->mnt_flag & MNT_RELATIME)
+               ip->i_flag |= IN_ACCESS;
+       if (resid > uio->uio_resid && ap->a_cred &&
+           kauth_authorize_generic(ap->a_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
+               ip->i_mode &= ~(ISUID | ISGID);
+               DIP_ASSIGN(ip, mode, ip->i_mode);
+       }
+       if (resid > uio->uio_resid)
+               VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+       if (error) {
+               (void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, ap->a_cred);
+               uio->uio_offset -= resid - uio->uio_resid;
+               uio->uio_resid = resid;
+       } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
+               error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+       else
+               UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+       KASSERT(vp->v_size == ip->i_size);
+       if ((ioflag & IO_JOURNALLOCKED) == 0)
+               UFS_WAPBL_END(vp->v_mount);
+       fstrans_done(vp->v_mount);
+
+       return (error);
+}
diff --git a/sys/ufs/ufs/ufs_vfsops.c b/sys/ufs/ufs/ufs_vfsops.c
new file mode 100644 (file)
index 0000000..ac7230b
--- /dev/null
@@ -0,0 +1,308 @@
+/*     $NetBSD: ufs_vfsops.c,v 1.42 2011/03/24 17:05:46 bouyer Exp $   */
+
+/*
+ * Copyright (c) 1991, 1993, 1994
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_vfsops.c        8.8 (Berkeley) 5/20/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_vfsops.c,v 1.42 2011/03/24 17:05:46 bouyer Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/kauth.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_extern.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#include <quota/quotaprop.h>
+
+/* how many times ufs_init() was called */
+static int ufs_initcount = 0;
+
+pool_cache_t ufs_direct_cache;
+
+/*
+ * Make a filesystem operational.
+ * Nothing to do at the moment.
+ */
+/* ARGSUSED */
+int
+ufs_start(struct mount *mp, int flags)
+{
+
+       return (0);
+}
+
+/*
+ * Return the root of a filesystem.
+ */
+int
+ufs_root(struct mount *mp, struct vnode **vpp)
+{
+       struct vnode *nvp;
+       int error;
+
+       if ((error = VFS_VGET(mp, (ino_t)ROOTINO, &nvp)) != 0)
+               return (error);
+       *vpp = nvp;
+       return (0);
+}
+
+/*
+ * Do operations associated with quotas
+ */
+int
+ufs_quotactl(struct mount *mp, prop_dictionary_t dict)
+{
+       struct lwp *l = curlwp;
+
+#if !defined(QUOTA) && !defined(QUOTA2)
+       (void) mp;
+       (void) dict;
+       (void) l;
+       return (EOPNOTSUPP);
+#else
+       int  error;
+       prop_dictionary_t cmddict;
+       prop_array_t commands;
+       prop_object_iterator_t iter;
+
+       /* Mark the mount busy, as we're passing it to kauth(9). */
+       error = vfs_busy(mp, NULL);
+       if (error)
+               return (error);
+
+       error = quota_get_cmds(dict, &commands);
+       if (error)
+               goto out_vfs;
+       iter = prop_array_iterator(commands);
+       if (iter == NULL) {
+               error = ENOMEM;
+               goto out_vfs;
+       }
+               
+               
+       mutex_enter(&mp->mnt_updating);
+       while ((cmddict = prop_object_iterator_next(iter)) != NULL) {
+               if (prop_object_type(cmddict) != PROP_TYPE_DICTIONARY)
+                       continue;
+               error = quota_handle_cmd(mp, l, cmddict);
+               if (error)
+                       break;
+       }
+       prop_object_iterator_release(iter);
+       mutex_exit(&mp->mnt_updating);
+out_vfs:
+       vfs_unbusy(mp, false, NULL);
+       return (error);
+#endif
+}
+       
+#if 0
+       switch (cmd) {
+       case Q_SYNC:
+               break;
+
+       case Q_GETQUOTA:
+               /* The user can always query about his own quota. */
+               if (uid == kauth_cred_getuid(l->l_cred))
+                       break;
+
+               error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+                   KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(uid), NULL);
+
+               break;
+
+       case Q_QUOTAON:
+       case Q_QUOTAOFF:
+               error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+                   KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
+
+               break;
+
+       case Q_SETQUOTA:
+       case Q_SETUSE:
+               error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
+                   KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(uid), NULL);
+
+               break;
+
+       default:
+               error = EINVAL;
+               break;
+       }
+
+       type = cmds & SUBCMDMASK;
+       if (!error) {
+               /* Only check if there was no error above. */
+               if ((u_int)type >= MAXQUOTAS)
+                       error = EINVAL;
+       }
+
+       if (error) {
+               vfs_unbusy(mp, false, NULL);
+               return (error);
+       }
+
+       mutex_enter(&mp->mnt_updating);
+       switch (cmd) {
+
+       case Q_QUOTAON:
+               error = quotaon(l, mp, type, arg);
+               break;
+
+       case Q_QUOTAOFF:
+               error = quotaoff(l, mp, type);
+               break;
+
+       case Q_SETQUOTA:
+               error = setquota(mp, uid, type, arg);
+               break;
+
+       case Q_SETUSE:
+               error = setuse(mp, uid, type, arg);
+               break;
+
+       case Q_GETQUOTA:
+               error = getquota(mp, uid, type, arg);
+               break;
+
+       case Q_SYNC:
+               error = qsync(mp);
+               break;
+
+       default:
+               error = EINVAL;
+       }
+       mutex_exit(&mp->mnt_updating);
+       vfs_unbusy(mp, false, NULL);
+       return (error);
+#endif
+
+/*
+ * This is the generic part of fhtovp called after the underlying
+ * filesystem has validated the file handle.
+ */
+int
+ufs_fhtovp(struct mount *mp, struct ufid *ufhp, struct vnode **vpp)
+{
+       struct vnode *nvp;
+       struct inode *ip;
+       int error;
+
+       if ((error = VFS_VGET(mp, ufhp->ufid_ino, &nvp)) != 0) {
+               *vpp = NULLVP;
+               return (error);
+       }
+       ip = VTOI(nvp);
+       if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen) {
+               vput(nvp);
+               *vpp = NULLVP;
+               return (ESTALE);
+       }
+       *vpp = nvp;
+       return (0);
+}
+
+/*
+ * Initialize UFS filesystems, done only once.
+ */
+void
+ufs_init(void)
+{
+       if (ufs_initcount++ > 0)
+               return;
+
+       ufs_direct_cache = pool_cache_init(sizeof(struct direct), 0, 0, 0,
+           "ufsdir", NULL, IPL_NONE, NULL, NULL, NULL);
+
+       ufs_ihashinit();
+#if defined(QUOTA) || defined(QUOTA2)
+       dqinit();
+#endif
+#ifdef UFS_DIRHASH
+       ufsdirhash_init();
+#endif
+#ifdef UFS_EXTATTR
+       ufs_extattr_init();
+#endif
+}
+
+void
+ufs_reinit(void)
+{
+       ufs_ihashreinit();
+#if defined(QUOTA) || defined(QUOTA2)
+       dqreinit();
+#endif
+}
+
+/*
+ * Free UFS filesystem resources, done only once.
+ */
+void
+ufs_done(void)
+{
+       if (--ufs_initcount > 0)
+               return;
+
+       ufs_ihashdone();
+#if defined(QUOTA) || defined(QUOTA2)
+       dqdone();
+#endif
+       pool_cache_destroy(ufs_direct_cache);
+#ifdef UFS_DIRHASH
+       ufsdirhash_done();
+#endif
+#ifdef UFS_EXTATTR
+       ufs_extattr_done();
+#endif
+}
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
new file mode 100644 (file)
index 0000000..634f966
--- /dev/null
@@ -0,0 +1,2989 @@
+/*     $NetBSD: ufs_vnops.c,v 1.206 2011/11/18 21:18:52 christos Exp $ */
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993, 1995
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_vnops.c 8.28 (Berkeley) 7/31/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.206 2011/11/18 21:18:52 christos Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_quota.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+#include <miscfs/genfs/genfs.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#ifdef UFS_DIRHASH
+#include <ufs/ufs/dirhash.h>
+#endif
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/ext2fs/ext2fs_dir.h>
+#include <ufs/ffs/ffs_extern.h>
+#include <ufs/lfs/lfs_extern.h>
+#include <ufs/lfs/lfs.h>
+
+#include <uvm/uvm.h>
+
+__CTASSERT(EXT2FS_MAXNAMLEN == FFS_MAXNAMLEN);
+__CTASSERT(LFS_MAXNAMLEN == FFS_MAXNAMLEN);
+
+static int ufs_chmod(struct vnode *, int, kauth_cred_t, struct lwp *);
+static int ufs_chown(struct vnode *, uid_t, gid_t, kauth_cred_t,
+    struct lwp *);
+
+/*
+ * A virgin directory (no blushing please).
+ */
+static const struct dirtemplate mastertemplate = {
+       0,      12,             DT_DIR, 1,      ".",
+       0,      DIRBLKSIZ - 12, DT_DIR, 2,      ".."
+};
+
+/*
+ * Create a regular file
+ */
+int
+ufs_create(void *v)
+{
+       struct vop_create_args /* {
+               struct vnode            *a_dvp;
+               struct vnode            **a_vpp;
+               struct componentname    *a_cnp;
+               struct vattr            *a_vap;
+       } */ *ap = v;
+       int     error;
+       struct vnode *dvp = ap->a_dvp;
+       struct ufs_lookup_results *ulr;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+       /*
+        * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+        * ufs_makeinode
+        */
+       fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+       error =
+           ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
+                         dvp, ulr, ap->a_vpp, ap->a_cnp);
+       if (error) {
+               fstrans_done(dvp->v_mount);
+               return (error);
+       }
+       UFS_WAPBL_END1(dvp->v_mount, dvp);
+       fstrans_done(dvp->v_mount);
+       VN_KNOTE(dvp, NOTE_WRITE);
+       return (0);
+}
+
+/*
+ * Mknod vnode call
+ */
+/* ARGSUSED */
+int
+ufs_mknod(void *v)
+{
+       struct vop_mknod_args /* {
+               struct vnode            *a_dvp;
+               struct vnode            **a_vpp;
+               struct componentname    *a_cnp;
+               struct vattr            *a_vap;
+       } */ *ap = v;
+       struct vattr    *vap;
+       struct vnode    **vpp;
+       struct inode    *ip;
+       int             error;
+       struct mount    *mp;
+       ino_t           ino;
+       struct ufs_lookup_results *ulr;
+
+       vap = ap->a_vap;
+       vpp = ap->a_vpp;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(ap->a_dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
+       /*
+        * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+        * ufs_makeinode
+        */
+       fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
+       if ((error =
+           ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
+           ap->a_dvp, ulr, vpp, ap->a_cnp)) != 0)
+               goto out;
+       VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+       ip = VTOI(*vpp);
+       mp  = (*vpp)->v_mount;
+       ino = ip->i_number;
+       ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+       if (vap->va_rdev != VNOVAL) {
+               struct ufsmount *ump = ip->i_ump;
+               /*
+                * Want to be able to use this to make badblock
+                * inodes, so don't truncate the dev number.
+                */
+               if (ump->um_fstype == UFS1)
+                       ip->i_ffs1_rdev = ufs_rw32(vap->va_rdev,
+                           UFS_MPNEEDSWAP(ump));
+               else
+                       ip->i_ffs2_rdev = ufs_rw64(vap->va_rdev,
+                           UFS_MPNEEDSWAP(ump));
+       }
+       UFS_WAPBL_UPDATE(*vpp, NULL, NULL, 0);
+       UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
+       /*
+        * Remove inode so that it will be reloaded by VFS_VGET and
+        * checked to see if it is an alias of an existing entry in
+        * the inode cache.
+        */
+       (*vpp)->v_type = VNON;
+       VOP_UNLOCK(*vpp);
+       vgone(*vpp);
+       error = VFS_VGET(mp, ino, vpp);
+out:
+       fstrans_done(ap->a_dvp->v_mount);
+       if (error != 0) {
+               *vpp = NULL;
+               return (error);
+       }
+       return (0);
+}
+
+/*
+ * Open called.
+ *
+ * Nothing to do.
+ */
+/* ARGSUSED */
+int
+ufs_open(void *v)
+{
+       struct vop_open_args /* {
+               struct vnode    *a_vp;
+               int             a_mode;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+
+       /*
+        * Files marked append-only must be opened for appending.
+        */
+       if ((VTOI(ap->a_vp)->i_flags & APPEND) &&
+           (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
+               return (EPERM);
+       return (0);
+}
+
+/*
+ * Close called.
+ *
+ * Update the times on the inode.
+ */
+/* ARGSUSED */
+int
+ufs_close(void *v)
+{
+       struct vop_close_args /* {
+               struct vnode    *a_vp;
+               int             a_fflag;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+       struct vnode    *vp;
+       struct inode    *ip;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+       if (vp->v_usecount > 1)
+               UFS_ITIMES(vp, NULL, NULL, NULL);
+       fstrans_done(vp->v_mount);
+       return (0);
+}
+
+static int
+ufs_check_possible(struct vnode *vp, struct inode *ip, mode_t mode,
+    kauth_cred_t cred)
+{
+#if defined(QUOTA) || defined(QUOTA2)
+       int error;
+#endif
+
+       /*
+        * Disallow write attempts on read-only file systems;
+        * unless the file is a socket, fifo, or a block or
+        * character device resident on the file system.
+        */
+       if (mode & VWRITE) {
+               switch (vp->v_type) {
+               case VDIR:
+               case VLNK:
+               case VREG:
+                       if (vp->v_mount->mnt_flag & MNT_RDONLY)
+                               return (EROFS);
+#if defined(QUOTA) || defined(QUOTA2)
+                       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+                       error = chkdq(ip, 0, cred, 0);
+                       fstrans_done(vp->v_mount);
+                       if (error != 0)
+                               return error;
+#endif
+                       break;
+               case VBAD:
+               case VBLK:
+               case VCHR:
+               case VSOCK:
+               case VFIFO:
+               case VNON:
+               default:
+                       break;
+               }
+       }
+
+       /* If it is a snapshot, nobody gets access to it. */
+       if ((ip->i_flags & SF_SNAPSHOT))
+               return (EPERM);
+       /* If immutable bit set, nobody gets to write it. */
+       if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE))
+               return (EPERM);
+
+       return 0;
+}
+
+static int
+ufs_check_permitted(struct vnode *vp, struct inode *ip, mode_t mode,
+    kauth_cred_t cred)
+{
+
+       return genfs_can_access(vp->v_type, ip->i_mode & ALLPERMS, ip->i_uid,
+           ip->i_gid, mode, cred);
+}
+
+int
+ufs_access(void *v)
+{
+       struct vop_access_args /* {
+               struct vnode    *a_vp;
+               int             a_mode;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+       struct vnode    *vp;
+       struct inode    *ip;
+       mode_t          mode;
+       int             error;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       mode = ap->a_mode;
+
+       error = ufs_check_possible(vp, ip, mode, ap->a_cred);
+       if (error)
+               return error;
+
+       error = ufs_check_permitted(vp, ip, mode, ap->a_cred);
+
+       return error;
+}
+
+/* ARGSUSED */
+int
+ufs_getattr(void *v)
+{
+       struct vop_getattr_args /* {
+               struct vnode    *a_vp;
+               struct vattr    *a_vap;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+       struct vnode    *vp;
+       struct inode    *ip;
+       struct vattr    *vap;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       vap = ap->a_vap;
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+       UFS_ITIMES(vp, NULL, NULL, NULL);
+
+       /*
+        * Copy from inode table
+        */
+       vap->va_fsid = ip->i_dev;
+       vap->va_fileid = ip->i_number;
+       vap->va_mode = ip->i_mode & ALLPERMS;
+       vap->va_nlink = ip->i_nlink;
+       vap->va_uid = ip->i_uid;
+       vap->va_gid = ip->i_gid;
+       vap->va_size = vp->v_size;
+       if (ip->i_ump->um_fstype == UFS1) {
+               vap->va_rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev,
+                   UFS_MPNEEDSWAP(ip->i_ump));
+               vap->va_atime.tv_sec = ip->i_ffs1_atime;
+               vap->va_atime.tv_nsec = ip->i_ffs1_atimensec;
+               vap->va_mtime.tv_sec = ip->i_ffs1_mtime;
+               vap->va_mtime.tv_nsec = ip->i_ffs1_mtimensec;
+               vap->va_ctime.tv_sec = ip->i_ffs1_ctime;
+               vap->va_ctime.tv_nsec = ip->i_ffs1_ctimensec;
+               vap->va_birthtime.tv_sec = 0;
+               vap->va_birthtime.tv_nsec = 0;
+               vap->va_bytes = dbtob((u_quad_t)ip->i_ffs1_blocks);
+       } else {
+               vap->va_rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev,
+                   UFS_MPNEEDSWAP(ip->i_ump));
+               vap->va_atime.tv_sec = ip->i_ffs2_atime;
+               vap->va_atime.tv_nsec = ip->i_ffs2_atimensec;
+               vap->va_mtime.tv_sec = ip->i_ffs2_mtime;
+               vap->va_mtime.tv_nsec = ip->i_ffs2_mtimensec;
+               vap->va_ctime.tv_sec = ip->i_ffs2_ctime;
+               vap->va_ctime.tv_nsec = ip->i_ffs2_ctimensec;
+               vap->va_birthtime.tv_sec = ip->i_ffs2_birthtime;
+               vap->va_birthtime.tv_nsec = ip->i_ffs2_birthnsec;
+               vap->va_bytes = dbtob(ip->i_ffs2_blocks);
+       }
+       vap->va_gen = ip->i_gen;
+       vap->va_flags = ip->i_flags;
+
+       /* this doesn't belong here */
+       if (vp->v_type == VBLK)
+               vap->va_blocksize = BLKDEV_IOSIZE;
+       else if (vp->v_type == VCHR)
+               vap->va_blocksize = MAXBSIZE;
+       else
+               vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+       vap->va_type = vp->v_type;
+       vap->va_filerev = ip->i_modrev;
+       fstrans_done(vp->v_mount);
+       return (0);
+}
+
+/*
+ * Set attribute vnode op. called from several syscalls
+ */
+int
+ufs_setattr(void *v)
+{
+       struct vop_setattr_args /* {
+               struct vnode    *a_vp;
+               struct vattr    *a_vap;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+       struct vattr    *vap;
+       struct vnode    *vp;
+       struct inode    *ip;
+       kauth_cred_t    cred;
+       struct lwp      *l;
+       int             error;
+
+       vap = ap->a_vap;
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       cred = ap->a_cred;
+       l = curlwp;
+
+       /*
+        * Check for unsettable attributes.
+        */
+       if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
+           (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
+           (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
+           ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
+               return (EINVAL);
+       }
+
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+
+       if (vap->va_flags != VNOVAL) {
+               if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+                       error = EROFS;
+                       goto out;
+               }
+               if (kauth_cred_geteuid(cred) != ip->i_uid &&
+                   (error = kauth_authorize_generic(cred,
+                   KAUTH_GENERIC_ISSUSER, NULL)))
+                       goto out;
+               if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
+                   NULL) == 0) {
+                       if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) &&
+                           kauth_authorize_system(l->l_cred,
+                            KAUTH_SYSTEM_CHSYSFLAGS, 0, NULL, NULL, NULL)) {
+                               error = EPERM;
+                               goto out;
+                       }
+                       /* Snapshot flag cannot be set or cleared */
+                       if ((vap->va_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
+                           (ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))) {
+                               error = EPERM;
+                               goto out;
+                       }
+                       error = UFS_WAPBL_BEGIN(vp->v_mount);
+                       if (error)
+                               goto out;
+                       ip->i_flags = vap->va_flags;
+                       DIP_ASSIGN(ip, flags, ip->i_flags);
+               } else {
+                       if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) ||
+                           (vap->va_flags & UF_SETTABLE) != vap->va_flags) {
+                               error = EPERM;
+                               goto out;
+                       }
+                       if ((ip->i_flags & SF_SETTABLE) !=
+                           (vap->va_flags & SF_SETTABLE)) {
+                               error = EPERM;
+                               goto out;
+                       }
+                       error = UFS_WAPBL_BEGIN(vp->v_mount);
+                       if (error)
+                               goto out;
+                       ip->i_flags &= SF_SETTABLE;
+                       ip->i_flags |= (vap->va_flags & UF_SETTABLE);
+                       DIP_ASSIGN(ip, flags, ip->i_flags);
+               }
+               ip->i_flag |= IN_CHANGE;
+               UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+               UFS_WAPBL_END(vp->v_mount);
+               if (vap->va_flags & (IMMUTABLE | APPEND)) {
+                       error = 0;
+                       goto out;
+               }
+       }
+       if (ip->i_flags & (IMMUTABLE | APPEND)) {
+               error = EPERM;
+               goto out;
+       }
+       /*
+        * Go through the fields and update iff not VNOVAL.
+        */
+       if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
+               if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+                       error = EROFS;
+                       goto out;
+               }
+               error = UFS_WAPBL_BEGIN(vp->v_mount);
+               if (error)
+                       goto out;
+               error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, l);
+               UFS_WAPBL_END(vp->v_mount);
+               if (error)
+                       goto out;
+       }
+       if (vap->va_size != VNOVAL) {
+               /*
+                * Disallow write attempts on read-only file systems;
+                * unless the file is a socket, fifo, or a block or
+                * character device resident on the file system.
+                */
+               switch (vp->v_type) {
+               case VDIR:
+                       error = EISDIR;
+                       goto out;
+               case VCHR:
+               case VBLK:
+               case VFIFO:
+                       break;
+               case VREG:
+                       if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+                               error = EROFS;
+                               goto out;
+                       }
+                       if ((ip->i_flags & SF_SNAPSHOT) != 0) {
+                               error = EPERM;
+                               goto out;
+                       }
+                       error = UFS_WAPBL_BEGIN(vp->v_mount);
+                       if (error)
+                               goto out;
+                       /*
+                        * When journaling, only truncate one indirect block
+                        * at a time.
+                        */
+                       if (vp->v_mount->mnt_wapbl) {
+                               uint64_t incr = MNINDIR(ip->i_ump) <<
+                                   vp->v_mount->mnt_fs_bshift; /* Power of 2 */
+                               uint64_t base = NDADDR <<
+                                   vp->v_mount->mnt_fs_bshift;
+                               while (!error && ip->i_size > base + incr &&
+                                   ip->i_size > vap->va_size + incr) {
+                                       /*
+                                        * round down to next full indirect
+                                        * block boundary.
+                                        */
+                                       uint64_t nsize = base +
+                                           ((ip->i_size - base - 1) &
+                                           ~(incr - 1));
+                                       error = UFS_TRUNCATE(vp, nsize, 0,
+                                           cred);
+                                       if (error == 0) {
+                                               UFS_WAPBL_END(vp->v_mount);
+                                               error =
+                                                  UFS_WAPBL_BEGIN(vp->v_mount);
+                                       }
+                               }
+                       }
+                       if (!error)
+                               error = UFS_TRUNCATE(vp, vap->va_size, 0, cred);
+                       UFS_WAPBL_END(vp->v_mount);
+                       if (error)
+                               goto out;
+                       break;
+               default:
+                       error = EOPNOTSUPP;
+                       goto out;
+               }
+       }
+       ip = VTOI(vp);
+       if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
+           vap->va_birthtime.tv_sec != VNOVAL) {
+               if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+                       error = EROFS;
+                       goto out;
+               }
+               if ((ip->i_flags & SF_SNAPSHOT) != 0) {
+                       error = EPERM;
+                       goto out;
+               }
+               error = genfs_can_chtimes(vp, vap->va_vaflags, ip->i_uid, cred);
+               if (error)
+                       goto out;
+               error = UFS_WAPBL_BEGIN(vp->v_mount);
+               if (error)
+                       goto out;
+               if (vap->va_atime.tv_sec != VNOVAL)
+                       if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
+                               ip->i_flag |= IN_ACCESS;
+               if (vap->va_mtime.tv_sec != VNOVAL) {
+                       ip->i_flag |= IN_CHANGE | IN_UPDATE;
+                       if (vp->v_mount->mnt_flag & MNT_RELATIME)
+                               ip->i_flag |= IN_ACCESS;
+               }
+               if (vap->va_birthtime.tv_sec != VNOVAL &&
+                   ip->i_ump->um_fstype == UFS2) {
+                       ip->i_ffs2_birthtime = vap->va_birthtime.tv_sec;
+                       ip->i_ffs2_birthnsec = vap->va_birthtime.tv_nsec;
+               }
+               error = UFS_UPDATE(vp, &vap->va_atime, &vap->va_mtime, 0);
+               UFS_WAPBL_END(vp->v_mount);
+               if (error)
+                       goto out;
+       }
+       error = 0;
+       if (vap->va_mode != (mode_t)VNOVAL) {
+               if (vp->v_mount->mnt_flag & MNT_RDONLY) {
+                       error = EROFS;
+                       goto out;
+               }
+               if ((ip->i_flags & SF_SNAPSHOT) != 0 &&
+                   (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP |
+                    S_IXOTH | S_IWOTH))) {
+                       error = EPERM;
+                       goto out;
+               }
+               error = UFS_WAPBL_BEGIN(vp->v_mount);
+               if (error)
+                       goto out;
+               error = ufs_chmod(vp, (int)vap->va_mode, cred, l);
+               UFS_WAPBL_END(vp->v_mount);
+       }
+       VN_KNOTE(vp, NOTE_ATTRIB);
+out:
+       fstrans_done(vp->v_mount);
+       return (error);
+}
+
+/*
+ * Change the mode on a file.
+ * Inode must be locked before calling.
+ */
+static int
+ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l)
+{
+       struct inode    *ip;
+       int             error;
+
+       UFS_WAPBL_JLOCK_ASSERT(vp->v_mount);
+
+       ip = VTOI(vp);
+
+       error = genfs_can_chmod(vp, cred, ip->i_uid, ip->i_gid, mode);
+       if (error)
+               return (error);
+
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+       ip->i_mode &= ~ALLPERMS;
+       ip->i_mode |= (mode & ALLPERMS);
+       ip->i_flag |= IN_CHANGE;
+       DIP_ASSIGN(ip, mode, ip->i_mode);
+       UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+       fstrans_done(vp->v_mount);
+       return (0);
+}
+
+/*
+ * Perform chown operation on inode ip;
+ * inode must be locked prior to call.
+ */
+static int
+ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
+       struct lwp *l)
+{
+       struct inode    *ip;
+       int             error = 0;
+#if defined(QUOTA) || defined(QUOTA2)
+       uid_t           ouid;
+       gid_t           ogid;
+       int64_t         change;
+#endif
+       ip = VTOI(vp);
+       error = 0;
+
+       if (uid == (uid_t)VNOVAL)
+               uid = ip->i_uid;
+       if (gid == (gid_t)VNOVAL)
+               gid = ip->i_gid;
+
+       error = genfs_can_chown(vp, cred, ip->i_uid, ip->i_gid, uid, gid);
+       if (error)
+               return (error);
+
+       fstrans_start(vp->v_mount, FSTRANS_SHARED);
+#if defined(QUOTA) || defined(QUOTA2)
+       ogid = ip->i_gid;
+       ouid = ip->i_uid;
+       change = DIP(ip, blocks);
+       (void) chkdq(ip, -change, cred, 0);
+       (void) chkiq(ip, -1, cred, 0);
+#endif
+       ip->i_gid = gid;
+       DIP_ASSIGN(ip, gid, gid);
+       ip->i_uid = uid;
+       DIP_ASSIGN(ip, uid, uid);
+#if defined(QUOTA) || defined(QUOTA2)
+       if ((error = chkdq(ip, change, cred, 0)) == 0) {
+               if ((error = chkiq(ip, 1, cred, 0)) == 0)
+                       goto good;
+               else
+                       (void) chkdq(ip, -change, cred, FORCE);
+       }
+       ip->i_gid = ogid;
+       DIP_ASSIGN(ip, gid, ogid);
+       ip->i_uid = ouid;
+       DIP_ASSIGN(ip, uid, ouid);
+       (void) chkdq(ip, change, cred, FORCE);
+       (void) chkiq(ip, 1, cred, FORCE);
+       fstrans_done(vp->v_mount);
+       return (error);
+ good:
+#endif /* QUOTA || QUOTA2 */
+       ip->i_flag |= IN_CHANGE;
+       UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+       fstrans_done(vp->v_mount);
+       return (0);
+}
+
+int
+ufs_remove(void *v)
+{
+       struct vop_remove_args /* {
+               struct vnode            *a_dvp;
+               struct vnode            *a_vp;
+               struct componentname    *a_cnp;
+       } */ *ap = v;
+       struct vnode    *vp, *dvp;
+       struct inode    *ip;
+       int             error;
+       struct ufs_lookup_results *ulr;
+
+       vp = ap->a_vp;
+       dvp = ap->a_dvp;
+       ip = VTOI(vp);
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+       fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+       if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) ||
+           (VTOI(dvp)->i_flags & APPEND))
+               error = EPERM;
+       else {
+               error = UFS_WAPBL_BEGIN(dvp->v_mount);
+               if (error == 0) {
+                       error = ufs_dirremove(dvp, ulr,
+                                             ip, ap->a_cnp->cn_flags, 0);
+                       UFS_WAPBL_END(dvp->v_mount);
+               }
+       }
+       VN_KNOTE(vp, NOTE_DELETE);
+       VN_KNOTE(dvp, NOTE_WRITE);
+       if (dvp == vp)
+               vrele(vp);
+       else
+               vput(vp);
+       vput(dvp);
+       fstrans_done(dvp->v_mount);
+       return (error);
+}
+
+/*
+ * ufs_link: create hard link.
+ */
+int
+ufs_link(void *v)
+{
+       struct vop_link_args /* {
+               struct vnode *a_dvp;
+               struct vnode *a_vp;
+               struct componentname *a_cnp;
+       } */ *ap = v;
+       struct vnode *dvp = ap->a_dvp;
+       struct vnode *vp = ap->a_vp;
+       struct componentname *cnp = ap->a_cnp;
+       struct inode *ip;
+       struct direct *newdir;
+       int error;
+       struct ufs_lookup_results *ulr;
+
+       KASSERT(dvp != vp);
+       KASSERT(vp->v_type != VDIR);
+       KASSERT(dvp->v_mount == vp->v_mount);
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+       fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+       error = vn_lock(vp, LK_EXCLUSIVE);
+       if (error) {
+               VOP_ABORTOP(dvp, cnp);
+               goto out2;
+       }
+       ip = VTOI(vp);
+       if ((nlink_t)ip->i_nlink >= LINK_MAX) {
+               VOP_ABORTOP(dvp, cnp);
+               error = EMLINK;
+               goto out1;
+       }
+       if (ip->i_flags & (IMMUTABLE | APPEND)) {
+               VOP_ABORTOP(dvp, cnp);
+               error = EPERM;
+               goto out1;
+       }
+       error = UFS_WAPBL_BEGIN(vp->v_mount);
+       if (error) {
+               VOP_ABORTOP(dvp, cnp);
+               goto out1;
+       }
+       ip->i_nlink++;
+       DIP_ASSIGN(ip, nlink, ip->i_nlink);
+       ip->i_flag |= IN_CHANGE;
+       error = UFS_UPDATE(vp, NULL, NULL, UPDATE_DIROP);
+       if (!error) {
+               newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+               ufs_makedirentry(ip, cnp, newdir);
+               error = ufs_direnter(dvp, ulr, vp, newdir, cnp, NULL);
+               pool_cache_put(ufs_direct_cache, newdir);
+       }
+       if (error) {
+               ip->i_nlink--;
+               DIP_ASSIGN(ip, nlink, ip->i_nlink);
+               ip->i_flag |= IN_CHANGE;
+               UFS_WAPBL_UPDATE(vp, NULL, NULL, UPDATE_DIROP);
+       }
+       UFS_WAPBL_END(vp->v_mount);
+ out1:
+       VOP_UNLOCK(vp);
+ out2:
+       VN_KNOTE(vp, NOTE_LINK);
+       VN_KNOTE(dvp, NOTE_WRITE);
+       vput(dvp);
+       fstrans_done(dvp->v_mount);
+       return (error);
+}
+
+/*
+ * whiteout vnode call
+ */
+int
+ufs_whiteout(void *v)
+{
+       struct vop_whiteout_args /* {
+               struct vnode            *a_dvp;
+               struct componentname    *a_cnp;
+               int                     a_flags;
+       } */ *ap = v;
+       struct vnode            *dvp = ap->a_dvp;
+       struct componentname    *cnp = ap->a_cnp;
+       struct direct           *newdir;
+       int                     error;
+       struct ufsmount         *ump = VFSTOUFS(dvp->v_mount);
+       struct ufs_lookup_results *ulr;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+
+       error = 0;
+       switch (ap->a_flags) {
+       case LOOKUP:
+               /* 4.4 format directories support whiteout operations */
+               if (ump->um_maxsymlinklen > 0)
+                       return (0);
+               return (EOPNOTSUPP);
+
+       case CREATE:
+               /* create a new directory whiteout */
+               fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+               error = UFS_WAPBL_BEGIN(dvp->v_mount);
+               if (error)
+                       break;
+#ifdef DIAGNOSTIC
+               if (ump->um_maxsymlinklen <= 0)
+                       panic("ufs_whiteout: old format filesystem");
+#endif
+
+               newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+               newdir->d_ino = WINO;
+               newdir->d_namlen = cnp->cn_namelen;
+               memcpy(newdir->d_name, cnp->cn_nameptr,
+                   (size_t)cnp->cn_namelen);
+               newdir->d_name[cnp->cn_namelen] = '\0';
+               newdir->d_type = DT_WHT;
+               error = ufs_direnter(dvp, ulr, NULL, newdir, cnp, NULL);
+               pool_cache_put(ufs_direct_cache, newdir);
+               break;
+
+       case DELETE:
+               /* remove an existing directory whiteout */
+               fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+               error = UFS_WAPBL_BEGIN(dvp->v_mount);
+               if (error)
+                       break;
+#ifdef DIAGNOSTIC
+               if (ump->um_maxsymlinklen <= 0)
+                       panic("ufs_whiteout: old format filesystem");
+#endif
+
+               cnp->cn_flags &= ~DOWHITEOUT;
+               error = ufs_dirremove(dvp, ulr, NULL, cnp->cn_flags, 0);
+               break;
+       default:
+               panic("ufs_whiteout: unknown op");
+               /* NOTREACHED */
+       }
+       UFS_WAPBL_END(dvp->v_mount);
+       fstrans_done(dvp->v_mount);
+       return (error);
+}
+
+
+/*
+ * Rename vnode operation
+ *     rename("foo", "bar");
+ * is essentially
+ *     unlink("bar");
+ *     link("foo", "bar");
+ *     unlink("foo");
+ * but ``atomically''.  Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time.  Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ *    target.  This also ensure the inode won't be deleted out
+ *    from underneath us while we work (it may be truncated by
+ *    a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination.  If destination already exists,
+ *    delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ *    directory was moved and the parent of the destination
+ *    is different from the source, patch the ".." entry in the
+ *    directory.
+ */
+
+/*
+ * Notes on rename locking:
+ *
+ * We lock parent vnodes before child vnodes. This means in particular
+ * that if A is above B in the directory tree then A must be locked
+ * before B. (This is true regardless of how many steps appear in
+ * between, because an arbitrary number of other processes could lock
+ * parent/child in between and establish a lock cycle and deadlock.)
+ *
+ * Therefore, if tdvp is above fdvp we must lock tdvp first; if fdvp
+ * is above tdvp we must lock fdvp first; and if they're
+ * incommensurate it doesn't matter. (But, we rely on the fact that
+ * there's a whole-volume rename lock to prevent deadlock among groups
+ * of renames upon overlapping sets of incommensurate vnodes.)
+ *
+ * In addition to establishing lock ordering the parent check also
+ * serves to rule out cases where someone tries to move a directory
+ * underneath itself, e.g. rename("a/b", "a/b/c"). If allowed to
+ * proceed such renames would detach portions of the directory tree
+ * and make fsck very unhappy.
+ *
+ * Note that it is an error for *fvp* to be above tdvp; however,
+ * *fdvp* can be above tdvp, as in rename("a/b", "a/c/d").
+ *
+ * The parent check searches up the tree from tdvp until it either
+ * finds fdvp or the root of the volume. It also returns the vnode it
+ * saw immediately before fdvp, if any. Later on (after looking up
+ * fvp) we will check to see if this *is* fvp and if so fail.
+ *
+ * If the parent check finds fdvp, it means fdvp is above tdvp, so we
+ * lock fdvp first and then tdvp. Otherwise, either tdvp is above fdvp
+ * or they're incommensurate and we lock tdvp first.
+ *
+ * In either case each of the child vnodes has to be looked up and
+ * locked immediately after its parent. The cases
+ *
+ *       fdvp/fvp/[.../]tdvp/tvp
+ *       tdvp/tvp/[.../]fdvp/fvp
+ *
+ * can cause deadlock otherwise. Note that both of these are error
+ * cases; the first fails the parent check and the second fails
+ * because tvp isn't empty. The parent check case is handled before
+ * we start locking; however, the nonempty case requires locking tvp
+ * to find out safely that it's nonempty.
+ *
+ * Therefore the procedure is either
+ *
+ *   lock fdvp
+ *   lookup fvp
+ *   lock fvp
+ *   lock tdvp
+ *   lookup tvp
+ *   lock tvp
+ *
+ * or
+ *
+ *   lock tdvp
+ *   lookup tvp
+ *   lock tvp
+ *   lock fdvp
+ *   lookup fvp
+ *   lock fvp
+ *
+ * This could in principle be simplified by always looking up fvp
+ * last; because of the parent check we know by the time we start
+ * locking that fvp cannot be directly above tdvp, so (given the
+ * whole-volume rename lock and other assumptions) it's safe to lock
+ * tdvp before fvp. This would allow the following scheme:
+ *
+ *   lock fdvp
+ *   lock tdvp
+ * or
+ *   lock tdvp
+ *   lock fdvp
+ *
+ * then
+ *   lookup tvp
+ *   lock tvp
+ *   lookup fvp
+ *   check if fvp is above of tdvp, fail if so
+ *   lock fvp
+ *
+ * which is much, much simpler.
+ *
+ * However, current levels of vfs namei/lookup sanity do not permit
+ * this. It is impossible currently to look up fvp without locking it.
+ * (It gets locked regardless of whether LOCKLEAF is set; without
+ * LOCKLEAF it just gets unlocked again, which doesn't help.)
+ *
+ * Therefore, because we must look up fvp to know if it's above tdvp,
+ * which locks fvp, we must, at least in the case where fdvp is above
+ * tdvp, do that before locking tdvp. The longer scheme does that; the
+ * simpler scheme is not safe.
+ *
+ * Note that for now we aren't doing lookup() but relookup(); however,
+ * the differences are minor.
+ *
+ * On top of all the above, just to make everything more
+ * exciting, any two of the vnodes might end up being the same.
+ *
+ * FROMPARENT == FROMCHILD     mv a/. foo      is an error.
+ * FROMPARENT == TOPARENT      mv a/b a/c      is ok.
+ * FROMPARENT == TOCHILD       mv a/b/c a/b    will give ENOTEMPTY.
+ * FROMCHILD == TOPARENT       mv a/b a/b/c    fails the parent check.
+ * FROMCHILD == TOCHILD                mv a/b a/b      is ok.
+ * TOPARENT == TOCHILD         mv foo a/.      is an error.
+ *
+ * This introduces more cases in the locking, because each distinct
+ * vnode must be locked exactly once.
+ *
+ * When FROMPARENT == TOPARENT and FROMCHILD != TOCHILD we assume it
+ * doesn't matter what order the children are locked in, because the
+ * per-volume rename lock excludes other renames and no other
+ * operation locks two files in the same directory at once. (Note: if
+ * it turns out that link() does, link() is wrong.)
+ *
+ * Until such time as we can do lookups without the namei and lookup
+ * machinery "helpfully" locking the result vnode for us, we can't
+ * avoid tripping on cases where FROMCHILD == TOCHILD. Currently for
+ * non-directories we unlock the first one we lock while looking up
+ * the second, then relock it if necessary. This is more or less
+ * harmless since not much of interest can happen to the objects in
+ * that window while we have the containing directory locked; but it's
+ * not desirable and should be cleaned up when that becomes possible.
+ * The right way to do it is to check after looking the second one up
+ * and only lock it if it's different. (Note: for directories we don't
+ * do this dance because the same directory can't appear more than
+ * once.)
+ */
+
+/* XXX following lifted from ufs_lookup.c */
+#define        FSFMT(vp)       (((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0)
+
+/*
+ * Check if either entry referred to by FROM_ULR is within the range
+ * of entries named by TO_ULR.
+ */
+static int
+ulr_overlap(const struct ufs_lookup_results *from_ulr,
+           const struct ufs_lookup_results *to_ulr)
+{
+       doff_t from_start, from_prevstart;
+       doff_t to_start, to_end;
+
+       /*
+        * FROM is a DELETE result; offset points to the entry to
+        * remove and subtracting count gives the previous entry.
+        */
+       from_start = from_ulr->ulr_offset - from_ulr->ulr_count;
+       from_prevstart = from_ulr->ulr_offset;
+
+       /*
+        * TO is a RENAME (thus non-DELETE) result; offset points
+        * to the beginning of a region to write in, and adding
+        * count gives the end of the region.
+        */
+       to_start = to_ulr->ulr_offset;
+       to_end = to_ulr->ulr_offset + to_ulr->ulr_count;
+
+       if (from_prevstart >= to_start && from_prevstart < to_end) {
+               return 1;
+       }
+       if (from_start >= to_start && from_start < to_end) {
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * Wrapper for relookup that also updates the supplemental results.
+ */
+static int
+do_relookup(struct vnode *dvp, struct ufs_lookup_results *ulr,
+           struct vnode **vp, struct componentname *cnp)
+{
+       int error;
+
+       error = relookup(dvp, vp, cnp, 0);
+       if (error) {
+               return error;
+       }
+       /* update the supplemental reasults */
+       *ulr = VTOI(dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(dvp));
+       return 0;
+}
+
+/*
+ * Lock and relookup a sequence of two directories and two children.
+ *
+ */
+static int
+lock_vnode_sequence(struct vnode *d1, struct ufs_lookup_results *ulr1,
+                   struct vnode **v1_ret, struct componentname *cn1, 
+                   int v1_missing_ok,
+                   int overlap_error,
+                   struct vnode *d2, struct ufs_lookup_results *ulr2,
+                   struct vnode **v2_ret, struct componentname *cn2, 
+                   int v2_missing_ok)
+{
+       struct vnode *v1, *v2;
+       int error;
+
+       KASSERT(d1 != d2);
+
+       vn_lock(d1, LK_EXCLUSIVE | LK_RETRY);
+       if (VTOI(d1)->i_size == 0) {
+               /* d1 has been rmdir'd */
+               VOP_UNLOCK(d1);
+               return ENOENT;
+       }
+       error = do_relookup(d1, ulr1, &v1, cn1);
+       if (v1_missing_ok) {
+               if (error == ENOENT) {
+                       /*
+                        * Note: currently if the name doesn't exist,
+                        * relookup succeeds (it intercepts the
+                        * EJUSTRETURN from VOP_LOOKUP) and sets tvp
+                        * to NULL. Therefore, we will never get
+                        * ENOENT and this branch is not needed.
+                        * However, in a saner future the EJUSTRETURN
+                        * garbage will go away, so let's DTRT.
+                        */
+                       v1 = NULL;
+                       error = 0;
+               }
+       } else {
+               if (error == 0 && v1 == NULL) {
+                       /* This is what relookup sets if v1 disappeared. */
+                       error = ENOENT;
+               }
+       }
+       if (error) {
+               VOP_UNLOCK(d1);
+               return error;
+       }
+       if (v1 && v1 == d2) {
+               VOP_UNLOCK(d1);
+               VOP_UNLOCK(v1);
+               vrele(v1);
+               return overlap_error;
+       }
+
+       /*
+        * The right way to do this is to do lookups without locking
+        * the results, and lock the results afterwards; then at the
+        * end we can avoid trying to lock v2 if v2 == v1.
+        *
+        * However, for the reasons described in the fdvp == tdvp case
+        * in rename below, we can't do that safely. So, in the case
+        * where v1 is not a directory, unlock it and lock it again
+        * afterwards. This is safe in locking order because a
+        * non-directory can't be above anything else in the tree. If
+        * v1 *is* a directory, that's not true, but then because d1
+        * != d2, v1 != v2.
+        */
+       if (v1 && v1->v_type != VDIR) {
+               VOP_UNLOCK(v1);
+       }
+       vn_lock(d2, LK_EXCLUSIVE | LK_RETRY);
+       if (VTOI(d2)->i_size == 0) {
+               /* d2 has been rmdir'd */
+               VOP_UNLOCK(d2);
+               if (v1 && v1->v_type == VDIR) {
+                       VOP_UNLOCK(v1);
+               }
+               VOP_UNLOCK(d1);
+               if (v1) {
+                       vrele(v1);
+               }
+               return ENOENT;
+       }
+       error = do_relookup(d2, ulr2, &v2, cn2);
+       if (v2_missing_ok) {
+               if (error == ENOENT) {
+                       /* as above */
+                       v2 = NULL;
+                       error = 0;
+               }
+       } else {
+               if (error == 0 && v2 == NULL) {
+                       /* This is what relookup sets if v2 disappeared. */
+                       error = ENOENT;
+               }
+       }
+       if (error) {
+               VOP_UNLOCK(d2);
+               if (v1 && v1->v_type == VDIR) {
+                       VOP_UNLOCK(v1);
+               }
+               VOP_UNLOCK(d1);
+               if (v1) {
+                       vrele(v1);
+               }
+               return error;
+       }
+       if (v1 && v1->v_type != VDIR && v1 != v2) {
+               vn_lock(v1, LK_EXCLUSIVE | LK_RETRY);
+       }
+       *v1_ret = v1;
+       *v2_ret = v2;
+       return 0;
+}
+
+/*
+ * Rename vnode operation
+ *     rename("foo", "bar");
+ * is essentially
+ *     unlink("bar");
+ *     link("foo", "bar");
+ *     unlink("foo");
+ * but ``atomically''.  Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time.  Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ *    target.  This also ensure the inode won't be deleted out
+ *    from underneath us while we work (it may be truncated by
+ *    a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination.  If destination already exists,
+ *    delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ *    directory was moved and the parent of the destination
+ *    is different from the source, patch the ".." entry in the
+ *    directory.
+ */
+int
+ufs_rename(void *v)
+{
+       struct vop_rename_args  /* {
+               struct vnode            *a_fdvp;
+               struct vnode            *a_fvp;
+               struct componentname    *a_fcnp;
+               struct vnode            *a_tdvp;
+               struct vnode            *a_tvp;
+               struct componentname    *a_tcnp;
+       } */ *ap = v;
+       struct vnode            *tvp, *tdvp, *fvp, *fdvp;
+       struct componentname    *tcnp, *fcnp;
+       struct inode            *ip, *txp, *fxp, *tdp, *fdp;
+       struct mount            *mp;
+       struct direct           *newdir;
+       int                     doingdirectory, error;
+       ino_t                   oldparent, newparent;
+
+       struct ufs_lookup_results from_ulr, to_ulr;
+
+       tvp = ap->a_tvp;
+       tdvp = ap->a_tdvp;
+       fvp = ap->a_fvp;
+       fdvp = ap->a_fdvp;
+       tcnp = ap->a_tcnp;
+       fcnp = ap->a_fcnp;
+       doingdirectory = error = 0;
+       oldparent = newparent = 0;
+
+       /* save the supplemental lookup results as they currently exist */
+       from_ulr = VTOI(fdvp)->i_crap;
+       to_ulr = VTOI(tdvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(fdvp));
+       UFS_CHECK_CRAPCOUNTER(VTOI(tdvp));
+
+       /*
+        * Owing to VFS oddities we are currently called with tdvp/tvp
+        * locked and not fdvp/fvp. In a sane world we'd be passed
+        * tdvp and fdvp only, unlocked, and two name strings. Pretend
+        * we have a sane world and unlock tdvp and tvp.
+        */
+       VOP_UNLOCK(tdvp);
+       if (tvp && tvp != tdvp) {
+               VOP_UNLOCK(tvp);
+       }
+
+       /* Also pretend we have a sane world and vrele fvp/tvp. */
+       vrele(fvp);
+       fvp = NULL;
+       if (tvp) {
+               vrele(tvp);
+               tvp = NULL;
+       }
+
+       /*
+        * Check for cross-device rename.
+        */
+       if (fdvp->v_mount != tdvp->v_mount) {
+               error = EXDEV;
+               goto abort;
+       }
+
+       /*
+        * Reject "." and ".."
+        */
+       if ((fcnp->cn_flags & ISDOTDOT) || (tcnp->cn_flags & ISDOTDOT) ||
+           (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+           (tcnp->cn_namelen == 1 && tcnp->cn_nameptr[0] == '.')) {
+               error = EINVAL;
+               goto abort;
+       }
+           
+       /*
+        * Get locks.
+        */
+
+       /* paranoia */
+       fcnp->cn_flags |= LOCKPARENT|LOCKLEAF;
+       tcnp->cn_flags |= LOCKPARENT|LOCKLEAF;
+
+       if (fdvp == tdvp) {
+               /* One directory. Lock it and relookup both children. */
+               vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+
+               if (VTOI(fdvp)->i_size == 0) {
+                       /* directory has been rmdir'd */
+                       VOP_UNLOCK(fdvp);
+                       error = ENOENT;
+                       goto abort;
+               }
+
+               error = do_relookup(fdvp, &from_ulr, &fvp, fcnp);
+               if (error == 0 && fvp == NULL) {
+                       /* relookup may produce this if fvp disappears */
+                       error = ENOENT;
+               }
+               if (error) {
+                       VOP_UNLOCK(fdvp);
+                       goto abort;
+               }
+
+               /*
+                * The right way to do this is to look up both children
+                * without locking either, and then lock both unless they
+                * turn out to be the same. However, due to deep-seated
+                * VFS-level issues all lookups lock the child regardless
+                * of whether LOCKLEAF is set (if LOCKLEAF is not set,
+                * the child is locked during lookup and then unlocked)
+                * so it is not safe to look up tvp while fvp is locked.
+                *
+                * Unlocking fvp here temporarily is more or less safe,
+                * because with the directory locked there's not much
+                * that can happen to it. However, ideally it wouldn't
+                * be necessary. XXX.
+                */
+               VOP_UNLOCK(fvp);
+               /* remember fdvp == tdvp so tdvp is locked */
+               error = do_relookup(tdvp, &to_ulr, &tvp, tcnp);
+               if (error && error != ENOENT) {
+                       VOP_UNLOCK(fdvp);
+                       goto abort;
+               }
+               if (error == ENOENT) {
+                       /*
+                        * Note: currently if the name doesn't exist,
+                        * relookup succeeds (it intercepts the
+                        * EJUSTRETURN from VOP_LOOKUP) and sets tvp
+                        * to NULL. Therefore, we will never get
+                        * ENOENT and this branch is not needed.
+                        * However, in a saner future the EJUSTRETURN
+                        * garbage will go away, so let's DTRT.
+                        */
+                       tvp = NULL;
+               }
+
+               /* tvp is locked; lock fvp if necessary */
+               if (!tvp || tvp != fvp) {
+                       vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
+               }
+       } else {
+               int found_fdvp;
+               struct vnode *illegal_fvp;
+
+               /*
+                * The source must not be above the destination. (If
+                * it were, the rename would detach a section of the
+                * tree.)
+                *
+                * Look up the tree from tdvp to see if we find fdvp,
+                * and if so, return the immediate child of fdvp we're
+                * under; that must not turn out to be the same as
+                * fvp.
+                *
+                * The per-volume rename lock guarantees that the
+                * result of this check remains true until we finish
+                * looking up and locking.
+                */
+               error = ufs_parentcheck(fdvp, tdvp, fcnp->cn_cred,
+                                       &found_fdvp, &illegal_fvp);
+               if (error) {
+                       goto abort;
+               }
+
+               /* Must lock in tree order. */
+
+               if (found_fdvp) {
+                       /* fdvp -> fvp -> tdvp -> tvp */
+                       error = lock_vnode_sequence(fdvp, &from_ulr,
+                                                   &fvp, fcnp, 0,
+                                                   EINVAL,
+                                                   tdvp, &to_ulr,
+                                                   &tvp, tcnp, 1);
+               } else {
+                       /* tdvp -> tvp -> fdvp -> fvp */
+                       error = lock_vnode_sequence(tdvp, &to_ulr,
+                                                   &tvp, tcnp, 1,
+                                                   ENOTEMPTY,
+                                                   fdvp, &from_ulr,
+                                                   &fvp, fcnp, 0);
+               }
+               if (error) {
+                       if (illegal_fvp) {
+                               vrele(illegal_fvp);
+                       }
+                       goto abort;
+               }
+               KASSERT(fvp != NULL);
+
+               if (illegal_fvp && fvp == illegal_fvp) {
+                       vrele(illegal_fvp);
+                       error = EINVAL;
+                       goto abort_withlocks;
+               }
+
+               if (illegal_fvp) {
+                       vrele(illegal_fvp);
+               }
+       }
+
+       KASSERT(fdvp && VOP_ISLOCKED(fdvp));
+       KASSERT(fvp && VOP_ISLOCKED(fvp));
+       KASSERT(tdvp && VOP_ISLOCKED(tdvp));
+       KASSERT(tvp == NULL || VOP_ISLOCKED(tvp));
+
+       /* --- everything is now locked --- */
+
+       if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
+           (VTOI(tdvp)->i_flags & APPEND))) {
+               error = EPERM;
+               goto abort_withlocks;
+       }
+
+       /*
+        * Check if just deleting a link name.
+        */
+       if (fvp == tvp) {
+               if (fvp->v_type == VDIR) {
+                       error = EINVAL;
+                       goto abort_withlocks;
+               }
+
+               /* Release destination completely. Leave fdvp locked. */
+               VOP_ABORTOP(tdvp, tcnp);
+               if (fdvp != tdvp) {
+                       VOP_UNLOCK(tdvp);
+               }
+               VOP_UNLOCK(tvp);
+               vrele(tdvp);
+               vrele(tvp);
+
+               /* Delete source. */
+               /* XXX: do we really need to relookup again? */
+
+               /*
+                * fdvp is still locked, but we just unlocked fvp
+                * (because fvp == tvp) so just decref fvp
+                */
+               vrele(fvp);
+               fcnp->cn_flags &= ~(MODMASK);
+               fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+               fcnp->cn_nameiop = DELETE;
+               if ((error = relookup(fdvp, &fvp, fcnp, 0))) {
+                       vput(fdvp);
+                       return (error);
+               }
+               return (VOP_REMOVE(fdvp, fvp, fcnp));
+       }
+       fdp = VTOI(fdvp);
+       ip = VTOI(fvp);
+       if ((nlink_t) ip->i_nlink >= LINK_MAX) {
+               error = EMLINK;
+               goto abort_withlocks;
+       }
+       if ((ip->i_flags & (IMMUTABLE | APPEND)) ||
+               (fdp->i_flags & APPEND)) {
+               error = EPERM;
+               goto abort_withlocks;
+       }
+       if ((ip->i_mode & IFMT) == IFDIR) {
+               /*
+                * Avoid ".", "..", and aliases of "." for obvious reasons.
+                */
+               if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+                   fdp == ip ||
+                   (fcnp->cn_flags & ISDOTDOT) ||
+                   (tcnp->cn_flags & ISDOTDOT) ||
+                   (ip->i_flag & IN_RENAME)) {
+                       error = EINVAL;
+                       goto abort_withlocks;
+               }
+               ip->i_flag |= IN_RENAME;
+               doingdirectory = 1;
+       }
+       oldparent = fdp->i_number;
+       VN_KNOTE(fdvp, NOTE_WRITE);             /* XXXLUKEM/XXX: right place? */
+
+       /*
+        * Both the directory
+        * and target vnodes are locked.
+        */
+       tdp = VTOI(tdvp);
+       txp = NULL;
+       if (tvp)
+               txp = VTOI(tvp);
+
+       mp = fdvp->v_mount;
+       fstrans_start(mp, FSTRANS_SHARED);
+
+       if (oldparent != tdp->i_number)
+               newparent = tdp->i_number;
+
+       /*
+        * If ".." must be changed (ie the directory gets a new
+        * parent) the user must have write permission in the source
+        * so as to be able to change "..".
+        */
+       if (doingdirectory && newparent) {
+               error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+               if (error)
+                       goto out;
+       }
+
+       KASSERT(fdvp != tvp);
+
+       if (newparent) {
+               /* Check for the rename("foo/foo", "foo") case. */
+               if (fdvp == tvp) {
+                       error = doingdirectory ? ENOTEMPTY : EISDIR;
+                       goto out;
+               }
+       }
+
+       fxp = VTOI(fvp);
+       fdp = VTOI(fdvp);
+
+       error = UFS_WAPBL_BEGIN(fdvp->v_mount);
+       if (error)
+               goto out2;
+
+       /*
+        * 1) Bump link count while we're moving stuff
+        *    around.  If we crash somewhere before
+        *    completing our work, the link count
+        *    may be wrong, but correctable.
+        */
+       ip->i_nlink++;
+       DIP_ASSIGN(ip, nlink, ip->i_nlink);
+       ip->i_flag |= IN_CHANGE;
+       if ((error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP)) != 0) {
+               goto bad;
+       }
+
+       /*
+        * 2) If target doesn't exist, link the target
+        *    to the source and unlink the source.
+        *    Otherwise, rewrite the target directory
+        *    entry to reference the source inode and
+        *    expunge the original entry's existence.
+        */
+       if (txp == NULL) {
+               if (tdp->i_dev != ip->i_dev)
+                       panic("rename: EXDEV");
+               /*
+                * Account for ".." in new directory.
+                * When source and destination have the same
+                * parent we don't fool with the link count.
+                */
+               if (doingdirectory && newparent) {
+                       if ((nlink_t)tdp->i_nlink >= LINK_MAX) {
+                               error = EMLINK;
+                               goto bad;
+                       }
+                       tdp->i_nlink++;
+                       DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+                       tdp->i_flag |= IN_CHANGE;
+                       if ((error = UFS_UPDATE(tdvp, NULL, NULL,
+                           UPDATE_DIROP)) != 0) {
+                               tdp->i_nlink--;
+                               DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+                               tdp->i_flag |= IN_CHANGE;
+                               goto bad;
+                       }
+               }
+               newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+               ufs_makedirentry(ip, tcnp, newdir);
+               error = ufs_direnter(tdvp, &to_ulr,
+                                    NULL, newdir, tcnp, NULL);
+               pool_cache_put(ufs_direct_cache, newdir);
+               if (error != 0) {
+                       if (doingdirectory && newparent) {
+                               tdp->i_nlink--;
+                               DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+                               tdp->i_flag |= IN_CHANGE;
+                               (void)UFS_UPDATE(tdvp, NULL, NULL,
+                                                UPDATE_WAIT | UPDATE_DIROP);
+                       }
+                       goto bad;
+               }
+               VN_KNOTE(tdvp, NOTE_WRITE);
+       } else {
+               if (txp->i_dev != tdp->i_dev || txp->i_dev != ip->i_dev)
+                       panic("rename: EXDEV");
+               /*
+                * Short circuit rename(foo, foo).
+                */
+               if (txp->i_number == ip->i_number)
+                       panic("rename: same file");
+               /*
+                * If the parent directory is "sticky", then the user must
+                * own the parent directory, or the destination of the rename,
+                * otherwise the destination may not be changed (except by
+                * root). This implements append-only directories.
+                */
+               if ((tdp->i_mode & S_ISTXT) &&
+                   kauth_authorize_generic(tcnp->cn_cred,
+                    KAUTH_GENERIC_ISSUSER, NULL) != 0 &&
+                   kauth_cred_geteuid(tcnp->cn_cred) != tdp->i_uid &&
+                   txp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
+                       error = EPERM;
+                       goto bad;
+               }
+               /*
+                * Target must be empty if a directory and have no links
+                * to it. Also, ensure source and target are compatible
+                * (both directories, or both not directories).
+                */
+               if ((txp->i_mode & IFMT) == IFDIR) {
+                       if (txp->i_nlink > 2 ||
+                           !ufs_dirempty(txp, tdp->i_number, tcnp->cn_cred)) {
+                               error = ENOTEMPTY;
+                               goto bad;
+                       }
+                       if (!doingdirectory) {
+                               error = ENOTDIR;
+                               goto bad;
+                       }
+                       cache_purge(tdvp);
+               } else if (doingdirectory) {
+                       error = EISDIR;
+                       goto bad;
+               }
+               if ((error = ufs_dirrewrite(tdp, to_ulr.ulr_offset,
+                   txp, ip->i_number,
+                   IFTODT(ip->i_mode), doingdirectory && newparent ?
+                   newparent : doingdirectory, IN_CHANGE | IN_UPDATE)) != 0)
+                       goto bad;
+               if (doingdirectory) {
+                       /*
+                        * Truncate inode. The only stuff left in the directory
+                        * is "." and "..". The "." reference is inconsequential
+                        * since we are quashing it. We have removed the "."
+                        * reference and the reference in the parent directory,
+                        * but there may be other hard links.
+                        */
+                       if (!newparent) {
+                               tdp->i_nlink--;
+                               DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+                               tdp->i_flag |= IN_CHANGE;
+                               UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0);
+                       }
+                       txp->i_nlink--;
+                       DIP_ASSIGN(txp, nlink, txp->i_nlink);
+                       txp->i_flag |= IN_CHANGE;
+                       if ((error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC,
+                           tcnp->cn_cred)))
+                               goto bad;
+               }
+               VN_KNOTE(tdvp, NOTE_WRITE);
+               VN_KNOTE(tvp, NOTE_DELETE);
+       }
+
+       /*
+        * Handle case where the directory entry we need to remove,
+        * which is/was at from_ulr.ulr_offset, or the one before it,
+        * which is/was at from_ulr.ulr_offset - from_ulr.ulr_count,
+        * may have been moved when the directory insertion above
+        * performed compaction.
+        */
+       if (tdp->i_number == fdp->i_number &&
+           ulr_overlap(&from_ulr, &to_ulr)) {
+
+               struct buf *bp;
+               struct direct *ep;
+               struct ufsmount *ump = fdp->i_ump;
+               doff_t curpos;
+               doff_t endsearch;       /* offset to end directory search */
+               uint32_t prev_reclen;
+               int dirblksiz = ump->um_dirblksiz;
+               const int needswap = UFS_MPNEEDSWAP(ump);
+               u_long bmask;
+               int namlen, entryoffsetinblock;
+               char *dirbuf;
+
+               bmask = fdvp->v_mount->mnt_stat.f_iosize - 1;
+
+               /*
+                * The fcnp entry will be somewhere between the start of
+                * compaction (to_ulr.ulr_offset) and the original location
+                * (from_ulr.ulr_offset).
+                */
+               curpos = to_ulr.ulr_offset;
+               endsearch = from_ulr.ulr_offset + from_ulr.ulr_reclen;
+               entryoffsetinblock = 0;
+
+               /*
+                * Get the directory block containing the start of
+                * compaction.
+                */
+               error = ufs_blkatoff(fdvp, (off_t)to_ulr.ulr_offset, &dirbuf,
+                   &bp, false);
+               if (error)
+                       goto bad;
+
+               /*
+                * Keep existing ulr_count (length of previous record)
+                * for the case where compaction did not include the
+                * previous entry but started at the from-entry.
+                */
+               prev_reclen = from_ulr.ulr_count;
+
+               while (curpos < endsearch) {
+                       uint32_t reclen;
+
+                       /*
+                        * If necessary, get the next directory block.
+                        *
+                        * dholland 7/13/11 to the best of my understanding
+                        * this should never happen; compaction occurs only
+                        * within single blocks. I think.
+                        */
+                       if ((curpos & bmask) == 0) {
+                               if (bp != NULL)
+                                       brelse(bp, 0);
+                               error = ufs_blkatoff(fdvp, (off_t)curpos,
+                                   &dirbuf, &bp, false);
+                               if (error)
+                                       goto bad;
+                               entryoffsetinblock = 0;
+                       }
+
+                       KASSERT(bp != NULL);
+                       ep = (struct direct *)(dirbuf + entryoffsetinblock);
+                       reclen = ufs_rw16(ep->d_reclen, needswap);
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+                       if (FSFMT(fdvp) && needswap == 0)
+                               namlen = ep->d_type;
+                       else
+                               namlen = ep->d_namlen;
+#else
+                       if (FSFMT(fdvp) && needswap != 0)
+                               namlen = ep->d_type;
+                       else
+                               namlen = ep->d_namlen;
+#endif
+                       if ((ep->d_ino != 0) &&
+                           (ufs_rw32(ep->d_ino, needswap) != WINO) &&
+                           (namlen == fcnp->cn_namelen) &&
+                           memcmp(ep->d_name, fcnp->cn_nameptr, namlen) == 0) {
+                               from_ulr.ulr_reclen = reclen;
+                               break;
+                       }
+                       curpos += reclen;
+                       entryoffsetinblock += reclen;
+                       prev_reclen = reclen;
+               }
+
+               from_ulr.ulr_offset = curpos;
+               from_ulr.ulr_count = prev_reclen;
+
+               KASSERT(curpos <= endsearch);
+
+               /*
+                * If ulr_offset points to start of a directory block,
+                * clear ulr_count so ufs_dirremove() doesn't try to
+                * merge free space over a directory block boundary.
+                */
+               if ((from_ulr.ulr_offset & (dirblksiz - 1)) == 0)
+                       from_ulr.ulr_count = 0;
+
+               brelse(bp, 0);
+       }
+
+       /*
+        * 3) Unlink the source.
+        */
+
+#if 0
+       /*
+        * Ensure that the directory entry still exists and has not
+        * changed while the new name has been entered. If the source is
+        * a file then the entry may have been unlinked or renamed. In
+        * either case there is no further work to be done. If the source
+        * is a directory then it cannot have been rmdir'ed; The IRENAME
+        * flag ensures that it cannot be moved by another rename or removed
+        * by a rmdir.
+        */
+#endif
+       KASSERT(fxp == ip);
+
+       /*
+        * If the source is a directory with a new parent, the link
+        * count of the old parent directory must be decremented and
+        * ".." set to point to the new parent.
+        */
+       if (doingdirectory && newparent) {
+               KASSERT(fdp != NULL);
+               ufs_dirrewrite(fxp, mastertemplate.dot_reclen,
+                              fdp, newparent, DT_DIR, 0, IN_CHANGE);
+               cache_purge(fdvp);
+       }
+       error = ufs_dirremove(fdvp, &from_ulr,
+                             fxp, fcnp->cn_flags, 0);
+       fxp->i_flag &= ~IN_RENAME;
+
+       VN_KNOTE(fvp, NOTE_RENAME);
+       goto done;
+
+ out:
+       goto out2;
+
+       /* exit routines from steps 1 & 2 */
+ bad:
+       if (doingdirectory)
+               ip->i_flag &= ~IN_RENAME;
+       ip->i_nlink--;
+       DIP_ASSIGN(ip, nlink, ip->i_nlink);
+       ip->i_flag |= IN_CHANGE;
+       ip->i_flag &= ~IN_RENAME;
+       UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0);
+ done:
+       UFS_WAPBL_END(fdvp->v_mount);
+ out2:
+       /*
+        * clear IN_RENAME - some exit paths happen too early to go
+        * through the cleanup done in the "bad" case above, so we
+        * always do this mini-cleanup here.
+        */
+       ip->i_flag &= ~IN_RENAME;
+
+       VOP_UNLOCK(fdvp);
+       if (tdvp != fdvp) {
+               VOP_UNLOCK(tdvp);
+       }
+       VOP_UNLOCK(fvp);
+       if (tvp && tvp != fvp) {
+               VOP_UNLOCK(tvp);
+       }
+
+       vrele(fdvp);
+       vrele(tdvp);
+       vrele(fvp);
+       if (tvp) {
+               vrele(tvp);
+       }
+
+       fstrans_done(mp);
+       return (error);
+
+ abort_withlocks:
+       VOP_UNLOCK(fdvp);
+       if (tdvp != fdvp) {
+               VOP_UNLOCK(tdvp);
+       }
+       VOP_UNLOCK(fvp);
+       if (tvp && tvp != fvp) {
+               VOP_UNLOCK(tvp);
+       }
+
+ abort:
+       VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
+       VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
+       vrele(tdvp);
+       if (tvp) {
+               vrele(tvp);
+       }
+       vrele(fdvp);
+       if (fvp) {
+               vrele(fvp);
+       }
+       return (error);
+}
+
+int
+ufs_mkdir(void *v)
+{
+       struct vop_mkdir_args /* {
+               struct vnode            *a_dvp;
+               struct vnode            **a_vpp;
+               struct componentname    *a_cnp;
+               struct vattr            *a_vap;
+       } */ *ap = v;
+       struct vnode            *dvp = ap->a_dvp, *tvp;
+       struct vattr            *vap = ap->a_vap;
+       struct componentname    *cnp = ap->a_cnp;
+       struct inode            *ip, *dp = VTOI(dvp);
+       struct buf              *bp;
+       struct dirtemplate      dirtemplate;
+       struct direct           *newdir;
+       int                     error, dmode;
+       struct ufsmount         *ump = dp->i_ump;
+       int                     dirblksiz = ump->um_dirblksiz;
+       struct ufs_lookup_results *ulr;
+
+       fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+
+       /* XXX should handle this material another way */
+       ulr = &dp->i_crap;
+       UFS_CHECK_CRAPCOUNTER(dp);
+
+       if ((nlink_t)dp->i_nlink >= LINK_MAX) {
+               error = EMLINK;
+               goto out;
+       }
+       dmode = vap->va_mode & ACCESSPERMS;
+       dmode |= IFDIR;
+       /*
+        * Must simulate part of ufs_makeinode here to acquire the inode,
+        * but not have it entered in the parent directory. The entry is
+        * made later after writing "." and ".." entries.
+        */
+       if ((error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, ap->a_vpp)) != 0)
+               goto out;
+
+       tvp = *ap->a_vpp;
+       ip = VTOI(tvp);
+
+       error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount);
+       if (error) {
+               UFS_VFREE(tvp, ip->i_number, dmode);
+               vput(tvp);
+               goto out;
+       }
+       ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+       DIP_ASSIGN(ip, uid, ip->i_uid);
+       ip->i_gid = dp->i_gid;
+       DIP_ASSIGN(ip, gid, ip->i_gid);
+#if defined(QUOTA) || defined(QUOTA2)
+       if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) {
+               UFS_VFREE(tvp, ip->i_number, dmode);
+               UFS_WAPBL_END(dvp->v_mount);
+               fstrans_done(dvp->v_mount);
+               vput(tvp);
+               vput(dvp);
+               return (error);
+       }
+#endif
+       ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+       ip->i_mode = dmode;
+       DIP_ASSIGN(ip, mode, dmode);
+       tvp->v_type = VDIR;     /* Rest init'd in getnewvnode(). */
+       ip->i_nlink = 2;
+       DIP_ASSIGN(ip, nlink, 2);
+       if (cnp->cn_flags & ISWHITEOUT) {
+               ip->i_flags |= UF_OPAQUE;
+               DIP_ASSIGN(ip, flags, ip->i_flags);
+       }
+
+       /*
+        * Bump link count in parent directory to reflect work done below.
+        * Should be done before reference is created so cleanup is
+        * possible if we crash.
+        */
+       dp->i_nlink++;
+       DIP_ASSIGN(dp, nlink, dp->i_nlink);
+       dp->i_flag |= IN_CHANGE;
+       if ((error = UFS_UPDATE(dvp, NULL, NULL, UPDATE_DIROP)) != 0)
+               goto bad;
+
+       /*
+        * Initialize directory with "." and ".." from static template.
+        */
+       dirtemplate = mastertemplate;
+       dirtemplate.dotdot_reclen = dirblksiz - dirtemplate.dot_reclen;
+       dirtemplate.dot_ino = ufs_rw32(ip->i_number, UFS_MPNEEDSWAP(ump));
+       dirtemplate.dotdot_ino = ufs_rw32(dp->i_number, UFS_MPNEEDSWAP(ump));
+       dirtemplate.dot_reclen = ufs_rw16(dirtemplate.dot_reclen,
+           UFS_MPNEEDSWAP(ump));
+       dirtemplate.dotdot_reclen = ufs_rw16(dirtemplate.dotdot_reclen,
+           UFS_MPNEEDSWAP(ump));
+       if (ump->um_maxsymlinklen <= 0) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+               if (UFS_MPNEEDSWAP(ump) == 0)
+#else
+               if (UFS_MPNEEDSWAP(ump) != 0)
+#endif
+               {
+                       dirtemplate.dot_type = dirtemplate.dot_namlen;
+                       dirtemplate.dotdot_type = dirtemplate.dotdot_namlen;
+                       dirtemplate.dot_namlen = dirtemplate.dotdot_namlen = 0;
+               } else
+                       dirtemplate.dot_type = dirtemplate.dotdot_type = 0;
+       }
+       if ((error = UFS_BALLOC(tvp, (off_t)0, dirblksiz, cnp->cn_cred,
+           B_CLRBUF, &bp)) != 0)
+               goto bad;
+       ip->i_size = dirblksiz;
+       DIP_ASSIGN(ip, size, dirblksiz);
+       ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+       uvm_vnp_setsize(tvp, ip->i_size);
+       memcpy((void *)bp->b_data, (void *)&dirtemplate, sizeof dirtemplate);
+
+       /*
+        * Directory set up, now install it's entry in the parent directory.
+        * We must write out the buffer containing the new directory body
+        * before entering the new name in the parent.
+        */
+       if ((error = VOP_BWRITE(bp->b_vp, bp)) != 0)
+               goto bad;
+       if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0) {
+               goto bad;
+       }
+       newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+       ufs_makedirentry(ip, cnp, newdir);
+       error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, bp);
+       pool_cache_put(ufs_direct_cache, newdir);
+ bad:
+       if (error == 0) {
+               VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+               UFS_WAPBL_END(dvp->v_mount);
+       } else {
+               dp->i_nlink--;
+               DIP_ASSIGN(dp, nlink, dp->i_nlink);
+               dp->i_flag |= IN_CHANGE;
+               UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+               /*
+                * No need to do an explicit UFS_TRUNCATE here, vrele will
+                * do this for us because we set the link count to 0.
+                */
+               ip->i_nlink = 0;
+               DIP_ASSIGN(ip, nlink, 0);
+               ip->i_flag |= IN_CHANGE;
+               /* If IN_ADIROP, account for it */
+               UFS_UNMARK_VNODE(tvp);
+               UFS_WAPBL_UPDATE(tvp, NULL, NULL, UPDATE_DIROP);
+               UFS_WAPBL_END(dvp->v_mount);
+               vput(tvp);
+       }
+ out:
+       fstrans_done(dvp->v_mount);
+       vput(dvp);
+       return (error);
+}
+
+int
+ufs_rmdir(void *v)
+{
+       struct vop_rmdir_args /* {
+               struct vnode            *a_dvp;
+               struct vnode            *a_vp;
+               struct componentname    *a_cnp;
+       } */ *ap = v;
+       struct vnode            *vp, *dvp;
+       struct componentname    *cnp;
+       struct inode            *ip, *dp;
+       int                     error;
+       struct ufs_lookup_results *ulr;
+
+       vp = ap->a_vp;
+       dvp = ap->a_dvp;
+       cnp = ap->a_cnp;
+       ip = VTOI(vp);
+       dp = VTOI(dvp);
+
+       /* XXX should handle this material another way */
+       ulr = &dp->i_crap;
+       UFS_CHECK_CRAPCOUNTER(dp);
+
+       /*
+        * No rmdir "." or of mounted directories please.
+        */
+       if (dp == ip || vp->v_mountedhere != NULL) {
+               if (dp == ip)
+                       vrele(dvp);
+               else
+                       vput(dvp);
+               vput(vp);
+               return (EINVAL);
+       }
+
+       fstrans_start(dvp->v_mount, FSTRANS_SHARED);
+
+       /*
+        * Do not remove a directory that is in the process of being renamed.
+        * Verify that the directory is empty (and valid). (Rmdir ".." won't
+        * be valid since ".." will contain a reference to the current
+        * directory and thus be non-empty.)
+        */
+       error = 0;
+       if (ip->i_flag & IN_RENAME) {
+               error = EINVAL;
+               goto out;
+       }
+       if (ip->i_nlink != 2 ||
+           !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
+               error = ENOTEMPTY;
+               goto out;
+       }
+       if ((dp->i_flags & APPEND) ||
+               (ip->i_flags & (IMMUTABLE | APPEND))) {
+               error = EPERM;
+               goto out;
+       }
+       error = UFS_WAPBL_BEGIN(dvp->v_mount);
+       if (error)
+               goto out;
+       /*
+        * Delete reference to directory before purging
+        * inode.  If we crash in between, the directory
+        * will be reattached to lost+found,
+        */
+       error = ufs_dirremove(dvp, ulr, ip, cnp->cn_flags, 1);
+       if (error) {
+               UFS_WAPBL_END(dvp->v_mount);
+               goto out;
+       }
+       VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+       cache_purge(dvp);
+       /*
+        * Truncate inode.  The only stuff left in the directory is "." and
+        * "..".  The "." reference is inconsequential since we're quashing
+        * it.
+        */
+       dp->i_nlink--;
+       DIP_ASSIGN(dp, nlink, dp->i_nlink);
+       dp->i_flag |= IN_CHANGE;
+       UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
+       ip->i_nlink--;
+       DIP_ASSIGN(ip, nlink, ip->i_nlink);
+       ip->i_flag |= IN_CHANGE;
+       error = UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred);
+       cache_purge(vp);
+       /*
+        * Unlock the log while we still have reference to unlinked
+        * directory vp so that it will not get locked for recycling
+        */
+       UFS_WAPBL_END(dvp->v_mount);
+#ifdef UFS_DIRHASH
+       if (ip->i_dirhash != NULL)
+               ufsdirhash_free(ip);
+#endif
+ out:
+       VN_KNOTE(vp, NOTE_DELETE);
+       vput(vp);
+       fstrans_done(dvp->v_mount);
+       vput(dvp);
+       return (error);
+}
+
+/*
+ * symlink -- make a symbolic link
+ */
+int
+ufs_symlink(void *v)
+{
+       struct vop_symlink_args /* {
+               struct vnode            *a_dvp;
+               struct vnode            **a_vpp;
+               struct componentname    *a_cnp;
+               struct vattr            *a_vap;
+               char                    *a_target;
+       } */ *ap = v;
+       struct vnode    *vp, **vpp;
+       struct inode    *ip;
+       int             len, error;
+       struct ufs_lookup_results *ulr;
+
+       vpp = ap->a_vpp;
+
+       /* XXX should handle this material another way */
+       ulr = &VTOI(ap->a_dvp)->i_crap;
+       UFS_CHECK_CRAPCOUNTER(VTOI(ap->a_dvp));
+
+       /*
+        * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+        * ufs_makeinode
+        */
+       fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
+       error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, ulr,
+                             vpp, ap->a_cnp);
+       if (error)
+               goto out;
+       VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+       vp = *vpp;
+       len = strlen(ap->a_target);
+       ip = VTOI(vp);
+       if (len < ip->i_ump->um_maxsymlinklen) {
+               memcpy((char *)SHORTLINK(ip), ap->a_target, len);
+               ip->i_size = len;
+               DIP_ASSIGN(ip, size, len);
+               uvm_vnp_setsize(vp, ip->i_size);
+               ip->i_flag |= IN_CHANGE | IN_UPDATE;
+               if (vp->v_mount->mnt_flag & MNT_RELATIME)
+                       ip->i_flag |= IN_ACCESS;
+               UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+       } else
+               error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
+                   UIO_SYSSPACE, IO_NODELOCKED | IO_JOURNALLOCKED,
+                   ap->a_cnp->cn_cred, NULL, NULL);
+       UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
+       if (error)
+               vput(vp);
+out:
+       fstrans_done(ap->a_dvp->v_mount);
+       return (error);
+}
+
+/*
+ * Vnode op for reading directories.
+ *
+ * This routine handles converting from the on-disk directory format
+ * "struct direct" to the in-memory format "struct dirent" as well as
+ * byte swapping the entries if necessary.
+ */
+int
+ufs_readdir(void *v)
+{
+       struct vop_readdir_args /* {
+               struct vnode    *a_vp;
+               struct uio      *a_uio;
+               kauth_cred_t    a_cred;
+               int             *a_eofflag;
+               off_t           **a_cookies;
+               int             *ncookies;
+       } */ *ap = v;
+       struct vnode    *vp = ap->a_vp;
+       struct direct   *cdp, *ecdp;
+       struct dirent   *ndp;
+       char            *cdbuf, *ndbuf, *endp;
+       struct uio      auio, *uio;
+       struct iovec    aiov;
+       int             error;
+       size_t          count, ccount, rcount;
+       off_t           off, *ccp;
+       off_t           startoff;
+       size_t          skipbytes;
+       struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+       int nswap = UFS_MPNEEDSWAP(ump);
+#if BYTE_ORDER == LITTLE_ENDIAN
+       int needswap = ump->um_maxsymlinklen <= 0 && nswap == 0;
+#else
+       int needswap = ump->um_maxsymlinklen <= 0 && nswap != 0;
+#endif
+       uio = ap->a_uio;
+       count = uio->uio_resid;
+       rcount = count - ((uio->uio_offset + count) & (ump->um_dirblksiz - 1));
+
+       if (rcount < _DIRENT_MINSIZE(cdp) || count < _DIRENT_MINSIZE(ndp))
+               return EINVAL;
+
+       startoff = uio->uio_offset & ~(ump->um_dirblksiz - 1);
+       skipbytes = uio->uio_offset - startoff;
+       rcount += skipbytes;
+
+       auio.uio_iov = &aiov;
+       auio.uio_iovcnt = 1;
+       auio.uio_offset = startoff;
+       auio.uio_resid = rcount;
+       UIO_SETUP_SYSSPACE(&auio);
+       auio.uio_rw = UIO_READ;
+       cdbuf = malloc(rcount, M_TEMP, M_WAITOK);
+       aiov.iov_base = cdbuf;
+       aiov.iov_len = rcount;
+       error = VOP_READ(vp, &auio, 0, ap->a_cred);
+       if (error != 0) {
+               free(cdbuf, M_TEMP);
+               return error;
+       }
+
+       rcount -= auio.uio_resid;
+
+       cdp = (struct direct *)(void *)cdbuf;
+       ecdp = (struct direct *)(void *)&cdbuf[rcount];
+
+       ndbuf = malloc(count, M_TEMP, M_WAITOK);
+       ndp = (struct dirent *)(void *)ndbuf;
+       endp = &ndbuf[count];
+
+       off = uio->uio_offset;
+       if (ap->a_cookies) {
+               ccount = rcount / _DIRENT_RECLEN(cdp, 1);
+               ccp = *(ap->a_cookies) = malloc(ccount * sizeof(*ccp),
+                   M_TEMP, M_WAITOK);
+       } else {
+               /* XXX: GCC */
+               ccount = 0;
+               ccp = NULL;
+       }
+
+       while (cdp < ecdp) {
+               cdp->d_reclen = ufs_rw16(cdp->d_reclen, nswap);
+               if (skipbytes > 0) {
+                       if (cdp->d_reclen <= skipbytes) {
+                               skipbytes -= cdp->d_reclen;
+                               cdp = _DIRENT_NEXT(cdp);
+                               continue;
+                       }
+                       /*
+                        * invalid cookie.
+                        */
+                       error = EINVAL;
+                       goto out;
+               }
+               if (cdp->d_reclen == 0) {
+                       struct dirent *ondp = ndp;
+                       ndp->d_reclen = _DIRENT_MINSIZE(ndp);
+                       ndp = _DIRENT_NEXT(ndp);
+                       ondp->d_reclen = 0;
+                       cdp = ecdp;
+                       break;
+               }
+               if (needswap) {
+                       ndp->d_type = cdp->d_namlen;
+                       ndp->d_namlen = cdp->d_type;
+               } else {
+                       ndp->d_type = cdp->d_type;
+                       ndp->d_namlen = cdp->d_namlen;
+               }
+               ndp->d_reclen = _DIRENT_RECLEN(ndp, ndp->d_namlen);
+               if ((char *)(void *)ndp + ndp->d_reclen +
+                   _DIRENT_MINSIZE(ndp) > endp)
+                       break;
+               ndp->d_fileno = ufs_rw32(cdp->d_ino, nswap);
+               (void)memcpy(ndp->d_name, cdp->d_name, ndp->d_namlen);
+               memset(&ndp->d_name[ndp->d_namlen], 0,
+                   ndp->d_reclen - _DIRENT_NAMEOFF(ndp) - ndp->d_namlen);
+               off += cdp->d_reclen;
+               if (ap->a_cookies) {
+                       KASSERT(ccp - *(ap->a_cookies) < ccount);
+                       *(ccp++) = off;
+               }
+               ndp = _DIRENT_NEXT(ndp);
+               cdp = _DIRENT_NEXT(cdp);
+       }
+
+       count = ((char *)(void *)ndp - ndbuf);
+       error = uiomove(ndbuf, count, uio);
+out:
+       if (ap->a_cookies) {
+               if (error) {
+                       free(*(ap->a_cookies), M_TEMP);
+                       *(ap->a_cookies) = NULL;
+                       *(ap->a_ncookies) = 0;
+               } else {
+                       *ap->a_ncookies = ccp - *(ap->a_cookies);
+               }
+       }
+       uio->uio_offset = off;
+       free(ndbuf, M_TEMP);
+       free(cdbuf, M_TEMP);
+       *ap->a_eofflag = VTOI(vp)->i_size <= uio->uio_offset;
+       return error;
+}
+
+/*
+ * Return target name of a symbolic link
+ */
+int
+ufs_readlink(void *v)
+{
+       struct vop_readlink_args /* {
+               struct vnode    *a_vp;
+               struct uio      *a_uio;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+       struct vnode    *vp = ap->a_vp;
+       struct inode    *ip = VTOI(vp);
+       struct ufsmount *ump = VFSTOUFS(vp->v_mount);
+       int             isize;
+
+       isize = ip->i_size;
+       if (isize < ump->um_maxsymlinklen ||
+           (ump->um_maxsymlinklen == 0 && DIP(ip, blocks) == 0)) {
+               uiomove((char *)SHORTLINK(ip), isize, ap->a_uio);
+               return (0);
+       }
+       return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
+}
+
+/*
+ * Calculate the logical to physical mapping if not done already,
+ * then call the device strategy routine.
+ */
+int
+ufs_strategy(void *v)
+{
+       struct vop_strategy_args /* {
+               struct vnode *a_vp;
+               struct buf *a_bp;
+       } */ *ap = v;
+       struct buf      *bp;
+       struct vnode    *vp;
+       struct inode    *ip;
+       struct mount    *mp;
+       int             error;
+
+       bp = ap->a_bp;
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       if (vp->v_type == VBLK || vp->v_type == VCHR)
+               panic("ufs_strategy: spec");
+       KASSERT(bp->b_bcount != 0);
+       if (bp->b_blkno == bp->b_lblkno) {
+               error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+                                NULL);
+               if (error) {
+                       bp->b_error = error;
+                       biodone(bp);
+                       return (error);
+               }
+               if (bp->b_blkno == -1) /* no valid data */
+                       clrbuf(bp);
+       }
+       if (bp->b_blkno < 0) { /* block is not on disk */
+               biodone(bp);
+               return (0);
+       }
+       vp = ip->i_devvp;
+
+       error = VOP_STRATEGY(vp, bp);
+       if (error)
+               return error;
+
+       if (!BUF_ISREAD(bp))
+               return 0;
+
+       mp = wapbl_vptomp(vp);
+       if (mp == NULL || mp->mnt_wapbl_replay == NULL ||
+           !WAPBL_REPLAY_ISOPEN(mp) ||
+           !WAPBL_REPLAY_CAN_READ(mp, bp->b_blkno, bp->b_bcount))
+               return 0;
+
+       error = biowait(bp);
+       if (error)
+               return error;
+
+       error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, bp->b_bcount);
+       if (error) {
+               mutex_enter(&bufcache_lock);
+               SET(bp->b_cflags, BC_INVAL);
+               mutex_exit(&bufcache_lock);
+       }
+       return error;
+}
+
+/*
+ * Print out the contents of an inode.
+ */
+int
+ufs_print(void *v)
+{
+       struct vop_print_args /* {
+               struct vnode    *a_vp;
+       } */ *ap = v;
+       struct vnode    *vp;
+       struct inode    *ip;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       printf("tag VT_UFS, ino %llu, on dev %llu, %llu",
+           (unsigned long long)ip->i_number,
+           (unsigned long long)major(ip->i_dev),
+           (unsigned long long)minor(ip->i_dev));
+       printf(" flags 0x%x, nlink %d\n",
+           ip->i_flag, ip->i_nlink);
+       printf("\tmode 0%o, owner %d, group %d, size %qd",
+           ip->i_mode, ip->i_uid, ip->i_gid,
+           (long long)ip->i_size);
+       if (vp->v_type == VFIFO)
+               VOCALL(fifo_vnodeop_p, VOFFSET(vop_print), v);
+       printf("\n");
+       return (0);
+}
+
+/*
+ * Read wrapper for special devices.
+ */
+int
+ufsspec_read(void *v)
+{
+       struct vop_read_args /* {
+               struct vnode    *a_vp;
+               struct uio      *a_uio;
+               int             a_ioflag;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+
+       /*
+        * Set access flag.
+        */
+       if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)
+               VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
+       return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap));
+}
+
+/*
+ * Write wrapper for special devices.
+ */
+int
+ufsspec_write(void *v)
+{
+       struct vop_write_args /* {
+               struct vnode    *a_vp;
+               struct uio      *a_uio;
+               int             a_ioflag;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+
+       /*
+        * Set update and change flags.
+        */
+       if ((ap->a_vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)
+               VTOI(ap->a_vp)->i_flag |= IN_MODIFY;
+       return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap));
+}
+
+/*
+ * Close wrapper for special devices.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+ufsspec_close(void *v)
+{
+       struct vop_close_args /* {
+               struct vnode    *a_vp;
+               int             a_fflag;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+       struct vnode    *vp;
+       struct inode    *ip;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       if (vp->v_usecount > 1)
+               UFS_ITIMES(vp, NULL, NULL, NULL);
+       return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Read wrapper for fifo's
+ */
+int
+ufsfifo_read(void *v)
+{
+       struct vop_read_args /* {
+               struct vnode    *a_vp;
+               struct uio      *a_uio;
+               int             a_ioflag;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+
+       /*
+        * Set access flag.
+        */
+       VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
+       return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap));
+}
+
+/*
+ * Write wrapper for fifo's.
+ */
+int
+ufsfifo_write(void *v)
+{
+       struct vop_write_args /* {
+               struct vnode    *a_vp;
+               struct uio      *a_uio;
+               int             a_ioflag;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+
+       /*
+        * Set update and change flags.
+        */
+       VTOI(ap->a_vp)->i_flag |= IN_MODIFY;
+       return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap));
+}
+
+/*
+ * Close wrapper for fifo's.
+ *
+ * Update the times on the inode then do device close.
+ */
+int
+ufsfifo_close(void *v)
+{
+       struct vop_close_args /* {
+               struct vnode    *a_vp;
+               int             a_fflag;
+               kauth_cred_t    a_cred;
+       } */ *ap = v;
+       struct vnode    *vp;
+       struct inode    *ip;
+
+       vp = ap->a_vp;
+       ip = VTOI(vp);
+       if (ap->a_vp->v_usecount > 1)
+               UFS_ITIMES(vp, NULL, NULL, NULL);
+       return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
+}
+
+/*
+ * Return POSIX pathconf information applicable to ufs filesystems.
+ */
+int
+ufs_pathconf(void *v)
+{
+       struct vop_pathconf_args /* {
+               struct vnode    *a_vp;
+               int             a_name;
+               register_t      *a_retval;
+       } */ *ap = v;
+
+       switch (ap->a_name) {
+       case _PC_LINK_MAX:
+               *ap->a_retval = LINK_MAX;
+               return (0);
+       case _PC_NAME_MAX:
+               *ap->a_retval = FFS_MAXNAMLEN;
+               return (0);
+       case _PC_PATH_MAX:
+               *ap->a_retval = PATH_MAX;
+               return (0);
+       case _PC_PIPE_BUF:
+               *ap->a_retval = PIPE_BUF;
+               return (0);
+       case _PC_CHOWN_RESTRICTED:
+               *ap->a_retval = 1;
+               return (0);
+       case _PC_NO_TRUNC:
+               *ap->a_retval = 1;
+               return (0);
+       case _PC_SYNC_IO:
+               *ap->a_retval = 1;
+               return (0);
+       case _PC_FILESIZEBITS:
+               *ap->a_retval = 42;
+               return (0);
+       case _PC_SYMLINK_MAX:
+               *ap->a_retval = MAXPATHLEN;
+               return (0);
+       case _PC_2_SYMLINKS:
+               *ap->a_retval = 1;
+               return (0);
+       default:
+               return (EINVAL);
+       }
+       /* NOTREACHED */
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+ufs_advlock(void *v)
+{
+       struct vop_advlock_args /* {
+               struct vnode    *a_vp;
+               void *          a_id;
+               int             a_op;
+               struct flock    *a_fl;
+               int             a_flags;
+       } */ *ap = v;
+       struct inode *ip;
+
+       ip = VTOI(ap->a_vp);
+       return lf_advlock(ap, &ip->i_lockf, ip->i_size);
+}
+
+/*
+ * Initialize the vnode associated with a new inode, handle aliased
+ * vnodes.
+ */
+void
+ufs_vinit(struct mount *mntp, int (**specops)(void *), int (**fifoops)(void *),
+       struct vnode **vpp)
+{
+       struct timeval  tv;
+       struct inode    *ip;
+       struct vnode    *vp;
+       dev_t           rdev;
+       struct ufsmount *ump;
+
+       vp = *vpp;
+       ip = VTOI(vp);
+       switch(vp->v_type = IFTOVT(ip->i_mode)) {
+       case VCHR:
+       case VBLK:
+               vp->v_op = specops;
+               ump = ip->i_ump;
+               if (ump->um_fstype == UFS1)
+                       rdev = (dev_t)ufs_rw32(ip->i_ffs1_rdev,
+                           UFS_MPNEEDSWAP(ump));
+               else
+                       rdev = (dev_t)ufs_rw64(ip->i_ffs2_rdev,
+                           UFS_MPNEEDSWAP(ump));
+               spec_node_init(vp, rdev);
+               break;
+       case VFIFO:
+               vp->v_op = fifoops;
+               break;
+       case VNON:
+       case VBAD:
+       case VSOCK:
+       case VLNK:
+       case VDIR:
+       case VREG:
+               break;
+       }
+       if (ip->i_number == ROOTINO)
+                vp->v_vflag |= VV_ROOT;
+       /*
+        * Initialize modrev times
+        */
+       getmicrouptime(&tv);
+       ip->i_modrev = (uint64_t)(uint)tv.tv_sec << 32
+                       | tv.tv_usec * 4294u;
+       *vpp = vp;
+}
+
+/*
+ * Allocate a new inode.
+ */
+int
+ufs_makeinode(int mode, struct vnode *dvp, const struct ufs_lookup_results *ulr,
+       struct vnode **vpp, struct componentname *cnp)
+{
+       struct inode    *ip, *pdir;
+       struct direct   *newdir;
+       struct vnode    *tvp;
+       int             error, ismember = 0;
+
+       UFS_WAPBL_JUNLOCK_ASSERT(dvp->v_mount);
+
+       pdir = VTOI(dvp);
+
+       if ((mode & IFMT) == 0)
+               mode |= IFREG;
+
+       if ((error = UFS_VALLOC(dvp, mode, cnp->cn_cred, vpp)) != 0) {
+               vput(dvp);
+               return (error);
+       }
+       tvp = *vpp;
+       ip = VTOI(tvp);
+       ip->i_gid = pdir->i_gid;
+       DIP_ASSIGN(ip, gid, ip->i_gid);
+       ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
+       DIP_ASSIGN(ip, uid, ip->i_uid);
+       error = UFS_WAPBL_BEGIN1(dvp->v_mount, dvp);
+       if (error) {
+               /*
+                * Note, we can't VOP_VFREE(tvp) here like we should
+                * because we can't write to the disk.  Instead, we leave
+                * the vnode dangling from the journal.
+                */
+               vput(tvp);
+               vput(dvp);
+               return (error);
+       }
+#if defined(QUOTA) || defined(QUOTA2)
+       if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) {
+               UFS_VFREE(tvp, ip->i_number, mode);
+               UFS_WAPBL_END1(dvp->v_mount, dvp);
+               vput(tvp);
+               vput(dvp);
+               return (error);
+       }
+#endif
+       ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+       ip->i_mode = mode;
+       DIP_ASSIGN(ip, mode, mode);
+       tvp->v_type = IFTOVT(mode);     /* Rest init'd in getnewvnode(). */
+       ip->i_nlink = 1;
+       DIP_ASSIGN(ip, nlink, 1);
+       if ((ip->i_mode & ISGID) && (kauth_cred_ismember_gid(cnp->cn_cred,
+           ip->i_gid, &ismember) != 0 || !ismember) &&
+           kauth_authorize_generic(cnp->cn_cred, KAUTH_GENERIC_ISSUSER, NULL)) {
+               ip->i_mode &= ~ISGID;
+               DIP_ASSIGN(ip, mode, ip->i_mode);
+       }
+
+       if (cnp->cn_flags & ISWHITEOUT) {
+               ip->i_flags |= UF_OPAQUE;
+               DIP_ASSIGN(ip, flags, ip->i_flags);
+       }
+
+       /*
+        * Make sure inode goes to disk before directory entry.
+        */
+       if ((error = UFS_UPDATE(tvp, NULL, NULL, UPDATE_DIROP)) != 0)
+               goto bad;
+       newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+       ufs_makedirentry(ip, cnp, newdir);
+       error = ufs_direnter(dvp, ulr, tvp, newdir, cnp, NULL);
+       pool_cache_put(ufs_direct_cache, newdir);
+       if (error)
+               goto bad;
+       vput(dvp);
+       *vpp = tvp;
+       return (0);
+
+ bad:
+       /*
+        * Write error occurred trying to update the inode
+        * or the directory so must deallocate the inode.
+        */
+       ip->i_nlink = 0;
+       DIP_ASSIGN(ip, nlink, 0);
+       ip->i_flag |= IN_CHANGE;
+       /* If IN_ADIROP, account for it */
+       UFS_UNMARK_VNODE(tvp);
+       UFS_WAPBL_UPDATE(tvp, NULL, NULL, 0);
+       tvp->v_type = VNON;             /* explodes later if VBLK */
+       UFS_WAPBL_END1(dvp->v_mount, dvp);
+       vput(tvp);
+       vput(dvp);
+       return (error);
+}
+
+/*
+ * Allocate len bytes at offset off.
+ */
+int
+ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
+    kauth_cred_t cred)
+{
+        struct inode *ip = VTOI(vp);
+        int error, delta, bshift, bsize;
+        UVMHIST_FUNC("ufs_gop_alloc"); UVMHIST_CALLED(ubchist);
+
+        error = 0;
+        bshift = vp->v_mount->mnt_fs_bshift;
+        bsize = 1 << bshift;
+
+        delta = off & (bsize - 1);
+        off -= delta;
+        len += delta;
+
+        while (len > 0) {
+                bsize = MIN(bsize, len);
+
+                error = UFS_BALLOC(vp, off, bsize, cred, flags, NULL);
+                if (error) {
+                        goto out;
+                }
+
+                /*
+                 * increase file size now, UFS_BALLOC() requires that
+                 * EOF be up-to-date before each call.
+                 */
+
+                if (ip->i_size < off + bsize) {
+                        UVMHIST_LOG(ubchist, "vp %p old 0x%x new 0x%x",
+                            vp, ip->i_size, off + bsize, 0);
+                        ip->i_size = off + bsize;
+                       DIP_ASSIGN(ip, size, ip->i_size);
+                }
+
+                off += bsize;
+                len -= bsize;
+        }
+
+out:
+       UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+       return error;
+}
+
+void
+ufs_gop_markupdate(struct vnode *vp, int flags)
+{
+       u_int32_t mask = 0;
+
+       if ((flags & GOP_UPDATE_ACCESSED) != 0) {
+               mask = IN_ACCESS;
+       }
+       if ((flags & GOP_UPDATE_MODIFIED) != 0) {
+               if (vp->v_type == VREG) {
+                       mask |= IN_CHANGE | IN_UPDATE;
+               } else {
+                       mask |= IN_MODIFY;
+               }
+       }
+       if (mask) {
+               struct inode *ip = VTOI(vp);
+
+               ip->i_flag |= mask;
+       }
+}
diff --git a/sys/ufs/ufs/ufs_wapbl.c b/sys/ufs/ufs/ufs_wapbl.c
new file mode 100644 (file)
index 0000000..1f11526
--- /dev/null
@@ -0,0 +1,166 @@
+/*  $NetBSD: ufs_wapbl.c,v 1.22 2011/07/18 06:46:05 dholland Exp $ */
+
+/*-
+ * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993, 1995
+ *     The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)ufs_vnops.c 8.28 (Berkeley) 7/31/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_wapbl.c,v 1.22 2011/07/18 06:46:05 dholland Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+#ifdef WAPBL_DEBUG_INODES
+#error WAPBL_DEBUG_INODES: not functional before ufs_wapbl.c is updated
+void
+ufs_wapbl_verify_inodes(struct mount *mp, const char *str)
+{
+       struct vnode *vp, *nvp;
+       struct inode *ip;
+       struct buf *bp, *nbp;
+
+       mutex_enter(&mntvnode_lock);
+ loop:
+       TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) {
+               /*
+                * If the vnode that we are about to sync is no longer
+                * associated with this mount point, start over.
+                */
+               if (vp->v_mount != mp)
+                       goto loop;
+               mutex_enter(&vp->v_interlock);
+               nvp = TAILQ_NEXT(vp, v_mntvnodes);
+               ip = VTOI(vp);
+               if (vp->v_type == VNON) {
+                       mutex_exit(&vp->v_interlock);
+                       continue;
+               }
+               /* verify that update has been called on all inodes */
+               if (ip->i_flag & (IN_CHANGE | IN_UPDATE)) {
+                       panic("wapbl_verify: mp %p: dirty vnode %p (inode %p): 0x%x\n",
+                               mp, vp, ip, ip->i_flag);
+               }
+               mutex_exit(&mntvnode_lock);
+
+               mutex_enter(&bufcache_lock);
+               for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+                       nbp = LIST_NEXT(bp, b_vnbufs);
+                       if ((bp->b_cflags & BC_BUSY)) {
+                               continue;
+                       }
+                       KASSERT((bp->b_oflags & BO_DELWRI) != 0);
+                       KASSERT((bp->b_flags & B_LOCKED) != 0);
+               }
+               mutex_exit(&bufcache_lock);
+               mutex_exit(&vp->v_interlock);
+
+               mutex_enter(&mntvnode_lock);
+       }
+       mutex_exit(&mntvnode_lock);
+
+       vp = VFSTOUFS(mp)->um_devvp;
+       mutex_enter(&vp->v_interlock);
+       mutex_enter(&bufcache_lock);
+       for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+               nbp = LIST_NEXT(bp, b_vnbufs);
+               if ((bp->b_cflags & BC_BUSY)) {
+                       continue;
+               }
+               KASSERT((bp->b_oflags & BO_DELWRI) != 0);
+               KASSERT((bp->b_flags & B_LOCKED) != 0);
+       }
+       mutex_exit(&bufcache_lock);
+       mutex_exit(&vp->v_interlock);
+}
+#endif /* WAPBL_DEBUG_INODES */
index ea9ee9469c063a58a7858b6d09514fe703f31984..bf059ec0b40ef3ef93876e7a248a1f2c1d48e4be 100644 (file)
@@ -2,16 +2,16 @@
 # Timestamp in UTC,minixpath,netbsdpath
 # minixpath:  path in Minix source tree (starting from /usr/src/)
 # netbsdpath: path in BSD source tree (starting from src/)
+2011/12/25 06:09:09,sys/arch/i386/stand
 2012/02/10 16:16:12,share/zoneinfo
 2011/05/26 00:00:00,external/public-domain/xz
 2011/09/30 01:32:21,usr.bin/gzip
 2011/08/27 12:55:09,bin/date
 2011/10/17 09:24:54,common/lib/libprop
-2011/11/28 12:50:07,include/ufs,sys/ufs
+2011/11/28 12:50:07,sys/ufs
 2010/09/10 15:51:20,sbin/newfs_ext2fs
 2011/09/16 16:13:18,sbin/fsck_ext2fs
 2011/09/30 22:08:19,lib/libprop
-2011/08/30 12:39:55,common/include/arch/i386,sys/arch/i386/include
 2011/11/13 22:19:09,common/include
 2011/01/17 18:11:10,common/lib/libc
 2011/01/21 23:36:49,lib/libc
@@ -40,7 +40,7 @@
 2011/09/01 13:37:33,usr.bin/du
 2010/07/07 21:24:34,usr.bin/man
 2009/05/08 12:48:43,usr.bin/apropos
-2011/01/12 23:02:22,usr.bin/mdocml,external/bsd/mdocml
+2011/01/12 23:02:22,external/bsd/mdocml
 2011/11/03 20:46:41,usr.sbin/installboot
 2011/01/04 10:01:51,usr.sbin/pwd_mkdb
 2011/01/04 10:30:21,usr.sbin/user
 2007/05/28 12:06:25,usr.bin/bzip2recover
 2009/04/02 21:39:33,libexec/makewhatis
 2010/05/14 16:43:34,dist/bzip2
-2011/08/17 00:07:38,sys/arch/i386/stand/bootxx
-2011/12/25 06:09:09,sys/arch/i386/stand/boot
-2011/05/20 22:29:55,sys/arch/i386/stand/cdboot
-2011/09/21 18:15:59,sys/arch/i386/stand/mbr
-2011/11/28 07:56:54,sys/arch/i386/stand/lib
 2012/01/16 18:47:57,sys/lib/libsa
 2011/10/30 00:28:57,sys/lib/libz
index 9674cd9ac3f4103bdfe04dec9786820d6368d728..6528d9318e1a105258db55d6ced1e54d54d5d0fc 100644 (file)
@@ -3,7 +3,7 @@
 .include <bsd.own.mk>
 
 # NetBSD imports
-SUBDIR= indent m4 stat tic sed mkdep uniq seq du man mdocml \
+SUBDIR= indent m4 stat tic sed mkdep uniq seq du man \
        apropos chpass newgrp passwd bzip2 bzip2recover gzip
 
 # Non-NetBSD imports