[Zrouter-src-freebsd] ZRouter.org: push to FreeBSD HEAD tree

zrouter-src-freebsd at zrouter.org zrouter-src-freebsd at zrouter.org
Wed Jul 25 14:36:09 UTC 2012


details:   http://zrouter.org/hg/FreeBSD/head//rev/fc630f3c8529
changeset: 493:fc630f3c8529
user:      Aleksandr Rybalko <ray at ddteam.net>
date:      Wed Jul 25 16:40:53 2012 +0300
description:
Lazy update

diffstat:

 head/sys/amd64/acpica/acpi_machdep.c           |     4 +-
 head/sys/amd64/acpica/acpi_switch.S            |   177 -
 head/sys/amd64/acpica/acpi_wakecode.S          |    39 +-
 head/sys/amd64/acpica/acpi_wakeup.c            |   420 ----
 head/sys/amd64/amd64/cpu_switch.S              |   166 +-
 head/sys/amd64/amd64/db_disasm.c               |   179 +-
 head/sys/amd64/amd64/fpu.c                     |   187 +-
 head/sys/amd64/amd64/genassym.c                |     9 +-
 head/sys/amd64/amd64/machdep.c                 |     9 +-
 head/sys/amd64/amd64/mem.c                     |     4 +-
 head/sys/amd64/amd64/minidump_machdep.c        |     8 +-
 head/sys/amd64/amd64/mp_machdep.c              |   110 +-
 head/sys/amd64/amd64/pmap.c                    |  1197 +++++++----
 head/sys/amd64/amd64/ptrace_machdep.c          |     9 +-
 head/sys/amd64/amd64/trap.c                    |    30 +-
 head/sys/amd64/amd64/vm_machdep.c              |     6 +-
 head/sys/amd64/conf/GENERIC                    |    10 +-
 head/sys/amd64/include/atomic.h                |    76 +-
 head/sys/amd64/include/cpufunc.h               |    40 +-
 head/sys/amd64/include/elf.h                   |     3 +-
 head/sys/amd64/include/fpu.h                   |     5 +-
 head/sys/amd64/include/in_cksum.h              |     6 +-
 head/sys/amd64/include/intr_machdep.h          |     4 +-
 head/sys/amd64/include/md_var.h                |     3 +-
 head/sys/amd64/include/pcb.h                   |    18 +-
 head/sys/amd64/include/pcpu.h                  |    24 +-
 head/sys/amd64/include/pmap.h                  |     5 +-
 head/sys/amd64/include/smp.h                   |     3 +-
 head/sys/amd64/include/vdso.h                  |     6 +
 head/sys/amd64/include/vmparam.h               |     4 +-
 head/sys/amd64/linux32/linux.h                 |     3 +-
 head/sys/amd64/linux32/linux32_dummy.c         |    11 +-
 head/sys/amd64/linux32/linux32_proto.h         |     8 +-
 head/sys/amd64/linux32/linux32_syscall.h       |     4 +-
 head/sys/amd64/linux32/linux32_syscalls.c      |     4 +-
 head/sys/amd64/linux32/linux32_sysent.c        |     4 +-
 head/sys/amd64/linux32/linux32_systrace_args.c |    10 +-
 head/sys/amd64/linux32/syscalls.master         |     6 +-
 head/sys/fs/cd9660/cd9660_node.c               |     5 +-
 head/sys/fs/cd9660/cd9660_vfsops.c             |     6 +-
 head/sys/fs/devfs/devfs_vnops.c                |    17 +-
 head/sys/fs/ext2fs/ext2_inode.c                |     6 +-
 head/sys/fs/ext2fs/ext2_lookup.c               |    60 +-
 head/sys/fs/ext2fs/ext2_vfsops.c               |     8 +-
 head/sys/fs/ext2fs/ext2_vnops.c                |     8 +-
 head/sys/fs/hpfs/hpfs_vnops.c                  |    25 +-
 head/sys/fs/msdosfs/denode.h                   |     4 +-
 head/sys/fs/msdosfs/msdosfs_denode.c           |    14 +-
 head/sys/fs/msdosfs/msdosfs_lookup.c           |     6 +-
 head/sys/fs/msdosfs/msdosfs_vnops.c            |    11 +-
 head/sys/fs/nandfs/bmap.c                      |   621 ++++++
 head/sys/fs/nandfs/bmap.h                      |    40 +
 head/sys/fs/nandfs/nandfs.h                    |   310 +++
 head/sys/fs/nandfs/nandfs_alloc.c              |   364 +++
 head/sys/fs/nandfs/nandfs_bmap.c               |   230 ++
 head/sys/fs/nandfs/nandfs_buffer.c             |    83 +
 head/sys/fs/nandfs/nandfs_cleaner.c            |   620 ++++++
 head/sys/fs/nandfs/nandfs_cpfile.c             |   776 +++++++
 head/sys/fs/nandfs/nandfs_dat.c                |   344 +++
 head/sys/fs/nandfs/nandfs_dir.c                |   314 +++
 head/sys/fs/nandfs/nandfs_fs.h                 |   565 +++++
 head/sys/fs/nandfs/nandfs_ifile.c              |   213 ++
 head/sys/fs/nandfs/nandfs_mount.h              |    50 +
 head/sys/fs/nandfs/nandfs_segment.c            |  1329 ++++++++++++
 head/sys/fs/nandfs/nandfs_subr.c               |  1120 ++++++++++
 head/sys/fs/nandfs/nandfs_subr.h               |   238 ++
 head/sys/fs/nandfs/nandfs_sufile.c             |   569 +++++
 head/sys/fs/nandfs/nandfs_vfsops.c             |  1590 +++++++++++++++
 head/sys/fs/nandfs/nandfs_vnops.c              |  2455 ++++++++++++++++++++++++
 head/sys/fs/nfs/nfs_commonacl.c                |     6 +-
 head/sys/fs/nfsclient/nfs_clbio.c              |    80 +-
 head/sys/fs/nfsclient/nfs_clnode.c             |    42 +-
 head/sys/fs/nfsclient/nfs_clvfsops.c           |     5 +-
 head/sys/fs/nfsclient/nfs_clvnops.c            |    23 +-
 head/sys/fs/nfsclient/nfsnode.h                |     3 +-
 head/sys/fs/nfsserver/nfs_nfsdport.c           |     9 +-
 head/sys/fs/nfsserver/nfs_nfsdstate.c          |    17 +-
 head/sys/fs/ntfs/ntfs.h                        |   318 +-
 head/sys/fs/ntfs/ntfs_subr.c                   |   170 +-
 head/sys/fs/ntfs/ntfs_subr.h                   |     4 +-
 head/sys/fs/ntfs/ntfs_vfsops.c                 |    84 +-
 head/sys/fs/ntfs/ntfs_vnops.c                  |   152 +-
 head/sys/fs/nullfs/null_vnops.c                |     5 +-
 head/sys/fs/portalfs/portal_vnops.c            |    10 +-
 head/sys/fs/smbfs/smbfs_node.c                 |    19 +-
 head/sys/fs/tmpfs/tmpfs_vnops.c                |     5 +-
 head/sys/fs/udf/udf_vfsops.c                   |     4 +-
 head/sys/fs/unionfs/union_subr.c               |    25 +-
 head/sys/fs/unionfs/union_vfsops.c             |    12 +-
 head/sys/fs/unionfs/union_vnops.c              |   305 +-
 head/sys/i386/acpica/acpi_machdep.c            |     4 +-
 head/sys/i386/acpica/acpi_wakecode.S           |   349 +-
 head/sys/i386/acpica/acpi_wakeup.c             |   371 ---
 head/sys/i386/conf/GENERIC                     |     8 +-
 head/sys/i386/conf/XEN                         |     4 +-
 head/sys/i386/i386/apic_vector.s               |    22 +-
 head/sys/i386/i386/bios.c                      |     6 +-
 head/sys/i386/i386/elf_machdep.c               |     7 +-
 head/sys/i386/i386/genassym.c                  |    15 +-
 head/sys/i386/i386/initcpu.c                   |     3 +-
 head/sys/i386/i386/machdep.c                   |    26 +-
 head/sys/i386/i386/mem.c                       |     4 +-
 head/sys/i386/i386/minidump_machdep.c          |     8 +-
 head/sys/i386/i386/mp_machdep.c                |   137 +-
 head/sys/i386/i386/pmap.c                      |   416 ++-
 head/sys/i386/i386/ptrace_machdep.c            |     4 +-
 head/sys/i386/i386/swtch.s                     |   111 +-
 head/sys/i386/i386/trap.c                      |    12 +-
 head/sys/i386/i386/vm86.c                      |     3 +-
 head/sys/i386/include/apicvar.h                |     5 +-
 head/sys/i386/include/atomic.h                 |    80 +-
 head/sys/i386/include/bootinfo.h               |    10 +-
 head/sys/i386/include/cpufunc.h                |    12 +-
 head/sys/i386/include/elf.h                    |     3 +-
 head/sys/i386/include/in_cksum.h               |     8 +-
 head/sys/i386/include/intr_machdep.h           |     4 +-
 head/sys/i386/include/md_var.h                 |     3 +-
 head/sys/i386/include/npx.h                    |     5 +-
 head/sys/i386/include/pcb.h                    |    17 +-
 head/sys/i386/include/pmap.h                   |     5 +-
 head/sys/i386/include/smp.h                    |     7 +-
 head/sys/i386/include/vdso.h                   |     6 +
 head/sys/i386/include/vmparam.h                |     5 +-
 head/sys/i386/isa/npx.c                        |    79 +-
 head/sys/i386/linux/linux.h                    |     3 +-
 head/sys/i386/linux/linux_dummy.c              |    11 +-
 head/sys/i386/xen/pmap.c                       |   220 +-
 head/sys/ia64/acpica/acpi_wakeup.c             |     9 +-
 head/sys/ia64/ia64/busdma_machdep.c            |    14 +-
 head/sys/ia64/ia64/machdep.c                   |   241 +-
 head/sys/ia64/ia64/mp_machdep.c                |    10 +-
 head/sys/ia64/ia64/nexus.c                     |    11 +-
 head/sys/ia64/ia64/physmem.c                   |   258 ++
 head/sys/ia64/ia64/pmap.c                      |    81 +-
 head/sys/ia64/include/_stdint.h                |     8 +-
 head/sys/ia64/include/_types.h                 |     6 +-
 head/sys/ia64/include/elf.h                    |     3 +-
 head/sys/ia64/include/in_cksum.h               |     6 +-
 head/sys/ia64/include/md_var.h                 |    13 +-
 head/sys/ia64/include/param.h                  |     5 +-
 head/sys/ia64/include/pcb.h                    |     6 +-
 head/sys/ia64/include/pmap.h                   |     3 +-
 head/sys/ia64/include/vdso.h                   |    41 +
 head/sys/kern/capabilities.conf                |     8 +-
 head/sys/kern/dtio_kdtrace.c                   |   232 ++
 head/sys/kern/imgact_aout.c                    |    15 +-
 head/sys/kern/imgact_elf.c                     |    33 +-
 head/sys/kern/imgact_gzip.c                    |     6 +-
 head/sys/kern/init_main.c                      |    37 +-
 head/sys/kern/init_sysent.c                    |    14 +-
 head/sys/kern/kern_acct.c                      |    25 +-
 head/sys/kern/kern_clock.c                     |     8 +-
 head/sys/kern/kern_conf.c                      |     9 +-
 head/sys/kern/kern_descrip.c                   |   552 ++--
 head/sys/kern/kern_event.c                     |    21 +-
 head/sys/kern/kern_exec.c                      |    67 +-
 head/sys/kern/kern_fork.c                      |    13 +-
 head/sys/kern/kern_jail.c                      |    23 +-
 head/sys/kern/kern_kthread.c                   |     4 +-
 head/sys/kern/kern_malloc.c                    |     8 +-
 head/sys/kern/kern_proc.c                      |    42 +-
 head/sys/kern/kern_racct.c                     |     7 +-
 head/sys/kern/kern_rangelock.c                 |   246 ++
 head/sys/kern/kern_sharedpage.c                |   240 ++
 head/sys/kern/kern_shutdown.c                  |    11 +-
 head/sys/kern/kern_sig.c                       |     7 +-
 head/sys/kern/kern_synch.c                     |    19 +-
 head/sys/kern/kern_tc.c                        |    86 +-
 head/sys/kern/kern_thr.c                       |     3 +-
 head/sys/kern/kern_thread.c                    |    11 +-
 head/sys/kern/kern_timeout.c                   |   359 +-
 head/sys/kern/sched_4bsd.c                     |    41 +-
 head/sys/kern/sched_ule.c                      |    40 +-
 head/sys/kern/subr_bus.c                       |     4 +-
 head/sys/kern/subr_devstat.c                   |    60 +-
 head/sys/kern/subr_dummy_vdso_tc.c             |    49 +
 head/sys/kern/subr_firmware.c                  |     4 +-
 head/sys/kern/subr_rman.c                      |    19 +-
 head/sys/kern/subr_sleepqueue.c                |    10 +-
 head/sys/kern/subr_smp.c                       |    17 +-
 head/sys/kern/subr_syscall.c                   |     8 +-
 head/sys/kern/subr_trap.c                      |     3 +-
 head/sys/kern/subr_turnstile.c                 |    12 +-
 head/sys/kern/subr_witness.c                   |    17 +-
 head/sys/kern/sys_capability.c                 |     6 +-
 head/sys/kern/sys_generic.c                    |     4 +-
 head/sys/kern/sys_procdesc.c                   |     6 +-
 head/sys/kern/sys_process.c                    |    10 +-
 head/sys/kern/syscalls.c                       |     4 +-
 head/sys/kern/syscalls.master                  |     6 +-
 head/sys/kern/systrace_args.c                  |    10 +-
 head/sys/kern/tty.c                            |    31 +-
 head/sys/kern/uipc_mqueue.c                    |     6 +-
 head/sys/kern/uipc_socket.c                    |     4 +-
 head/sys/kern/uipc_syscalls.c                  |    25 +-
 head/sys/kern/uipc_usrreq.c                    |     4 +-
 head/sys/kern/vfs_bio.c                        |    20 +-
 head/sys/kern/vfs_default.c                    |    19 +-
 head/sys/kern/vfs_subr.c                       |    15 +-
 head/sys/kern/vfs_syscalls.c                   |   302 +-
 head/sys/kern/vfs_vnops.c                      |   743 +++++-
 head/sys/netinet/icmp_var.h                    |     5 +-
 head/sys/netinet/if_ether.c                    |    15 +-
 head/sys/netinet/if_ether.h                    |    12 +-
 head/sys/netinet/igmp.c                        |    14 +-
 head/sys/netinet/in.c                          |     4 +-
 head/sys/netinet/in.h                          |     4 +-
 head/sys/netinet/in_pcb.c                      |     6 +-
 head/sys/netinet/in_pcb.h                      |     5 +-
 head/sys/netinet/in_var.h                      |     8 +-
 head/sys/netinet/ip.h                          |    27 +-
 head/sys/netinet/ip_carp.c                     |    11 +-
 head/sys/netinet/ip_fw.h                       |     2 +-
 head/sys/netinet/ip_icmp.c                     |     5 +-
 head/sys/netinet/ip_input.c                    |    11 +-
 head/sys/netinet/ip_mroute.c                   |     5 +-
 head/sys/netinet/ip_mroute.h                   |     3 +-
 head/sys/netinet/ip_output.c                   |    64 +-
 head/sys/netinet/ipfw/ip_dummynet.c            |     4 +-
 head/sys/netinet/ipfw/ip_fw_log.c              |   139 +-
 head/sys/netinet/ipfw/ip_fw_private.h          |     2 +-
 head/sys/netinet/ipfw/ip_fw_table.c            |    15 +-
 head/sys/netinet/libalias/alias_sctp.h         |     3 +-
 head/sys/netinet/libalias/libalias.3           |    16 +-
 head/sys/netinet/sctp.h                        |    80 +-
 head/sys/netinet/sctp_asconf.c                 |   189 +-
 head/sys/netinet/sctp_asconf.h                 |    12 +-
 head/sys/netinet/sctp_auth.c                   |    28 +-
 head/sys/netinet/sctp_auth.h                   |    10 +-
 head/sys/netinet/sctp_bsd_addr.c               |    14 +-
 head/sys/netinet/sctp_bsd_addr.h               |    13 +-
 head/sys/netinet/sctp_cc_functions.c           |    13 +-
 head/sys/netinet/sctp_constants.h              |    78 +-
 head/sys/netinet/sctp_crc32.c                  |    13 +-
 head/sys/netinet/sctp_crc32.h                  |    14 +-
 head/sys/netinet/sctp_dtrace_declare.h         |    12 +-
 head/sys/netinet/sctp_dtrace_define.h          |    12 +-
 head/sys/netinet/sctp_header.h                 |    27 +-
 head/sys/netinet/sctp_indata.c                 |   170 +-
 head/sys/netinet/sctp_indata.h                 |    22 +-
 head/sys/netinet/sctp_input.c                  |  1134 +++++-----
 head/sys/netinet/sctp_input.h                  |    24 +-
 head/sys/netinet/sctp_lock_bsd.h               |    15 +-
 head/sys/netinet/sctp_os.h                     |    12 +-
 head/sys/netinet/sctp_os_bsd.h                 |    45 +-
 head/sys/netinet/sctp_output.c                 |  1250 +++--------
 head/sys/netinet/sctp_output.h                 |    41 +-
 head/sys/netinet/sctp_pcb.c                    |   299 +--
 head/sys/netinet/sctp_pcb.h                    |    22 +-
 head/sys/netinet/sctp_peeloff.c                |    10 +-
 head/sys/netinet/sctp_peeloff.h                |    14 +-
 head/sys/netinet/sctp_ss_functions.c           |     8 +-
 head/sys/netinet/sctp_structs.h                |    14 +-
 head/sys/netinet/sctp_sysctl.c                 |    21 +-
 head/sys/netinet/sctp_sysctl.h                 |    17 +-
 head/sys/netinet/sctp_timer.c                  |    20 +-
 head/sys/netinet/sctp_timer.h                  |    11 +-
 head/sys/netinet/sctp_uio.h                    |    99 +-
 head/sys/netinet/sctp_usrreq.c                 |   180 +-
 head/sys/netinet/sctp_var.h                    |     8 +-
 head/sys/netinet/sctputil.c                    |   774 ++++---
 head/sys/netinet/sctputil.h                    |    41 +-
 head/sys/netinet/tcp_hostcache.c               |     4 +-
 head/sys/netinet/tcp_input.c                   |    61 +-
 head/sys/netinet/tcp_lro.c                     |   888 +++++---
 head/sys/netinet/tcp_lro.h                     |   123 +-
 head/sys/netinet/tcp_offload.c                 |   209 +-
 head/sys/netinet/tcp_offload.h                 |   364 +---
 head/sys/netinet/tcp_output.c                  |    69 +-
 head/sys/netinet/tcp_subr.c                    |    36 +-
 head/sys/netinet/tcp_syncache.c                |   147 +-
 head/sys/netinet/tcp_syncache.h                |    21 +-
 head/sys/netinet/tcp_timer.c                   |     7 +-
 head/sys/netinet/tcp_timewait.c                |    11 +-
 head/sys/netinet/tcp_usrreq.c                  |    77 +-
 head/sys/netinet/tcp_var.h                     |     4 +-
 head/sys/netinet/toecore.c                     |   575 +++++
 head/sys/netinet/toecore.h                     |   130 +
 head/sys/netinet/toedev.h                      |   162 -
 head/sys/netinet/udp_usrreq.c                  |    18 +-
 head/sys/pc98/conf/GENERIC                     |     4 +-
 head/sys/pc98/include/vdso.h                   |     6 +
 head/sys/pc98/pc98/machdep.c                   |    11 +-
 head/sys/powerpc/aim/locore32.S                |     9 +-
 head/sys/powerpc/aim/locore64.S                |     9 +-
 head/sys/powerpc/aim/mmu_oea.c                 |   165 +-
 head/sys/powerpc/aim/mmu_oea64.c               |   186 +-
 head/sys/powerpc/aim/moea64_native.c           |    47 +-
 head/sys/powerpc/aim/slb.c                     |     6 +-
 head/sys/powerpc/aim/swtch32.S                 |     5 +-
 head/sys/powerpc/aim/swtch64.S                 |     5 +-
 head/sys/powerpc/booke/locore.S                |    22 +-
 head/sys/powerpc/booke/machdep.c               |    82 +-
 head/sys/powerpc/booke/machdep_e500.c          |   158 +
 head/sys/powerpc/booke/machdep_ppc4xx.c        |   219 ++
 head/sys/powerpc/booke/platform_bare.c         |    63 +-
 head/sys/powerpc/booke/pmap.c                  |    52 +-
 head/sys/powerpc/booke/trap.c                  |     9 +-
 head/sys/powerpc/booke/trap_subr.S             |     4 +-
 head/sys/powerpc/conf/DEFAULTS                 |     4 +-
 head/sys/powerpc/conf/GENERIC                  |    11 +-
 head/sys/powerpc/conf/GENERIC64                |    22 +-
 head/sys/powerpc/conf/MPC85XX                  |     5 +-
 head/sys/powerpc/conf/NOTES                    |     5 +-
 head/sys/powerpc/include/_stdint.h             |     8 +-
 head/sys/powerpc/include/_types.h              |     6 +-
 head/sys/powerpc/include/atomic.h              |    61 +-
 head/sys/powerpc/include/cpu.h                 |     4 +-
 head/sys/powerpc/include/cpufunc.h             |    18 +-
 head/sys/powerpc/include/elf.h                 |     5 +-
 head/sys/powerpc/include/hid.h                 |    55 +-
 head/sys/powerpc/include/in_cksum.h            |     6 +-
 head/sys/powerpc/include/machdep.h             |    39 +
 head/sys/powerpc/include/pcpu.h                |     4 +-
 head/sys/powerpc/include/pio.h                 |    57 +-
 head/sys/powerpc/include/pmap.h                |    19 +-
 head/sys/powerpc/include/profile.h             |     9 +-
 head/sys/powerpc/include/psl.h                 |    30 +-
 head/sys/powerpc/include/pte.h                 |    29 +-
 head/sys/powerpc/include/spr.h                 |   228 +-
 head/sys/powerpc/include/tlb.h                 |    86 +-
 head/sys/powerpc/include/trap.h                |     4 +-
 head/sys/powerpc/include/ucontext.h            |     8 +-
 head/sys/powerpc/include/vdso.h                |    41 +
 head/sys/powerpc/include/vmparam.h             |     4 +-
 head/sys/powerpc/mpc85xx/fsl_sdhc.c            |  1306 ++++++++++++
 head/sys/powerpc/mpc85xx/fsl_sdhc.h            |   297 ++
 head/sys/powerpc/mpc85xx/i2c.c                 |     5 +-
 head/sys/powerpc/mpc85xx/lbc.c                 |   303 ++-
 head/sys/powerpc/mpc85xx/lbc.h                 |    62 +-
 head/sys/powerpc/mpc85xx/mpc85xx.c             |    13 +-
 head/sys/powerpc/mpc85xx/nexus.c               |    62 +-
 head/sys/powerpc/mpc85xx/openpic_fdt.c         |    93 -
 head/sys/powerpc/mpc85xx/pci_fdt.c             |    11 +-
 head/sys/powerpc/powermac/hrowpic.c            |     4 +-
 head/sys/powerpc/powerpc/atomic.S              |   137 -
 head/sys/powerpc/powerpc/bus_machdep.c         |    82 +-
 head/sys/powerpc/powerpc/cpu.c                 |    26 +-
 head/sys/powerpc/powerpc/db_trace.c            |     6 +-
 head/sys/powerpc/powerpc/gdb_machdep.c         |     4 +-
 head/sys/powerpc/powerpc/genassym.c            |    28 +-
 head/sys/powerpc/powerpc/mmu_if.m              |    12 +-
 head/sys/powerpc/powerpc/openpic_fdt.c         |    93 +
 head/sys/powerpc/powerpc/platform.c            |     6 +-
 head/sys/powerpc/powerpc/pmap_dispatch.c       |    24 +-
 head/sys/sparc64/conf/GENERIC                  |    35 +-
 head/sys/sparc64/include/_stdint.h             |     8 +-
 head/sys/sparc64/include/_types.h              |     6 +-
 head/sys/sparc64/include/elf.h                 |     3 +-
 head/sys/sparc64/include/in_cksum.h            |     6 +-
 head/sys/sparc64/include/intr_machdep.h        |     6 +-
 head/sys/sparc64/include/pcb.h                 |     4 +-
 head/sys/sparc64/include/pmap.h                |     5 +-
 head/sys/sparc64/include/vdso.h                |    34 +
 head/sys/sparc64/sparc64/intr_machdep.c        |     9 +-
 head/sys/sparc64/sparc64/machdep.c             |     4 +-
 head/sys/sparc64/sparc64/pmap.c                |   100 +-
 head/sys/sparc64/sparc64/tsb.c                 |     5 +-
 358 files changed, 25573 insertions(+), 9531 deletions(-)

diffs (53825 lines):

diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/acpica/acpi_machdep.c
--- a/head/sys/amd64/acpica/acpi_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/acpica/acpi_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/amd64/acpica/acpi_machdep.c 235556 2012-05-17 17:58:53Z jhb $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -44,8 +44,6 @@
 
 #include <machine/nexusvar.h>
 
-SYSCTL_DECL(_debug_acpi);
-
 int acpi_resume_beep;
 TUNABLE_INT("debug.acpi.resume_beep", &acpi_resume_beep);
 SYSCTL_INT(_debug_acpi, OID_AUTO, resume_beep, CTLFLAG_RW, &acpi_resume_beep,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/acpica/acpi_switch.S
--- a/head/sys/amd64/acpica/acpi_switch.S	Wed Jul 25 16:32:50 2012 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,177 +0,0 @@
-/*-
- * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
- * Copyright (c) 2008-2012 Jung-uk Kim <jkim at FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: head/sys/amd64/acpica/acpi_switch.S 230958 2012-02-03 21:24:28Z jkim $
- */
-
-#include <machine/asmacros.h>
-#include <machine/specialreg.h>
-
-#include "acpi_wakedata.h"
-#include "assym.s"
-
-#define	WAKEUP_CTX(member)	wakeup_ ## member - wakeup_ctx(%rsi)
-
-ENTRY(acpi_restorecpu)
-	/* Switch to KPML4phys. */
-	movq	%rdi, %cr3
-
-	/* Restore GDT. */
-	lgdt	WAKEUP_CTX(gdt)
-	jmp	1f
-1:
-
-	/* Fetch PCB. */
-	movq	WAKEUP_CTX(pcb), %rdi
-
-	/* Force kernel segment registers. */
-	movl	$KDSEL, %eax
-	movw	%ax, %ds
-	movw	%ax, %es
-	movw	%ax, %ss
-	movl	$KUF32SEL, %eax
-	movw	%ax, %fs
-	movl	$KUG32SEL, %eax
-	movw	%ax, %gs
-
-	movl	$MSR_FSBASE, %ecx
-	movl	PCB_FSBASE(%rdi), %eax
-	movl	4 + PCB_FSBASE(%rdi), %edx
-	wrmsr
-	movl	$MSR_GSBASE, %ecx
-	movl	PCB_GSBASE(%rdi), %eax
-	movl	4 + PCB_GSBASE(%rdi), %edx
-	wrmsr
-	movl	$MSR_KGSBASE, %ecx
-	movl	PCB_KGSBASE(%rdi), %eax
-	movl	4 + PCB_KGSBASE(%rdi), %edx
-	wrmsr
-
-	/* Restore EFER. */
-	movl	$MSR_EFER, %ecx
-	movl	WAKEUP_CTX(efer), %eax
-	wrmsr
-
-	/* Restore fast syscall stuff. */
-	movl	$MSR_STAR, %ecx
-	movl	WAKEUP_CTX(star), %eax
-	movl	4 + WAKEUP_CTX(star), %edx
-	wrmsr
-	movl	$MSR_LSTAR, %ecx
-	movl	WAKEUP_CTX(lstar), %eax
-	movl	4 + WAKEUP_CTX(lstar), %edx
-	wrmsr
-	movl	$MSR_CSTAR, %ecx
-	movl	WAKEUP_CTX(cstar), %eax
-	movl	4 + WAKEUP_CTX(cstar), %edx
-	wrmsr
-	movl	$MSR_SF_MASK, %ecx
-	movl	WAKEUP_CTX(sfmask), %eax
-	wrmsr
-
-	/* Restore CR0 except for FPU mode. */
-	movq	PCB_CR0(%rdi), %rax
-	andq	$~(CR0_EM | CR0_TS), %rax
-	movq	%rax, %cr0
-
-	/* Restore CR2 and CR4. */
-	movq	PCB_CR2(%rdi), %rax
-	movq	%rax, %cr2
-	movq	PCB_CR4(%rdi), %rax
-	movq	%rax, %cr4
-
-	/* Restore descriptor tables. */
-	lidt	PCB_IDT(%rdi)
-	lldt	PCB_LDT(%rdi)
-
-#define	SDT_SYSTSS	9
-#define	SDT_SYSBSY	11
-
-	/* Clear "task busy" bit and reload TR. */
-	movq	PCPU(TSS), %rax
-	andb	$(~SDT_SYSBSY | SDT_SYSTSS), 5(%rax)
-	movw	PCB_TR(%rdi), %ax
-	ltr	%ax
-
-#undef	SDT_SYSTSS
-#undef	SDT_SYSBSY
-
-	/* Restore debug registers. */
-	movq	PCB_DR0(%rdi), %rax
-	movq	%rax, %dr0
-	movq	PCB_DR1(%rdi), %rax
-	movq	%rax, %dr1
-	movq	PCB_DR2(%rdi), %rax
-	movq	%rax, %dr2
-	movq	PCB_DR3(%rdi), %rax
-	movq	%rax, %dr3
-	movq	PCB_DR6(%rdi), %rax
-	movq	%rax, %dr6
-	movq	PCB_DR7(%rdi), %rax
-	movq	%rax, %dr7
-
-	/* Restore FPU state. */
-	fninit
-	movq	WAKEUP_CTX(fpusave), %rbx
-	movq	WAKEUP_CTX(xsmask), %rax
-	testq	%rax, %rax
-	jz	1f
-	movq	%rax, %rdx
-	shrq	$32, %rdx
-	movl	$XCR0, %ecx
-/*	xsetbv	*/
-	.byte	0x0f, 0x01, 0xd1
-/*	xrstor	(%rbx) */
-	.byte	0x0f, 0xae, 0x2b
-	jmp	2f
-1:
-	fxrstor	(%rbx)
-2:
-
-	/* Reload CR0. */
-	movq	PCB_CR0(%rdi), %rax
-	movq	%rax, %cr0
-
-	/* Restore other callee saved registers. */
-	movq	PCB_R15(%rdi), %r15
-	movq	PCB_R14(%rdi), %r14
-	movq	PCB_R13(%rdi), %r13
-	movq	PCB_R12(%rdi), %r12
-	movq	PCB_RBP(%rdi), %rbp
-	movq	PCB_RSP(%rdi), %rsp
-	movq	PCB_RBX(%rdi), %rbx
-
-	/* Restore return address. */
-	movq	PCB_RIP(%rdi), %rax
-	movq	%rax, (%rsp)
-
-	/* Indicate the CPU is resumed. */
-	xorl	%eax, %eax
-	movl	%eax, WAKEUP_CTX(cpu)
-
-	ret
-END(acpi_restorecpu)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/acpica/acpi_wakecode.S
--- a/head/sys/amd64/acpica/acpi_wakecode.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/acpica/acpi_wakecode.S	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/acpica/acpi_wakecode.S 231787 2012-02-15 22:10:33Z jkim $
+ * $FreeBSD: head/sys/amd64/acpica/acpi_wakecode.S 237037 2012-06-13 22:53:56Z jkim $
  */
 
 #include <machine/asmacros.h>
@@ -219,10 +219,14 @@
 	mov	$bootdata64 - bootgdt, %eax
 	mov	%ax, %ds
 
-	/* Restore arguments and return. */
-	movq	wakeup_kpml4 - wakeup_start(%rbx), %rdi
-	movq	wakeup_ctx - wakeup_start(%rbx), %rsi
-	movq	wakeup_retaddr - wakeup_start(%rbx), %rax
+	/* Restore arguments. */
+	movq	wakeup_pcb - wakeup_start(%rbx), %rdi
+	movq	wakeup_ret - wakeup_start(%rbx), %rax
+
+	/* Restore GDT. */
+	lgdt	wakeup_gdt - wakeup_start(%rbx)
+
+	/* Jump to return address. */
 	jmp	*%rax
 
 	.data
@@ -268,34 +272,11 @@
 	.long	bootgdt - wakeup_start	/* Offset plus %ds << 4 */
 
 	ALIGN_DATA
-wakeup_retaddr:
-	.quad	0
-wakeup_kpml4:
-	.quad	0
-
-wakeup_ctx:
-	.quad	0
 wakeup_pcb:
 	.quad	0
-wakeup_fpusave:
+wakeup_ret:
 	.quad	0
 wakeup_gdt:
 	.word	0
 	.quad	0
-
-	ALIGN_DATA
-wakeup_efer:
-	.quad	0
-wakeup_star:
-	.quad	0
-wakeup_lstar:
-	.quad	0
-wakeup_cstar:
-	.quad	0
-wakeup_sfmask:
-	.quad	0
-wakeup_xsmask:
-	.quad	0
-wakeup_cpu:
-	.long	0
 dummy:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/acpica/acpi_wakeup.c
--- a/head/sys/amd64/acpica/acpi_wakeup.c	Wed Jul 25 16:32:50 2012 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,420 +0,0 @@
-/*-
- * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
- * Copyright (c) 2003 Peter Wemm
- * Copyright (c) 2008-2012 Jung-uk Kim <jkim at FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/acpica/acpi_wakeup.c 233704 2012-03-30 17:03:06Z jkim $");
-
-#include <sys/param.h>
-#include <sys/bus.h>
-#include <sys/eventhandler.h>
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/memrange.h>
-#include <sys/smp.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
-#include <machine/clock.h>
-#include <machine/intr_machdep.h>
-#include <x86/mca.h>
-#include <machine/pcb.h>
-#include <machine/pmap.h>
-#include <machine/specialreg.h>
-#include <machine/md_var.h>
-
-#ifdef SMP
-#include <x86/apicreg.h>
-#include <machine/smp.h>
-#include <machine/vmparam.h>
-#endif
-
-#include <contrib/dev/acpica/include/acpi.h>
-
-#include <dev/acpica/acpivar.h>
-
-#include "acpi_wakecode.h"
-#include "acpi_wakedata.h"
-
-/* Make sure the code is less than a page and leave room for the stack. */
-CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024);
-
-extern int		acpi_resume_beep;
-extern int		acpi_reset_video;
-
-#ifdef SMP
-extern struct pcb	**susppcbs;
-extern void		**suspfpusave;
-#else
-static struct pcb	**susppcbs;
-static void		**suspfpusave;
-#endif
-
-int			acpi_restorecpu(uint64_t, vm_offset_t);
-
-static void		*acpi_alloc_wakeup_handler(void);
-static void		acpi_stop_beep(void *);
-
-#ifdef SMP
-static int		acpi_wakeup_ap(struct acpi_softc *, int);
-static void		acpi_wakeup_cpus(struct acpi_softc *, const cpuset_t *);
-#endif
-
-#define	WAKECODE_VADDR(sc)	((sc)->acpi_wakeaddr + (3 * PAGE_SIZE))
-#define	WAKECODE_PADDR(sc)	((sc)->acpi_wakephys + (3 * PAGE_SIZE))
-#define	WAKECODE_FIXUP(offset, type, val) do	{	\
-	type	*addr;					\
-	addr = (type *)(WAKECODE_VADDR(sc) + offset);	\
-	*addr = val;					\
-} while (0)
-
-static void
-acpi_stop_beep(void *arg)
-{
-
-	if (acpi_resume_beep != 0)
-		timer_spkr_release();
-}
-
-#ifdef SMP
-static int
-acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
-{
-	int		vector = (WAKECODE_PADDR(sc) >> 12) & 0xff;
-	int		apic_id = cpu_apic_ids[cpu];
-	int		ms;
-
-	WAKECODE_FIXUP(wakeup_pcb, struct pcb *, susppcbs[cpu]);
-	WAKECODE_FIXUP(wakeup_fpusave, void *, suspfpusave[cpu]);
-	WAKECODE_FIXUP(wakeup_gdt, uint16_t, susppcbs[cpu]->pcb_gdt.rd_limit);
-	WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t,
-	    susppcbs[cpu]->pcb_gdt.rd_base);
-	WAKECODE_FIXUP(wakeup_cpu, int, cpu);
-
-	/* do an INIT IPI: assert RESET */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
-
-	/* wait for pending status end */
-	lapic_ipi_wait(-1);
-
-	/* do an INIT IPI: deassert RESET */
-	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
-
-	/* wait for pending status end */
-	DELAY(10000);		/* wait ~10mS */
-	lapic_ipi_wait(-1);
-
-	/*
-	 * next we do a STARTUP IPI: the previous INIT IPI might still be
-	 * latched, (P5 bug) this 1st STARTUP would then terminate
-	 * immediately, and the previously started INIT IPI would continue. OR
-	 * the previous INIT IPI has already run. and this STARTUP IPI will
-	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
-	 * will run.
-	 */
-
-	/* do a STARTUP IPI */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
-	    vector, apic_id);
-	lapic_ipi_wait(-1);
-	DELAY(200);		/* wait ~200uS */
-
-	/*
-	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
-	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
-	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
-	 * recognized after hardware RESET or INIT IPI.
-	 */
-
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
-	    vector, apic_id);
-	lapic_ipi_wait(-1);
-	DELAY(200);		/* wait ~200uS */
-
-	/* Wait up to 5 seconds for it to start. */
-	for (ms = 0; ms < 5000; ms++) {
-		if (*(int *)(WAKECODE_VADDR(sc) + wakeup_cpu) == 0)
-			return (1);	/* return SUCCESS */
-		DELAY(1000);
-	}
-	return (0);		/* return FAILURE */
-}
-
-#define	WARMBOOT_TARGET		0
-#define	WARMBOOT_OFF		(KERNBASE + 0x0467)
-#define	WARMBOOT_SEG		(KERNBASE + 0x0469)
-
-#define	CMOS_REG		(0x70)
-#define	CMOS_DATA		(0x71)
-#define	BIOS_RESET		(0x0f)
-#define	BIOS_WARM		(0x0a)
-
-static void
-acpi_wakeup_cpus(struct acpi_softc *sc, const cpuset_t *wakeup_cpus)
-{
-	uint32_t	mpbioswarmvec;
-	int		cpu;
-	u_char		mpbiosreason;
-
-	/* save the current value of the warm-start vector */
-	mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF);
-	outb(CMOS_REG, BIOS_RESET);
-	mpbiosreason = inb(CMOS_DATA);
-
-	/* setup a vector to our boot code */
-	*((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET;
-	*((volatile u_short *)WARMBOOT_SEG) = WAKECODE_PADDR(sc) >> 4;
-	outb(CMOS_REG, BIOS_RESET);
-	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
-
-	/* Wake up each AP. */
-	for (cpu = 1; cpu < mp_ncpus; cpu++) {
-		if (!CPU_ISSET(cpu, wakeup_cpus))
-			continue;
-		if (acpi_wakeup_ap(sc, cpu) == 0) {
-			/* restore the warmstart vector */
-			*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
-			panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)",
-			    cpu, cpu_apic_ids[cpu]);
-		}
-	}
-
-	/* restore the warmstart vector */
-	*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
-
-	outb(CMOS_REG, BIOS_RESET);
-	outb(CMOS_DATA, mpbiosreason);
-}
-#endif
-
-int
-acpi_sleep_machdep(struct acpi_softc *sc, int state)
-{
-#ifdef SMP
-	cpuset_t	wakeup_cpus;
-#endif
-	register_t	rf;
-	ACPI_STATUS	status;
-	int		ret;
-
-	ret = -1;
-
-	if (sc->acpi_wakeaddr == 0ul)
-		return (ret);
-
-#ifdef SMP
-	wakeup_cpus = all_cpus;
-	CPU_CLR(PCPU_GET(cpuid), &wakeup_cpus);
-#endif
-
-	if (acpi_resume_beep != 0)
-		timer_spkr_acquire();
-
-	AcpiSetFirmwareWakingVector(WAKECODE_PADDR(sc));
-
-	rf = intr_disable();
-	intr_suspend();
-
-	if (savectx(susppcbs[0])) {
-		ctx_fpusave(suspfpusave[0]);
-#ifdef SMP
-		if (!CPU_EMPTY(&wakeup_cpus) &&
-		    suspend_cpus(wakeup_cpus) == 0) {
-			device_printf(sc->acpi_dev, "Failed to suspend APs\n");
-			goto out;
-		}
-#endif
-
-		WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0));
-		WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0));
-
-		WAKECODE_FIXUP(wakeup_pcb, struct pcb *, susppcbs[0]);
-		WAKECODE_FIXUP(wakeup_fpusave, void *, suspfpusave[0]);
-		WAKECODE_FIXUP(wakeup_gdt, uint16_t,
-		    susppcbs[0]->pcb_gdt.rd_limit);
-		WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t,
-		    susppcbs[0]->pcb_gdt.rd_base);
-		WAKECODE_FIXUP(wakeup_cpu, int, 0);
-
-		/* Call ACPICA to enter the desired sleep state */
-		if (state == ACPI_STATE_S4 && sc->acpi_s4bios)
-			status = AcpiEnterSleepStateS4bios();
-		else
-			status = AcpiEnterSleepState(state, acpi_sleep_flags);
-
-		if (status != AE_OK) {
-			device_printf(sc->acpi_dev,
-			    "AcpiEnterSleepState failed - %s\n",
-			    AcpiFormatException(status));
-			goto out;
-		}
-
-		for (;;)
-			ia32_pause();
-	} else {
-		pmap_init_pat();
-		load_cr3(susppcbs[0]->pcb_cr3);
-		initializecpu();
-		PCPU_SET(switchtime, 0);
-		PCPU_SET(switchticks, ticks);
-#ifdef SMP
-		if (!CPU_EMPTY(&wakeup_cpus))
-			acpi_wakeup_cpus(sc, &wakeup_cpus);
-#endif
-		ret = 0;
-	}
-
-out:
-#ifdef SMP
-	if (!CPU_EMPTY(&wakeup_cpus))
-		restart_cpus(wakeup_cpus);
-#endif
-
-	mca_resume();
-	intr_resume();
-	intr_restore(rf);
-
-	AcpiSetFirmwareWakingVector(0);
-
-	if (ret == 0 && mem_range_softc.mr_op != NULL &&
-	    mem_range_softc.mr_op->reinit != NULL)
-		mem_range_softc.mr_op->reinit(&mem_range_softc);
-
-	return (ret);
-}
-
-static void *
-acpi_alloc_wakeup_handler(void)
-{
-	void		*wakeaddr;
-	int		i;
-
-	/*
-	 * Specify the region for our wakeup code.  We want it in the low 1 MB
-	 * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA
-	 * (less than 128KB, below 0xa0000, must be excluded by SMAP and DSDT),
-	 * and ROM area (0xa0000 and above).  The temporary page tables must be
-	 * page-aligned.
-	 */
-	wakeaddr = contigmalloc(4 * PAGE_SIZE, M_DEVBUF, M_WAITOK, 0x500,
-	    0xa0000, PAGE_SIZE, 0ul);
-	if (wakeaddr == NULL) {
-		printf("%s: can't alloc wake memory\n", __func__);
-		return (NULL);
-	}
-	if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL,
-	    EVENTHANDLER_PRI_LAST) == NULL) {
-		printf("%s: can't register event handler\n", __func__);
-		contigfree(wakeaddr, 4 * PAGE_SIZE, M_DEVBUF);
-		return (NULL);
-	}
-	susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK);
-	suspfpusave = malloc(mp_ncpus * sizeof(void *), M_DEVBUF, M_WAITOK);
-	for (i = 0; i < mp_ncpus; i++) {
-		susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK);
-		suspfpusave[i] = alloc_fpusave(M_WAITOK);
-	}
-
-	return (wakeaddr);
-}
-
-void
-acpi_install_wakeup_handler(struct acpi_softc *sc)
-{
-	static void	*wakeaddr = NULL;
-	uint64_t	*pt4, *pt3, *pt2;
-	int		i;
-
-	if (wakeaddr != NULL)
-		return;
-
-	wakeaddr = acpi_alloc_wakeup_handler();
-	if (wakeaddr == NULL)
-		return;
-
-	sc->acpi_wakeaddr = (vm_offset_t)wakeaddr;
-	sc->acpi_wakephys = vtophys(wakeaddr);
-
-	bcopy(wakecode, (void *)WAKECODE_VADDR(sc), sizeof(wakecode));
-
-	/* Patch GDT base address, ljmp targets and page table base address. */
-	WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t,
-	    WAKECODE_PADDR(sc) + bootgdt);
-	WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t,
-	    WAKECODE_PADDR(sc) + wakeup_32);
-	WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t,
-	    WAKECODE_PADDR(sc) + wakeup_64);
-	WAKECODE_FIXUP(wakeup_pagetables, uint32_t, sc->acpi_wakephys);
-
-	/* Save pointers to some global data. */
-	WAKECODE_FIXUP(wakeup_retaddr, void *, acpi_restorecpu);
-	WAKECODE_FIXUP(wakeup_kpml4, uint64_t, KPML4phys);
-	WAKECODE_FIXUP(wakeup_ctx, vm_offset_t,
-	    WAKECODE_VADDR(sc) + wakeup_ctx);
-	WAKECODE_FIXUP(wakeup_efer, uint64_t, rdmsr(MSR_EFER));
-	WAKECODE_FIXUP(wakeup_star, uint64_t, rdmsr(MSR_STAR));
-	WAKECODE_FIXUP(wakeup_lstar, uint64_t, rdmsr(MSR_LSTAR));
-	WAKECODE_FIXUP(wakeup_cstar, uint64_t, rdmsr(MSR_CSTAR));
-	WAKECODE_FIXUP(wakeup_sfmask, uint64_t, rdmsr(MSR_SF_MASK));
-	WAKECODE_FIXUP(wakeup_xsmask, uint64_t, xsave_mask);
-
-	/* Build temporary page tables below realmode code. */
-	pt4 = wakeaddr;
-	pt3 = pt4 + (PAGE_SIZE) / sizeof(uint64_t);
-	pt2 = pt3 + (PAGE_SIZE) / sizeof(uint64_t);
-
-	/* Create the initial 1GB replicated page tables */
-	for (i = 0; i < 512; i++) {
-		/*
-		 * Each slot of the level 4 pages points
-		 * to the same level 3 page
-		 */
-		pt4[i] = (uint64_t)(sc->acpi_wakephys + PAGE_SIZE);
-		pt4[i] |= PG_V | PG_RW | PG_U;
-
-		/*
-		 * Each slot of the level 3 pages points
-		 * to the same level 2 page
-		 */
-		pt3[i] = (uint64_t)(sc->acpi_wakephys + (2 * PAGE_SIZE));
-		pt3[i] |= PG_V | PG_RW | PG_U;
-
-		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
-		pt2[i] = i * (2 * 1024 * 1024);
-		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
-	}
-
-	if (bootverbose)
-		device_printf(sc->acpi_dev, "wakeup code va %p pa %p\n",
-		    (void *)sc->acpi_wakeaddr, (void *)sc->acpi_wakephys);
-}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/cpu_switch.S
--- a/head/sys/amd64/amd64/cpu_switch.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/cpu_switch.S	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/amd64/cpu_switch.S 232226 2012-02-27 17:28:22Z jhb $
+ * $FreeBSD: head/sys/amd64/amd64/cpu_switch.S 238450 2012-07-14 15:48:30Z kib $
  */
 
 #include <machine/asmacros.h>
@@ -122,8 +122,10 @@
 1:	movq	%rdx,%rcx
 	movl	xsave_mask,%eax
 	movl	xsave_mask+4,%edx
-/*	xsave	(%r8) */
-	.byte	0x41,0x0f,0xae,0x20
+	.globl	ctx_switch_xsave
+ctx_switch_xsave:
+	/* This is patched to xsaveopt if supported, see fpuinit_bsp1() */
+	xsave	(%r8)
 	movq	%rcx,%rdx
 2:	smsw	%ax
 	orb	$CR0_TS,%al
@@ -357,6 +359,30 @@
 	rdmsr
 	movl	%eax,PCB_KGSBASE(%rdi)
 	movl	%edx,PCB_KGSBASE+4(%rdi)
+	movl	$MSR_EFER,%ecx
+	rdmsr
+	movl	%eax,PCB_EFER(%rdi)
+	movl	%edx,PCB_EFER+4(%rdi)
+	movl	$MSR_STAR,%ecx
+	rdmsr
+	movl	%eax,PCB_STAR(%rdi)
+	movl	%edx,PCB_STAR+4(%rdi)
+	movl	$MSR_LSTAR,%ecx
+	rdmsr
+	movl	%eax,PCB_LSTAR(%rdi)
+	movl	%edx,PCB_LSTAR+4(%rdi)
+	movl	$MSR_CSTAR,%ecx
+	rdmsr
+	movl	%eax,PCB_CSTAR(%rdi)
+	movl	%edx,PCB_CSTAR+4(%rdi)
+	movl	$MSR_SF_MASK,%ecx
+	rdmsr
+	movl	%eax,PCB_SFMASK(%rdi)
+	movl	%edx,PCB_SFMASK+4(%rdi)
+	movl	xsave_mask,%eax
+	movl	%eax,PCB_XSMASK(%rdi)
+	movl	xsave_mask+4,%eax
+	movl	%eax,PCB_XSMASK+4(%rdi)
 
 	sgdt	PCB_GDT(%rdi)
 	sidt	PCB_IDT(%rdi)
@@ -370,6 +396,140 @@
 END(savectx)
 
 /*
+ * resumectx(pcb)
+ * Resuming processor state from pcb.
+ */     
+ENTRY(resumectx)
+	/* Switch to KPML4phys. */
+	movq	KPML4phys,%rax
+	movq	%rax,%cr3
+
+	/* Force kernel segment registers. */
+	movl	$KDSEL,%eax
+	movw	%ax,%ds
+	movw	%ax,%es
+	movw	%ax,%ss
+	movl	$KUF32SEL,%eax
+	movw	%ax,%fs
+	movl	$KUG32SEL,%eax
+	movw	%ax,%gs
+
+	movl	$MSR_FSBASE,%ecx
+	movl	PCB_FSBASE(%rdi),%eax
+	movl	4 + PCB_FSBASE(%rdi),%edx
+	wrmsr
+	movl	$MSR_GSBASE,%ecx
+	movl	PCB_GSBASE(%rdi),%eax
+	movl	4 + PCB_GSBASE(%rdi),%edx
+	wrmsr
+	movl	$MSR_KGSBASE,%ecx
+	movl	PCB_KGSBASE(%rdi),%eax
+	movl	4 + PCB_KGSBASE(%rdi),%edx
+	wrmsr
+
+	/* Restore EFER. */
+	movl	$MSR_EFER,%ecx
+	movl	PCB_EFER(%rdi),%eax
+	wrmsr
+
+	/* Restore fast syscall stuff. */
+	movl	$MSR_STAR,%ecx
+	movl	PCB_STAR(%rdi),%eax
+	movl	4 + PCB_STAR(%rdi),%edx
+	wrmsr
+	movl	$MSR_LSTAR,%ecx
+	movl	PCB_LSTAR(%rdi),%eax
+	movl	4 + PCB_LSTAR(%rdi),%edx
+	wrmsr
+	movl	$MSR_CSTAR,%ecx
+	movl	PCB_CSTAR(%rdi),%eax
+	movl	4 + PCB_CSTAR(%rdi),%edx
+	wrmsr
+	movl	$MSR_SF_MASK,%ecx
+	movl	PCB_SFMASK(%rdi),%eax
+	wrmsr
+
+	/* Restore CR0 except for FPU mode. */
+	movq	PCB_CR0(%rdi),%rax
+	andq	$~(CR0_EM | CR0_TS),%rax
+	movq	%rax,%cr0
+
+	/* Restore CR2, CR4 and CR3. */
+	movq	PCB_CR2(%rdi),%rax
+	movq	%rax,%cr2
+	movq	PCB_CR4(%rdi),%rax
+	movq	%rax,%cr4
+	movq	PCB_CR3(%rdi),%rax
+	movq	%rax,%cr3
+
+	/* Restore descriptor tables. */
+	lidt	PCB_IDT(%rdi)
+	lldt	PCB_LDT(%rdi)
+
+#define	SDT_SYSTSS	9
+#define	SDT_SYSBSY	11
+
+	/* Clear "task busy" bit and reload TR. */
+	movq	PCPU(TSS),%rax
+	andb	$(~SDT_SYSBSY | SDT_SYSTSS),5(%rax)
+	movw	PCB_TR(%rdi),%ax
+	ltr	%ax
+
+#undef	SDT_SYSTSS
+#undef	SDT_SYSBSY
+
+	/* Restore debug registers. */
+	movq	PCB_DR0(%rdi),%rax
+	movq	%rax,%dr0
+	movq	PCB_DR1(%rdi),%rax
+	movq	%rax,%dr1
+	movq	PCB_DR2(%rdi),%rax
+	movq	%rax,%dr2
+	movq	PCB_DR3(%rdi),%rax
+	movq	%rax,%dr3
+	movq	PCB_DR6(%rdi),%rax
+	movq	%rax,%dr6
+	movq	PCB_DR7(%rdi),%rax
+	movq	%rax,%dr7
+
+	/* Restore FPU state. */
+	fninit
+	movq	PCB_FPUSUSPEND(%rdi),%rbx
+	movq	PCB_XSMASK(%rdi),%rax
+	testq	%rax,%rax
+	jz	1f
+	movq	%rax,%rdx
+	shrq	$32,%rdx
+	movl	$XCR0,%ecx
+	xsetbv
+	xrstor	(%rbx)
+	jmp	2f
+1:
+	fxrstor	(%rbx)
+2:
+
+	/* Reload CR0. */
+	movq	PCB_CR0(%rdi),%rax
+	movq	%rax,%cr0
+
+	/* Restore other callee saved registers. */
+	movq	PCB_R15(%rdi),%r15
+	movq	PCB_R14(%rdi),%r14
+	movq	PCB_R13(%rdi),%r13
+	movq	PCB_R12(%rdi),%r12
+	movq	PCB_RBP(%rdi),%rbp
+	movq	PCB_RSP(%rdi),%rsp
+	movq	PCB_RBX(%rdi),%rbx
+
+	/* Restore return address. */
+	movq	PCB_RIP(%rdi),%rax
+	movq	%rax,(%rsp)
+
+	xorl	%eax,%eax
+	ret
+END(resumectx)
+
+/*
  * Wrapper around fpusave to care about TS0_CR.
  */
 ENTRY(ctx_fpusave)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/db_disasm.c
--- a/head/sys/amd64/amd64/db_disasm.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/db_disasm.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,12 +25,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/db_disasm.c 238166 2012-07-06 14:25:59Z jhb $");
 
 /*
  * Instruction disassembler.
  */
 #include <sys/param.h>
+#include <sys/libkern.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_access.h>
@@ -47,7 +48,9 @@
 #define	DBLR	5
 #define	EXTR	6
 #define	SDEP	7
-#define	NONE	8
+#define	ADEP	8
+#define	ESC	9
+#define	NONE	10
 
 /*
  * REX prefix and bits
@@ -67,6 +70,7 @@
 #define	Eb	4			/* address, byte size */
 #define	R	5			/* register, in 'reg' field */
 #define	Rw	6			/* word register, in 'reg' field */
+#define	Rq	39			/* quad register, in 'reg' field */
 #define	Ri	7			/* register in instruction */
 #define	S	8			/* segment reg, in 'reg' field */
 #define	Si	9			/* segment reg, in instruction */
@@ -120,6 +124,45 @@
 					   (or pointer to table) */
 };
 
+static const struct inst db_inst_0f388x[] = {
+/*80*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invept" },
+/*81*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invvpid" },
+/*82*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*83*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*84*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*85*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*86*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*87*/	{ "",	   FALSE, NONE,  0,	      0 },
+
+/*88*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*89*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8a*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8b*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8c*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8d*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8e*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8f*/	{ "",	   FALSE, NONE,  0,	      0 },
+};
+
+static const struct inst * const db_inst_0f38[] = {
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	db_inst_0f388x,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0
+};
+
 static const char * const db_Grp6[] = {
 	"sldt",
 	"str",
@@ -160,8 +203,8 @@
 	"",
 	"",
 	"",
-	"",
-	""
+	"vmptrld",
+	"vmptrst"
 };
 
 static const char * const db_Grp15[] = {
@@ -169,9 +212,9 @@
 	"fxrstor",
 	"ldmxcsr",
 	"stmxcsr",
-	"",
-	"",
-	"",
+	"xsave",
+	"xrstor",
+	"xsaveopt",
 	"clflush"
 };
 
@@ -236,7 +279,7 @@
 /*36*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*37*/	{ "getsec",FALSE, NONE,  0,	      0 },
 
-/*38*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*38*/	{ "",	   FALSE, ESC,  0,	      db_inst_0f38 },
 /*39*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3a*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3b*/	{ "",	   FALSE, NONE,  0,	      0 },
@@ -266,6 +309,26 @@
 /*4f*/	{ "cmovnle",TRUE, NONE,  op2(E, R),   0 },
 };
 
+static const struct inst db_inst_0f7x[] = {
+/*70*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*71*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*72*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*73*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*74*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*75*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*76*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*77*/	{ "",	   FALSE, NONE,  0,	      0 },
+
+/*78*/	{ "vmread", TRUE, NONE,  op2(Rq, E),  0 },
+/*79*/	{ "vmwrite",TRUE, NONE,  op2(E, Rq),  0 },
+/*7a*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7b*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7c*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7d*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7e*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7f*/	{ "",	   FALSE, NONE,  0,	      0 },
+};
+
 static const struct inst db_inst_0f8x[] = {
 /*80*/	{ "jo",    FALSE, NONE,  op1(Dl),     0 },
 /*81*/	{ "jno",   FALSE, NONE,  op1(Dl),     0 },
@@ -373,7 +436,7 @@
 	db_inst_0f4x,
 	0,
 	0,
-	0,
+	db_inst_0f7x,
 	db_inst_0f8x,
 	db_inst_0f9x,
 	db_inst_0fax,
@@ -582,7 +645,7 @@
 /*0c*/	{ "or",    FALSE, BYTE,  op2(I, A),  0 },
 /*0d*/	{ "or",    FALSE, LONG,  op2(I, A),  0 },
 /*0e*/	{ "push",  FALSE, NONE,  op1(Si),    0 },
-/*0f*/	{ "",      FALSE, NONE,  0,	     0 },
+/*0f*/	{ "",      FALSE, ESC,   0,	     db_inst_0f },
 
 /*10*/	{ "adc",   TRUE,  BYTE,  op2(R, E),  0 },
 /*11*/	{ "adc",   TRUE,  LONG,  op2(R, E),  0 },
@@ -738,8 +801,8 @@
 /*96*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 /*97*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 
-/*98*/	{ "cbw",   FALSE, SDEP,  0,	      "cwde" },	/* cbw/cwde */
-/*99*/	{ "cwd",   FALSE, SDEP,  0,	      "cdq"  },	/* cwd/cdq */
+/*98*/	{ "cwde",  FALSE, SDEP,  0,	      "cbw" },
+/*99*/	{ "cdq",   FALSE, SDEP,  0,	      "cwd" },
 /*9a*/	{ "lcall", FALSE, NONE,  op1(OS),     0 },
 /*9b*/	{ "wait",  FALSE, NONE,  0,	      0 },
 /*9c*/	{ "pushf", FALSE, LONG,  0,	      0 },
@@ -822,7 +885,7 @@
 /*e0*/	{ "loopne",FALSE, NONE,  op1(Db),     0 },
 /*e1*/	{ "loope", FALSE, NONE,  op1(Db),     0 },
 /*e2*/	{ "loop",  FALSE, NONE,  op1(Db),     0 },
-/*e3*/	{ "jcxz",  FALSE, SDEP,  op1(Db),     "jecxz" },
+/*e3*/	{ "jrcxz", FALSE, ADEP,  op1(Db),     "jecxz" },
 /*e4*/	{ "in",    FALSE, BYTE,  op2(Ib, A),  0 },
 /*e5*/	{ "in",    FALSE, LONG,  op2(Ib, A) , 0 },
 /*e6*/	{ "out",   FALSE, BYTE,  op2(A, Ib),  0 },
@@ -1208,14 +1271,6 @@
 	    if (prefix) {
 		get_value_inc(inst, loc, 1, FALSE);
 	    }
-	    if (rep == TRUE) {
-		if (inst == 0x90) {
-		    db_printf("pause\n");
-		    return (loc);
-		}
-		db_printf("repe ");	/* XXX repe VS rep */
-		rep = FALSE;
-	    }
 	} while (prefix);
 
 	if (inst >= 0xd8 && inst <= 0xdf) {
@@ -1224,9 +1279,10 @@
 	    return (loc);
 	}
 
-	if (inst == 0x0f) {
+	ip = &db_inst_table[inst];
+	while (ip->i_size == ESC) {
 	    get_value_inc(inst, loc, 1, FALSE);
-	    ip = db_inst_0f[inst>>4];
+	    ip = ((const struct inst * const *)ip->i_extra)[inst>>4];
 	    if (ip == 0) {
 		ip = &db_bad_inst;
 	    }
@@ -1234,8 +1290,6 @@
 		ip = &ip[inst&0xf];
 	    }
 	}
-	else
-	    ip = &db_inst_table[inst];
 
 	if (ip->i_has_modrm) {
 	    get_value_inc(regmodrm, loc, 1, FALSE);
@@ -1269,6 +1323,26 @@
 	/* Special cases that don't fit well in the tables. */
 	if (ip->i_extra == db_Grp7 && f_mod(rex, regmodrm) == 3) {
 		switch (regmodrm) {
+		case 0xc1:
+			i_name = "vmcall";
+			i_size = NONE;
+			i_mode = 0;
+			break;
+		case 0xc2:
+			i_name = "vmlaunch";
+			i_size = NONE;
+			i_mode = 0;
+			break;
+		case 0xc3:
+			i_name = "vmresume";
+			i_size = NONE;
+			i_mode = 0;
+			break;
+		case 0xc4:
+			i_name = "vmxoff";
+			i_size = NONE;
+			i_mode = 0;
+			break;
 		case 0xc8:
 			i_name = "monitor";
 			i_size = NONE;
@@ -1279,11 +1353,26 @@
 			i_size = NONE;
 			i_mode = 0;
 			break;
+		case 0xd0:
+			i_name = "xgetbv";
+			i_size = NONE;
+			i_mode = 0;
+			break;
+		case 0xd1:
+			i_name = "xsetbv";
+			i_size = NONE;
+			i_mode = 0;
+			break;
 		case 0xf8:
 			i_name = "swapgs";
 			i_size = NONE;
 			i_mode = 0;
 			break;
+		case 0xf9:
+			i_name = "rdtscp";
+			i_size = NONE;
+			i_mode = 0;
+			break;
 		}
 	}
 	if (ip->i_extra == db_Grp15 && f_mod(rex, regmodrm) == 3) {
@@ -1292,8 +1381,42 @@
 		i_mode = 0;
 	}
 
+	/* Handle instructions identified by mandatory prefixes. */
+	if (rep == TRUE) {
+	    if (inst == 0x90) {
+		i_name = "pause";
+		i_size = NONE;
+		i_mode = 0;
+		rep = FALSE;
+	    } else if (ip->i_extra == db_Grp9 && f_mod(rex, regmodrm) != 3 &&
+		f_reg(rex, regmodrm) == 0x6) {
+		i_name = "vmxon";
+		rep = FALSE;
+	    }
+	}
+	if (size == WORD) {
+	    if (ip->i_extra == db_Grp9 && f_mod(rex, regmodrm) != 3 &&
+		f_reg(rex, regmodrm) == 0x6) {
+		i_name = "vmclear";
+	    }
+	}
+	if (rex & REX_W) {
+	    if (strcmp(i_name, "cwde") == 0)
+		i_name = "cdqe";
+	    else if (strcmp(i_name, "cmpxchg8b") == 0)
+		i_name = "cmpxchg16b";
+	}
+
+	if (rep == TRUE)
+	    db_printf("repe ");	/* XXX repe VS rep */
+
 	if (i_size == SDEP) {
-	    if (size == WORD)
+	    if (size == LONG)
+		db_printf("%s", i_name);
+	    else
+		db_printf("%s", (const char *)ip->i_extra);
+	} else if (i_size == ADEP) {
+	    if (short_addr == FALSE)
 		db_printf("%s", i_name);
 	    else
 		db_printf("%s", (const char *)ip->i_extra);
@@ -1366,6 +1489,10 @@
 		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][WORD][f_reg(rex, regmodrm)]);
 		    break;
 
+		case Rq:
+		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][QUAD][f_reg(rex, regmodrm)]);
+		    break;
+
 		case Ri:
 		    db_printf("%s", db_reg[0][QUAD][f_rm(rex, inst)]);
 		    break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/fpu.c
--- a/head/sys/amd64/amd64/fpu.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/fpu.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/fpu.c 230766 2012-01-30 07:53:33Z kib $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/fpu.c 238671 2012-07-21 13:53:00Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -73,10 +73,7 @@
 #define	fxrstor(addr)		__asm __volatile("fxrstor %0" : : "m" (*(addr)))
 #define	fxsave(addr)		__asm __volatile("fxsave %0" : "=m" (*(addr)))
 #define	ldmxcsr(csr)		__asm __volatile("ldmxcsr %0" : : "m" (csr))
-#define	start_emulating()	__asm __volatile( \
-				    "smsw %%ax; orb %0,%%al; lmsw %%ax" \
-				    : : "n" (CR0_TS) : "ax")
-#define	stop_emulating()	__asm __volatile("clts")
+#define	stmxcsr(addr)		__asm __volatile("stmxcsr %0" : : "m" (*(addr)))
 
 static __inline void
 xrstor(char *addr, uint64_t mask)
@@ -85,9 +82,7 @@
 
 	low = mask;
 	hi = mask >> 32;
-	/* xrstor (%rdi) */
-	__asm __volatile(".byte	0x0f,0xae,0x2f" : :
-	    "a" (low), "d" (hi), "D" (addr));
+	__asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi));
 }
 
 static __inline void
@@ -97,20 +92,8 @@
 
 	low = mask;
 	hi = mask >> 32;
-	/* xsave (%rdi) */
-	__asm __volatile(".byte	0x0f,0xae,0x27" : :
-	    "a" (low), "d" (hi), "D" (addr) : "memory");
-}
-
-static __inline void
-xsetbv(uint32_t reg, uint64_t val)
-{
-	uint32_t low, hi;
-
-	low = val;
-	hi = val >> 32;
-	__asm __volatile(".byte 0x0f,0x01,0xd1" : :
-	    "c" (reg), "a" (low), "d" (hi));
+	__asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) :
+	    "memory");
 }
 
 #else	/* !(__GNUCLIKE_ASM && !lint) */
@@ -123,16 +106,14 @@
 void	fxsave(caddr_t addr);
 void	fxrstor(caddr_t addr);
 void	ldmxcsr(u_int csr);
-void	start_emulating(void);
-void	stop_emulating(void);
+void	stmxcsr(u_int csr);
 void	xrstor(char *addr, uint64_t mask);
 void	xsave(char *addr, uint64_t mask);
-void	xsetbv(uint32_t reg, uint64_t val);
 
 #endif	/* __GNUCLIKE_ASM && !lint */
 
-#define GET_FPU_CW(thread) ((thread)->td_pcb->pcb_save->sv_env.en_cw)
-#define GET_FPU_SW(thread) ((thread)->td_pcb->pcb_save->sv_env.en_sw)
+#define	start_emulating()	load_cr0(rcr0() | CR0_TS)
+#define	stop_emulating()	clts()
 
 CTASSERT(sizeof(struct savefpu) == 512);
 CTASSERT(sizeof(struct xstate_hdr) == 64);
@@ -141,7 +122,7 @@
 /*
  * This requirement is to make it easier for asm code to calculate
  * offset of the fpu save area from the pcb address. FPU save area
- * must by 64-bytes aligned.
+ * must be 64-byte aligned.
  */
 CTASSERT(sizeof(struct pcb) % XSAVE_AREA_ALIGN == 0);
 
@@ -150,10 +131,16 @@
 SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
     NULL, 1, "Floating point instructions executed in hardware");
 
+static int use_xsaveopt;
 int use_xsave;			/* non-static for cpu_switch.S */
 uint64_t xsave_mask;		/* the same */
 static	struct savefpu *fpu_initialstate;
 
+struct xsave_area_elm_descr {
+	u_int	offset;
+	u_int	size;
+} *xsave_area_desc;
+
 void
 fpusave(void *addr)
 {
@@ -200,6 +187,17 @@
 	TUNABLE_ULONG_FETCH("hw.xsave_mask", &xsave_mask_user);
 	xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE;
 	xsave_mask &= xsave_mask_user;
+
+	cpuid_count(0xd, 0x1, cp);
+	if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) {
+		/*
+		 * Patch the XSAVE instruction in the cpu_switch code
+		 * to XSAVEOPT.  We assume that XSAVE encoding used
+		 * REX byte, and set the bit 4 of the r/m byte.
+		 */
+		ctx_switch_xsave[3] |= 0x10;
+		use_xsaveopt = 1;
+	}
 }
 
 /*
@@ -238,7 +236,7 @@
 
 	if (use_xsave) {
 		load_cr4(rcr4() | CR4_XSAVE);
-		xsetbv(XCR0, xsave_mask);
+		load_xcr(XCR0, xsave_mask);
 	}
 
 	/*
@@ -270,6 +268,7 @@
 fpuinitstate(void *arg __unused)
 {
 	register_t saveintr;
+	int cp[4], i, max_ext_n;
 
 	fpu_initialstate = malloc(cpu_max_ext_state_size, M_DEVBUF,
 	    M_WAITOK | M_ZERO);
@@ -291,6 +290,28 @@
 	 */
 	bzero(&fpu_initialstate->sv_xmm[0], sizeof(struct xmmacc));
 
+	/*
+	 * Create a table describing the layout of the CPU Extended
+	 * Save Area.
+	 */
+	if (use_xsaveopt) {
+		max_ext_n = flsl(xsave_mask);
+		xsave_area_desc = malloc(max_ext_n * sizeof(struct
+		    xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO);
+		/* x87 state */
+		xsave_area_desc[0].offset = 0;
+		xsave_area_desc[0].size = 160;
+		/* XMM */
+		xsave_area_desc[1].offset = 160;
+		xsave_area_desc[1].size = 288 - 160;
+
+		for (i = 2; i < max_ext_n; i++) {
+			cpuid_count(0xd, i, cp);
+			xsave_area_desc[i].offset = cp[1];
+			xsave_area_desc[i].size = cp[0];
+		}
+	}
+
 	start_emulating();
 	intr_restore(saveintr);
 }
@@ -306,7 +327,7 @@
 	critical_enter();
 	if (curthread == PCPU_GET(fpcurthread)) {
 		stop_emulating();
-		fpusave(PCPU_GET(curpcb)->pcb_save);
+		fpusave(curpcb->pcb_save);
 		start_emulating();
 		PCPU_SET(fpcurthread, 0);
 	}
@@ -492,25 +513,26 @@
 };
 
 /*
- * Preserve the FP status word, clear FP exceptions, then generate a SIGFPE.
+ * Read the FP status and control words, then generate si_code value
+ * for SIGFPE.  The error code chosen will be one of the
+ * FPE_... macros.  It will be sent as the second argument to old
+ * BSD-style signal handlers and as "siginfo_t->si_code" (second
+ * argument) to SA_SIGINFO signal handlers.
  *
- * Clearing exceptions is necessary mainly to avoid IRQ13 bugs.  We now
- * depend on longjmp() restoring a usable state.  Restoring the state
- * or examining it might fail if we didn't clear exceptions.
+ * Some time ago, we cleared the x87 exceptions with FNCLEX there.
+ * Clearing exceptions was necessary mainly to avoid IRQ13 bugs.  The
+ * usermode code which understands the FPU hardware enough to enable
+ * the exceptions, can also handle clearing the exception state in the
+ * handler.  The only consequence of not clearing the exception is the
+ * rethrow of the SIGFPE on return from the signal handler and
+ * reexecution of the corresponding instruction.
  *
- * The error code chosen will be one of the FPE_... macros. It will be
- * sent as the second argument to old BSD-style signal handlers and as
- * "siginfo_t->si_code" (second argument) to SA_SIGINFO signal handlers.
- *
- * XXX the FP state is not preserved across signal handlers.  So signal
- * handlers cannot afford to do FP unless they preserve the state or
- * longjmp() out.  Both preserving the state and longjmp()ing may be
- * destroyed by IRQ13 bugs.  Clearing FP exceptions is not an acceptable
- * solution for signals other than SIGFPE.
+ * For XMM traps, the exceptions were never cleared.
  */
 int
-fputrap()
+fputrap_x87(void)
 {
+	struct savefpu *pcb_save;
 	u_short control, status;
 
 	critical_enter();
@@ -521,19 +543,32 @@
 	 * wherever they are.
 	 */
 	if (PCPU_GET(fpcurthread) != curthread) {
-		control = GET_FPU_CW(curthread);
-		status = GET_FPU_SW(curthread);
+		pcb_save = curpcb->pcb_save;
+		control = pcb_save->sv_env.en_cw;
+		status = pcb_save->sv_env.en_sw;
 	} else {
 		fnstcw(&control);
 		fnstsw(&status);
 	}
 
-	if (PCPU_GET(fpcurthread) == curthread)
-		fnclex();
 	critical_exit();
 	return (fpetable[status & ((~control & 0x3f) | 0x40)]);
 }
 
+int
+fputrap_sse(void)
+{
+	u_int mxcsr;
+
+	critical_enter();
+	if (PCPU_GET(fpcurthread) != curthread)
+		mxcsr = curpcb->pcb_save->sv_env.en_mxcsr;
+	else
+		stmxcsr(&mxcsr);
+	critical_exit();
+	return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
+}
+
 /*
  * Implement device not available (DNA) exception
  *
@@ -547,7 +582,6 @@
 void
 fpudna(void)
 {
-	struct pcb *pcb;
 
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == curthread) {
@@ -569,26 +603,31 @@
 	 * Record new context early in case frstor causes a trap.
 	 */
 	PCPU_SET(fpcurthread, curthread);
-	pcb = PCPU_GET(curpcb);
 
 	fpu_clean_state();
 
-	if ((pcb->pcb_flags & PCB_FPUINITDONE) == 0) {
+	if ((curpcb->pcb_flags & PCB_FPUINITDONE) == 0) {
 		/*
 		 * This is the first time this thread has used the FPU or
 		 * the PCB doesn't contain a clean FPU state.  Explicitly
 		 * load an initial state.
+		 *
+		 * We prefer to restore the state from the actual save
+		 * area in PCB instead of directly loading from
+		 * fpu_initialstate, to ignite the XSAVEOPT
+		 * tracking engine.
 		 */
-		fpurestore(fpu_initialstate);
-		if (pcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
-			fldcw(pcb->pcb_initial_fpucw);
-		if (PCB_USER_FPU(pcb))
-			set_pcb_flags(pcb,
+		bcopy(fpu_initialstate, curpcb->pcb_save, cpu_max_ext_state_size);
+		fpurestore(curpcb->pcb_save);
+		if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
+			fldcw(curpcb->pcb_initial_fpucw);
+		if (PCB_USER_FPU(curpcb))
+			set_pcb_flags(curpcb,
 			    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 		else
-			set_pcb_flags(pcb, PCB_FPUINITDONE);
+			set_pcb_flags(curpcb, PCB_FPUINITDONE);
 	} else
-		fpurestore(pcb->pcb_save);
+		fpurestore(curpcb->pcb_save);
 	critical_exit();
 }
 
@@ -614,6 +653,9 @@
 fpugetregs(struct thread *td)
 {
 	struct pcb *pcb;
+	uint64_t *xstate_bv, bit;
+	char *sa;
+	int max_ext_n, i;
 
 	pcb = td->td_pcb;
 	if ((pcb->pcb_flags & PCB_USERFPUINITDONE) == 0) {
@@ -631,6 +673,25 @@
 		return (_MC_FPOWNED_FPU);
 	} else {
 		critical_exit();
+		if (use_xsaveopt) {
+			/*
+			 * Handle partially saved state.
+			 */
+			sa = (char *)get_pcb_user_save_pcb(pcb);
+			xstate_bv = (uint64_t *)(sa + sizeof(struct savefpu) +
+			    offsetof(struct xstate_hdr, xstate_bv));
+			max_ext_n = flsl(xsave_mask);
+			for (i = 0; i < max_ext_n; i++) {
+				bit = 1 << i;
+				if ((*xstate_bv & bit) != 0)
+					continue;
+				bcopy((char *)fpu_initialstate +
+				    xsave_area_desc[i].offset,
+				    sa + xsave_area_desc[i].offset,
+				    xsave_area_desc[i].size);
+				*xstate_bv |= bit;
+			}
+		}
 		return (_MC_FPOWNED_PCB);
 	}
 }
@@ -900,16 +961,14 @@
 int
 fpu_kern_thread(u_int flags)
 {
-	struct pcb *pcb;
 
-	pcb = PCPU_GET(curpcb);
 	KASSERT((curthread->td_pflags & TDP_KTHREAD) != 0,
 	    ("Only kthread may use fpu_kern_thread"));
-	KASSERT(pcb->pcb_save == get_pcb_user_save_pcb(pcb),
+	KASSERT(curpcb->pcb_save == get_pcb_user_save_pcb(curpcb),
 	    ("mangled pcb_save"));
-	KASSERT(PCB_USER_FPU(pcb), ("recursive call"));
+	KASSERT(PCB_USER_FPU(curpcb), ("recursive call"));
 
-	set_pcb_flags(pcb, PCB_KERNFPU);
+	set_pcb_flags(curpcb, PCB_KERNFPU);
 	return (0);
 }
 
@@ -919,5 +978,5 @@
 
 	if ((curthread->td_pflags & TDP_KTHREAD) == 0)
 		return (0);
-	return ((PCPU_GET(curpcb)->pcb_flags & PCB_KERNFPU) != 0);
+	return ((curpcb->pcb_flags & PCB_KERNFPU) != 0);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/genassym.c
--- a/head/sys/amd64/amd64/genassym.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/genassym.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/genassym.c 230426 2012-01-21 17:45:27Z kib $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/genassym.c 236772 2012-06-09 00:37:26Z iwasaki $");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
@@ -157,6 +157,13 @@
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct savefpu));
 ASSYM(PCB_USERFPU, sizeof(struct pcb));
+ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer));
+ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star));
+ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar));
+ASSYM(PCB_CSTAR, offsetof(struct pcb, pcb_cstar));
+ASSYM(PCB_SFMASK, offsetof(struct pcb, pcb_sfmask));
+ASSYM(PCB_XSMASK, offsetof(struct pcb, pcb_xsmask));
+ASSYM(PCB_FPUSUSPEND, offsetof(struct pcb, pcb_fpususpend));
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_FULL_IRET, PCB_FULL_IRET);
 ASSYM(PCB_DBREGS, PCB_DBREGS);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/machdep.c
--- a/head/sys/amd64/amd64/machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/machdep.c 234105 2012-04-10 16:08:46Z marius $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/machdep.c 238623 2012-07-19 19:09:12Z kib $");
 
 #include "opt_atalk.h"
 #include "opt_atpic.h"
@@ -74,6 +74,7 @@
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
@@ -206,6 +207,8 @@
 
 struct mtx icu_lock;
 
+struct mem_range_softc mem_range_softc;
+
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
 static void
@@ -296,12 +299,10 @@
 
 	cpu_setregs();
 
-#ifdef SMP
 	/*
 	 * Add BSP as an interrupt target.
 	 */
 	intr_add_cpu(0);
-#endif
 }
 
 /*
@@ -995,7 +996,7 @@
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
-		if (pcb == PCPU_GET(curpcb)) {
+		if (pcb == curpcb) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/mem.c
--- a/head/sys/amd64/amd64/mem.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/mem.c	Wed Jul 25 16:40:53 2012 +0300
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/mem.c 238310 2012-07-09 20:42:08Z jhb $");
 
 /*
  * Memory special file
@@ -72,8 +72,6 @@
  */
 MALLOC_DEFINE(M_MEMDESC, "memdesc", "memory range descriptors");
 
-struct mem_range_softc mem_range_softc;
-
 /* ARGSUSED */
 int
 memrw(struct cdev *dev, struct uio *uio, int flags)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/minidump_machdep.c
--- a/head/sys/amd64/amd64/minidump_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/minidump_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/minidump_machdep.c 230623 2012-01-27 20:18:31Z kmacy $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/minidump_machdep.c 236503 2012-06-03 08:01:12Z avg $");
 
 #include "opt_pmap.h"
 #include "opt_watchdog.h"
@@ -37,9 +37,7 @@
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/msgbuf.h>
-#ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
-#endif
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
@@ -177,9 +175,9 @@
 			report_progress(progress, dumpsize);
 			counter &= (1<<24) - 1;
 		}
-#ifdef SW_WATCHDOG
+
 		wdog_kern_pat(WD_LASTVAL);
-#endif
+
 		if (ptr) {
 			error = dump_write(di, ptr, 0, dumplo, len);
 			if (error)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/mp_machdep.c
--- a/head/sys/amd64/amd64/mp_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/mp_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 234208 2012-04-13 07:18:19Z avg $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 237037 2012-06-13 22:53:56Z jkim $");
 
 #include "opt_cpu.h"
 #include "opt_kstack_pages.h"
@@ -100,7 +100,6 @@
 
 struct pcb stoppcbs[MAXCPU];
 struct pcb **susppcbs;
-void **suspfpusave;
 
 /* Variables needed for SMP tlb shootdown. */
 vm_offset_t smp_tlb_addr1;
@@ -982,6 +981,60 @@
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
+	ipi_startup(apic_id, vector);
+
+	/* Wait up to 5 seconds for it to start. */
+	for (ms = 0; ms < 5000; ms++) {
+		if (mp_naps > cpus)
+			return 1;	/* return SUCCESS */
+		DELAY(1000);
+	}
+	return 0;		/* return FAILURE */
+}
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+    sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+    sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+    sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW,
+    &ipi_range_size, 0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+    &ipi_masked_global, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+    &ipi_masked_page, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+    &ipi_masked_range, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+    &ipi_masked_range_size, 0, "");
+#endif /* COUNT_XINVLTLB_HITS */
+
+/*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
 	/*
 	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
@@ -1032,52 +1085,8 @@
 	    vector, apic_id);
 	lapic_ipi_wait(-1);
 	DELAY(200);		/* wait ~200uS */
-
-	/* Wait up to 5 seconds for it to start. */
-	for (ms = 0; ms < 5000; ms++) {
-		if (mp_naps > cpus)
-			return 1;	/* return SUCCESS */
-		DELAY(1000);
-	}
-	return 0;		/* return FAILURE */
 }
 
-#ifdef COUNT_XINVLTLB_HITS
-u_int xhits_gbl[MAXCPU];
-u_int xhits_pg[MAXCPU];
-u_int xhits_rng[MAXCPU];
-static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
-    sizeof(xhits_gbl), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
-    sizeof(xhits_pg), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
-    sizeof(xhits_rng), "IU", "");
-
-u_int ipi_global;
-u_int ipi_page;
-u_int ipi_range;
-u_int ipi_range_size;
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW,
-    &ipi_range_size, 0, "");
-
-u_int ipi_masked_global;
-u_int ipi_masked_page;
-u_int ipi_masked_range;
-u_int ipi_masked_range_size;
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
-    &ipi_masked_global, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
-    &ipi_masked_page, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
-    &ipi_masked_range, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
-    &ipi_masked_range_size, 0, "");
-#endif /* COUNT_XINVLTLB_HITS */
-
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
@@ -1415,15 +1424,17 @@
 	cpu = PCPU_GET(cpuid);
 
 	if (savectx(susppcbs[cpu])) {
-		ctx_fpusave(suspfpusave[cpu]);
+		ctx_fpusave(susppcbs[cpu]->pcb_fpususpend);
 		wbinvd();
-		CPU_SET_ATOMIC(cpu, &stopped_cpus);
+		CPU_SET_ATOMIC(cpu, &suspended_cpus);
 	} else {
 		pmap_init_pat();
-		load_cr3(susppcbs[cpu]->pcb_cr3);
 		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
+
+		/* Indicate that we are resumed */
+		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
 
 	/* Wait for resume */
@@ -1431,7 +1442,6 @@
 		ia32_pause();
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
-	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 
 	/* Resume MCA and local APIC */
 	mca_resume();
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/pmap.c
--- a/head/sys/amd64/amd64/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -77,7 +77,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 233954 2012-04-06 16:41:19Z alc $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 238610 2012-07-19 05:34:19Z alc $");
 
 /*
  *	Manages physical address maps.
@@ -117,6 +117,7 @@
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
@@ -167,6 +168,39 @@
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
+#define	NPV_LIST_LOCKS	MAXCPU
+
+#define	PHYS_TO_PV_LIST_LOCK(pa)	\
+			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
+
+#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
+	struct rwlock **_lockp = (lockp);		\
+	struct rwlock *_new_lock;			\
+							\
+	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
+	if (_new_lock != *_lockp) {			\
+		if (*_lockp != NULL)			\
+			rw_wunlock(*_lockp);		\
+		*_lockp = _new_lock;			\
+		rw_wlock(*_lockp);			\
+	}						\
+} while (0)
+
+#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
+			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
+
+#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
+	struct rwlock **_lockp = (lockp);		\
+							\
+	if (*_lockp != NULL) {				\
+		rw_wunlock(*_lockp);			\
+		*_lockp = NULL;				\
+	}						\
+} while (0)
+
+#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
+			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
+
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
@@ -199,9 +233,22 @@
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 
 /*
+ * Isolate the global pv list lock from data and other locks to prevent false
+ * sharing within the cache.
+ */
+static struct {
+	struct rwlock	lock;
+	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
+} pvh_global __aligned(CACHE_LINE_SIZE);
+
+#define	pvh_global_lock	pvh_global.lock
+
+/*
  * Data for the pv entry allocation mechanism
  */
-static long pv_entry_count;
+static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
+static struct mtx pv_chunks_mutex;
+static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 
 /*
@@ -215,11 +262,19 @@
  */
 static caddr_t crashdumpmap;
 
+static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t locked_pmap, boolean_t try);
-static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
-static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
-static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
+static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
+static int	popcnt_pc_map_elem(uint64_t elem);
+static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
+static void	reserve_pv_entries(pmap_t pmap, int needed,
+		    struct rwlock **lockp);
+static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+		    struct rwlock **lockp);
+static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+		    struct rwlock **lockp);
+static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+		    struct rwlock **lockp);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
@@ -227,12 +282,14 @@
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
+static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
+    vm_offset_t va, struct rwlock **lockp);
 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
     vm_offset_t va);
 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
-    vm_prot_t prot);
+    vm_prot_t prot, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
-    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
+    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
@@ -240,30 +297,32 @@
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
-static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
+static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+    struct rwlock **lockp);
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
-		vm_page_t *free);
+		vm_page_t *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
-		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
+		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free,
+		struct rwlock **lockp);
 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     vm_page_t *free);
-static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
-		vm_offset_t va);
-static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
-    vm_page_t m);
+    vm_page_t m, struct rwlock **lockp);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
 
-static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
-static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
-
-static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
+static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
+		struct rwlock **lockp);
+static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
+		struct rwlock **lockp);
+static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
+		struct rwlock **lockp);
+
 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
                 vm_page_t* free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
@@ -580,6 +639,11 @@
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 
+ 	/*
+	 * Initialize the global pv list lock.
+	 */
+	rw_init(&pvh_global_lock, "pmap pv global");
+
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
@@ -744,6 +808,17 @@
 	}
 
 	/*
+	 * Initialize the pv chunk list mutex.
+	 */
+	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
+
+	/*
+	 * Initialize the pool of pv list locks.
+	 */
+	for (i = 0; i < NPV_LIST_LOCKS; i++)
+		rw_init(&pv_list_locks[i], "pmap pv list");
+
+	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	for (i = 0; phys_avail[i + 1]; i += 2);
@@ -1625,8 +1700,10 @@
 }
 
 /*
- * this routine is called if the page table page is not
- * mapped correctly.
+ * This routine is called if the desired page table page does not exist.
+ *
+ * If page table page allocation fails, this routine may sleep before
+ * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
@@ -1634,25 +1711,23 @@
  * race conditions.
  */
 static vm_page_t
-_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
+_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, pdppg, pdpg;
 
-	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
-	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
-	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
-
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
-		if (flags & M_WAITOK) {
+		if (lockp != NULL) {
+			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
-			vm_page_unlock_queues();
+			rw_runlock(&pvh_global_lock);
 			VM_WAIT;
-			vm_page_lock_queues();
+			rw_rlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 
@@ -1693,7 +1768,7 @@
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
-			    flags) == NULL) {
+			    lockp) == NULL) {
 				--m->wire_count;
 				atomic_subtract_int(&cnt.v_wire_count, 1);
 				vm_page_free_zero(m);
@@ -1726,7 +1801,7 @@
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
-			    flags) == NULL) {
+			    lockp) == NULL) {
 				--m->wire_count;
 				atomic_subtract_int(&cnt.v_wire_count, 1);
 				vm_page_free_zero(m);
@@ -1740,7 +1815,7 @@
 			if ((*pdp & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
 				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
-				    flags) == NULL) {
+				    lockp) == NULL) {
 					--m->wire_count;
 					atomic_subtract_int(&cnt.v_wire_count,
 					    1);
@@ -1766,15 +1841,12 @@
 }
 
 static vm_page_t
-pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
+pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t pdpindex, ptepindex;
 	pdp_entry_t *pdpe;
 	vm_page_t pdpg;
 
-	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
-	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
-	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -1785,24 +1857,20 @@
 		/* Allocate a pd page. */
 		ptepindex = pmap_pde_pindex(va);
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
-		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
-		if (pdpg == NULL && (flags & M_WAITOK))
+		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
+		if (pdpg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (pdpg);
 }
 
 static vm_page_t
-pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
+pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd;
 	vm_page_t m;
 
-	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
-	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
-	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
-
 	/*
 	 * Calculate pagetable page index
 	 */
@@ -1818,7 +1886,7 @@
 	 * normal 4K page.
 	 */
 	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
-		if (!pmap_demote_pde(pmap, pd, va)) {
+		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 			/*
 			 * Invalidation of the 2MB page mapping may have caused
 			 * the deallocation of the underlying PD page.
@@ -1839,8 +1907,8 @@
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
-		m = _pmap_allocpte(pmap, ptepindex, flags);
-		if (m == NULL && (flags & M_WAITOK))
+		m = _pmap_allocpte(pmap, ptepindex, lockp);
+		if (m == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (m);
@@ -1993,7 +2061,7 @@
 pv_to_chunk(pv_entry_t pv)
 {
 
-	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
+	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
@@ -2002,10 +2070,7 @@
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
-static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
-
-SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
-	"Current number of pv entries");
+static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
@@ -2019,80 +2084,159 @@
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
-static long pv_entry_frees, pv_entry_allocs;
+static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
+SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
+	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
-
-static int pmap_collect_inactive, pmap_collect_active;
-
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
-	"Current number times pmap_collect called on inactive queue");
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
-	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
- * another pv entry chunk.  This is normally called to
- * unmap inactive pages, and if necessary, active pages.
+ * another pv entry chunk.
+ *
+ * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
-static void
-pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
+static vm_page_t
+reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
+	struct pch new_tail;
+	struct pv_chunk *pc;
+	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
-	pv_entry_t next_pv, pv;
+	pv_entry_t pv;
 	vm_offset_t va;
-	vm_page_t m, free;
-
-	TAILQ_FOREACH(m, &vpq->pl, pageq) {
-		if ((m->flags & PG_MARKER) != 0 || m->hold_count || m->busy)
+	vm_page_t free, m, m_pc;
+	uint64_t inuse;
+	int bit, field, freed;
+	
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
+	pmap = NULL;
+	free = m_pc = NULL;
+	TAILQ_INIT(&new_tail);
+	mtx_lock(&pv_chunks_mutex);
+	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && free == NULL) {
+		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+		mtx_unlock(&pv_chunks_mutex);
+		if (pmap != pc->pc_pmap) {
+			if (pmap != NULL) {
+				pmap_invalidate_all(pmap);
+				if (pmap != locked_pmap)
+					PMAP_UNLOCK(pmap);
+			}
+			pmap = pc->pc_pmap;
+			/* Avoid deadlock and lock recursion. */
+			if (pmap > locked_pmap) {
+				RELEASE_PV_LIST_LOCK(lockp);
+				PMAP_LOCK(pmap);
+			} else if (pmap != locked_pmap &&
+			    !PMAP_TRYLOCK(pmap)) {
+				pmap = NULL;
+				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+				mtx_lock(&pv_chunks_mutex);
+				continue;
+			}
+		}
+
+		/*
+		 * Destroy every non-wired, 4 KB page mapping in the chunk.
+		 */
+		freed = 0;
+		for (field = 0; field < _NPCM; field++) {
+			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
+			    inuse != 0; inuse &= ~(1UL << bit)) {
+				bit = bsfq(inuse);
+				pv = &pc->pc_pventry[field * 64 + bit];
+				va = pv->pv_va;
+				pde = pmap_pde(pmap, va);
+				if ((*pde & PG_PS) != 0)
+					continue;
+				pte = pmap_pde_to_pte(pde, va);
+				if ((*pte & PG_W) != 0)
+					continue;
+				tpte = pte_load_clear(pte);
+				if ((tpte & PG_G) != 0)
+					pmap_invalidate_page(pmap, va);
+				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+					vm_page_dirty(m);
+				if ((tpte & PG_A) != 0)
+					vm_page_aflag_set(m, PGA_REFERENCED);
+				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list) &&
+				    (m->flags & PG_FICTITIOUS) == 0) {
+					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+					if (TAILQ_EMPTY(&pvh->pv_list)) {
+						vm_page_aflag_clear(m,
+						    PGA_WRITEABLE);
+					}
+				}
+				pc->pc_map[field] |= 1UL << bit;
+				pmap_unuse_pt(pmap, va, *pde, &free);	
+				freed++;
+			}
+		}
+		if (freed == 0) {
+			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+			mtx_lock(&pv_chunks_mutex);
 			continue;
-		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
-			va = pv->pv_va;
-			pmap = PV_PMAP(pv);
-			/* Avoid deadlock and lock recursion. */
-			if (pmap > locked_pmap)
-				PMAP_LOCK(pmap);
-			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
-				continue;
-			pmap_resident_count_dec(pmap, 1);
-			pde = pmap_pde(pmap, va);
-			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
-			    " a 2mpage in page %p's pv list", m));
-			pte = pmap_pde_to_pte(pde, va);
-			tpte = pte_load_clear(pte);
-			KASSERT((tpte & PG_W) == 0,
-			    ("pmap_collect: wired pte %#lx", tpte));
-			if (tpte & PG_A)
-				vm_page_aflag_set(m, PGA_REFERENCED);
-			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
-				vm_page_dirty(m);
-			free = NULL;
-			pmap_unuse_pt(pmap, va, *pde, &free);
-			pmap_invalidate_page(pmap, va);
-			pmap_free_zero_pages(free);
-			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-			free_pv_entry(pmap, pv);
-			if (pmap != locked_pmap)
-				PMAP_UNLOCK(pmap);
 		}
-		if (TAILQ_EMPTY(&m->md.pv_list) &&
-		    TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list))
-			vm_page_aflag_clear(m, PGA_WRITEABLE);
+		/* Every freed mapping is for a 4 KB page. */
+		pmap_resident_count_dec(pmap, freed);
+		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
+		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
+		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
+		    pc->pc_map[2] == PC_FREE2) {
+			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
+			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
+			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
+			/* Entire chunk is free; return it. */
+			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
+			dump_drop_page(m_pc->phys_addr);
+			mtx_lock(&pv_chunks_mutex);
+			break;
+		}
+		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+		mtx_lock(&pv_chunks_mutex);
+		/* One freed pv entry in locked_pmap is sufficient. */
+		if (pmap == locked_pmap)
+			break;
 	}
+	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
+	mtx_unlock(&pv_chunks_mutex);
+	if (pmap != NULL) {
+		pmap_invalidate_all(pmap);
+		if (pmap != locked_pmap)
+			PMAP_UNLOCK(pmap);
+	}
+	if (m_pc == NULL && free != NULL) {
+		m_pc = free;
+		free = m_pc->right;
+		/* Recycle a freed page table page. */
+		m_pc->wire_count = 1;
+		atomic_add_int(&cnt.v_wire_count, 1);
+	}
+	pmap_free_zero_pages(free);
+	return (m_pc);
 }
 
 /*
@@ -2101,15 +2245,14 @@
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
-	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	PV_STAT(pv_entry_frees++);
-	PV_STAT(pv_entry_spare++);
-	pv_entry_count--;
+	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
+	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
+	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
@@ -2125,9 +2268,20 @@
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-	PV_STAT(pv_entry_spare -= _NPCPV);
-	PV_STAT(pc_chunk_count--);
-	PV_STAT(pc_chunk_frees++);
+	free_pv_chunk(pc);
+}
+
+static void
+free_pv_chunk(struct pv_chunk *pc)
+{
+	vm_page_t m;
+
+	mtx_lock(&pv_chunks_mutex);
+ 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+	mtx_unlock(&pv_chunks_mutex);
+	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
+	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
+	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
@@ -2136,22 +2290,24 @@
 }
 
 /*
- * get a new pv_entry, allocating a block from the system
- * when needed.
+ * Returns a new PV entry, allocating a new PV chunk from the system when
+ * needed.  If this PV chunk allocation fails and a PV list lock pointer was
+ * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
+ * returned.
+ *
+ * The given PV list lock may be released.
  */
 static pv_entry_t
-get_pv_entry(pmap_t pmap, boolean_t try)
+get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
-	struct vpgqueues *pq;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	PV_STAT(pv_entry_allocs++);
-	pq = NULL;
+	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
@@ -2171,52 +2327,130 @@
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
-			pv_entry_count++;
-			PV_STAT(pv_entry_spare--);
+			PV_STAT(atomic_add_long(&pv_entry_count, 1));
+			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
-	m = vm_page_alloc(NULL, 0, (pq == &vm_page_queues[PQ_ACTIVE] ?
-	    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ |
+	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
-		if (try) {
+		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
-		/*
-		 * Reclaim pv entries: At first, destroy mappings to inactive
-		 * pages.  After that, if a pv chunk entry is still needed,
-		 * destroy mappings to active pages.
-		 */
-		if (pq == NULL) {
-			PV_STAT(pmap_collect_inactive++);
-			pq = &vm_page_queues[PQ_INACTIVE];
-		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
-			PV_STAT(pmap_collect_active++);
-			pq = &vm_page_queues[PQ_ACTIVE];
-		} else
-			panic("get_pv_entry: allocation failed");
-		pmap_collect(pmap, pq);
-		goto retry;
+		m = reclaim_pv_chunk(pmap, lockp);
+		if (m == NULL)
+			goto retry;
 	}
-	PV_STAT(pc_chunk_count++);
-	PV_STAT(pc_chunk_allocs++);
+	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
+	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
+	mtx_lock(&pv_chunks_mutex);
+	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
+	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
-	pv_entry_count++;
-	PV_STAT(pv_entry_spare += _NPCPV - 1);
+	PV_STAT(atomic_add_long(&pv_entry_count, 1));
+	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
+ * Returns the number of one bits within the given PV chunk map element.
+ */
+static int
+popcnt_pc_map_elem(uint64_t elem)
+{
+	int count;
+
+	/*
+	 * This simple method of counting the one bits performs well because
+	 * the given element typically contains more zero bits than one bits.
+	 */
+	count = 0;
+	for (; elem != 0; elem &= elem - 1)
+		count++;
+	return (count);
+}
+
+/*
+ * Ensure that the number of spare PV entries in the specified pmap meets or
+ * exceeds the given count, "needed".
+ *
+ * The given PV list lock may be released.
+ */
+static void
+reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
+{
+	struct pch new_tail;
+	struct pv_chunk *pc;
+	int avail, free;
+	vm_page_t m;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
+
+	/*
+	 * Newly allocated PV chunks must be stored in a private list until
+	 * the required number of PV chunks have been allocated.  Otherwise,
+	 * reclaim_pv_chunk() could recycle one of these chunks.  In
+	 * contrast, these chunks must be added to the pmap upon allocation.
+	 */
+	TAILQ_INIT(&new_tail);
+retry:
+	avail = 0;
+	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
+		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
+			free = popcnt_pc_map_elem(pc->pc_map[0]);
+			free += popcnt_pc_map_elem(pc->pc_map[1]);
+			free += popcnt_pc_map_elem(pc->pc_map[2]);
+		} else {
+			free = popcntq(pc->pc_map[0]);
+			free += popcntq(pc->pc_map[1]);
+			free += popcntq(pc->pc_map[2]);
+		}
+		if (free == 0)
+			break;
+		avail += free;
+		if (avail >= needed)
+			break;
+	}
+	for (; avail < needed; avail += _NPCPV) {
+		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+		    VM_ALLOC_WIRED);
+		if (m == NULL) {
+			m = reclaim_pv_chunk(pmap, lockp);
+			if (m == NULL)
+				goto retry;
+		}
+		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
+		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
+		dump_add_page(m->phys_addr);
+		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
+		pc->pc_pmap = pmap;
+		pc->pc_map[0] = PC_FREE0;
+		pc->pc_map[1] = PC_FREE1;
+		pc->pc_map[2] = PC_FREE2;
+		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
+	}
+	if (!TAILQ_EMPTY(&new_tail)) {
+		mtx_lock(&pv_chunks_mutex);
+		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
+		mtx_unlock(&pv_chunks_mutex);
+	}
+}
+
+/*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
@@ -2227,7 +2461,7 @@
 {
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
@@ -2243,20 +2477,26 @@
  * entries for each of the 4KB page mappings.
  */
 static void
-pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
+pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
 {
 	struct md_page *pvh;
+	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
-
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	int bit, field;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
-	 * page's pv list.
+	 * page's pv list.  Once this transfer begins, the pv list lock
+	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_2mpage(va);
@@ -2265,14 +2505,37 @@
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
+	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 	va_last = va + NBPDR - PAGE_SIZE;
-	do {
-		m++;
-		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
-		    ("pmap_pv_demote_pde: page %p is not managed", m));
-		va += PAGE_SIZE;
-		pmap_insert_entry(pmap, va, m);
-	} while (va < va_last);
+	for (;;) {
+		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
+		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
+		for (field = 0; field < _NPCM; field++) {
+			while (pc->pc_map[field]) {
+				bit = bsfq(pc->pc_map[field]);
+				pc->pc_map[field] &= ~(1ul << bit);
+				pv = &pc->pc_pventry[field * 64 + bit];
+				va += PAGE_SIZE;
+				pv->pv_va = va;
+				m++;
+				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+			    ("pmap_pv_demote_pde: page %p is not managed", m));
+				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+				if (va == va_last)
+					goto out;
+			}
+		}
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+	}
+out:
+	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+	}
+	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
+	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 }
 
 /*
@@ -2281,23 +2544,25 @@
  * for the 2MB page mapping.
  */
 static void
-pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
+pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
-	 * Transfer the first page's pv entry for this mapping to the
-	 * 2mpage's pv list.  Aside from avoiding the cost of a call
-	 * to get_pv_entry(), a transfer avoids the possibility that
-	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
-	 * removes one of the mappings that is being promoted.
+	 * Transfer the first page's pv entry for this mapping to the 2mpage's
+	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
+	 * a transfer avoids the possibility that get_pv_entry() calls
+	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
+	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_2mpage(va);
@@ -2329,48 +2594,22 @@
 	free_pv_entry(pmap, pv);
 }
 
-static void
-pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
-{
-	struct md_page *pvh;
-
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	pmap_pvh_free(&m->md, pmap, va);
-	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
-		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
-		if (TAILQ_EMPTY(&pvh->pv_list))
-			vm_page_aflag_clear(m, PGA_WRITEABLE);
-	}
-}
-
 /*
- * Create a pv entry for page at pa for
- * (pmap, va).
+ * Conditionally create the PV entry for a 4KB page mapping if the required
+ * memory can be allocated without resorting to reclamation.
  */
-static void
-pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
+static boolean_t
+pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
+    struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	pv = get_pv_entry(pmap, FALSE);
-	pv->pv_va = va;
-	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-}
-
-/*
- * Conditionally create a pv entry.
- */
-static boolean_t
-pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
-{
-	pv_entry_t pv;
-
-	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	if ((pv = get_pv_entry(pmap, TRUE)) != NULL) {
+	/* Pass NULL instead of the lock pointer to disable reclamation. */
+	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
+		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 		return (TRUE);
 	} else
@@ -2378,17 +2617,22 @@
 }
 
 /*
- * Create the pv entry for a 2MB page mapping.
+ * Conditionally create the PV entry for a 2MB page mapping if the required
+ * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
-pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
+pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	if ((pv = get_pv_entry(pmap, TRUE)) != NULL) {
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	/* Pass NULL instead of the lock pointer to disable reclamation. */
+	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
+		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 		pvh = pa_to_pvh(pa);
 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
 		return (TRUE);
@@ -2417,6 +2661,20 @@
 static boolean_t
 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
+	struct rwlock *lock;
+	boolean_t rv;
+
+	lock = NULL;
+	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
+	if (lock != NULL)
+		rw_wunlock(lock);
+	return (rv);
+}
+
+static boolean_t
+pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+    struct rwlock **lockp)
+{
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
 	vm_paddr_t mptepa;
@@ -2451,7 +2709,8 @@
 		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 			free = NULL;
-			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free);
+			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
+			    lockp);
 			pmap_invalidate_page(pmap, trunc_2mpage(va));
 			pmap_free_zero_pages(free);
 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
@@ -2491,6 +2750,17 @@
 		pmap_fill_ptp(firstpte, newpte);
 
 	/*
+	 * The spare PV entries must be reserved prior to demoting the
+	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
+	 * of the PDE and the PV lists will be inconsistent, which can result
+	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
+	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
+	 * PV entry for the 2MB page mapping that is being demoted.
+	 */
+	if ((oldpde & PG_MANAGED) != 0)
+		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
+
+	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
@@ -2509,18 +2779,12 @@
 		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 
 	/*
-	 * Demote the pv entry.  This depends on the earlier demotion
-	 * of the mapping.  Specifically, the (re)creation of a per-
-	 * page pv entry might trigger the execution of pmap_collect(),
-	 * which might reclaim a newly (re)created per-page pv entry
-	 * and destroy the associated mapping.  In order to destroy
-	 * the mapping, the PDE must have already changed from mapping
-	 * the 2mpage to referencing the page table page.
+	 * Demote the PV entry.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
-		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
-
-	pmap_pde_demotions++;
+		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
+
+	atomic_add_long(&pmap_pde_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
@@ -2531,7 +2795,7 @@
  */
 static int
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
-    vm_page_t *free)
+    vm_page_t *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
@@ -2553,6 +2817,7 @@
 		pmap_invalidate_page(kernel_pmap, sva);
 	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 	if (oldpde & PG_MANAGED) {
+		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + NBPDR;
@@ -2568,7 +2833,7 @@
 		}
 	}
 	if (pmap == kernel_pmap) {
-		if (!pmap_demote_pde(pmap, pdq, sva))
+		if (!pmap_demote_pde_locked(pmap, pdq, sva, lockp))
 			panic("pmap_remove_pde: failed demotion");
 	} else {
 		mpte = pmap_lookup_pt_page(pmap, sva);
@@ -2590,8 +2855,9 @@
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
-    pd_entry_t ptepde, vm_page_t *free)
+    pd_entry_t ptepde, vm_page_t *free, struct rwlock **lockp)
 {
+	struct md_page *pvh;
 	pt_entry_t oldpte;
 	vm_page_t m;
 
@@ -2606,7 +2872,14 @@
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
-		pmap_remove_entry(pmap, m, va);
+		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
+		pmap_pvh_free(&m->md, pmap, va);
+		if (TAILQ_EMPTY(&m->md.pv_list) &&
+		    (m->flags & PG_FICTITIOUS) == 0) {
+			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+			if (TAILQ_EMPTY(&pvh->pv_list))
+				vm_page_aflag_clear(m, PGA_WRITEABLE);
+		}
 	}
 	return (pmap_unuse_pt(pmap, va, ptepde, free));
 }
@@ -2617,6 +2890,7 @@
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
 {
+	struct rwlock *lock;
 	pt_entry_t *pte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -2625,7 +2899,10 @@
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		return;
-	pmap_remove_pte(pmap, pte, va, *pde, free);
+	lock = NULL;
+	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
+	if (lock != NULL)
+		rw_wunlock(lock);
 	pmap_invalidate_page(pmap, va);
 }
 
@@ -2638,6 +2915,7 @@
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
+	struct rwlock *lock;
 	vm_offset_t va, va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
@@ -2654,7 +2932,7 @@
 
 	anyvalid = 0;
 
-	vm_page_lock_queues();
+	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 
 	/*
@@ -2670,6 +2948,7 @@
 		}
 	}
 
+	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
@@ -2722,9 +3001,10 @@
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
-				pmap_remove_pde(pmap, pde, sva, &free);
+				pmap_remove_pde(pmap, pde, sva, &free, &lock);
 				continue;
-			} else if (!pmap_demote_pde(pmap, pde, sva)) {
+			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
+			    &lock)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			} else
@@ -2753,7 +3033,8 @@
 				anyvalid = 1;
 			else if (va == va_next)
 				va = sva;
-			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free)) {
+			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
+			    &lock)) {
 				sva += PAGE_SIZE;
 				break;
 			}
@@ -2761,10 +3042,12 @@
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
+	if (lock != NULL)
+		rw_wunlock(lock);
 out:
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();	
+	rw_runlock(&pvh_global_lock);	
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
@@ -2796,7 +3079,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	free = NULL;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -2835,7 +3118,7 @@
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	pmap_free_zero_pages(free);
 }
 
@@ -2956,12 +3239,12 @@
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
-					if (!mtx_trylock(&vm_page_queue_mtx)) {
+					if (!rw_try_rlock(&pvh_global_lock)) {
 						if (anychanged)
 							pmap_invalidate_all(
 							    pmap);
 						PMAP_UNLOCK(pmap);
-						vm_page_lock_queues();
+						rw_rlock(&pvh_global_lock);
 						goto resume;
 					}
 				}
@@ -3012,7 +3295,7 @@
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	if (pv_lists_locked)
-		vm_page_unlock_queues();
+		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3024,7 +3307,8 @@
  * identical characteristics. 
  */
 static void
-pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
+pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+    struct rwlock **lockp)
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
@@ -3042,7 +3326,7 @@
 setpde:
 	newpde = *firstpte;
 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
-		pmap_pde_p_failures++;
+		atomic_add_long(&pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return;
@@ -3067,7 +3351,7 @@
 setpte:
 		oldpte = *pte;
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
-			pmap_pde_p_failures++;
+			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
@@ -3086,7 +3370,7 @@
 			    " in pmap %p", oldpteva, pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
-			pmap_pde_p_failures++;
+			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
@@ -3111,7 +3395,7 @@
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
-		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
+		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 
 	/*
 	 * Propagate the PAT index to its proper position.
@@ -3127,7 +3411,7 @@
 	else
 		pde_store(pde, PG_PS | newpde);
 
-	pmap_pde_promotions++;
+	atomic_add_long(&pmap_pde_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 }
@@ -3148,6 +3432,7 @@
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
     vm_prot_t prot, boolean_t wired)
 {
+	struct rwlock *lock;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	pt_entry_t newpte, origpte;
@@ -3161,115 +3446,16 @@
 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 	    va));
+	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
+	    va >= kmi.clean_eva,
+	    ("pmap_enter: managed mapping within the clean submap"));
 	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
 	    VM_OBJECT_LOCKED(m->object),
 	    ("pmap_enter: page %p is not busy", m));
-
-	mpte = NULL;
-
-	vm_page_lock_queues();
-	PMAP_LOCK(pmap);
-
-	/*
-	 * In the case that a page table page is not
-	 * resident, we are creating it here.
-	 */
-	if (va < VM_MAXUSER_ADDRESS)
-		mpte = pmap_allocpte(pmap, va, M_WAITOK);
-
-	pde = pmap_pde(pmap, va);
-	if (pde != NULL && (*pde & PG_V) != 0) {
-		if ((*pde & PG_PS) != 0)
-			panic("pmap_enter: attempted pmap_enter on 2MB page");
-		pte = pmap_pde_to_pte(pde, va);
-	} else
-		panic("pmap_enter: invalid page directory va=%#lx", va);
-
 	pa = VM_PAGE_TO_PHYS(m);
-	om = NULL;
-	origpte = *pte;
-	opa = origpte & PG_FRAME;
-
-	/*
-	 * Mapping has not changed, must be protection or wiring change.
-	 */
-	if (origpte && (opa == pa)) {
-		/*
-		 * Wiring change, just update stats. We don't worry about
-		 * wiring PT pages as they remain resident as long as there
-		 * are valid mappings in them. Hence, if a user page is wired,
-		 * the PT page will be also.
-		 */
-		if (wired && ((origpte & PG_W) == 0))
-			pmap->pm_stats.wired_count++;
-		else if (!wired && (origpte & PG_W))
-			pmap->pm_stats.wired_count--;
-
-		/*
-		 * Remove extra pte reference
-		 */
-		if (mpte)
-			mpte->wire_count--;
-
-		if (origpte & PG_MANAGED) {
-			om = m;
-			pa |= PG_MANAGED;
-		}
-		goto validate;
-	} 
-
-	pv = NULL;
-
-	/*
-	 * Mapping has changed, invalidate old range and fall through to
-	 * handle validating new mapping.
-	 */
-	if (opa) {
-		if (origpte & PG_W)
-			pmap->pm_stats.wired_count--;
-		if (origpte & PG_MANAGED) {
-			om = PHYS_TO_VM_PAGE(opa);
-			pv = pmap_pvh_remove(&om->md, pmap, va);
-		}
-		if (mpte != NULL) {
-			mpte->wire_count--;
-			KASSERT(mpte->wire_count > 0,
-			    ("pmap_enter: missing reference to page table page,"
-			     " va: 0x%lx", va));
-		}
-	} else
-		pmap_resident_count_inc(pmap, 1);
-
-	/*
-	 * Enter on the PV list if part of our managed memory.
-	 */
-	if ((m->oflags & VPO_UNMANAGED) == 0) {
-		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
-		    ("pmap_enter: managed mapping within the clean submap"));
-		if (pv == NULL)
-			pv = get_pv_entry(pmap, FALSE);
-		pv->pv_va = va;
-		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-		pa |= PG_MANAGED;
-	} else if (pv != NULL)
-		free_pv_entry(pmap, pv);
-
-	/*
-	 * Increment counters
-	 */
-	if (wired)
-		pmap->pm_stats.wired_count++;
-
-validate:
-	/*
-	 * Now validate mapping with desired protection/wiring.
-	 */
 	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
-	if ((prot & VM_PROT_WRITE) != 0) {
+	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= PG_RW;
-		if ((newpte & PG_MANAGED) != 0)
-			vm_page_aflag_set(m, PGA_WRITEABLE);
-	}
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if (wired)
@@ -3279,40 +3465,143 @@
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
 
+	mpte = om = NULL;
+
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
+	PMAP_LOCK(pmap);
+
 	/*
-	 * if the mapping or permission bits are different, we need
-	 * to update the pte.
+	 * In the case that a page table page is not
+	 * resident, we are creating it here.
 	 */
-	if ((origpte & ~(PG_M|PG_A)) != newpte) {
-		newpte |= PG_A;
-		if ((access & VM_PROT_WRITE) != 0)
-			newpte |= PG_M;
-		if (origpte & PG_V) {
-			invlva = FALSE;
-			origpte = pte_load_store(pte, newpte);
-			if (origpte & PG_A) {
-				if (origpte & PG_MANAGED)
-					vm_page_aflag_set(om, PGA_REFERENCED);
-				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
-				    PG_NX) == 0 && (newpte & PG_NX)))
-					invlva = TRUE;
+retry:
+	pde = pmap_pde(pmap, va);
+	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
+	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
+		pte = pmap_pde_to_pte(pde, va);
+		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
+			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
+			mpte->wire_count++;
+		}
+	} else if (va < VM_MAXUSER_ADDRESS) {
+		/*
+		 * Here if the pte page isn't mapped, or if it has been
+		 * deallocated.
+		 */
+		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
+		goto retry;
+	} else
+		panic("pmap_enter: invalid page directory va=%#lx", va);
+
+	origpte = *pte;
+	opa = origpte & PG_FRAME;
+
+	/*
+	 * Is the specified virtual address already mapped?
+	 */
+	if ((origpte & PG_V) != 0) {
+		/*
+		 * Wiring change, just update stats. We don't worry about
+		 * wiring PT pages as they remain resident as long as there
+		 * are valid mappings in them. Hence, if a user page is wired,
+		 * the PT page will be also.
+		 */
+		if (wired && (origpte & PG_W) == 0)
+			pmap->pm_stats.wired_count++;
+		else if (!wired && (origpte & PG_W))
+			pmap->pm_stats.wired_count--;
+
+		/*
+		 * Remove the extra PT page reference.
+		 */
+		if (mpte != NULL) {
+			mpte->wire_count--;
+			KASSERT(mpte->wire_count > 0,
+			    ("pmap_enter: missing reference to page table page,"
+			     " va: 0x%lx", va));
+		}
+
+		/*
+		 * Has the mapping changed?
+		 */
+		if (opa == pa) {
+			/*
+			 * No, might be a protection or wiring change.
+			 */
+			if ((origpte & PG_MANAGED) != 0) {
+				newpte |= PG_MANAGED;
+				om = m;
 			}
-			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
-				if ((origpte & PG_MANAGED) != 0)
-					vm_page_dirty(om);
-				if ((newpte & PG_RW) == 0)
-					invlva = TRUE;
-			}
-			if ((origpte & PG_MANAGED) != 0 &&
+			if ((origpte & ~(PG_M | PG_A)) == newpte)
+				goto unchanged;
+			goto validate;
+		} else {
+			/*
+			 * Yes, fall through to validate the new mapping.
+			 */
+			if ((origpte & PG_MANAGED) != 0)
+				om = PHYS_TO_VM_PAGE(opa);
+		}
+	} else {
+		/*
+		 * Increment the counters.
+		 */
+		if (wired)
+			pmap->pm_stats.wired_count++;
+		pmap_resident_count_inc(pmap, 1);
+	}
+
+	/*
+	 * Enter on the PV list if part of our managed memory.
+	 */
+	if ((m->oflags & VPO_UNMANAGED) == 0) {
+		newpte |= PG_MANAGED;
+		pv = get_pv_entry(pmap, &lock);
+		pv->pv_va = va;
+		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
+		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+	}
+
+validate:
+
+	/*
+	 * Update the PTE.
+	 */
+	newpte |= PG_A;
+	if ((access & VM_PROT_WRITE) != 0)
+		newpte |= PG_M;
+	if ((newpte & (PG_MANAGED | PG_RW)) == (PG_MANAGED | PG_RW))
+		vm_page_aflag_set(m, PGA_WRITEABLE);
+	if ((origpte & PG_V) != 0) {
+		invlva = FALSE;
+		origpte = pte_load_store(pte, newpte);
+		if ((origpte & PG_A) != 0 && (opa != pa ||
+		    ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0)))
+			invlva = TRUE;
+		if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+			if ((origpte & PG_MANAGED) != 0)
+				vm_page_dirty(om);
+			if ((newpte & PG_RW) == 0)
+				invlva = TRUE;
+		}
+		if (opa != pa && (origpte & PG_MANAGED) != 0) {
+			if ((origpte & PG_A) != 0)
+				vm_page_aflag_set(om, PGA_REFERENCED);
+			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
+			pmap_pvh_free(&om->md, pmap, va);
+			if ((om->aflags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
-			if (invlva)
-				pmap_invalidate_page(pmap, va);
-		} else
-			pte_store(pte, newpte);
-	}
+		}
+		if (invlva)
+			pmap_invalidate_page(pmap, va);
+	} else
+		pte_store(pte, newpte);
+
+unchanged:
 
 	/*
 	 * If both the page table page and the reservation are fully
@@ -3321,9 +3610,11 @@
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
-		pmap_promote_pde(pmap, pde, va);
-
-	vm_page_unlock_queues();
+		pmap_promote_pde(pmap, pde, va, &lock);
+
+	if (lock != NULL)
+		rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3334,14 +3625,15 @@
  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
  */
 static boolean_t
-pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
+pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
+    struct rwlock **lockp)
 {
 	pd_entry_t *pde, newpde;
 	vm_page_t free, mpde;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
+	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
@@ -3364,7 +3656,8 @@
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
-		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
+		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
+		    lockp)) {
 			free = NULL;
 			if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) {
 				pmap_invalidate_page(pmap, va);
@@ -3390,7 +3683,7 @@
 	 */
 	pde_store(pde, newpde);
 
-	pmap_pde_mappings++;
+	atomic_add_long(&pmap_pde_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
@@ -3412,6 +3705,7 @@
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
+	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
@@ -3420,21 +3714,24 @@
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
-	vm_page_lock_queues();
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
 		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
-		    pmap_enter_pde(pmap, va, m, prot))
+		    pmap_enter_pde(pmap, va, m, prot, &lock))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
-			    mpte);
+			    mpte, &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
-	vm_page_unlock_queues();
+	if (lock != NULL)
+		rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3450,17 +3747,21 @@
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
-
-	vm_page_lock_queues();
+	struct rwlock *lock;
+
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
-	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
-	vm_page_unlock_queues();
+	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
+	if (lock != NULL)
+		rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
-    vm_prot_t prot, vm_page_t mpte)
+    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	vm_page_t free;
 	pt_entry_t *pte;
@@ -3469,7 +3770,7 @@
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
@@ -3494,7 +3795,9 @@
 
 			/*
 			 * If the page table page is mapped, we just increment
-			 * the hold count, and activate it.
+			 * the hold count, and activate it.  Otherwise, we
+			 * attempt to allocate a page table page.  If this
+			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (ptepa && (*ptepa & PG_V) != 0) {
 				if (*ptepa & PG_PS)
@@ -3502,8 +3805,11 @@
 				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
-				mpte = _pmap_allocpte(pmap, ptepindex,
-				    M_NOWAIT);
+				/*
+				 * Pass NULL instead of the PV list lock
+				 * pointer, because we don't intend to sleep.
+				 */
+				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
@@ -3526,7 +3832,7 @@
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
-	    !pmap_try_insert_pv_entry(pmap, va, m)) {
+	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			free = NULL;
 			if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
@@ -3629,7 +3935,7 @@
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
 		    size; pa += NBPDR) {
-			pdpg = pmap_allocpde(pmap, addr, M_NOWAIT);
+			pdpg = pmap_allocpde(pmap, addr, NULL);
 			if (pdpg == NULL) {
 				/*
 				 * The creation of mappings below is only an
@@ -3647,7 +3953,7 @@
 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
-				pmap_pde_mappings++;
+				atomic_add_long(&pmap_pde_mappings, 1);
 			} else {
 				/* Continue on if the PDE is already valid. */
 				pdpg->wire_count--;
@@ -3673,9 +3979,9 @@
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
-	boolean_t are_queues_locked;
-
-	are_queues_locked = FALSE;
+	boolean_t pv_lists_locked;
+
+	pv_lists_locked = FALSE;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
@@ -3686,11 +3992,11 @@
 	pde = pmap_pde(pmap, va);
 	if ((*pde & PG_PS) != 0) {
 		if (!wired != ((*pde & PG_W) == 0)) {
-			if (!are_queues_locked) {
-				are_queues_locked = TRUE;
-				if (!mtx_trylock(&vm_page_queue_mtx)) {
+			if (!pv_lists_locked) {
+				pv_lists_locked = TRUE;
+				if (!rw_try_rlock(&pvh_global_lock)) {
 					PMAP_UNLOCK(pmap);
-					vm_page_lock_queues();
+					rw_rlock(&pvh_global_lock);
 					goto retry;
 				}
 			}
@@ -3708,8 +4014,8 @@
 		atomic_clear_long(pte, PG_W);
 	}
 out:
-	if (are_queues_locked)
-		vm_page_unlock_queues();
+	if (pv_lists_locked)
+		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3725,6 +4031,7 @@
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
+	struct rwlock *lock;
 	vm_page_t   free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
@@ -3733,7 +4040,8 @@
 	if (dst_addr != src_addr)
 		return;
 
-	vm_page_lock_queues();
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
@@ -3777,7 +4085,7 @@
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
-			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
+			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
 			if (dstmpde == NULL)
 				break;
 			pde = (pd_entry_t *)
@@ -3785,7 +4093,7 @@
 			pde = &pde[pmap_pde_index(addr)];
 			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
-			    PG_PS_FRAME))) {
+			    PG_PS_FRAME, &lock))) {
 				*pde = srcptepaddr & ~PG_W;
 				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
 			} else
@@ -3815,14 +4123,15 @@
 				    dstmpte->pindex == pmap_pde_pindex(addr))
 					dstmpte->wire_count++;
 				else if ((dstmpte = pmap_allocpte(dst_pmap,
-				    addr, M_NOWAIT)) == NULL)
+				    addr, NULL)) == NULL)
 					goto out;
 				dst_pte = (pt_entry_t *)
 				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 				dst_pte = &dst_pte[pmap_pte_index(addr)];
 				if (*dst_pte == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
-				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
+				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
+				    &lock)) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
@@ -3849,7 +4158,9 @@
 		}
 	}
 out:
-	vm_page_unlock_queues();
+	if (lock != NULL)
+		rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
@@ -3923,6 +4234,7 @@
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
+	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
@@ -3930,7 +4242,9 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
-	vm_page_lock_queues();
+	rw_rlock(&pvh_global_lock);
+	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
@@ -3952,7 +4266,8 @@
 				break;
 		}
 	}
-	vm_page_unlock_queues();
+	rw_runlock(lock);
+	rw_runlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -3970,13 +4285,13 @@
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	count = pmap_pvh_wired_mappings(&m->md, count);
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
 	        count);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
@@ -3992,7 +4307,7 @@
 	pt_entry_t *pte;
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
@@ -4011,15 +4326,19 @@
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
+	struct rwlock *lock;
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
-	vm_page_lock_queues();
+	rw_rlock(&pvh_global_lock);
+	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+	rw_rlock(lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
-	vm_page_unlock_queues();
+	rw_runlock(lock);
+	rw_runlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4041,21 +4360,23 @@
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
-	int field, idx;
+	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
-	int allfree;
+	int allfree, field, freed, idx;
 
 	if (pmap != PCPU_GET(curpmap)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
-	vm_page_lock_queues();
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
+		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
-			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
+			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfq(inuse);
 				bitmask = 1UL << bit;
@@ -4109,10 +4430,9 @@
 						vm_page_dirty(m);
 				}
 
+				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
+
 				/* Mark free */
-				PV_STAT(pv_entry_frees++);
-				PV_STAT(pv_entry_spare++);
-				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 				if ((tpte & PG_PS) != 0) {
 					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
@@ -4120,7 +4440,8 @@
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
-							if (TAILQ_EMPTY(&mt->md.pv_list))
+							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
+							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
@@ -4136,7 +4457,8 @@
 				} else {
 					pmap_resident_count_dec(pmap, 1);
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-					if (TAILQ_EMPTY(&m->md.pv_list) &&
+					if ((m->aflags & PGA_WRITEABLE) != 0 &&
+					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
@@ -4144,21 +4466,21 @@
 					}
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
+				freed++;
 			}
 		}
+		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
+		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
+		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
-			PV_STAT(pv_entry_spare -= _NPCPV);
-			PV_STAT(pc_chunk_count--);
-			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
-			dump_drop_page(m->phys_addr);
-			vm_page_unwire(m, 0);
-			vm_page_free(m);
+			free_pv_chunk(pc);
 		}
 	}
+	if (lock != NULL)
+		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
@@ -4186,11 +4508,11 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_modified_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4207,7 +4529,7 @@
 	pmap_t pmap;
 	boolean_t rv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
@@ -4258,11 +4580,11 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_referenced_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4278,7 +4600,7 @@
 	pmap_t pmap;
 	boolean_t rv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
@@ -4317,7 +4639,7 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -4335,8 +4657,9 @@
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
-		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
-		    " a 2mpage in page %p's pv list", m));
+		KASSERT((*pde & PG_PS) == 0,
+		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
+		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 retry:
 		oldpte = *pte;
@@ -4351,7 +4674,7 @@
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -4379,7 +4702,7 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -4437,7 +4760,7 @@
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 out:
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rtval);
 }
 
@@ -4467,7 +4790,7 @@
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -4516,7 +4839,7 @@
 		}
 		PMAP_UNLOCK(pmap);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -4536,7 +4859,7 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_reference: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -4576,7 +4899,7 @@
 		}
 		PMAP_UNLOCK(pmap);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/ptrace_machdep.c
--- a/head/sys/amd64/amd64/ptrace_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/ptrace_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/ptrace_machdep.c 232520 2012-03-04 20:24:28Z tijl $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/ptrace_machdep.c 238669 2012-07-21 13:06:37Z kib $");
 
 #include "opt_compat.h"
 
@@ -50,6 +50,7 @@
 
 	switch (req) {
 	case PT_GETXSTATE:
+		fpugetregs(td);
 		savefpu = (char *)(get_pcb_user_save_td(td) + 1);
 		error = copyout(savefpu, addr,
 		    cpu_max_ext_state_size - sizeof(struct savefpu));
@@ -62,8 +63,10 @@
 		}
 		savefpu = malloc(data, M_TEMP, M_WAITOK);
 		error = copyin(addr, savefpu, data);
-		if (error == 0)
+		if (error == 0) {
+			fpugetregs(td);
 			error = fpusetxstate(td, savefpu, data);
+		}
 		free(savefpu, M_TEMP);
 		break;
 
@@ -89,11 +92,13 @@
 
 	switch (req) {
 	case PT_I386_GETXMMREGS:
+		fpugetregs(td);
 		error = copyout(get_pcb_user_save_td(td), addr,
 		    sizeof(*fpstate));
 		break;
 
 	case PT_I386_SETXMMREGS:
+		fpugetregs(td);
 		fpstate = get_pcb_user_save_td(td);
 		error = copyin(addr, fpstate, sizeof(*fpstate));
 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/trap.c
--- a/head/sys/amd64/amd64/trap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/trap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/trap.c 233781 2012-04-02 15:07:22Z jhb $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/trap.c 238623 2012-07-19 19:09:12Z kib $");
 
 /*
  * AMD64 Trap and System call handling
@@ -328,7 +328,7 @@
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
-			ucode = fputrap();
+			ucode = fputrap_x87();
 			if (ucode == -1)
 				goto userout;
 			i = SIGFPE;
@@ -442,7 +442,9 @@
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
-			ucode = 0; /* XXX */
+			ucode = fputrap_sse();
+			if (ucode == -1)
+				goto userout;
 			i = SIGFPE;
 			break;
 		}
@@ -518,9 +520,8 @@
 				frame->tf_rip = (long)fsbase_load_fault;
 				goto out;
 			}
-			if (PCPU_GET(curpcb)->pcb_onfault != NULL) {
-				frame->tf_rip =
-				    (long)PCPU_GET(curpcb)->pcb_onfault;
+			if (curpcb->pcb_onfault != NULL) {
+				frame->tf_rip = (long)curpcb->pcb_onfault;
 				goto out;
 			}
 			break;
@@ -706,7 +707,7 @@
 		 * it normally, and panic immediately.
 		 */
 		if (!usermode && (td->td_intr_nesting_level != 0 ||
-		    PCPU_GET(curpcb)->pcb_onfault == NULL)) {
+		    curpcb->pcb_onfault == NULL)) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
@@ -762,8 +763,8 @@
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
-		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
-			frame->tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault;
+		    curpcb->pcb_onfault != NULL) {
+			frame->tf_rip = (long)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
@@ -972,4 +973,15 @@
 	     syscallname(td->td_proc, sa.code)));
 
 	syscallret(td, error, &sa);
+
+	/*
+	 * If the user-supplied value of %rip is not a canonical
+	 * address, then some CPUs will trigger a ring 0 #GP during
+	 * the sysret instruction.  However, the fault handler would
+	 * execute in ring 0 with the user's %gs and %rsp which would
+	 * not be safe.  Instead, use the full return path which
+	 * catches the problem safely.
+	 */
+	if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS)
+		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/vm_machdep.c
--- a/head/sys/amd64/amd64/vm_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/vm_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/vm_machdep.c 231441 2012-02-10 21:26:25Z kib $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/vm_machdep.c 238623 2012-07-19 19:09:12Z kib $");
 
 #include "opt_isa.h"
 #include "opt_cpu.h"
@@ -90,6 +90,10 @@
 static volatile u_int	cpu_reset_proxy_active;
 #endif
 
+CTASSERT((struct thread **)OFFSETOF_CURTHREAD ==
+    &((struct pcpu *)NULL)->pc_curthread);
+CTASSERT((struct pcb **)OFFSETOF_CURPCB == &((struct pcpu *)NULL)->pc_curpcb);
+
 struct savefpu *
 get_pcb_user_save_td(struct thread *td)
 {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/conf/GENERIC
--- a/head/sys/amd64/conf/GENERIC	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/conf/GENERIC	Wed Jul 25 16:40:53 2012 +0300
@@ -16,7 +16,7 @@
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
-# $FreeBSD: head/sys/amd64/conf/GENERIC 234504 2012-04-20 21:37:42Z brooks $
+# $FreeBSD: head/sys/amd64/conf/GENERIC 237901 2012-07-01 08:10:49Z delphij $
 
 cpu		HAMMER
 ident		GENERIC
@@ -28,6 +28,7 @@
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
+options 	TCP_OFFLOAD		# TCP offload
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
@@ -44,6 +45,7 @@
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
+options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
@@ -66,6 +68,7 @@
 options 	MAC			# TrustedBSD MAC Framework
 options 	KDTRACE_FRAME		# Ensure frames are compiled in
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
+options 	DDB_CTF			# Kernel ELF linker loads CTF data
 options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
 
 # Debugging support.  Always need this:
@@ -75,7 +78,6 @@
 # For full debugger support use this instead:
 options 	DDB			# Support DDB.
 options 	GDB			# Support remote GDB.
-options 	DDB_CTF			# kernel ELF linker loads CTF data
 options 	DEADLKRES		# Enable the deadlock resolver
 options 	INVARIANTS		# Enable calls of extra sanity checking
 options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
@@ -150,6 +152,7 @@
 device		ips		# IBM (Adaptec) ServeRAID
 device		mly		# Mylex AcceleRAID/eXtremeRAID
 device		twa		# 3ware 9000 series PATA/SATA RAID
+device		tws		# LSI 3ware 9750 SATA+SAS 6Gb/s RAID controller
 
 # RAID controllers
 device		aac		# Adaptec FSA RAID
@@ -160,7 +163,6 @@
 #XXX pointer/int warnings
 #device		pst		# Promise Supertrak SX6000
 device		twe		# 3ware ATA RAID
-device		tws		# LSI 3ware 9750 SATA+SAS 6Gb/s RAID controller
 
 # atkbdc0 controls both the keyboard and the PS/2 mouse
 device		atkbdc		# AT keyboard controller
@@ -272,6 +274,8 @@
 device		ath_pci		# Atheros pci/cardbus glue
 device		ath_hal		# pci/cardbus chip support
 options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
+options 	AH_AR5416_INTERRUPT_MITIGATION	# AR5416 interrupt mitigation
+options 	ATH_ENABLE_11N	# Enable 802.11n support for AR5416 and later
 device		ath_rate_sample	# SampleRate tx rate control for ath
 #device		bwi		# Broadcom BCM430x/BCM431x wireless NICs.
 #device		bwn		# Broadcom BCM43xx wireless NICs.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/atomic.h
--- a/head/sys/amd64/include/atomic.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/atomic.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/amd64/include/atomic.h 236456 2012-06-02 18:10:16Z kib $
  */
 #ifndef _MACHINE_ATOMIC_H_
 #define	_MACHINE_ATOMIC_H_
@@ -81,8 +81,9 @@
 u_int	atomic_fetchadd_int(volatile u_int *p, u_int v);
 u_long	atomic_fetchadd_long(volatile u_long *p, u_long v);
 
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)			\
-u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p);	\
+#define	ATOMIC_LOAD(TYPE, LOP)					\
+u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p)
+#define	ATOMIC_STORE(TYPE)					\
 void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 
 #else /* !KLD_MODULE && __GNUCLIKE_ASM */
@@ -210,37 +211,43 @@
 	return (v);
 }
 
+/*
+ * We assume that a = b will do atomic loads and stores.  Due to the
+ * IA32 memory model, a simple store guarantees release semantics.
+ *
+ * However, loads may pass stores, so for atomic_load_acq we have to
+ * ensure a Store/Load barrier to do the load in SMP kernels.  We use
+ * "lock cmpxchg" as recommended by the AMD Software Optimization
+ * Guide, and not mfence.  For UP kernels, however, the cache of the
+ * single processor is always consistent, so we only need to take care
+ * of the compiler.
+ */
+#define	ATOMIC_STORE(TYPE)				\
+static __inline void					\
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
+{							\
+	__asm __volatile("" : : : "memory");		\
+	*p = v;						\
+}							\
+struct __hack
+
 #if defined(_KERNEL) && !defined(SMP)
 
-/*
- * We assume that a = b will do atomic loads and stores.  However, on a
- * PentiumPro or higher, reads may pass writes, so for that case we have
- * to use a serializing instruction (i.e. with LOCK) to do the load in
- * SMP kernels.  For UP kernels, however, the cache of the single processor
- * is always consistent, so we only need to take care of compiler.
- */
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)		\
+#define	ATOMIC_LOAD(TYPE, LOP)				\
 static __inline u_##TYPE				\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
 {							\
 	u_##TYPE tmp;					\
 							\
 	tmp = *p;					\
-	__asm __volatile ("" : : : "memory");		\
+	__asm __volatile("" : : : "memory");		\
 	return (tmp);					\
 }							\
-							\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__asm __volatile ("" : : : "memory");		\
-	*p = v;						\
-}							\
 struct __hack
 
 #else /* !(_KERNEL && !SMP) */
 
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)		\
+#define	ATOMIC_LOAD(TYPE, LOP)				\
 static __inline u_##TYPE				\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
 {							\
@@ -254,19 +261,6 @@
 							\
 	return (res);					\
 }							\
-							\
-/*							\
- * The XCHG instruction asserts LOCK automagically.	\
- */							\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__asm __volatile(SOP				\
-	: "=m" (*p),			/* 0 */		\
-	  "+r" (v)			/* 1 */		\
-	: "m" (*p)			/* 2 */		\
-	: "memory");					\
-}							\
 struct __hack
 
 #endif /* _KERNEL && !SMP */
@@ -293,13 +287,19 @@
 ATOMIC_ASM(add,	     long,  "addq %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subq %1,%0",  "ir",  v);
 
-ATOMIC_STORE_LOAD(char,	"cmpxchgb %b0,%1", "xchgb %b1,%0");
-ATOMIC_STORE_LOAD(short,"cmpxchgw %w0,%1", "xchgw %w1,%0");
-ATOMIC_STORE_LOAD(int,	"cmpxchgl %0,%1",  "xchgl %1,%0");
-ATOMIC_STORE_LOAD(long,	"cmpxchgq %0,%1",  "xchgq %1,%0");
+ATOMIC_LOAD(char,  "cmpxchgb %b0,%1");
+ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
+ATOMIC_LOAD(int,   "cmpxchgl %0,%1");
+ATOMIC_LOAD(long,  "cmpxchgq %0,%1");
+
+ATOMIC_STORE(char);
+ATOMIC_STORE(short);
+ATOMIC_STORE(int);
+ATOMIC_STORE(long);
 
 #undef ATOMIC_ASM
-#undef ATOMIC_STORE_LOAD
+#undef ATOMIC_LOAD
+#undef ATOMIC_STORE
 
 #ifndef WANT_FUNCTIONS
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/cpufunc.h
--- a/head/sys/amd64/include/cpufunc.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/cpufunc.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/cpufunc.h 232227 2012-02-27 17:28:47Z jhb $
+ * $FreeBSD: head/sys/amd64/include/cpufunc.h 238311 2012-07-09 20:55:39Z jhb $
  */
 
 /*
@@ -107,6 +107,13 @@
 }
 
 static __inline void
+clts(void)
+{
+
+	__asm __volatile("clts");
+}
+
+static __inline void
 disable_intr(void)
 {
 	__asm __volatile("cli" : : : "memory");
@@ -273,6 +280,15 @@
 	__asm __volatile("outw %0, %w1" : : "a" (data), "Nd" (port));
 }
 
+static __inline u_long
+popcntq(u_long mask)
+{
+	u_long result;
+
+	__asm __volatile("popcntq %1,%0" : "=r" (result) : "rm" (mask));
+	return (result);
+}
+
 static __inline void
 mfence(void)
 {
@@ -409,6 +425,25 @@
 	return (data);
 }
 
+static __inline u_long
+rxcr(u_int reg)
+{
+	u_int low, high;
+
+	__asm __volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (reg));
+	return (low | ((uint64_t)high << 32));
+}
+
+static __inline void
+load_xcr(u_int reg, u_long val)
+{
+	u_int low, high;
+
+	low = val;
+	high = val >> 32;
+	__asm __volatile("xsetbv" : : "c" (reg), "a" (low), "d" (high));
+}
+
 /*
  * Global TLB flush (except for thise for pages marked PG_G)
  */
@@ -674,6 +709,9 @@
 int	breakpoint(void);
 u_int	bsfl(u_int mask);
 u_int	bsrl(u_int mask);
+void	clflush(u_long addr);
+void	clts(void);
+void	cpuid_count(u_int ax, u_int cx, u_int *p);
 void	disable_intr(void);
 void	do_cpuid(u_int ax, u_int *p);
 void	enable_intr(void);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/elf.h
--- a/head/sys/amd64/include/elf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/elf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/amd64/include/elf.h 237430 2012-06-22 06:38:31Z kib $
  */
 
 #ifndef _MACHINE_ELF_H_
@@ -94,6 +94,7 @@
 #define	AT_NCPUS	19	/* Number of CPUs. */
 #define	AT_PAGESIZES	20	/* Pagesizes. */
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
+#define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #define	AT_STACKPROT	23	/* Initial stack protection. */
 
 #define	AT_COUNT	24	/* Count of defined aux entry types. */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/fpu.h
--- a/head/sys/amd64/include/fpu.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/fpu.h	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)npx.h	5.3 (Berkeley) 1/18/91
- * $FreeBSD: head/sys/amd64/include/fpu.h 233044 2012-03-16 20:24:30Z tijl $
+ * $FreeBSD: head/sys/amd64/include/fpu.h 238598 2012-07-18 15:43:47Z kib $
  */
 
 /*
@@ -62,7 +62,8 @@
 	    char *xfpustate, size_t xfpustate_size);
 int	fpusetxstate(struct thread *td, char *xfpustate,
 	    size_t xfpustate_size);
-int	fputrap(void);
+int	fputrap_sse(void);
+int	fputrap_x87(void);
 void	fpuuserinited(struct thread *td);
 struct fpu_kern_ctx *fpu_kern_alloc_ctx(u_int flags);
 void	fpu_kern_free_ctx(struct fpu_kern_ctx *ctx);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/in_cksum.h
--- a/head/sys/amd64/include/in_cksum.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/in_cksum.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  *	from tahoe:	in_cksum.c	1.2	86/01/05
  *	from:		@(#)in_cksum.c	1.3 (Berkeley) 1/19/91
  *	from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
- * $FreeBSD$
+ * $FreeBSD: head/sys/amd64/include/in_cksum.h 235941 2012-05-24 22:00:48Z bz $
  */
 
 #ifndef _MACHINE_IN_CKSUM_H_
@@ -43,6 +43,7 @@
 
 #define in_cksum(m, len)	in_cksum_skip(m, len, 0)
 
+#if defined(IPVERSION) && (IPVERSION == 4)
 /*
  * It it useful to have an Internet checksum routine which is inlineable
  * and optimized specifically for the task of computing IP header checksums
@@ -69,9 +70,12 @@
 	} while(0)
 
 #endif
+#endif
 
 #ifdef _KERNEL
+#if defined(IPVERSION) && (IPVERSION == 4)
 u_int in_cksum_hdr(const struct ip *ip);
+#endif
 u_short	in_addword(u_short sum, u_short b);
 u_short	in_pseudo(u_int sum, u_int b, u_int c);
 u_short	in_cksum_skip(struct mbuf *m, int len, int skip);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/intr_machdep.h
--- a/head/sys/amd64/include/intr_machdep.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/intr_machdep.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/intr_machdep.h 234207 2012-04-13 07:15:40Z avg $
+ * $FreeBSD: head/sys/amd64/include/intr_machdep.h 234989 2012-05-03 21:44:01Z attilio $
  */
 
 #ifndef __MACHINE_INTR_MACHDEP_H__
@@ -140,9 +140,7 @@
 enum intr_trigger elcr_read_trigger(u_int irq);
 void	elcr_resume(void);
 void	elcr_write_trigger(u_int irq, enum intr_trigger trigger);
-#ifdef SMP
 void	intr_add_cpu(u_int cpu);
-#endif
 int	intr_add_handler(const char *name, int vector, driver_filter_t filter, 
 			 driver_intr_t handler, void *arg, enum intr_type flags, 
 			 void **cookiep);    
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/md_var.h
--- a/head/sys/amd64/include/md_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/md_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/md_var.h 230426 2012-01-21 17:45:27Z kib $
+ * $FreeBSD: head/sys/amd64/include/md_var.h 238450 2012-07-14 15:48:30Z kib $
  */
 
 #ifndef _MACHINE_MD_VAR_H_
@@ -57,6 +57,7 @@
 extern	u_int	cpu_procinfo2;
 extern	char	cpu_vendor[];
 extern	u_int	cpu_vendor_id;
+extern	char	ctx_switch_xsave[];
 extern	char	kstack[];
 extern	char	sigcode[];
 extern	int	szsigcode;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/pcb.h
--- a/head/sys/amd64/include/pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)pcb.h	5.10 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/amd64/include/pcb.h 230426 2012-01-21 17:45:27Z kib $
+ * $FreeBSD: head/sys/amd64/include/pcb.h 237037 2012-06-13 22:53:56Z jkim $
  */
 
 #ifndef _AMD64_PCB_H_
@@ -91,9 +91,20 @@
 	/* local tss, with i/o bitmap; NULL for common */
 	struct amd64tss *pcb_tssp;
 
+	/* model specific registers */
+	register_t	pcb_efer;
+	register_t	pcb_star;
+	register_t	pcb_lstar;
+	register_t	pcb_cstar;
+	register_t	pcb_sfmask;
+	register_t	pcb_xsmask;
+
+	/* fpu context for suspend/resume */
+	void		*pcb_fpususpend;
+
 	struct savefpu	*pcb_save;
 
-	uint64_t	pcb_pad[2];
+	uint64_t	pcb_pad[3];
 };
 
 #ifdef _KERNEL
@@ -130,7 +141,8 @@
 }
 
 void	makectx(struct trapframe *, struct pcb *);
-int	savectx(struct pcb *);
+int	savectx(struct pcb *) __returns_twice;
+void	resumectx(struct pcb *);
 
 #endif
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/pcpu.h
--- a/head/sys/amd64/include/pcpu.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/pcpu.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/pcpu.h 230260 2012-01-17 07:21:23Z kib $
+ * $FreeBSD: head/sys/amd64/include/pcpu.h 238723 2012-07-23 19:16:31Z kib $
  */
 
 #ifndef _MACHINE_PCPU_H_
@@ -216,16 +216,36 @@
 #define	PCPU_PTR(member)	__PCPU_PTR(pc_ ## member)
 #define	PCPU_SET(member, val)	__PCPU_SET(pc_ ## member, val)
 
+#define	OFFSETOF_CURTHREAD	0
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnull-dereference"
+#endif
 static __inline __pure2 struct thread *
 __curthread(void)
 {
 	struct thread *td;
 
-	__asm("movq %%gs:0,%0" : "=r" (td));
+	__asm("movq %%gs:%1,%0" : "=r" (td)
+	    : "m" (*(char *)OFFSETOF_CURTHREAD));
 	return (td);
 }
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
 #define	curthread		(__curthread())
 
+#define	OFFSETOF_CURPCB		32
+static __inline __pure2 struct pcb *
+__curpcb(void)
+{
+	struct pcb *pcb;
+
+	__asm("movq %%gs:%1,%0" : "=r" (pcb) : "m" (*(char *)OFFSETOF_CURPCB));
+	return (pcb);
+}
+#define	curpcb		(__curpcb())
+
 #define	IS_BSP()	(PCPU_GET(cpuid) == 0)
 
 #else /* !lint || defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/pmap.h
--- a/head/sys/amd64/include/pmap.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/pmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  *
  *	from: hp300: @(#)pmap.h	7.2 (Berkeley) 12/16/90
  *	from: @(#)pmap.h	7.4 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/amd64/include/pmap.h 222813 2011-06-07 08:46:13Z attilio $
+ * $FreeBSD: head/sys/amd64/include/pmap.h 237168 2012-06-16 18:56:19Z alc $
  */
 
 #ifndef _MACHINE_PMAP_H_
@@ -295,7 +295,7 @@
 	pmap_t			pc_pmap;
 	TAILQ_ENTRY(pv_chunk)	pc_list;
 	uint64_t		pc_map[_NPCM];	/* bitmap; 1 = free */
-	uint64_t		pc_spare[2];
+	TAILQ_ENTRY(pv_chunk)	pc_lru;
 	struct pv_entry		pc_pventry[_NPCPV];
 };
 
@@ -309,6 +309,7 @@
 extern vm_offset_t virtual_end;
 
 #define	pmap_page_get_memattr(m)	((vm_memattr_t)(m)->md.pat_mode)
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
 #define	pmap_unmapbios(va, sz)	pmap_unmapdev((va), (sz))
 
 void	pmap_bootstrap(vm_paddr_t *);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/smp.h
--- a/head/sys/amd64/include/smp.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/smp.h	Wed Jul 25 16:40:53 2012 +0300
@@ -6,7 +6,7 @@
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
- * $FreeBSD: head/sys/amd64/include/smp.h 222853 2011-06-08 08:12:15Z avg $
+ * $FreeBSD: head/sys/amd64/include/smp.h 236938 2012-06-12 00:14:54Z iwasaki $
  *
  */
 
@@ -59,6 +59,7 @@
 void	cpustop_handler(void);
 void	cpususpend_handler(void);
 void	init_secondary(void);
+void	ipi_startup(int apic_id, int vector);
 void	ipi_all_but_self(u_int ipi);
 void 	ipi_bitmap_handler(struct trapframe frame);
 void	ipi_cpu(int cpu, u_int ipi);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/vdso.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/amd64/include/vdso.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/amd64/include/vdso.h 237433 2012-06-22 07:06:40Z kib $ */
+
+#include <x86/vdso.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/vmparam.h
--- a/head/sys/amd64/include/vmparam.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/vmparam.h	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vmparam.h	5.9 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/amd64/include/vmparam.h 221855 2011-05-13 19:35:01Z mdf $
+ * $FreeBSD: head/sys/amd64/include/vmparam.h 234743 2012-04-27 22:27:21Z rmh $
  */
 
 
@@ -54,7 +54,7 @@
  */
 #define	MAXTSIZ		(128UL*1024*1024)	/* max text size */
 #ifndef DFLDSIZ
-#define	DFLDSIZ		(128UL*1024*1024)	/* initial data size limit */
+#define	DFLDSIZ		(32768UL*1024*1024)	/* initial data size limit */
 #endif
 #ifndef MAXDSIZ
 #define	MAXDSIZ		(32768UL*1024*1024)	/* max data size */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux.h
--- a/head/sys/amd64/linux32/linux.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/linux32/linux.h 230132 2012-01-15 13:23:18Z uqs $
+ * $FreeBSD: head/sys/amd64/linux32/linux.h 235063 2012-05-05 19:42:38Z netchild $
  */
 
 #ifndef _AMD64_LINUX_H_
@@ -42,6 +42,7 @@
 #define	ldebug(name)	isclr(linux_debug_map, LINUX_SYS_linux_ ## name)
 #define	ARGS(nm, fmt)	"linux(%ld): "#nm"("fmt")\n", (long)td->td_proc->p_pid
 #define	LMSG(fmt)	"linux(%ld): "fmt"\n", (long)td->td_proc->p_pid
+#define	LINUX_DTRACE	linuxulator32
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_LINUX);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_dummy.c
--- a/head/sys/amd64/linux32/linux32_dummy.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_dummy.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,16 +27,25 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_dummy.c 234352 2012-04-16 21:22:02Z jkim $");
+__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_dummy.c 235063 2012-05-05 19:42:38Z netchild $");
+
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
 
 #include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <amd64/linux32/linux.h>
 #include <amd64/linux32/linux32_proto.h>
+#include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_util.h>
 
+/* DTrace init */
+LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
+
 DUMMY(stime);
 DUMMY(olduname);
 DUMMY(syslog);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_proto.h
--- a/head/sys/amd64/linux32/linux32_proto.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_proto.h	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call prototypes.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_proto.h 234360 2012-04-16 23:17:29Z jkim $
- * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim 
+ * $FreeBSD: head/sys/amd64/linux32/linux32_proto.h 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 #ifndef _LINUX_SYSPROTO_H_
@@ -60,8 +60,8 @@
 };
 struct linux_execve_args {
 	char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
-	char argp_l_[PADL_(u_int32_t *)]; u_int32_t * argp; char argp_r_[PADR_(u_int32_t *)];
-	char envp_l_[PADL_(u_int32_t *)]; u_int32_t * envp; char envp_r_[PADR_(u_int32_t *)];
+	char argp_l_[PADL_(uint32_t *)]; uint32_t * argp; char argp_r_[PADR_(uint32_t *)];
+	char envp_l_[PADL_(uint32_t *)]; uint32_t * envp; char envp_r_[PADR_(uint32_t *)];
 };
 struct linux_chdir_args {
 	char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_syscall.h
--- a/head/sys/amd64/linux32/linux32_syscall.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_syscall.h	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call numbers.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_syscall.h 234360 2012-04-16 23:17:29Z jkim $
- * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim 
+ * $FreeBSD: head/sys/amd64/linux32/linux32_syscall.h 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 #define	LINUX_SYS_exit	1
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_syscalls.c
--- a/head/sys/amd64/linux32/linux32_syscalls.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_syscalls.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_syscalls.c 234360 2012-04-16 23:17:29Z jkim $
- * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim 
+ * $FreeBSD: head/sys/amd64/linux32/linux32_syscalls.c 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 const char *linux_syscallnames[] = {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_sysent.c
--- a/head/sys/amd64/linux32/linux32_sysent.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_sysent.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_sysent.c 234360 2012-04-16 23:17:29Z jkim $
- * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim 
+ * $FreeBSD: head/sys/amd64/linux32/linux32_sysent.c 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 #include "opt_compat.h"
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_systrace_args.c
--- a/head/sys/amd64/linux32/linux32_systrace_args.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_systrace_args.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,7 +2,7 @@
  * System call argument to DTrace register array converstion.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_systrace_args.c 234360 2012-04-16 23:17:29Z jkim $
+ * $FreeBSD: head/sys/amd64/linux32/linux32_systrace_args.c 236027 2012-05-25 21:52:57Z ed $
  * This file is part of the DTrace syscall provider.
  */
 
@@ -94,8 +94,8 @@
 	case 11: {
 		struct linux_execve_args *p = params;
 		uarg[0] = (intptr_t) p->path; /* char * */
-		uarg[1] = (intptr_t) p->argp; /* u_int32_t * */
-		uarg[2] = (intptr_t) p->envp; /* u_int32_t * */
+		uarg[1] = (intptr_t) p->argp; /* uint32_t * */
+		uarg[2] = (intptr_t) p->envp; /* uint32_t * */
 		*n_args = 3;
 		break;
 	}
@@ -2401,10 +2401,10 @@
 			p = "char *";
 			break;
 		case 1:
-			p = "u_int32_t *";
+			p = "uint32_t *";
 			break;
 		case 2:
-			p = "u_int32_t *";
+			p = "uint32_t *";
 			break;
 		default:
 			break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/syscalls.master
--- a/head/sys/amd64/linux32/syscalls.master	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/syscalls.master	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
- $FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim $
+ $FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed $
 
 ;	@(#)syscalls.master	8.1 (Berkeley) 7/19/93
 ; System call name/number master file (or rather, slave, from LINUX).
@@ -54,8 +54,8 @@
 				    l_int mode); }
 9	AUE_LINK	STD	{ int linux_link(char *path, char *to); }
 10	AUE_UNLINK	STD	{ int linux_unlink(char *path); }
-11	AUE_EXECVE	STD	{ int linux_execve(char *path, u_int32_t *argp, \
-				    u_int32_t *envp); }
+11	AUE_EXECVE	STD	{ int linux_execve(char *path, uint32_t *argp, \
+				    uint32_t *envp); }
 12	AUE_CHDIR	STD	{ int linux_chdir(char *path); }
 13	AUE_NULL	STD	{ int linux_time(l_time_t *tm); }
 14	AUE_MKNOD	STD	{ int linux_mknod(char *path, l_int mode, \
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/cd9660/cd9660_node.c
--- a/head/sys/fs/cd9660/cd9660_node.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/cd9660/cd9660_node.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/fs/cd9660/cd9660_node.c 234607 2012-04-23 14:10:34Z trasz $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -65,7 +65,6 @@
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
-	struct thread *td = ap->a_td;
 	struct iso_node *ip = VTOI(vp);
 	int error = 0;
 
@@ -74,7 +73,7 @@
 	 * so that it can be reused immediately.
 	 */
 	if (ip->inode.iso_mode == 0)
-		vrecycle(vp, td);
+		vrecycle(vp);
 	return error;
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/cd9660/cd9660_vfsops.c
--- a/head/sys/fs/cd9660/cd9660_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/cd9660/cd9660_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/cd9660/cd9660_vfsops.c 232485 2012-03-04 09:48:58Z kevlo $");
+__FBSDID("$FreeBSD: head/sys/fs/cd9660/cd9660_vfsops.c 238697 2012-07-22 15:40:31Z kevlo $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -133,7 +133,7 @@
 	int error;
 	accmode_t accmode;
 	struct nameidata ndp;
-	struct iso_mnt *imp = 0;
+	struct iso_mnt *imp = NULL;
 
 	td = curthread;
 
@@ -214,7 +214,7 @@
 	int iso_bsize;
 	int iso_blknum;
 	int joliet_level;
-	struct iso_volume_descriptor *vdp = 0;
+	struct iso_volume_descriptor *vdp = NULL;
 	struct iso_primary_descriptor *pri = NULL;
 	struct iso_sierra_primary_descriptor *pri_sierra = NULL;
 	struct iso_supplementary_descriptor *sup = NULL;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/devfs/devfs_vnops.c
--- a/head/sys/fs/devfs/devfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/devfs/devfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  *	@(#)kernfs_vnops.c	8.15 (Berkeley) 5/21/95
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43
  *
- * $FreeBSD: head/sys/fs/devfs/devfs_vnops.c 231949 2012-02-21 01:05:12Z kib $
+ * $FreeBSD: head/sys/fs/devfs/devfs_vnops.c 238029 2012-07-02 21:01:03Z kib $
  */
 
 /*
@@ -1170,18 +1170,14 @@
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
-	if ((flags & FOF_OFFSET) == 0)
-		uio->uio_offset = fp->f_offset;
-
+	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 	error = dsw->d_read(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		vfs_timestamp(&dev->si_atime);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
-	if ((flags & FOF_OFFSET) == 0)
-		fp->f_offset = uio->uio_offset;
-	fp->f_nextoff = uio->uio_offset;
+	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
@@ -1648,8 +1644,7 @@
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
-	if ((flags & FOF_OFFSET) == 0)
-		uio->uio_offset = fp->f_offset;
+	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 
 	resid = uio->uio_resid;
 
@@ -1661,9 +1656,7 @@
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
-	if ((flags & FOF_OFFSET) == 0)
-		fp->f_offset = uio->uio_offset;
-	fp->f_nextoff = uio->uio_offset;
+	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ext2fs/ext2_inode.c
--- a/head/sys/fs/ext2fs/ext2_inode.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ext2fs/ext2_inode.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_inode.c	8.5 (Berkeley) 12/30/93
- * $FreeBSD: head/sys/fs/ext2fs/ext2_inode.c 228583 2011-12-16 15:47:43Z pfg $
+ * $FreeBSD: head/sys/fs/ext2fs/ext2_inode.c 234607 2012-04-23 14:10:34Z trasz $
  */
 
 #include <sys/param.h>
@@ -249,7 +249,7 @@
 	bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof(newblks));
 	bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof(oldblks));
 	oip->i_size = osize;
-	error = vtruncbuf(ovp, cred, td, length, (int)fs->e2fs_bsize);
+	error = vtruncbuf(ovp, cred, length, (int)fs->e2fs_bsize);
 	if (error && (allerror == 0))
 		allerror = error;
 	vnode_pager_setsize(ovp, length);
@@ -498,7 +498,7 @@
 	 * so that it can be reused immediately.
 	 */
 	if (ip->i_mode == 0)
-		vrecycle(vp, td);
+		vrecycle(vp);
 	return (error);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ext2fs/ext2_lookup.c
--- a/head/sys/fs/ext2fs/ext2_lookup.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ext2fs/ext2_lookup.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_lookup.c	8.6 (Berkeley) 4/1/94
- * $FreeBSD: head/sys/fs/ext2fs/ext2_lookup.c 231949 2012-02-21 01:05:12Z kib $
+ * $FreeBSD: head/sys/fs/ext2fs/ext2_lookup.c 235508 2012-05-16 15:53:38Z pfg $
  */
 
 #include <sys/param.h>
@@ -115,6 +115,8 @@
 
 static int	ext2_dirbadentry(struct vnode *dp, struct ext2fs_direct_2 *de,
 		    int entryoffsetinblock);
+static int	ext2_lookup_ino(struct vnode *vdp, struct vnode **vpp,
+		    struct componentname *cnp, ino_t *dd_ino);
 
 /*
  * Vnode op for reading directories.
@@ -285,7 +287,14 @@
 		struct componentname *a_cnp;
 	} */ *ap;
 {
-	struct vnode *vdp;		/* vnode for directory being searched */
+
+	return (ext2_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
+}
+
+static int
+ext2_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
+    ino_t *dd_ino)
+{
 	struct inode *dp;		/* inode for directory being searched */
 	struct buf *bp;			/* a buffer of directory entries */
 	struct ext2fs_direct_2 *ep;	/* the current directory entry */
@@ -305,22 +314,22 @@
 	doff_t enduseful;		/* pointer past last used dir slot */
 	u_long bmask;			/* block offset mask */
 	int namlen, error;
-	struct vnode **vpp = ap->a_vpp;
-	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
-	ino_t ino;
+	ino_t ino, ino1;
 	int ltype;
 
-	int	DIRBLKSIZ = VTOI(ap->a_dvp)->i_e2fs->e2fs_bsize;
+	int	DIRBLKSIZ = VTOI(vdp)->i_e2fs->e2fs_bsize;
 
+	if (vpp != NULL)
+		*vpp = NULL;
+
+	dp = VTOI(vdp);
+	bmask = VFSTOEXT2(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
+restart:
 	bp = NULL;
 	slotoffset = -1;
-	*vpp = NULL;
-	vdp = ap->a_dvp;
-	dp = VTOI(vdp);
-	bmask = VFSTOEXT2(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
@@ -536,10 +545,12 @@
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
-		cache_enter(vdp, *vpp, cnp);
+		cache_enter(vdp, NULL, cnp);
 	return (ENOENT);
 
 found:
+	if (dd_ino != NULL)
+		*dd_ino = ino;
 	if (numdirpasses == 2)
 		nchstats.ncs_pass2++;
 	/*
@@ -582,6 +593,8 @@
 			dp->i_count = 0;
 		else
 			dp->i_count = dp->i_offset - prevoff;
+		if (dd_ino != NULL)
+			return (0);
 		if (dp->i_number == ino) {
 			VREF(vdp);
 			*vpp = vdp;
@@ -622,6 +635,8 @@
 		 */
 		if (dp->i_number == ino)
 			return (EISDIR);
+		if (dd_ino != NULL)
+			return (0);
 		if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE,
 		    &tdp)) != 0)
 			return (error);
@@ -629,6 +644,8 @@
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
+	if (dd_ino != NULL)
+		return (0);
 
 	/*
 	 * Step through the translation in the name.  We do not `vput' the
@@ -655,8 +672,27 @@
 		VOP_UNLOCK(pdp, 0);	/* race to get the inode */
 		error = VFS_VGET(vdp->v_mount, ino, cnp->cn_lkflags, &tdp);
 		vn_lock(pdp, ltype | LK_RETRY);
-		if (error != 0)
+		if (pdp->v_iflag & VI_DOOMED) {
+			if (error == 0)
+				vput(tdp);
+			error = ENOENT;
+		}
+		if (error)
 			return (error);
+		/*
+		 * Recheck that ".." entry in the vdp directory points
+		 * to the inode we looked up before vdp lock was
+		 * dropped.
+		 */
+		error = ext2_lookup_ino(pdp, NULL, cnp, &ino1);
+		if (error) {
+			vput(tdp);
+			return (error);
+		}
+		if (ino1 != ino) {
+			vput(tdp);
+			goto restart;
+		}
 		*vpp = tdp;
 	} else if (dp->i_number == ino) {
 		VREF(vdp);	/* we want ourself, ie "." */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ext2fs/ext2_vfsops.c
--- a/head/sys/fs/ext2fs/ext2_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ext2fs/ext2_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.8 (Berkeley) 4/18/94
- * $FreeBSD: head/sys/fs/ext2fs/ext2_vfsops.c 234386 2012-04-17 16:28:22Z mckusick $
+ * $FreeBSD: head/sys/fs/ext2fs/ext2_vfsops.c 238697 2012-07-22 15:40:31Z kevlo $
  */
 
 #include <sys/param.h>
@@ -112,7 +112,7 @@
 	struct vfsoptlist *opts;
 	struct vnode *devvp;
 	struct thread *td;
-	struct ext2mount *ump = 0;
+	struct ext2mount *ump = NULL;
 	struct m_ext2fs *fs;
 	struct nameidata nd, *ndp = &nd;
 	accmode_t accmode;
@@ -767,7 +767,7 @@
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	if (fs->e2fs->e2fs_magic != E2FS_MAGIC)
-		panic("ext2fs_statvfs");
+		panic("ext2fs_statfs");
 
 	/*
 	 * Compute the overhead (FS structures)
@@ -830,7 +830,6 @@
 	/*
 	 * Write back each (modified) inode.
 	 */
-	MNT_ILOCK(mp);
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if (vp->v_type == VNON) {
@@ -847,7 +846,6 @@
 		}
 		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
 		if (error) {
-			MNT_ILOCK(mp);
 			if (error == ENOENT) {
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				goto loop;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ext2fs/ext2_vnops.c
--- a/head/sys/fs/ext2fs/ext2_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ext2fs/ext2_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  *
  *	@(#)ufs_vnops.c	8.7 (Berkeley) 2/3/94
  *	@(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95
- * $FreeBSD: head/sys/fs/ext2fs/ext2_vnops.c 234203 2012-04-13 05:48:31Z jh $
+ * $FreeBSD: head/sys/fs/ext2fs/ext2_vnops.c 235508 2012-05-16 15:53:38Z pfg $
  */
 
 #include "opt_suiddir.h"
@@ -1336,7 +1336,11 @@
 	error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred,
 	    cnp->cn_thread);
 	cache_purge(ITOV(ip));
-	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+	if (vn_lock(dvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+		VOP_UNLOCK(vp, 0);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	}
 out:
 	return (error);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/hpfs/hpfs_vnops.c
--- a/head/sys/fs/hpfs/hpfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/hpfs/hpfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/fs/hpfs/hpfs_vnops.c 235984 2012-05-25 09:16:59Z gleb $
  */
 
 #include <sys/param.h>
@@ -528,7 +528,7 @@
 		}
 
 		if (vap->va_size < hp->h_fn.fn_size) {
-			error = vtruncbuf(vp, cred, td, vap->va_size, DEV_BSIZE);
+			error = vtruncbuf(vp, cred, vap->va_size, DEV_BSIZE);
 			if (error)
 				return (error);
 			error = hpfs_truncate(hp, vap->va_size);
@@ -576,7 +576,7 @@
 	}
 
 	if (hp->h_flag & H_INVAL) {
-		vrecycle(vp, ap->a_td);
+		vrecycle(vp);
 		return (0);
 	}
 
@@ -797,10 +797,21 @@
 }
 
 
-static struct dirent hpfs_de_dot =
-	{ 0, sizeof(struct dirent), DT_DIR, 1, "." };
-static struct dirent hpfs_de_dotdot =
-	{ 0, sizeof(struct dirent), DT_DIR, 2, ".." };
+static struct dirent hpfs_de_dot = {
+	.d_fileno = 0,
+	.d_reclen = sizeof(struct dirent),
+	.d_type = DT_DIR,
+	.d_namlen = 1,
+	.d_name = "."
+};
+static struct dirent hpfs_de_dotdot = {
+	.d_fileno = 0,
+	.d_reclen = sizeof(struct dirent),
+	.d_type = DT_DIR,
+	.d_namlen = 2,
+	.d_name = ".."
+};
+
 int
 hpfs_readdir(ap)
 	struct vop_readdir_args /* {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/msdosfs/denode.h
--- a/head/sys/fs/msdosfs/denode.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/msdosfs/denode.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD$ */
+/* $FreeBSD: head/sys/fs/msdosfs/denode.h 234605 2012-04-23 13:21:28Z trasz $ */
 /*	$NetBSD: denode.h,v 1.25 1997/11/17 15:36:28 ws Exp $	*/
 
 /*-
@@ -276,6 +276,6 @@
 int createde(struct denode *dep, struct denode *ddep, struct denode **depp, struct componentname *cnp);
 int deupdat(struct denode *dep, int waitfor);
 int removede(struct denode *pdep, struct denode *dep);
-int detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred, struct thread *td);
+int detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred);
 int doscheckpath( struct denode *source, struct denode *target);
 #endif	/* _KERNEL */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/msdosfs/msdosfs_denode.c
--- a/head/sys/fs/msdosfs/msdosfs_denode.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/msdosfs/msdosfs_denode.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_denode.c 231998 2012-02-22 13:01:17Z kib $ */
+/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_denode.c 234607 2012-04-23 14:10:34Z trasz $ */
 /*	$NetBSD: msdosfs_denode.c,v 1.28 1998/02/10 14:10:00 mrg Exp $	*/
 
 /*-
@@ -326,12 +326,11 @@
  * Truncate the file described by dep to the length specified by length.
  */
 int
-detrunc(dep, length, flags, cred, td)
+detrunc(dep, length, flags, cred)
 	struct denode *dep;
 	u_long length;
 	int flags;
 	struct ucred *cred;
-	struct thread *td;
 {
 	int error;
 	int allerror;
@@ -426,7 +425,7 @@
 	dep->de_FileSize = length;
 	if (!isadir)
 		dep->de_flag |= DE_UPDATE | DE_MODIFIED;
-	allerror = vtruncbuf(DETOV(dep), cred, td, length, pmp->pm_bpcluster);
+	allerror = vtruncbuf(DETOV(dep), cred, length, pmp->pm_bpcluster);
 #ifdef MSDOSFS_DEBUG
 	if (allerror)
 		printf("detrunc(): vtruncbuf error %d\n", allerror);
@@ -504,7 +503,7 @@
 		error = extendfile(dep, count, NULL, NULL, DE_CLEAR);
 		if (error) {
 			/* truncate the added clusters away again */
-			(void) detrunc(dep, dep->de_FileSize, 0, cred, NULL);
+			(void) detrunc(dep, dep->de_FileSize, 0, cred);
 			return (error);
 		}
 	}
@@ -584,7 +583,6 @@
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
-	struct thread *td = ap->a_td;
 	int error = 0;
 
 #ifdef MSDOSFS_DEBUG
@@ -607,7 +605,7 @@
 	       dep, dep->de_refcnt, vp->v_mount->mnt_flag, MNT_RDONLY);
 #endif
 	if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
-		error = detrunc(dep, (u_long) 0, 0, NOCRED, td);
+		error = detrunc(dep, (u_long) 0, 0, NOCRED);
 		dep->de_flag |= DE_UPDATE;
 		dep->de_Name[0] = SLOT_DELETED;
 	}
@@ -623,6 +621,6 @@
 	       vrefcnt(vp), dep->de_Name[0]);
 #endif
 	if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY)
-		vrecycle(vp, td);
+		vrecycle(vp);
 	return (error);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/msdosfs/msdosfs_lookup.c
--- a/head/sys/fs/msdosfs/msdosfs_lookup.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/msdosfs/msdosfs_lookup.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_lookup.c 231998 2012-02-22 13:01:17Z kib $ */
+/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_lookup.c 238697 2012-07-22 15:40:31Z kevlo $ */
 /*	$NetBSD: msdosfs_lookup.c,v 1.37 1997/11/17 15:36:54 ws Exp $	*/
 
 /*-
@@ -108,7 +108,7 @@
 	struct denode *dp;
 	struct denode *tdp;
 	struct msdosfsmount *pmp;
-	struct buf *bp = 0;
+	struct buf *bp = NULL;
 	struct direntry *dep = NULL;
 	u_char dosfilename[12];
 	int flags = cnp->cn_flags;
@@ -649,7 +649,7 @@
 		dirclust = de_clcount(pmp, diroffset);
 		error = extendfile(ddep, dirclust, 0, 0, DE_CLEAR);
 		if (error) {
-			(void)detrunc(ddep, ddep->de_FileSize, 0, NOCRED, NULL);
+			(void)detrunc(ddep, ddep->de_FileSize, 0, NOCRED);
 			return error;
 		}
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/msdosfs/msdosfs_vnops.c
--- a/head/sys/fs/msdosfs/msdosfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/msdosfs/msdosfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_vnops.c 231998 2012-02-22 13:01:17Z kib $ */
+/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_vnops.c 234605 2012-04-23 13:21:28Z trasz $ */
 /*	$NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $	*/
 
 /*-
@@ -476,7 +476,7 @@
 			 */
 			break;
 		}
-		error = detrunc(dep, vap->va_size, 0, cred, td);
+		error = detrunc(dep, vap->va_size, 0, cred);
 		if (error)
 			return error;
 	}
@@ -835,11 +835,11 @@
 errexit:
 	if (error) {
 		if (ioflag & IO_UNIT) {
-			detrunc(dep, osize, ioflag & IO_SYNC, NOCRED, NULL);
+			detrunc(dep, osize, ioflag & IO_SYNC, NOCRED);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		} else {
-			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, NULL);
+			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED);
 			if (uio->uio_resid != resid)
 				error = 0;
 		}
@@ -1429,7 +1429,6 @@
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *ip, *dp;
-	struct thread *td = cnp->cn_thread;
 	int error;
 
 	ip = VTODE(vp);
@@ -1467,7 +1466,7 @@
 	/*
 	 * Truncate the directory that is being deleted.
 	 */
-	error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred, td);
+	error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred);
 	cache_purge(vp);
 
 out:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/bmap.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/bmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,621 @@
+/*-
+ * Copyright (c) 2012 Semihalf
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/bmap.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/ktr.h>
+#include <sys/kdb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include <machine/_inttypes.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+#include "bmap.h"
+
+static int bmap_getlbns(struct nandfs_node *, nandfs_lbn_t,
+    struct nandfs_indir *, int *);
+
+int
+bmap_lookup(struct nandfs_node *node, nandfs_lbn_t lblk, nandfs_daddr_t *vblk)
+{
+	struct nandfs_inode *ip;
+	struct nandfs_indir a[NIADDR + 1], *ap;
+	nandfs_daddr_t daddr;
+	struct buf *bp;
+	int error;
+	int num, *nump;
+
+	DPRINTF(BMAP, ("%s: node %p lblk %jx enter\n", __func__, node, lblk));
+	ip = &node->nn_inode;
+
+	ap = a;
+	nump = #
+
+	error = bmap_getlbns(node, lblk, ap, nump);
+	if (error)
+		return (error);
+
+	if (num == 0) {
+		*vblk = ip->i_db[lblk];
+		return (0);
+	}
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx trying ip->i_ib[%x]\n", __func__,
+	    node, lblk, ap->in_off));
+	daddr = ip->i_ib[ap->in_off];
+	for (bp = NULL, ++ap; --num; ap++) {
+		if (daddr == 0) {
+			DPRINTF(BMAP, ("%s: node %p lblk=%jx returning with "
+			    "vblk 0\n", __func__, node, lblk));
+			*vblk = 0;
+			return (0);
+		}
+		if (ap->in_lbn == lblk) {
+			DPRINTF(BMAP, ("%s: node %p lblk=%jx ap->in_lbn=%jx "
+			    "returning address of indirect block (%jx)\n",
+			    __func__, node, lblk, ap->in_lbn, daddr));
+			*vblk = daddr;
+			return (0);
+		}
+
+		DPRINTF(BMAP, ("%s: node %p lblk=%jx reading block "
+		    "ap->in_lbn=%jx\n", __func__, node, lblk, ap->in_lbn));
+
+		error = nandfs_bread_meta(node, ap->in_lbn, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+
+		daddr = ((nandfs_daddr_t *)bp->b_data)[ap->in_off];
+		brelse(bp);
+	}
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx returning with %jx\n", __func__,
+	    node, lblk, daddr));
+	*vblk = daddr;
+
+	return (0);
+}
+
+int
+bmap_dirty_meta(struct nandfs_node *node, nandfs_lbn_t lblk, int force)
+{
+	struct nandfs_indir a[NIADDR+1], *ap;
+#ifdef DEBUG
+	nandfs_daddr_t daddr;
+#endif
+	struct buf *bp;
+	int error;
+	int num, *nump;
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx\n", __func__, node, lblk));
+
+	ap = a;
+	nump = #
+
+	error = bmap_getlbns(node, lblk, ap, nump);
+	if (error)
+		return (error);
+
+	/*
+	 * Direct block, nothing to do
+	 */
+	if (num == 0)
+		return (0);
+
+	DPRINTF(BMAP, ("%s: node %p reading blocks\n", __func__, node));
+
+	for (bp = NULL, ++ap; --num; ap++) {
+		error = nandfs_bread_meta(node, ap->in_lbn, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+
+#ifdef DEBUG
+		daddr = ((nandfs_daddr_t *)bp->b_data)[ap->in_off];
+		MPASS(daddr != 0 || node->nn_ino == 3);
+#endif
+
+		error = nandfs_dirty_buf_meta(bp, force);
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+int
+bmap_insert_block(struct nandfs_node *node, nandfs_lbn_t lblk,
+    nandfs_daddr_t vblk)
+{
+	struct nandfs_inode *ip;
+	struct nandfs_indir a[NIADDR+1], *ap;
+	struct buf *bp;
+	nandfs_daddr_t daddr;
+	int error;
+	int num, *nump, i;
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx\n", __func__, node, lblk,
+	    vblk));
+
+	ip = &node->nn_inode;
+
+	ap = a;
+	nump = #
+
+	error = bmap_getlbns(node, lblk, ap, nump);
+	if (error)
+		return (error);
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx got num=%d\n", __func__,
+	    node, lblk, vblk, num));
+
+	if (num == 0) {
+		DPRINTF(BMAP, ("%s: node %p lblk=%jx direct block\n", __func__,
+		    node, lblk));
+		ip->i_db[lblk] = vblk;
+		return (0);
+	}
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx indirect block level %d\n",
+	    __func__, node, lblk, ap->in_off));
+
+	if (num == 1) {
+		DPRINTF(BMAP, ("%s: node %p lblk=%jx indirect block: inserting "
+		    "%jx as vblk for indirect block %d\n", __func__, node,
+		    lblk, vblk, ap->in_off));
+		ip->i_ib[ap->in_off] = vblk;
+		return (0);
+	}
+
+	bp = NULL;
+	daddr = ip->i_ib[a[0].in_off];
+	for (i = 1; i < num; i++) {
+		if (bp)
+			brelse(bp);
+		if (daddr == 0) {
+			DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx create "
+			    "block %jx %d\n", __func__, node, lblk, vblk,
+			    a[i].in_lbn, a[i].in_off));
+			error = nandfs_bcreate_meta(node, a[i].in_lbn, NOCRED,
+			    0, &bp);
+			if (error)
+				return (error);
+		} else {
+			DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx read "
+			    "block %jx %d\n", __func__, node, daddr, vblk,
+			    a[i].in_lbn, a[i].in_off));
+			error = nandfs_bread_meta(node, a[i].in_lbn, NOCRED, 0, &bp);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
+		}
+		daddr = ((nandfs_daddr_t *)bp->b_data)[a[i].in_off];
+	}
+	i--;
+
+	DPRINTF(BMAP,
+	    ("%s: bmap node %p lblk=%jx vblk=%jx inserting vblk level %d at "
+	    "offset %d at %jx\n", __func__, node, lblk, vblk, i, a[i].in_off,
+	    daddr));
+
+	if (!bp) {
+		nandfs_error("%s: cannot find indirect block\n", __func__);
+		return (-1);
+	}
+	((nandfs_daddr_t *)bp->b_data)[a[i].in_off] = vblk;
+
+	error = nandfs_dirty_buf_meta(bp, 0);
+	if (error) {
+		nandfs_warning("%s: dirty failed buf: %p\n", __func__, bp);
+		return (error);
+	}
+	DPRINTF(BMAP, ("%s: exiting node %p lblk=%jx vblk=%jx\n", __func__,
+	    node, lblk, vblk));
+
+	return (error);
+}
+
+CTASSERT(NIADDR <= 3);
+#define SINGLE	0	/* index of single indirect block */
+#define DOUBLE	1	/* index of double indirect block */
+#define TRIPLE	2	/* index of triple indirect block */
+
+static __inline nandfs_lbn_t
+lbn_offset(struct nandfs_device *fsdev, int level)
+{
+	nandfs_lbn_t res;
+
+	for (res = 1; level > 0; level--)
+		res *= MNINDIR(fsdev);
+	return (res);
+}
+
+static nandfs_lbn_t
+blocks_inside(struct nandfs_device *fsdev, int level, struct nandfs_indir *nip)
+{
+	nandfs_lbn_t blocks;
+
+	for (blocks = 1; level >= SINGLE; level--, nip++) {
+		MPASS(nip->in_off >= 0 && nip->in_off < MNINDIR(fsdev));
+		blocks += nip->in_off * lbn_offset(fsdev, level);
+	}
+
+	return (blocks);
+}
+
+static int
+bmap_truncate_indirect(struct nandfs_node *node, int level, nandfs_lbn_t *left,
+    int *cleaned, struct nandfs_indir *ap, struct nandfs_indir *fp,
+    nandfs_daddr_t *copy)
+{
+	struct buf *bp;
+	nandfs_lbn_t i, lbn, nlbn, factor, tosub;
+	struct nandfs_device *fsdev;
+	int error, lcleaned, modified;
+
+	DPRINTF(BMAP, ("%s: node %p level %d left %jx\n", __func__,
+	    node, level, *left));
+
+	fsdev = node->nn_nandfsdev;
+
+	MPASS(ap->in_off >= 0 && ap->in_off < MNINDIR(fsdev));
+
+	factor = lbn_offset(fsdev, level);
+	lbn = ap->in_lbn;
+
+	error = nandfs_bread_meta(node, lbn, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	bcopy(bp->b_data, copy, fsdev->nd_blocksize);
+	bqrelse(bp);
+
+	modified = 0;
+
+	i = ap->in_off;
+
+	if (ap != fp)
+		ap++;
+	for (nlbn = lbn + 1 - i * factor; i >= 0 && *left > 0; i--,
+	    nlbn += factor) {
+		lcleaned = 0;
+
+		DPRINTF(BMAP,
+		    ("%s: node %p i=%jx nlbn=%jx left=%jx ap=%p vblk %jx\n",
+		    __func__, node, i, nlbn, *left, ap, copy[i]));
+
+		if (copy[i] == 0) {
+			tosub = blocks_inside(fsdev, level - 1, ap);
+			if (tosub > *left)
+				tosub = 0;
+
+			*left -= tosub;
+		} else {
+			if (level > SINGLE) {
+				if (ap == fp)
+					ap->in_lbn = nlbn;
+
+				error = bmap_truncate_indirect(node, level - 1,
+				    left, &lcleaned, ap, fp,
+				    copy + MNINDIR(fsdev));
+				if (error)
+					return (error);
+			} else {
+				error = nandfs_bdestroy(node, copy[i]);
+				if (error)
+					return (error);
+				lcleaned = 1;
+				*left -= 1;
+			}
+		}
+
+		if (lcleaned) {
+			if (level > SINGLE) {
+				error = nandfs_vblock_end(fsdev, copy[i]);
+				if (error)
+					return (error);
+			}
+			copy[i] = 0;
+			modified++;
+		}
+
+		ap = fp;
+	}
+
+	if (i == -1)
+		*cleaned = 1;
+
+	error = nandfs_bread_meta(node, lbn, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	if (modified)
+		bcopy(copy, bp->b_data, fsdev->nd_blocksize);
+
+	error = nandfs_dirty_buf_meta(bp, 0);
+	if (error)
+		return (error);
+
+	return (error);
+}
+
+int
+bmap_truncate_mapping(struct nandfs_node *node, nandfs_lbn_t lastblk,
+    nandfs_lbn_t todo)
+{
+	struct nandfs_inode *ip;
+	struct nandfs_indir a[NIADDR + 1], f[NIADDR], *ap;
+	nandfs_daddr_t indir_lbn[NIADDR];
+	nandfs_daddr_t *copy;
+	int error, level;
+	nandfs_lbn_t left, tosub;
+	struct nandfs_device *fsdev;
+	int cleaned, i;
+	int num, *nump;
+
+	DPRINTF(BMAP, ("%s: node %p lastblk %jx truncating by %jx\n", __func__,
+	    node, lastblk, todo));
+
+	ip = &node->nn_inode;
+	fsdev = node->nn_nandfsdev;
+
+	ap = a;
+	nump = #
+
+	error = bmap_getlbns(node, lastblk, ap, nump);
+	if (error)
+		return (error);
+
+	indir_lbn[SINGLE] = -NDADDR;
+	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - MNINDIR(fsdev) - 1;
+	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - MNINDIR(fsdev)
+	    * MNINDIR(fsdev) - 1;
+
+	for (i = 0; i < NIADDR; i++) {
+		f[i].in_off = MNINDIR(fsdev) - 1;
+		f[i].in_lbn = 0xdeadbeef;
+	}
+
+	left = todo;
+
+#ifdef DEBUG
+	a[num].in_off = -1;
+#endif
+
+	ap++;
+	num -= 2;
+
+	if (num < 0)
+		goto direct;
+
+	copy = malloc(MNINDIR(fsdev) * sizeof(nandfs_daddr_t) * (num + 1),
+	    M_NANDFSTEMP, M_WAITOK);
+
+	for (level = num; level >= SINGLE && left > 0; level--) {
+		cleaned = 0;
+
+		if (ip->i_ib[level] == 0) {
+			tosub = blocks_inside(fsdev, level, ap);
+			if (tosub > left)
+				left = 0;
+			else
+				left -= tosub;
+		} else {
+			if (ap == f)
+				ap->in_lbn = indir_lbn[level];
+			error = bmap_truncate_indirect(node, level, &left,
+			    &cleaned, ap, f, copy);
+			if (error) {
+				nandfs_error("%s: error %d when truncate "
+				    "at level %d\n", __func__, error, level);
+				return (error);
+			}
+		}
+
+		if (cleaned) {
+			nandfs_vblock_end(fsdev, ip->i_ib[level]);
+			ip->i_ib[level] = 0;
+		}
+
+		ap = f;
+	}
+
+	free(copy, M_NANDFSTEMP);
+
+direct:
+	if (num < 0)
+		i = lastblk;
+	else
+		i = NDADDR - 1;
+
+	for (; i >= 0 && left > 0; i--) {
+		if (ip->i_db[i] != 0) {
+			error = nandfs_bdestroy(node, ip->i_db[i]);
+			if (error) {
+				nandfs_error("%s: cannot destroy "
+				    "block %jx, error %d\n", __func__,
+				    (uintmax_t)ip->i_db[i], error);
+				return (error);
+			}
+			ip->i_db[i] = 0;
+		}
+
+		left--;
+	}
+
+	KASSERT(left == 0,
+	    ("truncated wrong number of blocks (%jd should be 0)", left));
+
+	return (error);
+}
+
+nandfs_lbn_t
+get_maxfilesize(struct nandfs_device *fsdev)
+{
+	struct nandfs_indir f[NIADDR];
+	nandfs_lbn_t max;
+	int i;
+
+	max = NDADDR;
+
+	for (i = 0; i < NIADDR; i++) {
+		f[i].in_off = MNINDIR(fsdev) - 1;
+		max += blocks_inside(fsdev, i, f);
+	}
+
+	max *= fsdev->nd_blocksize;
+
+	return (max);
+}
+
+/*
+ * This is ufs_getlbns with minor modifications.
+ */
+/*
+ * Create an array of logical block number/offset pairs which represent the
+ * path of indirect blocks required to access a data block.  The first "pair"
+ * contains the logical block number of the appropriate single, double or
+ * triple indirect block and the offset into the inode indirect block array.
+ * Note, the logical block number of the inode single/double/triple indirect
+ * block appears twice in the array, once with the offset into the i_ib and
+ * once with the offset into the page itself.
+ */
+static int
+bmap_getlbns(struct nandfs_node *node, nandfs_lbn_t bn, struct nandfs_indir *ap, int *nump)
+{
+	nandfs_daddr_t blockcnt;
+	nandfs_lbn_t metalbn, realbn;
+	struct nandfs_device *fsdev;
+	int i, numlevels, off;
+
+	fsdev = node->nn_nandfsdev;
+
+	DPRINTF(BMAP, ("%s: node %p bn=%jx mnindir=%zd enter\n", __func__,
+	    node, bn, MNINDIR(fsdev)));
+
+	*nump = 0;
+	numlevels = 0;
+	realbn = bn;
+
+	if (bn < 0)
+		bn = -bn;
+
+	/* The first NDADDR blocks are direct blocks. */
+	if (bn < NDADDR)
+		return (0);
+
+	/*
+	 * Determine the number of levels of indirection.  After this loop
+	 * is done, blockcnt indicates the number of data blocks possible
+	 * at the previous level of indirection, and NIADDR - i is the number
+	 * of levels of indirection needed to locate the requested block.
+	 */
+	for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) {
+		DPRINTF(BMAP, ("%s: blockcnt=%jd i=%d bn=%jd\n", __func__,
+		    blockcnt, i, bn));
+		if (i == 0)
+			return (EFBIG);
+		blockcnt *= MNINDIR(fsdev);
+		if (bn < blockcnt)
+			break;
+	}
+
+	/* Calculate the address of the first meta-block. */
+	if (realbn >= 0)
+		metalbn = -(realbn - bn + NIADDR - i);
+	else
+		metalbn = -(-realbn - bn + NIADDR - i);
+
+	/*
+	 * At each iteration, off is the offset into the bap array which is
+	 * an array of disk addresses at the current level of indirection.
+	 * The logical block number and the offset in that block are stored
+	 * into the argument array.
+	 */
+	ap->in_lbn = metalbn;
+	ap->in_off = off = NIADDR - i;
+
+	DPRINTF(BMAP, ("%s: initial: ap->in_lbn=%jx ap->in_off=%d\n", __func__,
+	    metalbn, off));
+
+	ap++;
+	for (++numlevels; i <= NIADDR; i++) {
+		/* If searching for a meta-data block, quit when found. */
+		if (metalbn == realbn)
+			break;
+
+		blockcnt /= MNINDIR(fsdev);
+		off = (bn / blockcnt) % MNINDIR(fsdev);
+
+		++numlevels;
+		ap->in_lbn = metalbn;
+		ap->in_off = off;
+
+		DPRINTF(BMAP, ("%s: in_lbn=%jx in_off=%d\n", __func__,
+		    ap->in_lbn, ap->in_off));
+		++ap;
+
+		metalbn -= -1 + off * blockcnt;
+	}
+	if (nump)
+		*nump = numlevels;
+
+	DPRINTF(BMAP, ("%s: numlevels=%d\n", __func__, numlevels));
+
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/bmap.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/bmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2012 Semihalf
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/fs/nandfs/bmap.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _BMAP_H
+#define _BMAP_H
+
+#include "nandfs_fs.h"
+
+int bmap_lookup(struct nandfs_node *, nandfs_lbn_t, nandfs_daddr_t *);
+int bmap_insert_block(struct nandfs_node *, nandfs_lbn_t, nandfs_daddr_t);
+int bmap_truncate_mapping(struct nandfs_node *, nandfs_lbn_t, nandfs_lbn_t);
+int bmap_dirty_meta(struct nandfs_node *, nandfs_lbn_t, int);
+
+nandfs_lbn_t get_maxfilesize(struct nandfs_device *);
+
+#endif /* _BMAP_H */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,310 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs.h,v 1.1 2009/07/18 16:31:42 reinoud
+ *
+ * $FreeBSD: head/sys/fs/nandfs/nandfs.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _FS_NANDFS_NANDFS_H_
+#define _FS_NANDFS_NANDFS_H_
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <sys/queue.h>
+#include <sys/uio.h>
+#include <sys/mutex.h>
+
+#include <sys/disk.h>
+#include <sys/kthread.h>
+#include "nandfs_fs.h"
+
+MALLOC_DECLARE(M_NANDFSTEMP);
+
+/* Debug categories */
+#define	NANDFS_DEBUG_VOLUMES		0x000001
+#define	NANDFS_DEBUG_BLOCK		0x000004
+#define	NANDFS_DEBUG_LOCKING		0x000008
+#define	NANDFS_DEBUG_NODE		0x000010
+#define	NANDFS_DEBUG_LOOKUP		0x000020
+#define	NANDFS_DEBUG_READDIR		0x000040
+#define	NANDFS_DEBUG_TRANSLATE		0x000080
+#define	NANDFS_DEBUG_STRATEGY		0x000100
+#define	NANDFS_DEBUG_READ		0x000200
+#define	NANDFS_DEBUG_WRITE		0x000400
+#define	NANDFS_DEBUG_IFILE		0x000800
+#define	NANDFS_DEBUG_ATTR		0x001000
+#define	NANDFS_DEBUG_EXTATTR		0x002000
+#define	NANDFS_DEBUG_ALLOC		0x004000
+#define	NANDFS_DEBUG_CPFILE		0x008000
+#define	NANDFS_DEBUG_DIRHASH		0x010000
+#define	NANDFS_DEBUG_NOTIMPL		0x020000
+#define	NANDFS_DEBUG_SHEDULE		0x040000
+#define	NANDFS_DEBUG_SEG		0x080000
+#define	NANDFS_DEBUG_SYNC		0x100000
+#define	NANDFS_DEBUG_PARANOIA		0x200000
+#define	NANDFS_DEBUG_VNCALL		0x400000
+#define	NANDFS_DEBUG_BUF		0x1000000
+#define	NANDFS_DEBUG_BMAP		0x2000000
+#define	NANDFS_DEBUG_DAT		0x4000000
+#define	NANDFS_DEBUG_GENERIC		0x8000000
+#define	NANDFS_DEBUG_CLEAN		0x10000000
+
+extern int nandfs_verbose;
+
+#define	DPRINTF(name, arg) { \
+		if (nandfs_verbose & NANDFS_DEBUG_##name) {\
+			printf arg;\
+		};\
+	}
+#define	DPRINTFIF(name, cond, arg) { \
+		if (nandfs_verbose & NANDFS_DEBUG_##name) { \
+			if (cond) printf arg;\
+		};\
+	}
+
+#define	VFSTONANDFS(mp)    ((struct nandfsmount *)((mp)->mnt_data))
+#define	VTON(vp) ((struct nandfs_node *)(vp)->v_data)
+#define	NTOV(xp) ((xp)->nn_vnode)
+
+int nandfs_init(struct vfsconf *);
+int nandfs_uninit(struct vfsconf *);
+
+extern struct vop_vector nandfs_vnodeops;
+extern struct vop_vector nandfs_system_vnodeops;
+
+struct nandfs_node;
+
+/* Structure and derivatives */
+struct nandfs_mdt {
+	uint32_t	entries_per_block;
+	uint32_t	entries_per_group;
+	uint32_t	blocks_per_group;
+	uint32_t	groups_per_desc_block;	/* desc is super group */
+	uint32_t	blocks_per_desc_block;	/* desc is super group */
+};
+
+struct nandfs_segment {
+	LIST_ENTRY(nandfs_segment) seg_link;
+
+	struct nandfs_device	*fsdev;
+
+	TAILQ_HEAD(, buf)	 segsum;
+	TAILQ_HEAD(, buf)	 data;
+
+	uint64_t		 seg_num;
+	uint64_t		 seg_next;
+	uint64_t		 start_block;
+	uint32_t		 num_blocks;
+
+	uint32_t		 nblocks;
+	uint32_t		 nbinfos;
+	uint32_t		 segsum_blocks;
+	uint32_t		 segsum_bytes;
+	uint32_t		 bytes_left;
+	char			*current_off;
+};
+
+struct nandfs_seginfo {
+	LIST_HEAD( ,nandfs_segment)	seg_list;
+	struct nandfs_segment		*curseg;
+	struct nandfs_device		*fsdev;
+	uint32_t			blocks;
+	uint8_t				reiterate;
+};
+
+#define	NANDFS_FSSTOR_FAILED	1
+struct nandfs_fsarea {
+	int	offset;
+	int	flags;
+	int	last_used;
+};
+
+extern int nandfs_cleaner_enable;
+extern int nandfs_cleaner_interval;
+extern int nandfs_cleaner_segments;
+
+struct nandfs_device {
+	struct vnode		*nd_devvp;
+	struct g_consumer	*nd_gconsumer;
+
+	struct thread		*nd_syncer;
+	struct thread		*nd_cleaner;
+	int			nd_syncer_exit;
+	int			nd_cleaner_exit;
+
+	int			nd_is_nand;
+
+	struct nandfs_fsarea	nd_fsarea[NANDFS_NFSAREAS];
+	int			nd_last_fsarea;
+
+	STAILQ_HEAD(nandfs_mnts, nandfsmount)	nd_mounts;
+	SLIST_ENTRY(nandfs_device)		nd_next_device;
+
+	/* FS structures */
+	struct nandfs_fsdata		nd_fsdata;
+	struct nandfs_super_block	nd_super;
+	struct nandfs_segment_summary	nd_last_segsum;
+	struct nandfs_super_root	nd_super_root;
+	struct nandfs_node	*nd_dat_node;
+	struct nandfs_node	*nd_cp_node;
+	struct nandfs_node	*nd_su_node;
+	struct nandfs_node	*nd_gc_node;
+
+	struct nandfs_mdt	nd_dat_mdt;
+	struct nandfs_mdt	nd_ifile_mdt;
+
+	struct timespec		nd_ts;
+
+	/* Synchronization */
+	struct mtx		nd_mutex;
+	struct mtx		nd_sync_mtx;
+	struct cv		nd_sync_cv;
+	struct mtx		nd_clean_mtx;
+	struct cv		nd_clean_cv;
+	struct lock		nd_seg_const;
+
+	struct nandfs_seginfo	*nd_seginfo;
+
+	/* FS geometry */
+	uint64_t		nd_devsize;
+	uint64_t		nd_maxfilesize;
+	uint32_t		nd_blocksize;
+	uint32_t		nd_erasesize;
+
+	uint32_t		nd_devblocksize;
+
+	/* Segment usage */
+	uint64_t		nd_clean_segs;
+	uint64_t		*nd_free_base;
+	uint64_t		nd_free_count;
+	uint64_t		nd_dirty_bufs;
+
+	/* Running values */
+	uint64_t		nd_seg_sequence;
+	uint64_t		nd_seg_num;
+	uint64_t		nd_next_seg_num;
+	uint64_t		nd_last_pseg;
+	uint64_t		nd_last_cno;
+	uint64_t		nd_last_ino;
+	uint64_t		nd_fakevblk;
+
+	int			nd_mount_state;
+	int			nd_refcnt;
+	int			nd_syncing;
+	int			nd_cleaning;
+};
+
+extern SLIST_HEAD(_nandfs_devices, nandfs_device) nandfs_devices;
+
+#define	NANDFS_FORCE_SYNCER	0x1
+#define	NANDFS_UMOUNT		0x2
+
+#define	SYNCER_UMOUNT		0x0
+#define	SYNCER_VFS_SYNC		0x1
+#define	SYNCER_BDFLUSH		0x2
+#define	SYNCER_FFORCE		0x3
+#define	SYNCER_FSYNC		0x4
+#define	SYNCER_ROUPD		0x5
+
+static __inline int
+nandfs_writelockflags(struct nandfs_device *fsdev, int flags)
+{
+	int error = 0;
+
+	if (lockstatus(&fsdev->nd_seg_const) != LK_EXCLUSIVE)
+		error = lockmgr(&fsdev->nd_seg_const, flags | LK_SHARED, NULL);
+
+	return (error);
+}
+
+static __inline void
+nandfs_writeunlock(struct nandfs_device *fsdev)
+{
+
+	if (lockstatus(&fsdev->nd_seg_const) != LK_EXCLUSIVE)
+		lockmgr(&(fsdev)->nd_seg_const, LK_RELEASE, NULL);
+}
+
+#define NANDFS_WRITELOCKFLAGS(fsdev, flags)	nandfs_writelockflags(fsdev, flags)
+
+#define NANDFS_WRITELOCK(fsdev) NANDFS_WRITELOCKFLAGS(fsdev, 0)
+
+#define NANDFS_WRITEUNLOCK(fsdev) nandfs_writeunlock(fsdev)
+
+#define NANDFS_WRITEASSERT(fsdev) lockmgr_assert(&(fsdev)->nd_seg_const, KA_LOCKED)
+
+/* Specific mountpoint; head or a checkpoint/snapshot */
+struct nandfsmount {
+	STAILQ_ENTRY(nandfsmount) nm_next_mount;
+
+	struct mount		*nm_vfs_mountp;
+	struct nandfs_device	*nm_nandfsdev;
+	struct nandfs_args	nm_mount_args;
+	struct nandfs_node	*nm_ifile_node;
+
+	uint8_t			nm_flags;
+	int8_t			nm_ronly;
+};
+
+struct nandfs_node {
+	struct vnode			*nn_vnode;
+	struct nandfsmount		*nn_nmp;
+	struct nandfs_device		*nn_nandfsdev;
+	struct lockf			*nn_lockf;
+
+	uint64_t			nn_ino;
+	struct nandfs_inode		nn_inode;
+
+	uint64_t			nn_diroff;
+	uint32_t			nn_flags;
+};
+
+#define	IN_ACCESS	0x0001	/* Inode access time update request  */
+#define	IN_CHANGE	0x0002	/* Inode change time update request  */
+#define	IN_UPDATE	0x0004	/* Inode was written to; update mtime*/
+#define	IN_MODIFIED	0x0008	/* node has been modified */
+#define	IN_RENAME	0x0010	/* node is being renamed. */
+
+/* File permissions. */
+#define	IEXEC		0000100	/* Executable. */
+#define	IWRITE		0000200	/* Writeable. */
+#define	IREAD		0000400	/* Readable. */
+#define	ISVTX		0001000	/* Sticky bit. */
+#define	ISGID		0002000	/* Set-gid. */
+#define	ISUID		0004000	/* Set-uid. */
+
+#define	PRINT_NODE_FLAGS \
+	"\10\1IN_ACCESS\2IN_CHANGE\3IN_UPDATE\4IN_MODIFIED\5IN_RENAME"
+
+#define	NANDFS_GATHER(x) ((x)->b_flags |= B_00800000)
+#define	NANDFS_UNGATHER(x) ((x)->b_flags &= ~B_00800000)
+#define	NANDFS_ISGATHERED(x) ((x)->b_flags & B_00800000)
+
+#endif /* !_FS_NANDFS_NANDFS_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_alloc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_alloc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,364 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_alloc.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+static void
+nandfs_get_desc_block_nr(struct nandfs_mdt *mdt, uint64_t desc,
+    uint64_t *desc_block)
+{
+
+	*desc_block = desc * mdt->blocks_per_desc_block;
+}
+
+static void
+nandfs_get_group_block_nr(struct nandfs_mdt *mdt, uint64_t group,
+    uint64_t *group_block)
+{
+	uint64_t desc, group_off;
+
+	desc = group / mdt->groups_per_desc_block;
+	group_off = group % mdt->groups_per_desc_block;
+	*group_block = desc * mdt->blocks_per_desc_block +
+	    1 + group_off * mdt->blocks_per_group;
+}
+
+static void
+init_desc_block(struct nandfs_mdt *mdt, uint8_t *block_data)
+{
+	struct nandfs_block_group_desc *desc;
+	uint32_t i;
+
+	desc = (struct nandfs_block_group_desc *) block_data;
+	for (i = 0; i < mdt->groups_per_desc_block; i++)
+		desc[i].bg_nfrees = mdt->entries_per_group;
+}
+
+int
+nandfs_find_free_entry(struct nandfs_mdt *mdt, struct nandfs_node *node,
+    struct nandfs_alloc_request *req)
+{
+	nandfs_daddr_t desc, group, maxgroup, maxdesc, pos = 0;
+	nandfs_daddr_t start_group, start_desc;
+	nandfs_daddr_t desc_block, group_block;
+	nandfs_daddr_t file_blocks;
+	struct nandfs_block_group_desc *descriptors;
+	struct buf *bp, *bp2;
+	uint32_t *mask, i, mcount, msize;
+	int error;
+
+	file_blocks = node->nn_inode.i_blocks;
+	maxgroup = 0x100000000ull / mdt->entries_per_group;
+	maxdesc = maxgroup / mdt->groups_per_desc_block;
+	start_group = req->entrynum / mdt->entries_per_group;
+	start_desc = start_group / mdt->groups_per_desc_block;
+
+	bp = bp2 = NULL;
+restart:
+	for (desc = start_desc; desc < maxdesc; desc++) {
+		nandfs_get_desc_block_nr(mdt, desc, &desc_block);
+
+		if (bp)
+			brelse(bp);
+		if (desc_block < file_blocks) {
+			error = nandfs_bread(node, desc_block, NOCRED, 0, &bp);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
+		} else {
+			error = nandfs_bcreate(node, desc_block, NOCRED, 0,
+			    &bp);
+			if (error)
+				return (error);
+			file_blocks++;
+			init_desc_block(mdt, bp->b_data);
+		}
+
+		descriptors = (struct nandfs_block_group_desc *) bp->b_data;
+		for (group = start_group; group < mdt->groups_per_desc_block;
+		    group++) {
+			if (descriptors[group].bg_nfrees > 0) {
+				nandfs_get_group_block_nr(mdt, group,
+				    &group_block);
+
+				if (bp2)
+					brelse(bp2);
+				if (group_block < file_blocks) {
+					error = nandfs_bread(node, group_block,
+					    NOCRED, 0, &bp2);
+					if (error) {
+						brelse(bp);
+						return (error);
+					}
+				} else {
+					error = nandfs_bcreate(node,
+					    group_block, NOCRED, 0, &bp2);
+					if (error)
+						return (error);
+					file_blocks++;
+				}
+				mask = (uint32_t *)bp2->b_data;
+				msize = (sizeof(uint32_t) * __CHAR_BIT);
+				mcount = mdt->entries_per_group / msize;
+				for (i = 0; i < mcount; i++) {
+					if (mask[i] == UINT32_MAX)
+						continue;
+
+					pos = ffs(~mask[i]) - 1;
+					pos += (msize * i);
+					pos += (group * mdt->entries_per_group);
+					pos += desc * group *
+					    mdt->groups_per_desc_block *
+					    mdt->entries_per_group;
+					goto found;
+				}
+			}
+		}
+		start_group = 0;
+	}
+
+	if (start_desc != 0) {
+		maxdesc = start_desc;
+		start_desc = 0;
+		req->entrynum = 0;
+		goto restart;
+	}
+
+	return (ENOENT);
+
+found:
+	req->entrynum = pos;
+	req->bp_desc = bp;
+	req->bp_bitmap = bp2;
+	DPRINTF(ALLOC, ("%s: desc: %p bitmap: %p entry: %#jx\n",
+	    __func__, req->bp_desc, req->bp_bitmap, (uintmax_t)pos));
+
+	return (0);
+}
+
+int
+nandfs_find_entry(struct nandfs_mdt* mdt, struct nandfs_node *nnode,
+    struct nandfs_alloc_request *req)
+{
+	uint64_t dblock, bblock, eblock;
+	uint32_t offset;
+	int error;
+
+	nandfs_mdt_trans_blk(mdt, req->entrynum, &dblock, &bblock, &eblock,
+	    &offset);
+
+	error = nandfs_bread(nnode, dblock, NOCRED, 0, &req->bp_desc);
+	if (error) {
+		brelse(req->bp_desc);
+		return (error);
+	}
+
+	error = nandfs_bread(nnode, bblock, NOCRED, 0, &req->bp_bitmap);
+	if (error) {
+		brelse(req->bp_desc);
+		brelse(req->bp_bitmap);
+		return (error);
+	}
+
+	error = nandfs_bread(nnode, eblock, NOCRED, 0, &req->bp_entry);
+	if (error) {
+		brelse(req->bp_desc);
+		brelse(req->bp_bitmap);
+		brelse(req->bp_entry);
+		return (error);
+	}
+
+	DPRINTF(ALLOC,
+	    ("%s: desc_buf: %p bitmap_buf %p entry_buf %p offset %x\n",
+	    __func__, req->bp_desc, req->bp_bitmap, req->bp_entry, offset));
+
+	return (0);
+}
+
+static __inline void
+nandfs_calc_idx_entry(struct nandfs_mdt* mdt, uint32_t entrynum,
+    uint64_t *group, uint64_t *bitmap_idx, uint64_t *bitmap_off)
+{
+
+	/* Find group_desc index */
+	entrynum = entrynum %
+	    (mdt->entries_per_group * mdt->groups_per_desc_block);
+	*group = entrynum / mdt->entries_per_group;
+	/* Find bitmap index and bit offset */
+	entrynum = entrynum % mdt->entries_per_group;
+	*bitmap_idx = entrynum / (sizeof(uint32_t) * __CHAR_BIT);
+	*bitmap_off = entrynum % (sizeof(uint32_t) * __CHAR_BIT);
+}
+
+int
+nandfs_free_entry(struct nandfs_mdt* mdt, struct nandfs_alloc_request *req)
+{
+	struct nandfs_block_group_desc *descriptors;
+	uint64_t bitmap_idx, bitmap_off;
+	uint64_t group;
+	uint32_t *mask, maskrw;
+
+	nandfs_calc_idx_entry(mdt, req->entrynum, &group, &bitmap_idx,
+	    &bitmap_off);
+
+	DPRINTF(ALLOC, ("nandfs_free_entry: req->entrynum=%jx bitmap_idx=%jx"
+	   " bitmap_off=%jx group=%jx\n", (uintmax_t)req->entrynum,
+	   (uintmax_t)bitmap_idx, (uintmax_t)bitmap_off, (uintmax_t)group));
+
+	/* Update counter of free entries for group */
+	descriptors = (struct nandfs_block_group_desc *) req->bp_desc->b_data;
+	descriptors[group].bg_nfrees++;
+
+	/* Set bit to indicate that entry is taken */
+	mask = (uint32_t *)req->bp_bitmap->b_data;
+	maskrw = mask[bitmap_idx];
+	KASSERT(maskrw & (1 << bitmap_off), ("freeing unallocated vblock"));
+	maskrw &= ~(1 << bitmap_off);
+	mask[bitmap_idx] = maskrw;
+
+	/* Make descriptor, bitmap and entry buffer dirty */
+	if (nandfs_dirty_buf(req->bp_desc, 0) == 0) {
+		nandfs_dirty_buf(req->bp_bitmap, 1);
+		nandfs_dirty_buf(req->bp_entry, 1);
+	} else {
+		brelse(req->bp_bitmap);
+		brelse(req->bp_entry);
+		return (-1);
+	}
+
+	return (0);
+}
+
+int
+nandfs_alloc_entry(struct nandfs_mdt* mdt, struct nandfs_alloc_request *req)
+{
+	struct nandfs_block_group_desc *descriptors;
+	uint64_t bitmap_idx, bitmap_off;
+	uint64_t group;
+	uint32_t *mask, maskrw;
+
+	nandfs_calc_idx_entry(mdt, req->entrynum, &group, &bitmap_idx,
+	    &bitmap_off);
+
+	DPRINTF(ALLOC, ("nandfs_alloc_entry: req->entrynum=%jx bitmap_idx=%jx"
+	    " bitmap_off=%jx group=%jx\n", (uintmax_t)req->entrynum,
+	    (uintmax_t)bitmap_idx, (uintmax_t)bitmap_off, (uintmax_t)group));
+
+	/* Update counter of free entries for group */
+	descriptors = (struct nandfs_block_group_desc *) req->bp_desc->b_data;
+	descriptors[group].bg_nfrees--;
+
+	/* Clear bit to indicate that entry is free */
+	mask = (uint32_t *)req->bp_bitmap->b_data;
+	maskrw = mask[bitmap_idx];
+	maskrw |= 1 << bitmap_off;
+	mask[bitmap_idx] = maskrw;
+
+	/* Make descriptor, bitmap and entry buffer dirty */
+	if (nandfs_dirty_buf(req->bp_desc, 0) == 0) {
+		nandfs_dirty_buf(req->bp_bitmap, 1);
+		nandfs_dirty_buf(req->bp_entry, 1);
+	} else {
+		brelse(req->bp_bitmap);
+		brelse(req->bp_entry);
+		return (-1);
+	}
+
+	return (0);
+}
+
+void
+nandfs_abort_entry(struct nandfs_alloc_request *req)
+{
+
+	brelse(req->bp_desc);
+	brelse(req->bp_bitmap);
+	brelse(req->bp_entry);
+}
+
+int
+nandfs_get_entry_block(struct nandfs_mdt *mdt, struct nandfs_node *node,
+    struct nandfs_alloc_request *req, uint32_t *entry, int create)
+{
+	struct buf *bp;
+	nandfs_lbn_t blocknr;
+	int	error;
+
+	/* Find buffer number for given entry */
+	nandfs_mdt_trans(mdt, req->entrynum, &blocknr, entry);
+	DPRINTF(ALLOC, ("%s: ino %#jx entrynum:%#jx block:%#jx entry:%x\n",
+	    __func__, (uintmax_t)node->nn_ino, (uintmax_t)req->entrynum,
+	    (uintmax_t)blocknr, *entry));
+
+	/* Read entry block or create if 'create' parameter is not zero */
+	bp = NULL;
+
+	if (blocknr < node->nn_inode.i_blocks)
+		error = nandfs_bread(node, blocknr, NOCRED, 0, &bp);
+	else if (create)
+		error = nandfs_bcreate(node, blocknr, NOCRED, 0, &bp);
+	else
+		error = E2BIG;
+
+	if (error) {
+		DPRINTF(ALLOC, ("%s: ino %#jx block %#jx entry %x error %d\n",
+		    __func__, (uintmax_t)node->nn_ino, (uintmax_t)blocknr,
+		    *entry, error));
+		if (bp)
+			brelse(bp);
+		return (error);
+	}
+
+	MPASS(nandfs_vblk_get(bp) != 0 || node->nn_ino == NANDFS_DAT_INO);
+
+	req->bp_entry = bp;
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_bmap.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_bmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,230 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_subr.c,v 1.4 2009/07/29 17:06:57 reinoud
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_bmap.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/ktr.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include <machine/_inttypes.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+#include "bmap.h"
+
+nandfs_lbn_t
+nandfs_get_maxfilesize(struct nandfs_device *fsdev)
+{
+
+	return (get_maxfilesize(fsdev));
+}
+
+int
+nandfs_bmap_lookup(struct nandfs_node *node, nandfs_lbn_t lblk,
+    nandfs_daddr_t *vblk)
+{
+	int error = 0;
+
+	if (node->nn_ino == NANDFS_GC_INO && lblk >= 0)
+		*vblk = lblk;
+	else
+		error = bmap_lookup(node, lblk, vblk);
+
+	DPRINTF(TRANSLATE, ("%s: error %d ino %#jx lblocknr %#jx -> %#jx\n",
+	    __func__, error, (uintmax_t)node->nn_ino, (uintmax_t)lblk,
+	    (uintmax_t)*vblk));
+
+	if (error)
+		nandfs_error("%s: returned %d", __func__, error);
+
+	return (error);
+}
+
+int
+nandfs_bmap_insert_block(struct nandfs_node *node, nandfs_lbn_t lblk,
+    struct buf *bp)
+{
+	struct nandfs_device *fsdev;
+	nandfs_daddr_t vblk;
+	int error;
+
+	fsdev = node->nn_nandfsdev;
+
+	vblk = 0;
+	if (node->nn_ino != NANDFS_DAT_INO) {
+		error = nandfs_vblock_alloc(fsdev, &vblk);
+		if (error)
+			return (error);
+	}
+
+	nandfs_buf_set(bp, NANDFS_VBLK_ASSIGNED);
+	nandfs_vblk_set(bp, vblk);
+
+	error = bmap_insert_block(node, lblk, vblk);
+	if (error) {
+		nandfs_vblock_free(fsdev, vblk);
+		return (error);
+	}
+
+	return (0);
+}
+
+int
+nandfs_bmap_dirty_blocks(struct nandfs_node *node, struct buf *bp, int force)
+{
+	int error;
+
+	error = bmap_dirty_meta(node, bp->b_lblkno, force);
+	if (error)
+		nandfs_error("%s: cannot dirty buffer %p\n",
+		    __func__, bp);
+
+	return (error);
+}
+
+static int
+nandfs_bmap_update_mapping(struct nandfs_node *node, nandfs_lbn_t lblk,
+    nandfs_daddr_t blknr)
+{
+	int error;
+
+	DPRINTF(BMAP,
+	    ("%s: node: %p ino: %#jx lblk: %#jx vblk: %#jx\n",
+	    __func__, node, (uintmax_t)node->nn_ino, (uintmax_t)lblk,
+	    (uintmax_t)blknr));
+
+	error = bmap_insert_block(node, lblk, blknr);
+
+	return (error);
+}
+
+int
+nandfs_bmap_update_block(struct nandfs_node *node, struct buf *bp,
+    nandfs_lbn_t blknr)
+{
+	nandfs_lbn_t lblk;
+	int error;
+
+	lblk = bp->b_lblkno;
+	nandfs_vblk_set(bp, blknr);
+
+	DPRINTF(BMAP, ("%s: node: %p ino: %#jx bp: %p lblk: %#jx blk: %#jx\n",
+	    __func__, node, (uintmax_t)node->nn_ino, bp,
+	    (uintmax_t)lblk, (uintmax_t)blknr));
+
+	error = nandfs_bmap_update_mapping(node, lblk, blknr);
+	if (error) {
+		nandfs_error("%s: cannot update lblk:%jx to blk:%jx for "
+		    "node:%p, error:%d\n", __func__, (uintmax_t)lblk,
+		    (uintmax_t)blknr, node, error);
+		return (error);
+	}
+
+	return (error);
+}
+
+int
+nandfs_bmap_update_dat(struct nandfs_node *node, nandfs_daddr_t oldblk,
+    struct buf *bp)
+{
+	struct nandfs_device *fsdev;
+	nandfs_daddr_t vblk = 0;
+	int error;
+
+	if (node->nn_ino == NANDFS_DAT_INO)
+		return (0);
+
+	if (nandfs_buf_check(bp, NANDFS_VBLK_ASSIGNED)) {
+		nandfs_buf_clear(bp, NANDFS_VBLK_ASSIGNED);
+		return (0);
+	}
+
+	fsdev = node->nn_nandfsdev;
+
+	/* First alloc new virtual block.... */
+	error = nandfs_vblock_alloc(fsdev, &vblk);
+	if (error)
+		return (error);
+
+	error = nandfs_bmap_update_block(node, bp, vblk);
+	if (error)
+		return (error);
+
+	/* Then we can end up with old one */
+	nandfs_vblock_end(fsdev, oldblk);
+
+	DPRINTF(BMAP,
+	    ("%s: ino %#jx block %#jx: update vblk %#jx to %#jx\n",
+	    __func__, (uintmax_t)node->nn_ino, (uintmax_t)bp->b_lblkno,
+	    (uintmax_t)oldblk, (uintmax_t)vblk));
+	return (error);
+}
+
+int
+nandfs_bmap_truncate_mapping(struct nandfs_node *node, nandfs_lbn_t oblk,
+    nandfs_lbn_t nblk)
+{
+	nandfs_lbn_t todo;
+	int error;
+
+	todo = oblk - nblk;
+
+	DPRINTF(BMAP, ("%s: node %p oblk %jx nblk %jx truncate by %jx\n",
+	    __func__, node, oblk, nblk, todo));
+
+	error = bmap_truncate_mapping(node, oblk, todo);
+	if (error)
+		return (error);
+
+	return (error);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_buffer.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_buffer.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,83 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_buffer.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/buf.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+struct buf *
+nandfs_geteblk(int size, int flags)
+{
+	struct buf *bp;
+
+	/*
+	 * XXX
+	 * Right now we can call geteblk with GB_NOWAIT_BD flag, which means
+	 * it can return NULL. But we cannot afford to get NULL, hence this panic.
+	 */
+	bp = geteblk(size, flags);
+	if (bp == NULL)
+		panic("geteblk returned NULL");
+
+	return (bp);
+}
+
+void
+nandfs_dirty_bufs_increment(struct nandfs_device *fsdev)
+{
+
+	mtx_lock(&fsdev->nd_mutex);
+	KASSERT(fsdev->nd_dirty_bufs >= 0, ("negative nd_dirty_bufs"));
+	fsdev->nd_dirty_bufs++;
+	mtx_unlock(&fsdev->nd_mutex);
+}
+
+void
+nandfs_dirty_bufs_decrement(struct nandfs_device *fsdev)
+{
+
+	mtx_lock(&fsdev->nd_mutex);
+	KASSERT(fsdev->nd_dirty_bufs > 0,
+	    ("decrementing not-positive nd_dirty_bufs"));
+	fsdev->nd_dirty_bufs--;
+	mtx_unlock(&fsdev->nd_mutex);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_cleaner.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_cleaner.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,620 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_cleaner.c 236188 2012-05-28 16:33:58Z marcel $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/buf.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+#define	NANDFS_CLEANER_KILL	1
+
+static void nandfs_cleaner(struct nandfs_device *);
+static int nandfs_cleaner_clean_segments(struct nandfs_device *,
+    struct nandfs_vinfo *, uint32_t, struct nandfs_period *, uint32_t,
+    struct nandfs_bdesc *, uint32_t, uint64_t *, uint32_t);
+
+static int
+nandfs_process_bdesc(struct nandfs_device *nffsdev, struct nandfs_bdesc *bd,
+    uint64_t nmembs);
+
+static void
+nandfs_wakeup_wait_cleaner(struct nandfs_device *fsdev, int reason)
+{
+
+	mtx_lock(&fsdev->nd_clean_mtx);
+	if (reason == NANDFS_CLEANER_KILL)
+		fsdev->nd_cleaner_exit = 1;
+	if (fsdev->nd_cleaning == 0) {
+		fsdev->nd_cleaning = 1;
+		wakeup(&fsdev->nd_cleaning);
+	}
+	cv_wait(&fsdev->nd_clean_cv, &fsdev->nd_clean_mtx);
+	mtx_unlock(&fsdev->nd_clean_mtx);
+}
+
+int
+nandfs_start_cleaner(struct nandfs_device *fsdev)
+{
+	int error;
+
+	MPASS(fsdev->nd_cleaner == NULL);
+
+	fsdev->nd_cleaner_exit = 0;
+
+	error = kthread_add((void(*)(void *))nandfs_cleaner, fsdev, NULL,
+	    &fsdev->nd_cleaner, 0, 0, "nandfs_cleaner");
+	if (error)
+		printf("nandfs: could not start cleaner: %d\n", error);
+
+	return (error);
+}
+
+int
+nandfs_stop_cleaner(struct nandfs_device *fsdev)
+{
+
+	MPASS(fsdev->nd_cleaner != NULL);
+	nandfs_wakeup_wait_cleaner(fsdev, NANDFS_CLEANER_KILL);
+	fsdev->nd_cleaner = NULL;
+
+	DPRINTF(CLEAN, ("cleaner stopped\n"));
+	return (0);
+}
+
+static int
+nandfs_cleaner_finished(struct nandfs_device *fsdev)
+{
+	int exit;
+
+	mtx_lock(&fsdev->nd_clean_mtx);
+	fsdev->nd_cleaning = 0;
+	if (!fsdev->nd_cleaner_exit) {
+		DPRINTF(CLEAN, ("%s: sleep\n", __func__));
+		msleep(&fsdev->nd_cleaning, &fsdev->nd_clean_mtx, PRIBIO, "-",
+		    hz * nandfs_cleaner_interval);
+	}
+	exit = fsdev->nd_cleaner_exit;
+	cv_broadcast(&fsdev->nd_clean_cv);
+	mtx_unlock(&fsdev->nd_clean_mtx);
+	if (exit) {
+		DPRINTF(CLEAN, ("%s: no longer active\n", __func__));
+		return (1);
+	}
+
+	return (0);
+}
+
+static void
+print_suinfo(struct nandfs_suinfo *suinfo, int nsegs)
+{
+	int i;
+
+	for (i = 0; i < nsegs; i++) {
+		DPRINTF(CLEAN, ("%jx  %jd  %c%c%c  %10u\n",
+		    suinfo[i].nsi_num, suinfo[i].nsi_lastmod,
+		    (suinfo[i].nsi_flags &
+		    (NANDFS_SEGMENT_USAGE_ACTIVE) ? 'a' : '-'),
+		    (suinfo[i].nsi_flags &
+		    (NANDFS_SEGMENT_USAGE_DIRTY) ? 'd' : '-'),
+		    (suinfo[i].nsi_flags &
+		    (NANDFS_SEGMENT_USAGE_ERROR) ? 'e' : '-'),
+		    suinfo[i].nsi_blocks));
+	}
+}
+
+static int
+nandfs_cleaner_vblock_is_alive(struct nandfs_device *fsdev,
+    struct nandfs_vinfo *vinfo, struct nandfs_cpinfo *cp, uint32_t ncps)
+{
+	int64_t idx, min, max;
+
+	if (vinfo->nvi_end >= fsdev->nd_last_cno)
+		return (1);
+
+	if (ncps == 0)
+		return (0);
+
+	if (vinfo->nvi_end < cp[0].nci_cno ||
+	    vinfo->nvi_start > cp[ncps - 1].nci_cno)
+		return (0);
+
+	idx = min = 0;
+	max = ncps - 1;
+	while (min <= max) {
+		idx = (min + max) / 2;
+		if (vinfo->nvi_start == cp[idx].nci_cno)
+			return (1);
+		if (vinfo->nvi_start < cp[idx].nci_cno)
+			max = idx - 1;
+		else
+			min = idx + 1;
+	}
+
+	return (vinfo->nvi_end >= cp[idx].nci_cno);
+}
+
+static void
+nandfs_cleaner_vinfo_mark_alive(struct nandfs_device *fsdev,
+    struct nandfs_vinfo *vinfo, uint32_t nmembs, struct nandfs_cpinfo *cp,
+    uint32_t ncps)
+{
+	uint32_t i;
+
+	for (i = 0; i < nmembs; i++)
+		vinfo[i].nvi_alive =
+		    nandfs_cleaner_vblock_is_alive(fsdev, &vinfo[i], cp, ncps);
+}
+
+static int
+nandfs_cleaner_bdesc_is_alive(struct nandfs_device *fsdev,
+    struct nandfs_bdesc *bdesc)
+{
+	int alive;
+
+	alive = bdesc->bd_oblocknr == bdesc->bd_blocknr;
+	if (!alive)
+		MPASS(abs(bdesc->bd_oblocknr - bdesc->bd_blocknr) > 2);
+
+	return (alive);
+}
+
+static void
+nandfs_cleaner_bdesc_mark_alive(struct nandfs_device *fsdev,
+    struct nandfs_bdesc *bdesc, uint32_t nmembs)
+{
+	uint32_t i;
+
+	for (i = 0; i < nmembs; i++)
+		bdesc[i].bd_alive = nandfs_cleaner_bdesc_is_alive(fsdev,
+		    &bdesc[i]);
+}
+
+static void
+nandfs_cleaner_iterate_psegment(struct nandfs_device *fsdev,
+    struct nandfs_segment_summary *segsum, union nandfs_binfo *binfo,
+    nandfs_daddr_t blk, struct nandfs_vinfo **vipp, struct nandfs_bdesc **bdpp)
+{
+	int i;
+
+	DPRINTF(CLEAN, ("%s nbinfos %x\n", __func__, segsum->ss_nbinfos));
+	for (i = 0; i < segsum->ss_nbinfos; i++) {
+		if (binfo[i].bi_v.bi_ino == NANDFS_DAT_INO) {
+			(*bdpp)->bd_oblocknr = blk + segsum->ss_nblocks -
+			    segsum->ss_nbinfos + i;
+			/*
+			 * XXX Hack
+			 */
+			if (segsum->ss_flags & NANDFS_SS_SR)
+				(*bdpp)->bd_oblocknr--;
+			(*bdpp)->bd_level = binfo[i].bi_dat.bi_level;
+			(*bdpp)->bd_offset = binfo[i].bi_dat.bi_blkoff;
+			(*bdpp)++;
+		} else {
+			(*vipp)->nvi_ino = binfo[i].bi_v.bi_ino;
+			(*vipp)->nvi_vblocknr = binfo[i].bi_v.bi_vblocknr;
+			(*vipp)++;
+		}
+	}
+}
+
+static int
+nandfs_cleaner_iterate_segment(struct nandfs_device *fsdev, uint64_t segno,
+    struct nandfs_vinfo **vipp, struct nandfs_bdesc **bdpp, int *select)
+{
+	struct nandfs_segment_summary *segsum;
+	union nandfs_binfo *binfo;
+	struct buf *bp;
+	uint32_t nblocks;
+	nandfs_daddr_t curr, start, end;
+	int error = 0;
+
+	nandfs_get_segment_range(fsdev, segno, &start, &end);
+
+	DPRINTF(CLEAN, ("%s: segno %jx start %jx end %jx\n", __func__, segno,
+	    start, end));
+
+	*select = 0;
+
+	for (curr = start; curr < end; curr += nblocks) {
+		error = nandfs_dev_bread(fsdev, curr, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			nandfs_error("%s: couldn't load segment summary of %jx: %d\n",
+			    __func__, segno, error);
+			return (error);
+		}
+
+		segsum = (struct nandfs_segment_summary *)bp->b_data;
+		binfo = (union nandfs_binfo *)(bp->b_data + segsum->ss_bytes);
+
+		if (!nandfs_segsum_valid(segsum)) {
+			brelse(bp);
+			nandfs_error("nandfs: invalid summary of segment %jx\n", segno);
+			return (error);
+		}
+
+		DPRINTF(CLEAN, ("%s: %jx magic %x bytes %x nblocks %x nbinfos "
+		    "%x\n", __func__, segno, segsum->ss_magic, segsum->ss_bytes,
+		    segsum->ss_nblocks, segsum->ss_nbinfos));
+
+		nandfs_cleaner_iterate_psegment(fsdev, segsum, binfo, curr,
+		    vipp, bdpp);
+		nblocks = segsum->ss_nblocks;
+		brelse(bp);
+	}
+
+	if (error == 0)
+		*select = 1;
+
+	return (error);
+}
+
+static int
+nandfs_cleaner_choose_segment(struct nandfs_device *fsdev, uint64_t **segpp,
+    uint64_t nsegs, uint64_t *rseg)
+{
+	struct nandfs_suinfo *suinfo;
+	uint64_t i, ssegs;
+	int error;
+
+	suinfo = malloc(sizeof(*suinfo) * nsegs, M_NANDFSTEMP,
+	    M_ZERO | M_WAITOK);
+
+	if (*rseg >= fsdev->nd_fsdata.f_nsegments)
+		*rseg = 0;
+
+retry:
+	error = nandfs_get_segment_info_filter(fsdev, suinfo, nsegs, *rseg,
+	    &ssegs, NANDFS_SEGMENT_USAGE_DIRTY,
+	    NANDFS_SEGMENT_USAGE_ACTIVE | NANDFS_SEGMENT_USAGE_ERROR |
+	    NANDFS_SEGMENT_USAGE_GC);
+	if (error) {
+		nandfs_error("%s:%d", __FILE__, __LINE__);
+		goto out;
+	}
+	if (ssegs == 0 && *rseg != 0) {
+		*rseg = 0;
+		goto retry;
+	}
+	if (ssegs > 0) {
+		print_suinfo(suinfo, ssegs);
+
+		for (i = 0; i < ssegs; i++) {
+			(**segpp) = suinfo[i].nsi_num;
+			(*segpp)++;
+		}
+		*rseg = suinfo[i - 1].nsi_num + 1;
+	}
+
+out:
+	free(suinfo, M_NANDFSTEMP);
+	return (error);
+}
+
+static int
+nandfs_cleaner_body(struct nandfs_device *fsdev, uint64_t *rseg)
+{
+	struct nandfs_vinfo *vinfo, *vip, *vipi;
+	struct nandfs_bdesc *bdesc, *bdp, *bdpi;
+	struct nandfs_cpstat cpstat;
+	struct nandfs_cpinfo *cpinfo = NULL;
+	uint64_t *segnums, *segp;
+	int select, selected;
+	int error = 0;
+	int nsegs;
+	int i;
+
+	nsegs = nandfs_cleaner_segments;
+
+	vip = vinfo = malloc(sizeof(*vinfo) *
+	    fsdev->nd_fsdata.f_blocks_per_segment * nsegs, M_NANDFSTEMP,
+	    M_ZERO | M_WAITOK);
+	bdp = bdesc = malloc(sizeof(*bdesc) *
+	    fsdev->nd_fsdata.f_blocks_per_segment * nsegs, M_NANDFSTEMP,
+	    M_ZERO | M_WAITOK);
+	segp = segnums = malloc(sizeof(*segnums) * nsegs, M_NANDFSTEMP,
+	    M_WAITOK);
+
+	error = nandfs_cleaner_choose_segment(fsdev, &segp, nsegs, rseg);
+	if (error) {
+		nandfs_error("%s:%d", __FILE__, __LINE__);
+		goto out;
+	}
+
+	if (segnums == segp)
+		goto out;
+
+	selected = 0;
+	for (i = 0; i < segp - segnums; i++) {
+		error = nandfs_cleaner_iterate_segment(fsdev, segnums[i], &vip,
+		    &bdp, &select);
+		if (error) {
+			/*
+			 * XXX deselect (see below)?
+			 */
+			goto out;
+		}
+		if (!select)
+			segnums[i] = NANDFS_NOSEGMENT;
+		else {
+			error = nandfs_markgc_segment(fsdev, segnums[i]);
+			if (error) {
+				nandfs_error("%s:%d\n", __FILE__, __LINE__);
+				goto out;
+			}
+			selected++;
+		}
+	}
+
+	if (selected == 0) {
+		MPASS(vinfo == vip);
+		MPASS(bdesc == bdp);
+		goto out;
+	}
+
+	error = nandfs_get_cpstat(fsdev->nd_cp_node, &cpstat);
+	if (error) {
+		nandfs_error("%s:%d\n", __FILE__, __LINE__);
+		goto out;
+	}
+
+	if (cpstat.ncp_nss != 0) {
+		cpinfo = malloc(sizeof(struct nandfs_cpinfo) * cpstat.ncp_nss,
+		    M_NANDFSTEMP, M_WAITOK);
+		error = nandfs_get_cpinfo(fsdev->nd_cp_node, 1, NANDFS_SNAPSHOT,
+		    cpinfo, cpstat.ncp_nss, NULL);
+		if (error) {
+			nandfs_error("%s:%d\n", __FILE__, __LINE__);
+			goto out_locked;
+		}
+	}
+
+	NANDFS_WRITELOCK(fsdev);
+	DPRINTF(CLEAN, ("%s: got lock\n", __func__));
+
+	error = nandfs_get_dat_vinfo(fsdev, vinfo, vip - vinfo);
+	if (error) {
+		nandfs_error("%s:%d\n", __FILE__, __LINE__);
+		goto out_locked;
+	}
+
+	nandfs_cleaner_vinfo_mark_alive(fsdev, vinfo, vip - vinfo, cpinfo,
+	    cpstat.ncp_nss);
+
+	error = nandfs_get_dat_bdescs(fsdev, bdesc, bdp - bdesc);
+	if (error) {
+		nandfs_error("%s:%d\n", __FILE__, __LINE__);
+		goto out_locked;
+	}
+
+	nandfs_cleaner_bdesc_mark_alive(fsdev, bdesc, bdp - bdesc);
+
+	DPRINTF(CLEAN, ("got:\n"));
+	for (vipi = vinfo; vipi < vip; vipi++) {
+		DPRINTF(CLEAN, ("v ino %jx vblocknr %jx start %jx end %jx "
+		    "alive %d\n", vipi->nvi_ino, vipi->nvi_vblocknr,
+		    vipi->nvi_start, vipi->nvi_end, vipi->nvi_alive));
+	}
+	for (bdpi = bdesc; bdpi < bdp; bdpi++) {
+		DPRINTF(CLEAN, ("b oblocknr %jx blocknr %jx offset %jx "
+		    "alive %d\n", bdpi->bd_oblocknr, bdpi->bd_blocknr,
+		    bdpi->bd_offset, bdpi->bd_alive));
+	}
+	DPRINTF(CLEAN, ("end list\n"));
+
+	error = nandfs_cleaner_clean_segments(fsdev, vinfo, vip - vinfo, NULL,
+	    0, bdesc, bdp - bdesc, segnums, segp - segnums);
+	if (error)
+		nandfs_error("%s:%d\n", __FILE__, __LINE__);
+
+out_locked:
+	NANDFS_WRITEUNLOCK(fsdev);
+out:
+	free(cpinfo, M_NANDFSTEMP);
+	free(segnums, M_NANDFSTEMP);
+	free(bdesc, M_NANDFSTEMP);
+	free(vinfo, M_NANDFSTEMP);
+
+	return (error);
+}
+
+static void
+nandfs_cleaner(struct nandfs_device *fsdev)
+{
+	uint64_t checked_seg = 0;
+	int error;
+
+	while (!nandfs_cleaner_finished(fsdev)) {
+		if (!nandfs_cleaner_enable || rebooting)
+			continue;
+
+		DPRINTF(CLEAN, ("%s: run started\n", __func__));
+
+		fsdev->nd_cleaning = 1;
+
+		error = nandfs_cleaner_body(fsdev, &checked_seg);
+
+		DPRINTF(CLEAN, ("%s: run finished error %d\n", __func__,
+		    error));
+	}
+
+	DPRINTF(CLEAN, ("%s: exiting\n", __func__));
+	kthread_exit();
+}
+
+static int
+nandfs_cleaner_clean_segments(struct nandfs_device *nffsdev,
+    struct nandfs_vinfo *vinfo, uint32_t nvinfo,
+    struct nandfs_period *pd, uint32_t npd,
+    struct nandfs_bdesc *bdesc, uint32_t nbdesc,
+    uint64_t *segments, uint32_t nsegs)
+{
+	struct nandfs_node *gc;
+	struct buf *bp;
+	uint32_t i;
+	int error = 0;
+
+	gc = nffsdev->nd_gc_node;
+
+	DPRINTF(CLEAN, ("%s: enter\n", __func__));
+
+	VOP_LOCK(NTOV(gc), LK_EXCLUSIVE);
+	for (i = 0; i < nvinfo; i++) {
+		if (!vinfo[i].nvi_alive)
+			continue;
+		DPRINTF(CLEAN, ("%s: read vblknr:%#jx blk:%#jx\n",
+		    __func__, (uintmax_t)vinfo[i].nvi_vblocknr,
+		    (uintmax_t)vinfo[i].nvi_blocknr));
+		error = nandfs_bread(nffsdev->nd_gc_node, vinfo[i].nvi_blocknr,
+		    NULL, 0, &bp);
+		if (error) {
+			nandfs_error("%s:%d", __FILE__, __LINE__);
+			VOP_UNLOCK(NTOV(gc), 0);
+			goto out;
+		}
+		nandfs_vblk_set(bp, vinfo[i].nvi_vblocknr);
+		nandfs_buf_set(bp, NANDFS_VBLK_ASSIGNED);
+		nandfs_dirty_buf(bp, 1);
+	}
+	VOP_UNLOCK(NTOV(gc), 0);
+
+	/* Delete checkpoints */
+	for (i = 0; i < npd; i++) {
+		DPRINTF(CLEAN, ("delete checkpoint: %jx\n",
+		    (uintmax_t)pd[i].p_start));
+		error = nandfs_delete_cp(nffsdev->nd_cp_node, pd[i].p_start,
+		    pd[i].p_end);
+		if (error) {
+			nandfs_error("%s:%d", __FILE__, __LINE__);
+			goto out;
+		}
+	}
+
+	/* Update vblocks */
+	for (i = 0; i < nvinfo; i++) {
+		if (vinfo[i].nvi_alive)
+			continue;
+		DPRINTF(CLEAN, ("freeing vblknr: %jx\n", vinfo[i].nvi_vblocknr));
+		error = nandfs_vblock_free(nffsdev, vinfo[i].nvi_vblocknr);
+		if (error) {
+			nandfs_error("%s:%d", __FILE__, __LINE__);
+			goto out;
+		}
+	}
+
+	error = nandfs_process_bdesc(nffsdev, bdesc, nbdesc);
+	if (error) {
+		nandfs_error("%s:%d", __FILE__, __LINE__);
+		goto out;
+	}
+
+	/* Add segments to clean */
+	if (nffsdev->nd_free_count) {
+		nffsdev->nd_free_base = realloc(nffsdev->nd_free_base,
+		    (nffsdev->nd_free_count + nsegs) * sizeof(uint64_t),
+		    M_NANDFSTEMP, M_WAITOK | M_ZERO);
+		memcpy(&nffsdev->nd_free_base[nffsdev->nd_free_count], segments,
+		    nsegs * sizeof(uint64_t));
+		nffsdev->nd_free_count += nsegs;
+	} else {
+		nffsdev->nd_free_base = malloc(nsegs * sizeof(uint64_t),
+		    M_NANDFSTEMP, M_WAITOK|M_ZERO);
+		memcpy(nffsdev->nd_free_base, segments,
+		    nsegs * sizeof(uint64_t));
+		nffsdev->nd_free_count = nsegs;
+	}
+
+out:
+
+	DPRINTF(CLEAN, ("%s: exit error %d\n", __func__, error));
+
+	return (error);
+}
+
+static int
+nandfs_process_bdesc(struct nandfs_device *nffsdev, struct nandfs_bdesc *bd,
+    uint64_t nmembs)
+{
+	struct nandfs_node *dat_node;
+	struct buf *bp;
+	uint64_t i;
+	int error;
+
+	dat_node = nffsdev->nd_dat_node;
+
+	VOP_LOCK(NTOV(dat_node), LK_EXCLUSIVE);
+
+	for (i = 0; i < nmembs; i++) {
+		if (!bd[i].bd_alive)
+			continue;
+		DPRINTF(CLEAN, ("%s: idx %jx offset %jx\n",
+		    __func__, i, bd[i].bd_offset));
+		if (bd[i].bd_level) {
+			error = nandfs_bread_meta(dat_node, bd[i].bd_offset,
+			    NULL, 0, &bp);
+			if (error) {
+				nandfs_error("%s: cannot read dat node "
+				    "level:%d\n", __func__, bd[i].bd_level);
+				brelse(bp);
+				VOP_UNLOCK(NTOV(dat_node), 0);
+				return (error);
+			}
+			nandfs_dirty_buf_meta(bp, 1);
+			nandfs_bmap_dirty_blocks(VTON(bp->b_vp), bp, 1);
+		} else {
+			error = nandfs_bread(dat_node, bd[i].bd_offset, NULL,
+			    0, &bp);
+			if (error) {
+				nandfs_error("%s: cannot read dat node\n",
+				    __func__);
+				brelse(bp);
+				VOP_UNLOCK(NTOV(dat_node), 0);
+				return (error);
+			}
+			nandfs_dirty_buf(bp, 1);
+		}
+		DPRINTF(CLEAN, ("%s: bp: %p\n", __func__, bp));
+	}
+
+	VOP_UNLOCK(NTOV(dat_node), 0);
+
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_cpfile.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_cpfile.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,776 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_cpfile.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+
+
+static int
+nandfs_checkpoint_size(struct nandfs_device *fsdev)
+{
+
+	return (fsdev->nd_fsdata.f_checkpoint_size);
+}
+
+static int
+nandfs_checkpoint_blk_offset(struct nandfs_device *fsdev, uint64_t cn,
+    uint64_t *blk, uint64_t *offset)
+{
+	uint64_t off;
+	uint16_t cp_size, cp_per_blk;
+
+	KASSERT((cn), ("checkpoing cannot be zero"));
+
+	cp_size = fsdev->nd_fsdata.f_checkpoint_size;
+	cp_per_blk = fsdev->nd_blocksize / cp_size;
+	off = roundup(sizeof(struct nandfs_cpfile_header), cp_size) / cp_size;
+	off += (cn - 1);
+
+	*blk = off / cp_per_blk;
+	*offset = (off % cp_per_blk) * cp_size;
+
+	return (0);
+}
+
+static int
+nandfs_checkpoint_blk_remaining(struct nandfs_device *fsdev, uint64_t cn,
+    uint64_t blk, uint64_t offset)
+{
+	uint16_t cp_size, cp_remaining;
+
+	cp_size = fsdev->nd_fsdata.f_checkpoint_size;
+	cp_remaining = (fsdev->nd_blocksize - offset) / cp_size;
+
+	return (cp_remaining);
+}
+
+int
+nandfs_get_checkpoint(struct nandfs_device *fsdev, struct nandfs_node *cp_node,
+    uint64_t cn)
+{
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	if (cn != fsdev->nd_last_cno && cn != (fsdev->nd_last_cno + 1)) {
+		return (-1);
+	}
+
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (-1);
+	}
+
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (-1);
+
+
+	nandfs_checkpoint_blk_offset(fsdev, cn, &blk, &offset);
+
+	if (blk != 0) {
+		if (blk < cp_node->nn_inode.i_blocks)
+			error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		else
+			error = nandfs_bcreate(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			if (bp)
+				brelse(bp);
+			return (-1);
+		}
+
+		nandfs_dirty_buf(bp, 1);
+	}
+
+	DPRINTF(CPFILE, ("%s: cn:%#jx entry block:%#jx offset:%#jx\n",
+	    __func__, (uintmax_t)cn, (uintmax_t)blk, (uintmax_t)offset));
+
+	return (0);
+}
+
+int
+nandfs_set_checkpoint(struct nandfs_device *fsdev, struct nandfs_node *cp_node,
+    uint64_t cn, struct nandfs_inode *ifile_inode, uint64_t nblocks)
+{
+	struct nandfs_cpfile_header *cnh;
+	struct nandfs_checkpoint *cnp;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	if (cn != fsdev->nd_last_cno && cn != (fsdev->nd_last_cno + 1)) {
+		nandfs_error("%s: trying to set invalid chekpoint %jx - %jx\n",
+		    __func__, cn, fsdev->nd_last_cno);
+		return (-1);
+	}
+
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return error;
+	}
+
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	cnh->ch_ncheckpoints++;
+
+	nandfs_checkpoint_blk_offset(fsdev, cn, &blk, &offset);
+
+	if(blk != 0) {
+		brelse(bp);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return error;
+		}
+	}
+
+	cnp = (struct nandfs_checkpoint *)((uint8_t *)bp->b_data + offset);
+	cnp->cp_flags = 0;
+	cnp->cp_checkpoints_count = 1;
+	memset(&cnp->cp_snapshot_list, 0, sizeof(struct nandfs_snapshot_list));
+	cnp->cp_cno = cn;
+	cnp->cp_create = fsdev->nd_ts.tv_sec;
+	cnp->cp_nblk_inc = nblocks;
+	cnp->cp_blocks_count = 0;
+	memcpy (&cnp->cp_ifile_inode, ifile_inode, sizeof(cnp->cp_ifile_inode));
+
+	DPRINTF(CPFILE, ("%s: cn:%#jx ctime:%#jx nblk:%#jx\n",
+	    __func__, (uintmax_t)cn, (uintmax_t)cnp->cp_create,
+	    (uintmax_t)nblocks));
+
+	brelse(bp);
+	return (0);
+}
+
+static int
+nandfs_cp_mounted(struct nandfs_device *nandfsdev, uint64_t cno)
+{
+	struct nandfsmount *nmp;
+	int mounted = 0;
+
+	mtx_lock(&nandfsdev->nd_mutex);
+	/* No double-mounting of the same checkpoint */
+	STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) {
+		if (nmp->nm_mount_args.cpno == cno) {
+			mounted = 1;
+			break;
+		}
+	}
+	mtx_unlock(&nandfsdev->nd_mutex);
+
+	return (mounted);
+}
+
+static int
+nandfs_cp_set_snapshot(struct nandfs_node *cp_node, uint64_t cno)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_cpfile_header *cnh;
+	struct nandfs_checkpoint *cnp;
+	struct nandfs_snapshot_list *list;
+	struct buf *bp;
+	uint64_t blk, prev_blk, offset;
+	uint64_t curr, prev;
+	int error;
+
+	fsdev = cp_node->nn_nandfsdev;
+
+	/* Get snapshot data */
+	nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+	error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+	if (cnp->cp_flags & NANDFS_CHECKPOINT_INVALID) {
+		brelse(bp);
+		return (ENOENT);
+	}
+	if ((cnp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT)) {
+		brelse(bp);
+		return (EINVAL);
+	}
+
+	brelse(bp);
+	/* Get list from header */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	list = &cnh->ch_snapshot_list;
+	prev = list->ssl_prev;
+	brelse(bp);
+	prev_blk = ~(0);
+	curr = 0;
+	while (prev > cno) {
+		curr = prev;
+		nandfs_checkpoint_blk_offset(fsdev, prev, &prev_blk, &offset);
+		error = nandfs_bread(cp_node, prev_blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+		prev = list->ssl_prev;
+		brelse(bp);
+	}
+
+	if (curr == 0) {
+		nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		list = &cnh->ch_snapshot_list;
+	} else {
+		nandfs_checkpoint_blk_offset(fsdev, curr, &blk, &offset);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+	}
+
+	list->ssl_prev = cno;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+
+	/* Update snapshot for cno */
+	nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+	error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+	list = &cnp->cp_snapshot_list;
+	list->ssl_prev = prev;
+	list->ssl_next = curr;
+	cnp->cp_flags |= NANDFS_CHECKPOINT_SNAPSHOT;
+	nandfs_dirty_buf(bp, 1);
+
+	if (prev == 0) {
+		nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		list = &cnh->ch_snapshot_list;
+	} else {
+		/* Update snapshot list for prev */
+		nandfs_checkpoint_blk_offset(fsdev, prev, &blk, &offset);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+	}
+	list->ssl_next = cno;
+	nandfs_dirty_buf(bp, 1);
+
+	/* Update header */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	cnh->ch_nsnapshots++;
+	nandfs_dirty_buf(bp, 1);
+
+	return (0);
+}
+
+static int
+nandfs_cp_clr_snapshot(struct nandfs_node *cp_node, uint64_t cno)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_cpfile_header *cnh;
+	struct nandfs_checkpoint *cnp;
+	struct nandfs_snapshot_list *list;
+	struct buf *bp;
+	uint64_t blk, offset, snapshot_cnt;
+	uint64_t next, prev;
+	int error;
+
+	fsdev = cp_node->nn_nandfsdev;
+
+	/* Get snapshot data */
+	nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+	error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+	if (cnp->cp_flags & NANDFS_CHECKPOINT_INVALID) {
+		brelse(bp);
+		return (ENOENT);
+	}
+	if (!(cnp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT)) {
+		brelse(bp);
+		return (EINVAL);
+	}
+
+	list = &cnp->cp_snapshot_list;
+	next = list->ssl_next;
+	prev = list->ssl_prev;
+	brelse(bp);
+
+	/* Get previous snapshot */
+	if (prev != 0) {
+		nandfs_checkpoint_blk_offset(fsdev, prev, &blk, &offset);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+	} else {
+		nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		list = &cnh->ch_snapshot_list;
+	}
+
+	list->ssl_next = next;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	/* Get next snapshot */
+	if (next != 0) {
+		nandfs_checkpoint_blk_offset(fsdev, next, &blk, &offset);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+	} else {
+		nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		list = &cnh->ch_snapshot_list;
+	}
+	list->ssl_prev = prev;
+	nandfs_dirty_buf(bp, 1);
+
+	/* Update snapshot list for cno */
+	nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+	error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+	list = &cnp->cp_snapshot_list;
+	list->ssl_prev = 0;
+	list->ssl_next = 0;
+	cnp->cp_flags &= !NANDFS_CHECKPOINT_SNAPSHOT;
+	nandfs_dirty_buf(bp, 1);
+
+	/* Update header */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	snapshot_cnt = cnh->ch_nsnapshots;
+	snapshot_cnt--;
+	cnh->ch_nsnapshots = snapshot_cnt;
+	nandfs_dirty_buf(bp, 1);
+
+	return (0);
+}
+
+int
+nandfs_chng_cpmode(struct nandfs_node *node, struct nandfs_cpmode *ncpm)
+{
+	struct nandfs_device *fsdev;
+	uint64_t cno = ncpm->ncpm_cno;
+	int mode = ncpm->ncpm_mode;
+	int ret;
+
+	fsdev = node->nn_nandfsdev;
+	VOP_LOCK(NTOV(node), LK_EXCLUSIVE);
+	switch (mode) {
+	case NANDFS_CHECKPOINT:
+		if (nandfs_cp_mounted(fsdev, cno)) {
+			ret = EBUSY;
+		} else
+			ret = nandfs_cp_clr_snapshot(node, cno);
+		break;
+	case NANDFS_SNAPSHOT:
+		ret = nandfs_cp_set_snapshot(node, cno);
+		break;
+	default:
+		ret = EINVAL;
+		break;
+	}
+	VOP_UNLOCK(NTOV(node), 0);
+
+	return (ret);
+}
+
+static void
+nandfs_cpinfo_fill(struct nandfs_checkpoint *cnp, struct nandfs_cpinfo *nci)
+{
+
+	nci->nci_flags = cnp->cp_flags;
+	nci->nci_pad = 0;
+	nci->nci_cno = cnp->cp_cno;
+	nci->nci_create = cnp->cp_create;
+	nci->nci_nblk_inc = cnp->cp_nblk_inc;
+	nci->nci_blocks_count = cnp->cp_blocks_count;
+	nci->nci_next = cnp->cp_snapshot_list.ssl_next;
+	DPRINTF(CPFILE, ("%s: cn:%#jx ctime:%#jx\n",
+	    __func__, (uintmax_t)cnp->cp_cno,
+	    (uintmax_t)cnp->cp_create));
+}
+
+static int
+nandfs_get_cpinfo_cp(struct nandfs_node *node, uint64_t cno,
+    struct nandfs_cpinfo *nci, uint32_t mnmembs, uint32_t *nmembs)
+{
+	struct nandfs_device *fsdev;
+	struct buf *bp;
+	uint64_t blk, offset, last_cno, i;
+	uint16_t remaining;
+	int error;
+#ifdef INVARIANTS
+	uint64_t testblk, testoffset;
+#endif
+
+	if (cno == 0) {
+		return (ENOENT);
+	}
+
+	if (mnmembs < 1) {
+		return (EINVAL);
+	}
+
+	fsdev = node->nn_nandfsdev;
+	last_cno = fsdev->nd_last_cno;
+	DPRINTF(CPFILE, ("%s: cno:%#jx mnmembs: %#jx last:%#jx\n", __func__,
+	    (uintmax_t)cno, (uintmax_t)mnmembs,
+	    (uintmax_t)fsdev->nd_last_cno));
+
+	/*
+	 * do {
+	 * 	get block
+	 * 	read checkpoints until we hit last checkpoint, end of block or
+	 * 	requested number
+	 * } while (last read checkpoint <= last checkpoint on fs &&
+	 * 		read checkpoints < request number);
+	 */
+	*nmembs = i = 0;
+	do {
+		nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+		remaining = nandfs_checkpoint_blk_remaining(fsdev, cno,
+		    blk, offset);
+		error = nandfs_bread(node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+
+		while (cno <= last_cno && i < mnmembs && remaining) {
+#ifdef INVARIANTS
+			nandfs_checkpoint_blk_offset(fsdev, cno, &testblk,
+			    &testoffset);
+			KASSERT(testblk == blk, ("testblk != blk"));
+			KASSERT(testoffset == offset, ("testoffset != offset"));
+#endif
+			DPRINTF(CPFILE, ("%s: cno %#jx\n", __func__,
+			    (uintmax_t)cno));
+
+			nandfs_cpinfo_fill((struct nandfs_checkpoint *)
+			    (bp->b_data + offset), nci);
+			offset += nandfs_checkpoint_size(fsdev);
+			i++;
+			nci++;
+			cno++;
+			(*nmembs)++;
+			remaining--;
+		}
+		brelse(bp);
+	} while (cno <= last_cno && i < mnmembs);
+
+	return (0);
+}
+
+static int
+nandfs_get_cpinfo_sp(struct nandfs_node *node, uint64_t cno,
+    struct nandfs_cpinfo *nci, uint32_t mnmembs, uint32_t *nmembs)
+{
+	struct nandfs_checkpoint *cnp;
+	struct nandfs_cpfile_header *cnh;
+	struct nandfs_device *fsdev;
+	struct buf *bp = NULL;
+	uint64_t curr = 0;
+	uint64_t blk, offset, curr_cno;
+	uint32_t flag;
+	int i, error;
+
+	if (cno == 0 || cno == ~(0))
+		return (ENOENT);
+
+	fsdev = node->nn_nandfsdev;
+	curr_cno = cno;
+
+	if (nmembs)
+		*nmembs = 0;
+	if (curr_cno == 1) {
+		/* Get list from header */
+		error = nandfs_bread(node, 0, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		curr_cno = cnh->ch_snapshot_list.ssl_next;
+		brelse(bp);
+		bp = NULL;
+
+		/* No snapshots */
+		if (curr_cno == 0)
+			return (0);
+	}
+
+	for (i = 0; i < mnmembs; i++, nci++) {
+		nandfs_checkpoint_blk_offset(fsdev, curr_cno, &blk, &offset);
+		if (i == 0 || curr != blk) {
+			if (bp)
+				brelse(bp);
+			error = nandfs_bread(node, blk, NOCRED, 0, &bp);
+			if (error) {
+				brelse(bp);
+				return (ENOENT);
+			}
+			curr = blk;
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		flag = cnp->cp_flags;
+		if (!(flag & NANDFS_CHECKPOINT_SNAPSHOT) ||
+		    (flag & NANDFS_CHECKPOINT_INVALID))
+			break;
+
+		nci->nci_flags = flag;
+		nci->nci_pad = 0;
+		nci->nci_cno = cnp->cp_cno;
+		nci->nci_create = cnp->cp_create;
+		nci->nci_nblk_inc = cnp->cp_nblk_inc;
+		nci->nci_blocks_count = cnp->cp_blocks_count;
+		nci->nci_next = cnp->cp_snapshot_list.ssl_next;
+		if (nmembs)
+			(*nmembs)++;
+
+		curr_cno = nci->nci_next;
+		if (!curr_cno)
+			break;
+	}
+
+	brelse(bp);
+
+	return (0);
+}
+
+int
+nandfs_get_cpinfo(struct nandfs_node *node, uint64_t cno, uint16_t flags,
+    struct nandfs_cpinfo *nci, uint32_t nmembs, uint32_t *nnmembs)
+{
+	int error;
+
+	VOP_LOCK(NTOV(node), LK_EXCLUSIVE);
+	switch (flags) {
+	case NANDFS_CHECKPOINT:
+		error = nandfs_get_cpinfo_cp(node, cno, nci, nmembs, nnmembs);
+		break;
+	case NANDFS_SNAPSHOT:
+		error = nandfs_get_cpinfo_sp(node, cno, nci, nmembs, nnmembs);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	VOP_UNLOCK(NTOV(node), 0);
+
+	return (error);
+}
+
+int
+nandfs_get_cpinfo_ioctl(struct nandfs_node *node, struct nandfs_argv *nargv)
+{
+	struct nandfs_cpinfo *nci;
+	uint64_t cno = nargv->nv_index;
+	void *buf = (void *)((uintptr_t)nargv->nv_base);
+	uint16_t flags = nargv->nv_flags;
+	uint32_t nmembs = 0;
+	int error;
+
+	if (nargv->nv_nmembs > NANDFS_CPINFO_MAX)
+		return (EINVAL);
+
+	nci = malloc(sizeof(struct nandfs_cpinfo) * nargv->nv_nmembs,
+	    M_NANDFSTEMP, M_WAITOK | M_ZERO);
+
+	error = nandfs_get_cpinfo(node, cno, flags, nci, nargv->nv_nmembs, &nmembs);
+
+	if (error == 0) {
+		nargv->nv_nmembs = nmembs;
+		error = copyout(nci, buf,
+		    sizeof(struct nandfs_cpinfo) * nmembs);
+	}
+
+	free(nci, M_NANDFSTEMP);
+	return (error);
+}
+
+int
+nandfs_delete_cp(struct nandfs_node *node, uint64_t start, uint64_t end)
+{
+	struct nandfs_checkpoint *cnp;
+	struct nandfs_device *fsdev;
+	struct buf *bp;
+	uint64_t cno = start, blk, offset;
+	int error;
+
+	DPRINTF(CPFILE, ("%s: delete cno %jx-%jx\n", __func__, start, end));
+	VOP_LOCK(NTOV(node), LK_EXCLUSIVE);
+	fsdev = node->nn_nandfsdev;
+	for (cno = start; cno <= end; cno++) {
+		if (!cno)
+			continue;
+
+		nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+		error = nandfs_bread(node, blk, NOCRED, 0, &bp);
+		if (error) {
+			VOP_UNLOCK(NTOV(node), 0);
+			brelse(bp);
+			return (error);
+		}
+
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		if (cnp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT) {
+			brelse(bp);
+			VOP_UNLOCK(NTOV(node), 0);
+			return (0);
+		}
+
+		cnp->cp_flags |= NANDFS_CHECKPOINT_INVALID;
+
+		error = nandfs_dirty_buf(bp, 0);
+		if (error)
+			return (error);
+	}
+	VOP_UNLOCK(NTOV(node), 0);
+
+	return (0);
+}
+
+int
+nandfs_make_snap(struct nandfs_device *fsdev, uint64_t *cno)
+{
+	struct nandfs_cpmode cpm;
+	int error;
+
+	*cno = cpm.ncpm_cno = fsdev->nd_last_cno;
+	cpm.ncpm_mode = NANDFS_SNAPSHOT;
+	error = nandfs_chng_cpmode(fsdev->nd_cp_node, &cpm);
+	return (error);
+}
+
+int
+nandfs_delete_snap(struct nandfs_device *fsdev, uint64_t cno)
+{
+	struct nandfs_cpmode cpm;
+	int error;
+
+	cpm.ncpm_cno = cno;
+	cpm.ncpm_mode = NANDFS_CHECKPOINT;
+	error = nandfs_chng_cpmode(fsdev->nd_cp_node, &cpm);
+	return (error);
+}
+
+int nandfs_get_cpstat(struct nandfs_node *cp_node, struct nandfs_cpstat *ncp)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_cpfile_header *cnh;
+	struct buf *bp;
+	int error;
+
+	VOP_LOCK(NTOV(cp_node), LK_EXCLUSIVE);
+	fsdev = cp_node->nn_nandfsdev;
+
+	/* Get header */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (error);
+	}
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	ncp->ncp_cno = fsdev->nd_last_cno;
+	ncp->ncp_ncps = cnh->ch_ncheckpoints;
+	ncp->ncp_nss = cnh->ch_nsnapshots;
+	DPRINTF(CPFILE, ("%s: cno:%#jx ncps:%#jx nss:%#jx\n",
+	    __func__, ncp->ncp_cno, ncp->ncp_ncps, ncp->ncp_nss));
+	brelse(bp);
+	VOP_UNLOCK(NTOV(cp_node), 0);
+
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_dat.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_dat.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,344 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_dat.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+int
+nandfs_vblock_alloc(struct nandfs_device *nandfsdev, nandfs_daddr_t *vblock)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	struct nandfs_dat_entry *dat_entry;
+	uint64_t start;
+	uint32_t entry;
+	int locked, error;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+	start = nandfsdev->nd_last_cno + 1;
+
+	locked = NANDFS_VOP_ISLOCKED(NTOV(dat));
+	if (!locked)
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+	req.entrynum = 0;
+
+	/* Alloc vblock number */
+	error = nandfs_find_free_entry(mdt, dat, &req);
+	if (error) {
+		nandfs_error("%s: cannot find free vblk entry\n",
+		    __func__);
+		if (!locked)
+			VOP_UNLOCK(NTOV(dat), 0);
+		return (error);
+	}
+
+	/* Read/create buffer */
+	error = nandfs_get_entry_block(mdt, dat, &req, &entry, 1);
+	if (error) {
+		nandfs_error("%s: cannot get free vblk entry\n",
+		    __func__);
+		nandfs_abort_entry(&req);
+		if (!locked)
+			VOP_UNLOCK(NTOV(dat), 0);
+		return (error);
+	}
+
+	/* Fill out vblock data */
+	dat_entry = (struct nandfs_dat_entry *) req.bp_entry->b_data;
+	dat_entry[entry].de_start = start;
+	dat_entry[entry].de_end = UINTMAX_MAX;
+	dat_entry[entry].de_blocknr = 0;
+
+	/* Commit allocation */
+	error = nandfs_alloc_entry(mdt, &req);
+	if (error) {
+		nandfs_error("%s: cannot get free vblk entry\n",
+		    __func__);
+		if (!locked)
+			VOP_UNLOCK(NTOV(dat), 0);
+		return (error);
+	}
+
+	/* Return allocated vblock */
+	*vblock = req.entrynum;
+	DPRINTF(DAT, ("%s: allocated vblock %#jx\n",
+	    __func__, (uintmax_t)*vblock));
+
+	if (!locked)
+		VOP_UNLOCK(NTOV(dat), 0);
+	return (error);
+}
+
+int
+nandfs_vblock_assign(struct nandfs_device *nandfsdev, nandfs_daddr_t vblock,
+    nandfs_lbn_t block)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	struct nandfs_dat_entry *dat_entry;
+	uint32_t entry;
+	int locked, error;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+
+	locked = NANDFS_VOP_ISLOCKED(NTOV(dat));
+	if (!locked)
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+	req.entrynum = vblock;
+
+	error = nandfs_get_entry_block(mdt, dat, &req, &entry, 0);
+	if (!error) {
+		dat_entry = (struct nandfs_dat_entry *) req.bp_entry->b_data;
+		dat_entry[entry].de_blocknr = block;
+
+		DPRINTF(DAT, ("%s: assing vblock %jx->%jx\n",
+		    __func__, (uintmax_t)vblock, (uintmax_t)block));
+
+		/*
+		 * It is mostly called from syncer() so
+		 * we want to force making buf dirty
+		 */
+		error = nandfs_dirty_buf(req.bp_entry, 1);
+	}
+
+	if (!locked)
+		VOP_UNLOCK(NTOV(dat), 0);
+
+	return (error);
+}
+
+int
+nandfs_vblock_end(struct nandfs_device *nandfsdev, nandfs_daddr_t vblock)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	struct nandfs_dat_entry *dat_entry;
+	uint64_t end;
+	uint32_t entry;
+	int locked, error;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+	end = nandfsdev->nd_last_cno;
+
+	locked = NANDFS_VOP_ISLOCKED(NTOV(dat));
+	if (!locked)
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+	req.entrynum = vblock;
+
+	error = nandfs_get_entry_block(mdt, dat, &req, &entry, 0);
+	if (!error) {
+		dat_entry = (struct nandfs_dat_entry *) req.bp_entry->b_data;
+		dat_entry[entry].de_end = end;
+		DPRINTF(DAT, ("%s: end vblock %#jx at checkpoint %#jx\n",
+		    __func__, (uintmax_t)vblock, (uintmax_t)end));
+
+		/*
+		 * It is mostly called from syncer() so
+		 * we want to force making buf dirty
+		 */
+		error = nandfs_dirty_buf(req.bp_entry, 1);
+	}
+
+	if (!locked)
+		VOP_UNLOCK(NTOV(dat), 0);
+
+	return (error);
+}
+
+int
+nandfs_vblock_free(struct nandfs_device *nandfsdev, nandfs_daddr_t vblock)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	int error;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+
+	VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+	req.entrynum = vblock;
+
+	error = nandfs_find_entry(mdt, dat, &req);
+	if (!error) {
+		DPRINTF(DAT, ("%s: vblk %#jx\n", __func__, (uintmax_t)vblock));
+		nandfs_free_entry(mdt, &req);
+	}
+
+	VOP_UNLOCK(NTOV(dat), 0);
+	return (error);
+}
+
+int
+nandfs_get_dat_vinfo_ioctl(struct nandfs_device *nandfsdev, struct nandfs_argv *nargv)
+{
+	struct nandfs_vinfo *vinfo;
+	size_t size;
+	int error;
+
+	if (nargv->nv_nmembs > NANDFS_VINFO_MAX)
+		return (EINVAL);
+
+	size = sizeof(struct nandfs_vinfo) * nargv->nv_nmembs;
+	vinfo = malloc(size, M_NANDFSTEMP, M_WAITOK|M_ZERO);
+
+	error = copyin((void *)(uintptr_t)nargv->nv_base, vinfo, size);
+	if (error) {
+		free(vinfo, M_NANDFSTEMP);
+		return (error);
+	}
+
+	error = nandfs_get_dat_vinfo(nandfsdev, vinfo, nargv->nv_nmembs);
+	if (error == 0)
+		error =	copyout(vinfo, (void *)(uintptr_t)nargv->nv_base, size);
+	free(vinfo, M_NANDFSTEMP);
+	return (error);
+}
+
+int
+nandfs_get_dat_vinfo(struct nandfs_device *nandfsdev, struct nandfs_vinfo *vinfo,
+    uint32_t nmembs)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	struct nandfs_dat_entry *dat_entry;
+	uint32_t i, idx;
+	int error = 0;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+
+	DPRINTF(DAT, ("%s: nmembs %#x\n", __func__, nmembs));
+
+	VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+
+	for (i = 0; i < nmembs; i++) {
+		req.entrynum = vinfo[i].nvi_vblocknr;
+
+		error = nandfs_get_entry_block(mdt, dat,&req, &idx, 0);
+		if (error)
+			break;
+
+		dat_entry = ((struct nandfs_dat_entry *) req.bp_entry->b_data);
+		vinfo[i].nvi_start = dat_entry[idx].de_start;
+		vinfo[i].nvi_end = dat_entry[idx].de_end;
+		vinfo[i].nvi_blocknr = dat_entry[idx].de_blocknr;
+
+		DPRINTF(DAT, ("%s: vinfo: %jx[%jx-%jx]->%jx\n",
+		    __func__, vinfo[i].nvi_vblocknr, vinfo[i].nvi_start,
+		    vinfo[i].nvi_end, vinfo[i].nvi_blocknr));
+
+		brelse(req.bp_entry);
+	}
+
+	VOP_UNLOCK(NTOV(dat), 0);
+	return (error);
+}
+
+int
+nandfs_get_dat_bdescs_ioctl(struct nandfs_device *nffsdev,
+    struct nandfs_argv *nargv)
+{
+	struct nandfs_bdesc *bd;
+	size_t size;
+	int error;
+
+	size = nargv->nv_nmembs * sizeof(struct nandfs_bdesc);
+	bd = malloc(size, M_NANDFSTEMP, M_WAITOK);
+	error = copyin((void *)(uintptr_t)nargv->nv_base, bd, size);
+	if (error) {
+		free(bd, M_NANDFSTEMP);
+		return (error);
+	}
+
+	error = nandfs_get_dat_bdescs(nffsdev, bd, nargv->nv_nmembs);
+
+	if (error == 0)
+		error =	copyout(bd, (void *)(uintptr_t)nargv->nv_base, size);
+
+	free(bd, M_NANDFSTEMP);
+	return (error);
+}
+
+int
+nandfs_get_dat_bdescs(struct nandfs_device *nffsdev, struct nandfs_bdesc *bd,
+    uint32_t nmembs)
+{
+	struct nandfs_node *dat_node;
+	uint64_t map;
+	uint32_t i;
+	int error = 0;
+
+	dat_node = nffsdev->nd_dat_node;
+
+	VOP_LOCK(NTOV(dat_node), LK_EXCLUSIVE);
+
+	for (i = 0; i < nmembs; i++) {
+		DPRINTF(CLEAN,
+		    ("%s: bd ino:%#jx oblk:%#jx blocknr:%#jx off:%#jx\n",
+		    __func__,  (uintmax_t)bd[i].bd_ino,
+		    (uintmax_t)bd[i].bd_oblocknr, (uintmax_t)bd[i].bd_blocknr,
+		    (uintmax_t)bd[i].bd_offset));
+
+		error = nandfs_bmap_lookup(dat_node, bd[i].bd_offset, &map);
+		if (error)
+			break;
+		bd[i].bd_blocknr = map;
+	}
+
+	VOP_UNLOCK(NTOV(dat_node), 0);
+	return (error);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_dir.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_dir.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,314 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_subr.c,v 1.4 2009/07/29 17:06:57 reinoud
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_dir.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+
+int
+nandfs_add_dirent(struct vnode *dvp, uint64_t ino, char *nameptr, long namelen,
+    uint8_t type)
+{
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfs_dir_entry *dirent, *pdirent;
+	uint32_t blocksize = dir_node->nn_nandfsdev->nd_blocksize;
+	uint64_t filesize = dir_node->nn_inode.i_size;
+	uint64_t inode_blks = dir_node->nn_inode.i_blocks;
+	uint32_t off, rest;
+	uint8_t *pos;
+	struct buf *bp;
+	int error;
+
+	pdirent = NULL;
+	bp = NULL;
+	if (inode_blks) {
+		error = nandfs_bread(dir_node, inode_blks - 1, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+
+		pos = bp->b_data;
+		off = 0;
+		while (off < blocksize) {
+			pdirent = (struct nandfs_dir_entry *) (pos + off);
+			if (!pdirent->rec_len) {
+				pdirent = NULL;
+				break;
+			}
+			off += pdirent->rec_len;
+		}
+
+		if (pdirent)
+			rest = pdirent->rec_len -
+			    NANDFS_DIR_REC_LEN(pdirent->name_len);
+		else
+			rest = blocksize;
+
+		if (rest < NANDFS_DIR_REC_LEN(namelen)) {
+			/* Do not update pdirent as new block is created */
+			pdirent = NULL;
+			brelse(bp);
+			/* Set to NULL to create new */
+			bp = NULL;
+			filesize += rest;
+		}
+	}
+
+	/* If no bp found create new */
+	if (!bp) {
+		error = nandfs_bcreate(dir_node, inode_blks, NOCRED, 0, &bp);
+		if (error)
+			return (error);
+		off = 0;
+		pos = bp->b_data;
+	}
+
+	/* Modify pdirent if exists */
+	if (pdirent) {
+		DPRINTF(LOOKUP, ("modify pdirent %p\n", pdirent));
+		/* modify last de */
+		off -= pdirent->rec_len;
+		pdirent->rec_len =
+		    NANDFS_DIR_REC_LEN(pdirent->name_len);
+		off += pdirent->rec_len;
+	}
+
+	/* Create new dirent */
+	dirent = (struct nandfs_dir_entry *) (pos + off);
+	dirent->rec_len = blocksize - off;
+	dirent->inode = ino;
+	dirent->name_len = namelen;
+	memset(dirent->name, 0, NANDFS_DIR_NAME_LEN(namelen));
+	memcpy(dirent->name, nameptr, namelen);
+	dirent->file_type = type;
+
+	filesize += NANDFS_DIR_REC_LEN(dirent->name_len);
+
+	DPRINTF(LOOKUP, ("create dir_entry '%.*s' at %p with size %x "
+	    "new filesize: %jx\n",
+	    (int)namelen, dirent->name, dirent, dirent->rec_len,
+	    (uintmax_t)filesize));
+
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	dir_node->nn_inode.i_size = filesize;
+	dir_node->nn_flags |= IN_CHANGE | IN_UPDATE;
+	vnode_pager_setsize(dvp, filesize);
+
+	return (0);
+}
+
+int
+nandfs_remove_dirent(struct vnode *dvp, struct nandfs_node *node,
+    struct componentname *cnp)
+{
+	struct nandfs_node *dir_node;
+	struct nandfs_dir_entry *dirent, *pdirent;
+	struct buf *bp;
+	uint64_t filesize, blocknr, ino, offset;
+	uint32_t blocksize, limit, off;
+	uint16_t newsize;
+	uint8_t *pos;
+	int error, found;
+
+	dir_node = VTON(dvp);
+	filesize = dir_node->nn_inode.i_size;
+	if (!filesize)
+		return (0);
+
+	if (node) {
+		offset = node->nn_diroff;
+		ino = node->nn_ino;
+	} else {
+		offset = dir_node->nn_diroff;
+		ino = NANDFS_WHT_INO;
+	}
+
+	dirent = pdirent = NULL;
+	blocksize = dir_node->nn_nandfsdev->nd_blocksize;
+	blocknr = offset / blocksize;
+
+	DPRINTF(LOOKUP, ("rm direntry dvp %p node %p ino %#jx at off %#jx\n",
+	    dvp, node, (uintmax_t)ino, (uintmax_t)offset));
+
+	error = nandfs_bread(dir_node, blocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	pos = bp->b_data;
+	off = 0;
+	found = 0;
+	limit = offset % blocksize;
+	pdirent = (struct nandfs_dir_entry *) bp->b_data;
+	while (off <= limit) {
+		dirent = (struct nandfs_dir_entry *) (pos + off);
+
+		if ((off == limit) &&
+		    (dirent->inode == ino)) {
+			found = 1;
+			break;
+		}
+		if (dirent->inode != 0)
+			pdirent = dirent;
+		off += dirent->rec_len;
+	}
+
+	if (!found) {
+		nandfs_error("cannot find entry to remove");
+		brelse(bp);
+		return (error);
+	}
+	DPRINTF(LOOKUP,
+	    ("rm dirent ino %#jx at %#x with size %#x\n",
+	    (uintmax_t)dirent->inode, off, dirent->rec_len));
+
+	newsize = (uintptr_t)dirent - (uintptr_t)pdirent;
+	newsize += dirent->rec_len;
+	pdirent->rec_len = newsize;
+	dirent->inode = 0;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	dir_node->nn_flags |= IN_CHANGE | IN_UPDATE;
+	/* If last one modify filesize */
+	if ((offset + NANDFS_DIR_REC_LEN(dirent->name_len)) == filesize) {
+		filesize = blocknr * blocksize +
+		    ((uintptr_t)pdirent - (uintptr_t)pos) +
+		    NANDFS_DIR_REC_LEN(pdirent->name_len);
+		dir_node->nn_inode.i_size = filesize;
+	}
+
+	return (0);
+}
+
+int
+nandfs_update_parent_dir(struct vnode *dvp, uint64_t newparent)
+{
+	struct nandfs_dir_entry *dirent;
+	struct nandfs_node *dir_node;
+	struct buf *bp;
+	int error;
+
+	dir_node = VTON(dvp);
+	error = nandfs_bread(dir_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	dirent = (struct nandfs_dir_entry *)bp->b_data;
+	dirent->inode = newparent;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+int
+nandfs_update_dirent(struct vnode *dvp, struct nandfs_node *fnode,
+    struct nandfs_node *tnode)
+{
+	struct nandfs_node *dir_node;
+	struct nandfs_dir_entry *dirent;
+	struct buf *bp;
+	uint64_t file_size, blocknr;
+	uint32_t blocksize, off;
+	uint8_t *pos;
+	int error;
+
+	dir_node = VTON(dvp);
+	file_size = dir_node->nn_inode.i_size;
+	if (!file_size)
+		return (0);
+
+	DPRINTF(LOOKUP,
+	    ("chg direntry dvp %p ino %#jx  to in %#jx at off %#jx\n",
+	    dvp, (uintmax_t)tnode->nn_ino, (uintmax_t)fnode->nn_ino,
+	    (uintmax_t)tnode->nn_diroff));
+
+	blocksize = dir_node->nn_nandfsdev->nd_blocksize;
+	blocknr = tnode->nn_diroff / blocksize;
+	off = tnode->nn_diroff % blocksize;
+	error = nandfs_bread(dir_node, blocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	pos = bp->b_data;
+	dirent = (struct nandfs_dir_entry *) (pos + off);
+	KASSERT((dirent->inode == tnode->nn_ino),
+	    ("direntry mismatch"));
+
+	dirent->inode = fnode->nn_ino;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+int
+nandfs_init_dir(struct vnode *dvp, uint64_t ino, uint64_t parent_ino)
+{
+
+	if (nandfs_add_dirent(dvp, parent_ino, "..", 2, DT_DIR) ||
+	    nandfs_add_dirent(dvp, ino, ".", 1, DT_DIR)) {
+		nandfs_error("%s: cannot initialize dir ino:%jd(pino:%jd)\n",
+		    __func__, ino, parent_ino);
+		return (-1);
+	}
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_fs.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_fs.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,565 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Original definitions written by Koji Sato <koji at osrg.net>
+ *                    and Ryusuke Konishi <ryusuke at osrg.net>
+ * From: NetBSD: nandfs_fs.h,v 1.1 2009/07/18 16:31:42 reinoud
+ *
+ * $FreeBSD: head/sys/fs/nandfs/nandfs_fs.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _NANDFS_FS_H
+#define _NANDFS_FS_H
+
+#include <sys/uuid.h>
+
+#define	MNINDIR(fsdev)	((fsdev)->nd_blocksize / sizeof(nandfs_daddr_t))
+
+/*
+ * Inode structure. There are a few dedicated inode numbers that are
+ * defined here first.
+ */
+#define	NANDFS_WHT_INO		1	/* Whiteout ino			*/
+#define	NANDFS_ROOT_INO		2	/* Root file inode		*/
+#define	NANDFS_DAT_INO		3	/* DAT file			*/
+#define	NANDFS_CPFILE_INO	4	/* checkpoint file		*/
+#define	NANDFS_SUFILE_INO	5	/* segment usage file		*/
+#define	NANDFS_IFILE_INO	6	/* ifile			*/
+#define	NANDFS_GC_INO		7	/* Cleanerd node		*/
+#define	NANDFS_ATIME_INO	8	/* Atime file (reserved)	*/
+#define	NANDFS_XATTR_INO	9	/* Xattribute file (reserved)	*/
+#define	NANDFS_SKETCH_INO	10	/* Sketch file (obsolete)	*/
+#define	NANDFS_USER_INO		11	/* First user's file inode number */
+
+#define	NANDFS_SYS_NODE(ino) \
+	(((ino) >= NANDFS_DAT_INO) && ((ino) <= NANDFS_GC_INO))
+
+#define	NDADDR		12		/* Direct addresses in inode. */
+#define	NIADDR		3		/* Indirect addresses in inode. */
+
+typedef	int64_t		nandfs_daddr_t;
+typedef	int64_t		nandfs_lbn_t;
+
+struct nandfs_inode {
+	uint64_t	i_blocks;	/* 0: size in device blocks		*/
+	uint64_t	i_size;		/* 8: size in bytes			*/
+	uint64_t	i_ctime;	/* 16: creation time in seconds		*/
+	uint64_t	i_mtime;	/* 24: modification time in seconds part*/
+	uint32_t	i_ctime_nsec;	/* 32: creation time nanoseconds part	*/
+	uint32_t	i_mtime_nsec;	/* 36: modification time in nanoseconds	*/
+	uint32_t	i_uid;		/* 40: user id				*/
+	uint32_t	i_gid;		/* 44: group id				*/
+	uint16_t	i_mode;		/* 48: file mode			*/
+	uint16_t	i_links_count;	/* 50: number of references to the inode*/
+	uint32_t	i_flags;	/* 52: NANDFS_*_FL flags		*/
+	nandfs_daddr_t	i_special;	/* 56: special				*/
+	nandfs_daddr_t	i_db[NDADDR];	/* 64: Direct disk blocks.		*/
+	nandfs_daddr_t	i_ib[NIADDR];	/* 160: Indirect disk blocks.		*/
+	uint64_t	i_xattr;	/* 184: reserved for extended attributes*/
+	uint32_t	i_generation;	/* 192: file generation for NFS		*/
+	uint32_t	i_pad[15];	/* 196: make it 64 bits aligned		*/
+};
+
+#ifdef _KERNEL
+CTASSERT(sizeof(struct nandfs_inode) == 256);
+#endif
+
+/*
+ * Each checkpoint/snapshot has a super root.
+ *
+ * The super root holds the inodes of the three system files: `dat', `cp' and
+ * 'su' files. All other FS state is defined by those.
+ *
+ * It is CRC checksum'ed and time stamped.
+ */
+
+struct nandfs_super_root {
+	uint32_t	sr_sum;		/* check-sum				*/
+	uint16_t	sr_bytes;	/* byte count of this structure		*/
+	uint16_t	sr_flags;	/* reserved for flags			*/
+	uint64_t	sr_nongc_ctime;	/* timestamp, not for cleaner(?)	*/
+	struct nandfs_inode sr_dat;	/* DAT, virt->phys translation inode	*/
+	struct nandfs_inode sr_cpfile;	/* CP, checkpoints inode		*/
+	struct nandfs_inode sr_sufile;	/* SU, segment usage inode		*/
+};
+
+#define	NANDFS_SR_MDT_OFFSET(inode_size, i)			\
+	((uint32_t)&((struct nandfs_super_root *)0)->sr_dat +	\
+	(inode_size) * (i))
+
+#define	NANDFS_SR_DAT_OFFSET(inode_size)	NANDFS_SR_MDT_OFFSET(inode_size, 0)
+#define	NANDFS_SR_CPFILE_OFFSET(inode_size)	NANDFS_SR_MDT_OFFSET(inode_size, 1)
+#define	NANDFS_SR_SUFILE_OFFSET(inode_size)	NANDFS_SR_MDT_OFFSET(inode_size, 2)
+#define	NANDFS_SR_BYTES			(sizeof(struct nandfs_super_root))
+
+/*
+ * The superblock describes the basic structure and mount history. It also
+ * records some sizes of structures found on the disc for sanity checks.
+ *
+ * The superblock is stored at two places: NANDFS_SB_OFFSET_BYTES and
+ * NANDFS_SB2_OFFSET_BYTES.
+ */
+
+/* File system states stored on media in superblock's sbp->s_state */
+#define	NANDFS_VALID_FS		0x0001	/* cleanly unmounted and all is ok  */
+#define	NANDFS_ERROR_FS		0x0002	/* there were errors detected, fsck */
+#define	NANDFS_RESIZE_FS	0x0004	/* resize required, XXX unknown flag*/
+#define	NANDFS_MOUNT_STATE_BITS	"\20\1VALID_FS\2ERROR_FS\3RESIZE_FS"
+
+/*
+ * Brief description of control structures:
+ *
+ * NANDFS_NFSAREAS first blocks contain fsdata and some amount of super blocks.
+ * Simple round-robin policy is used in order to choose which block will
+ * contain new super block.
+ *
+ * Simple case with 2 blocks:
+ * 1: fsdata sblock1 [sblock3 [sblock5 ..]]
+ * 2: fsdata sblock2 [sblock4 [sblock6 ..]]
+ */
+struct nandfs_fsdata {
+	uint16_t	f_magic;
+	uint16_t	f_bytes;
+
+	uint32_t	f_sum;		/* checksum of fsdata		*/
+	uint32_t	f_rev_level;	/* major disk format revision	*/
+
+	uint64_t	f_ctime;	/* creation time (execution time
+					   of newfs)			*/
+	/* Block size represented as: blocksize = 1 << (f_log_block_size + 10)	*/
+	uint32_t	f_log_block_size;
+
+	uint16_t	f_inode_size;		/* size of an inode		*/
+	uint16_t	f_dat_entry_size;	/* size of a dat entry		*/
+	uint16_t	f_checkpoint_size;	/* size of a checkpoint		*/
+	uint16_t	f_segment_usage_size;	/* size of a segment usage	*/
+
+	uint16_t	f_sbbytes;		/* byte count of CRC calculation
+						   for super blocks. s_reserved
+						   is excluded!			*/
+
+	uint16_t	f_errors;		/* behaviour on detecting errors	*/
+
+	uint32_t	f_erasesize;
+	uint64_t	f_nsegments;		/* number of segm. in filesystem	*/
+	nandfs_daddr_t	f_first_data_block;	/* 1st seg disk block number		*/
+	uint32_t	f_blocks_per_segment;	/* number of blocks per segment		*/
+	uint32_t	f_r_segments_percentage;	/* reserved segments percentage		*/
+
+	struct uuid	f_uuid;			/* 128-bit uuid for volume		*/
+	char		f_volume_name[16];	/* volume name				*/
+	uint32_t	f_pad[104];
+} __packed;
+
+#ifdef _KERNEL
+CTASSERT(sizeof(struct nandfs_fsdata) == 512);
+#endif
+
+struct nandfs_super_block {
+	uint16_t	s_magic;		/* magic value for identification */
+
+	uint32_t	s_sum;			/* check sum of super block       */
+
+	uint64_t	s_last_cno;		/* last checkpoint number         */
+	uint64_t	s_last_pseg;		/* addr part. segm. written last  */
+	uint64_t	s_last_seq;		/* seq.number of seg written last */
+	uint64_t	s_free_blocks_count;	/* free blocks count              */
+
+	uint64_t	s_mtime;		/* mount time                     */
+	uint64_t	s_wtime;		/* write time                     */
+	uint16_t	s_state;		/* file system state              */
+
+	char		s_last_mounted[64];	/* directory where last mounted   */
+
+	uint32_t	s_c_interval;		/* commit interval of segment     */
+	uint32_t	s_c_block_max;		/* threshold of data amount for
+						   the segment construction */
+	uint32_t	s_reserved[32];		/* padding to end of the block    */
+} __packed;
+
+#ifdef _KERNEL
+CTASSERT(sizeof(struct nandfs_super_block) == 256);
+#endif
+
+#define	NANDFS_FSDATA_MAGIC	0xf8da
+#define	NANDFS_SUPER_MAGIC	0x8008
+
+#define	NANDFS_NFSAREAS		4
+#define	NANDFS_DATA_OFFSET_BYTES(esize)	(NANDFS_NFSAREAS * (esize))
+
+#define	NANDFS_SBLOCK_OFFSET_BYTES (sizeof(struct nandfs_fsdata))
+
+#define	NANDFS_DEF_BLOCKSIZE	4096
+#define	NANDFS_MIN_BLOCKSIZE	512
+
+#define	NANDFS_DEF_ERASESIZE	(2 << 16)
+
+#define	NANDFS_MIN_SEGSIZE	NANDFS_DEF_ERASESIZE
+
+#define	NANDFS_CURRENT_REV	9	/* current major revision */
+
+#define	NANDFS_FSDATA_CRC_BYTES offsetof(struct nandfs_fsdata, f_pad)
+/* Bytes count of super_block for CRC-calculation */
+#define	NANDFS_SB_BYTES  offsetof(struct nandfs_super_block, s_reserved)
+
+/* Maximal count of links to a file */
+#define	NANDFS_LINK_MAX		32000
+
+/*
+ * Structure of a directory entry.
+ *
+ * Note that they can't span blocks; the rec_len fills out.
+ */
+
+#define	NANDFS_NAME_LEN 255
+struct nandfs_dir_entry {
+	uint64_t	inode;			/* inode number */
+	uint16_t	rec_len;		/* directory entry length */
+	uint8_t		name_len;		/* name length */
+	uint8_t		file_type;
+	char		name[NANDFS_NAME_LEN];	/* file name */
+	char		pad;
+};
+
+/*
+ * NANDFS_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 8
+ */
+#define	NANDFS_DIR_PAD			8
+#define	NANDFS_DIR_ROUND		(NANDFS_DIR_PAD - 1)
+#define	NANDFS_DIR_NAME_OFFSET		(offsetof(struct nandfs_dir_entry, name))
+#define	NANDFS_DIR_REC_LEN(name_len)					\
+	(((name_len) + NANDFS_DIR_NAME_OFFSET + NANDFS_DIR_ROUND)	\
+	& ~NANDFS_DIR_ROUND)
+#define	NANDFS_DIR_NAME_LEN(name_len)	\
+	(NANDFS_DIR_REC_LEN(name_len) - NANDFS_DIR_NAME_OFFSET)
+
+/*
+ * NiLFS/NANDFS devides the disc into fixed length segments. Each segment is
+ * filled with one or more partial segments of variable lengths.
+ *
+ * Each partial segment has a segment summary header followed by updates of
+ * files and optionally a super root.
+ */
+
+/*
+ * Virtual to physical block translation information. For data blocks it maps
+ * logical block number bi_blkoff to virtual block nr bi_vblocknr. For non
+ * datablocks it is the virtual block number assigned to an indirect block
+ * and has no bi_blkoff. The physical block number is the next
+ * available data block in the partial segment after all the binfo's.
+ */
+struct nandfs_binfo_v {
+	uint64_t	bi_ino;		/* file's inode			     */
+	uint64_t	bi_vblocknr;	/* assigned virtual block number     */
+	uint64_t	bi_blkoff;	/* for file's logical block number   */
+};
+
+/*
+ * DAT allocation. For data blocks just the logical block number that maps on
+ * the next available data block in the partial segment after the binfo's.
+ */
+struct nandfs_binfo_dat {
+	uint64_t	bi_ino;
+	uint64_t	bi_blkoff;	/* DAT file's logical block number */
+	uint8_t		bi_level;	/* whether this is meta block */
+	uint8_t		bi_pad[7];
+};
+
+#ifdef _KERNEL
+CTASSERT(sizeof(struct nandfs_binfo_v) == sizeof(struct nandfs_binfo_dat));
+#endif
+
+/* Convenience union for both types of binfo's */
+union nandfs_binfo {
+	struct nandfs_binfo_v bi_v;
+	struct nandfs_binfo_dat bi_dat;
+};
+
+/* Indirect buffers path */
+struct nandfs_indir {
+	nandfs_daddr_t	in_lbn;
+	int		in_off;
+};
+
+/* The (partial) segment summary */
+struct nandfs_segment_summary {
+	uint32_t	ss_datasum;	/* CRC of complete data block        */
+	uint32_t	ss_sumsum;	/* CRC of segment summary only       */
+	uint32_t	ss_magic;	/* magic to identify segment summary */
+	uint16_t	ss_bytes;	/* size of segment summary structure */
+	uint16_t	ss_flags;	/* NANDFS_SS_* flags                  */
+	uint64_t	ss_seq;		/* sequence number of this segm. sum */
+	uint64_t	ss_create;	/* creation timestamp in seconds     */
+	uint64_t	ss_next;	/* blocknumber of next segment       */
+	uint32_t	ss_nblocks;	/* number of blocks used by summary  */
+	uint32_t	ss_nbinfos;	/* number of binfo structures	     */
+	uint32_t	ss_sumbytes;	/* total size of segment summary     */
+	uint32_t	ss_pad;
+	/* stream of binfo structures */
+};
+
+#define	NANDFS_SEGSUM_MAGIC	0x8e680011	/* segment summary magic number */
+
+/* Segment summary flags */
+#define	NANDFS_SS_LOGBGN	0x0001	/* begins a logical segment */
+#define	NANDFS_SS_LOGEND	0x0002	/* ends a logical segment */
+#define	NANDFS_SS_SR		0x0004	/* has super root */
+#define	NANDFS_SS_SYNDT		0x0008	/* includes data only updates */
+#define	NANDFS_SS_GC		0x0010	/* segment written for cleaner operation */
+#define	NANDFS_SS_FLAG_BITS	"\20\1LOGBGN\2LOGEND\3SR\4SYNDT\5GC"
+
+/* Segment summary constrains */
+#define	NANDFS_SEG_MIN_BLOCKS	16	/* minimum number of blocks in a
+					   full segment */
+#define	NANDFS_PSEG_MIN_BLOCKS	2	/* minimum number of blocks in a
+					   partial segment */
+#define	NANDFS_MIN_NRSVSEGS	8	/* minimum number of reserved
+					   segments */
+
+/*
+ * Structure of DAT/inode file.
+ *
+ * A DAT file is devided into groups. The maximum number of groups is the
+ * number of block group descriptors that fit into one block; this descriptor
+ * only gives the number of free entries in the associated group.
+ *
+ * Each group has a block sized bitmap indicating if an entry is taken or
+ * empty. Each bit stands for a DAT entry.
+ *
+ * The inode file has exactly the same format only the entries are inode
+ * entries.
+ */
+
+struct nandfs_block_group_desc {
+	uint32_t	bg_nfrees;	/* num. free entries in block group  */
+};
+
+/* DAT entry in a super root's DAT file */
+struct nandfs_dat_entry {
+	uint64_t	de_blocknr;	/* block number                      */
+	uint64_t	de_start;	/* valid from checkpoint             */
+	uint64_t	de_end;		/* valid till checkpoint             */
+	uint64_t	de_rsv;		/* reserved for future use           */
+};
+
+/*
+ * Structure of CP file.
+ *
+ * A snapshot is just a checkpoint only it's protected against removal by the
+ * cleaner. The snapshots are kept on a double linked list of checkpoints.
+ */
+struct nandfs_snapshot_list {
+	uint64_t	ssl_next;	/* checkpoint nr. forward */
+	uint64_t	ssl_prev;	/* checkpoint nr. back    */
+};
+
+/* Checkpoint entry structure */
+struct nandfs_checkpoint {
+	uint32_t	cp_flags;		/* NANDFS_CHECKPOINT_* flags          */
+	uint32_t	cp_checkpoints_count;	/* ZERO, not used anymore?           */
+	struct nandfs_snapshot_list cp_snapshot_list; /* list of snapshots   */
+	uint64_t	cp_cno;			/* checkpoint number                 */
+	uint64_t	cp_create;		/* creation timestamp                */
+	uint64_t	cp_nblk_inc;		/* number of blocks incremented      */
+	uint64_t	cp_blocks_count;	/* reserved (might be deleted)       */
+	struct nandfs_inode cp_ifile_inode;	/* inode file inode          */
+};
+
+/* Checkpoint flags */
+#define	NANDFS_CHECKPOINT_SNAPSHOT	1
+#define	NANDFS_CHECKPOINT_INVALID	2
+#define	NANDFS_CHECKPOINT_SKETCH	4
+#define	NANDFS_CHECKPOINT_MINOR		8
+#define	NANDFS_CHECKPOINT_BITS		"\20\1SNAPSHOT\2INVALID\3SKETCH\4MINOR"
+
+/* Header of the checkpoint file */
+struct nandfs_cpfile_header {
+	uint64_t	ch_ncheckpoints;	/* number of checkpoints             */
+	uint64_t	ch_nsnapshots;	/* number of snapshots               */
+	struct nandfs_snapshot_list ch_snapshot_list;	/* snapshot list     */
+};
+
+#define	NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET		\
+	((sizeof(struct nandfs_cpfile_header) +		\
+	sizeof(struct nandfs_checkpoint) - 1) /		\
+	sizeof(struct nandfs_checkpoint))
+
+
+#define NANDFS_NOSEGMENT        0xffffffff
+
+/*
+ * Structure of SU file.
+ *
+ * The segment usage file sums up how each of the segments are used. They are
+ * indexed by their segment number.
+ */
+
+/* Segment usage entry */
+struct nandfs_segment_usage {
+	uint64_t	su_lastmod;	/* last modified timestamp           */
+	uint32_t	su_nblocks;	/* number of blocks in segment       */
+	uint32_t	su_flags;	/* NANDFS_SEGMENT_USAGE_* flags       */
+};
+
+/* Segment usage flag */
+#define	NANDFS_SEGMENT_USAGE_ACTIVE	1
+#define	NANDFS_SEGMENT_USAGE_DIRTY	2
+#define	NANDFS_SEGMENT_USAGE_ERROR	4
+#define	NANDFS_SEGMENT_USAGE_GC		8
+#define	NANDFS_SEGMENT_USAGE_BITS	"\20\1ACTIVE\2DIRTY\3ERROR"
+
+/* Header of the segment usage file */
+struct nandfs_sufile_header {
+	uint64_t	sh_ncleansegs;	/* number of segments marked clean   */
+	uint64_t	sh_ndirtysegs;	/* number of segments marked dirty   */
+	uint64_t	sh_last_alloc;	/* last allocated segment number     */
+};
+
+#define	NANDFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET	\
+	((sizeof(struct nandfs_sufile_header) +		\
+	sizeof(struct nandfs_segment_usage) - 1) /	\
+	sizeof(struct nandfs_segment_usage))
+
+struct nandfs_seg_stat {
+	uint64_t	nss_nsegs;
+	uint64_t	nss_ncleansegs;
+	uint64_t	nss_ndirtysegs;
+	uint64_t	nss_ctime;
+	uint64_t	nss_nongc_ctime;
+	uint64_t	nss_prot_seq;
+};
+
+enum {
+	NANDFS_CHECKPOINT,
+	NANDFS_SNAPSHOT
+};
+
+#define	NANDFS_CPINFO_MAX		512
+
+struct nandfs_cpinfo {
+	uint32_t	nci_flags;
+	uint32_t	nci_pad;
+	uint64_t	nci_cno;
+	uint64_t	nci_create;
+	uint64_t	nci_nblk_inc;
+	uint64_t	nci_blocks_count;
+	uint64_t	nci_next;
+};
+
+#define	NANDFS_SEGMENTS_MAX	512
+
+struct nandfs_suinfo {
+	uint64_t	nsi_num;
+	uint64_t	nsi_lastmod;
+	uint32_t	nsi_blocks;
+	uint32_t	nsi_flags;
+};
+
+#define	NANDFS_VINFO_MAX	512
+
+struct nandfs_vinfo {
+	uint64_t	nvi_ino;
+	uint64_t	nvi_vblocknr;
+	uint64_t	nvi_start;
+	uint64_t	nvi_end;
+	uint64_t	nvi_blocknr;
+	int		nvi_alive;
+};
+
+struct nandfs_cpmode {
+	uint64_t	ncpm_cno;
+	uint32_t	ncpm_mode;
+	uint32_t	ncpm_pad;
+};
+
+struct nandfs_argv {
+	uint64_t	nv_base;
+	uint32_t	nv_nmembs;
+	uint16_t	nv_size;
+	uint16_t	nv_flags;
+	uint64_t	nv_index;
+};
+
+struct nandfs_cpstat {
+	uint64_t	ncp_cno;
+	uint64_t	ncp_ncps;
+	uint64_t	ncp_nss;
+};
+
+struct nandfs_period {
+	uint64_t	p_start;
+	uint64_t	p_end;
+};
+
+struct nandfs_vdesc {
+	uint64_t	vd_ino;
+	uint64_t	vd_cno;
+	uint64_t	vd_vblocknr;
+	struct nandfs_period	vd_period;
+	uint64_t	vd_blocknr;
+	uint64_t	vd_offset;
+	uint32_t	vd_flags;
+	uint32_t	vd_pad;
+};
+
+struct nandfs_bdesc {
+	uint64_t	bd_ino;
+	uint64_t	bd_oblocknr;
+	uint64_t	bd_blocknr;
+	uint64_t	bd_offset;
+	uint32_t	bd_level;
+	uint32_t	bd_alive;
+};
+
+#ifndef _KERNEL
+#ifndef	MNAMELEN
+#define	MNAMELEN	88
+#endif
+#endif
+
+struct nandfs_fsinfo {
+	struct nandfs_fsdata		fs_fsdata;
+	struct nandfs_super_block	fs_super;
+	char				fs_dev[MNAMELEN];
+};
+
+#define	NANDFS_MAX_MOUNTS	65535
+
+#define	NANDFS_IOCTL_GET_SUSTAT		_IOR('N', 100, struct nandfs_seg_stat)
+#define	NANDFS_IOCTL_CHANGE_CPMODE	_IOWR('N', 101, struct nandfs_cpmode)
+#define	NANDFS_IOCTL_GET_CPINFO		_IOWR('N', 102, struct nandfs_argv)
+#define	NANDFS_IOCTL_DELETE_CP		_IOWR('N', 103, uint64_t[2])
+#define	NANDFS_IOCTL_GET_CPSTAT		_IOR('N', 104, struct nandfs_cpstat)
+#define	NANDFS_IOCTL_GET_SUINFO		_IOWR('N', 105, struct nandfs_argv)
+#define	NANDFS_IOCTL_GET_VINFO		_IOWR('N', 106, struct nandfs_argv)
+#define	NANDFS_IOCTL_GET_BDESCS		_IOWR('N', 107, struct nandfs_argv)
+#define	NANDFS_IOCTL_GET_FSINFO		_IOR('N', 108, struct nandfs_fsinfo)
+#define	NANDFS_IOCTL_MAKE_SNAP		_IOWR('N', 109, uint64_t)
+#define	NANDFS_IOCTL_DELETE_SNAP	_IOWR('N', 110, uint64_t)
+#define	NANDFS_IOCTL_SYNC		_IOWR('N', 111, uint64_t)
+
+#endif /* _NANDFS_FS_H */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_ifile.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_ifile.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,213 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_ifile.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+int
+nandfs_node_create(struct nandfsmount *nmp, struct nandfs_node **node,
+    uint16_t mode)
+{
+	struct nandfs_alloc_request req;
+	struct nandfs_device *nandfsdev;
+	struct nandfs_mdt *mdt;
+	struct nandfs_node *ifile;
+	struct nandfs_inode *inode;
+	struct vnode *vp;
+	uint32_t entry;
+	int error = 0;
+
+	nandfsdev = nmp->nm_nandfsdev;
+	mdt = &nandfsdev->nd_ifile_mdt;
+	ifile = nmp->nm_ifile_node;
+	vp = NTOV(ifile);
+
+	VOP_LOCK(vp, LK_EXCLUSIVE);
+	/* Allocate new inode in ifile */
+	req.entrynum = nandfsdev->nd_last_ino + 1;
+	error = nandfs_find_free_entry(mdt, ifile, &req);
+	if (error) {
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	error = nandfs_get_entry_block(mdt, ifile, &req, &entry, 1);
+	if (error) {
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	/* Inode initialization */
+	inode = ((struct nandfs_inode *) req.bp_entry->b_data) + entry;
+	nandfs_inode_init(inode, mode);
+
+	error = nandfs_alloc_entry(mdt, &req);
+	if (error) {
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	VOP_UNLOCK(vp, 0);
+
+	nandfsdev->nd_last_ino = req.entrynum;
+	error = nandfs_get_node(nmp, req.entrynum, node);
+	DPRINTF(IFILE, ("%s: node: %p ino: %#jx\n",
+	    __func__, node, (uintmax_t)((*node)->nn_ino)));
+
+	return (error);
+}
+
+int
+nandfs_node_destroy(struct nandfs_node *node)
+{
+	struct nandfs_alloc_request req;
+	struct nandfsmount *nmp;
+	struct nandfs_mdt *mdt;
+	struct nandfs_node *ifile;
+	struct vnode *vp;
+	int error = 0;
+
+	nmp = node->nn_nmp;
+	req.entrynum = node->nn_ino;
+	mdt = &nmp->nm_nandfsdev->nd_ifile_mdt;
+	ifile = nmp->nm_ifile_node;
+	vp = NTOV(ifile);
+
+	DPRINTF(IFILE, ("%s: destroy node: %p ino: %#jx\n",
+	    __func__, node, (uintmax_t)node->nn_ino));
+	VOP_LOCK(vp, LK_EXCLUSIVE);
+
+	error = nandfs_find_entry(mdt, ifile, &req);
+	if (error) {
+		nandfs_error("%s: finding entry error:%d node %p(%jx)",
+		    __func__, error, node, node->nn_ino);
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	nandfs_inode_destroy(&node->nn_inode);
+
+	error = nandfs_free_entry(mdt, &req);
+	if (error) {
+		nandfs_error("%s: freing entry error:%d node %p(%jx)",
+		    __func__, error, node, node->nn_ino);
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	VOP_UNLOCK(vp, 0);
+	DPRINTF(IFILE, ("%s: freed node %p ino %#jx\n",
+	    __func__, node, (uintmax_t)node->nn_ino));
+	return (error);
+}
+
+int
+nandfs_node_update(struct nandfs_node *node)
+{
+	struct nandfs_alloc_request req;
+	struct nandfsmount *nmp;
+	struct nandfs_mdt *mdt;
+	struct nandfs_node *ifile;
+	struct nandfs_inode *inode;
+	uint32_t index;
+	int error = 0;
+
+	nmp = node->nn_nmp;
+	ifile = nmp->nm_ifile_node;
+	ASSERT_VOP_LOCKED(NTOV(ifile), __func__);
+
+	req.entrynum = node->nn_ino;
+	mdt = &nmp->nm_nandfsdev->nd_ifile_mdt;
+
+	DPRINTF(IFILE, ("%s: node:%p ino:%#jx\n",
+	    __func__, &node->nn_inode, (uintmax_t)node->nn_ino));
+
+	error = nandfs_get_entry_block(mdt, ifile, &req, &index, 0);
+	if (error) {
+		printf("nandfs_get_entry_block returned with ERROR=%d\n",
+		    error);
+		return (error);
+	}
+
+	inode = ((struct nandfs_inode *) req.bp_entry->b_data) + index;
+	memcpy(inode, &node->nn_inode, sizeof(*inode));
+	error = nandfs_dirty_buf(req.bp_entry, 0);
+
+	return (error);
+}
+
+int
+nandfs_get_node_entry(struct nandfsmount *nmp, struct nandfs_inode **inode,
+    uint64_t ino, struct buf **bp)
+{
+	struct nandfs_alloc_request req;
+	struct nandfs_mdt *mdt;
+	struct nandfs_node *ifile;
+	struct vnode *vp;
+	uint32_t index;
+	int error = 0;
+
+	req.entrynum = ino;
+	mdt = &nmp->nm_nandfsdev->nd_ifile_mdt;
+	ifile = nmp->nm_ifile_node;
+	vp = NTOV(ifile);
+
+	VOP_LOCK(vp, LK_EXCLUSIVE);
+	error = nandfs_get_entry_block(mdt, ifile, &req, &index, 0);
+	if (error) {
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	*inode = ((struct nandfs_inode *) req.bp_entry->b_data) + index;
+	*bp = req.bp_entry;
+	VOP_UNLOCK(vp, 0);
+	return (0);
+}
+
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_mount.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_mount.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *          This product includes software developed for the
+ *          NetBSD Project.  See http://www.NetBSD.org/ for
+ *          information about NetBSD.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_mount.h,v 1.1 2009/07/18 16:31:42 reinoud
+ *
+ * $FreeBSD: head/sys/fs/nandfs/nandfs_mount.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _FS_NANDFS_NANDFS_MOUNT_H_
+#define _FS_NANDFS_NANDFS_MOUNT_H_
+
+/*
+ * Arguments to mount NANDFS filingsystem.
+ */
+
+struct nandfs_args {
+	char		*fspec;		/* mount specifier                   */
+	int64_t		cpno;		/* checkpoint number                 */
+};
+
+#endif /* !_FS_NANDFS_NANDFS_MOUNT_H_ */
+
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_segment.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_segment.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,1329 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_segment.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/libkern.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <geom/geom.h>
+#include <geom/geom_vfs.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+static int
+nandfs_new_segment(struct nandfs_device *fsdev)
+{
+	int error = 0;
+	uint64_t new;
+
+	error = nandfs_alloc_segment(fsdev, &new);
+	if (!error) {
+		fsdev->nd_seg_num = fsdev->nd_next_seg_num;
+		fsdev->nd_next_seg_num = new;
+	}
+	DPRINTF(SYNC, ("%s: new segment %jx next %jx error %d\n",
+	    __func__, (uintmax_t)fsdev->nd_seg_num, (uintmax_t)new, error));
+	if (error)
+		nandfs_error("%s: cannot create segment error %d\n",
+		    __func__, error);
+
+	return (error);
+}
+
+static int
+create_segment(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_segment *seg;
+	struct nandfs_device *fsdev;
+	struct nandfs_segment *prev;
+	struct buf *bp;
+	uint64_t start_block, curr;
+	uint32_t blks_per_seg, nblocks;
+	int error;
+
+	fsdev = seginfo->fsdev;
+	prev = seginfo->curseg;
+	blks_per_seg = fsdev->nd_fsdata.f_blocks_per_segment;
+	nblocks = fsdev->nd_last_segsum.ss_nblocks;
+
+	if (!prev) {
+		vfs_timestamp(&fsdev->nd_ts);
+		/* Touch current segment */
+		error = nandfs_touch_segment(fsdev, fsdev->nd_seg_num);
+		if (error) {
+			nandfs_error("%s: cannot preallocate segment %jx\n",
+			    __func__, fsdev->nd_seg_num);
+			return (error);
+		}
+		error = nandfs_touch_segment(fsdev, 0);
+		if (error) {
+			nandfs_error("%s: cannot dirty block with segment 0\n",
+			    __func__);
+			return (error);
+		}
+		start_block = fsdev->nd_last_pseg + (uint64_t)nblocks;
+		/*
+		 * XXX Hack
+		 */
+		if (blks_per_seg - (start_block % blks_per_seg) - 1 == 0)
+			start_block++;
+		curr = nandfs_get_segnum_of_block(fsdev, start_block);
+		/* Allocate new segment if last one is full */
+		if (fsdev->nd_seg_num != curr) {
+			error = nandfs_new_segment(fsdev);
+			if (error) {
+				nandfs_error("%s: cannot create new segment\n",
+				    __func__);
+				return (error);
+			}
+			/*
+			 * XXX Hack
+			 */
+			nandfs_get_segment_range(fsdev, fsdev->nd_seg_num, &start_block, NULL);
+		}
+	} else {
+		nandfs_get_segment_range(fsdev, fsdev->nd_next_seg_num,
+		    &start_block, NULL);
+
+		/* Touch current segment and allocate and touch new one */
+		error = nandfs_new_segment(fsdev);
+		if (error) {
+			nandfs_error("%s: cannot create next segment\n",
+			    __func__);
+			return (error);
+		}
+
+		/* Reiterate in case new buf is dirty */
+		seginfo->reiterate = 1;
+	}
+
+	/* Allocate and initialize nandfs_segment structure */
+	seg = malloc(sizeof(*seg), M_DEVBUF, M_WAITOK|M_ZERO);
+	TAILQ_INIT(&seg->segsum);
+	TAILQ_INIT(&seg->data);
+	seg->fsdev = fsdev;
+	seg->start_block = start_block;
+	seg->num_blocks = blks_per_seg - (start_block % blks_per_seg) - 1;
+	seg->seg_num = fsdev->nd_seg_num;
+	seg->seg_next = fsdev->nd_next_seg_num;
+	seg->segsum_blocks = 1;
+	seg->bytes_left = fsdev->nd_blocksize -
+	    sizeof(struct nandfs_segment_summary);
+	seg->segsum_bytes = sizeof(struct nandfs_segment_summary);
+
+	/* Allocate buffer for segment summary */
+	bp = getblk(fsdev->nd_devvp, nandfs_block_to_dblock(fsdev,
+	    seg->start_block), fsdev->nd_blocksize, 0, 0, 0);
+	bzero(bp->b_data, seginfo->fsdev->nd_blocksize);
+	bp->b_bufobj = &seginfo->fsdev->nd_devvp->v_bufobj;
+	bp->b_flags |= B_MANAGED;
+
+	/* Add buffer to segment */
+	TAILQ_INSERT_TAIL(&seg->segsum, bp, b_cluster.cluster_entry);
+	seg->current_off = bp->b_data + sizeof(struct nandfs_segment_summary);
+
+	DPRINTF(SYNC, ("%s: seg %p : initial settings: start %#jx size :%#x\n",
+	    __func__, seg, (uintmax_t)seg->start_block, seg->num_blocks));
+	DPRINTF(SYNC, ("%s: seg->seg_num %#jx cno %#jx next %#jx\n", __func__,
+	    (uintmax_t)seg->seg_num, (uintmax_t)(fsdev->nd_last_cno + 1),
+	    (uintmax_t)seg->seg_next));
+
+	if (!prev)
+		LIST_INSERT_HEAD(&seginfo->seg_list, seg, seg_link);
+	else
+		LIST_INSERT_AFTER(prev, seg, seg_link);
+
+	seginfo->curseg = seg;
+
+	return (0);
+}
+
+static int
+delete_segment(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_segment *seg, *tseg;
+	struct buf *bp, *tbp;
+
+	LIST_FOREACH_SAFE(seg, &seginfo->seg_list, seg_link, tseg) {
+		TAILQ_FOREACH_SAFE(bp, &seg->segsum, b_cluster.cluster_entry,
+		    tbp) {
+			TAILQ_REMOVE(&seg->segsum, bp, b_cluster.cluster_entry);
+			bp->b_flags &= ~B_MANAGED;
+			brelse(bp);
+		};
+
+		LIST_REMOVE(seg, seg_link);
+		free(seg, M_DEVBUF);
+	}
+
+	return (0);
+}
+
+static int
+create_seginfo(struct nandfs_device *fsdev, struct nandfs_seginfo **seginfo)
+{
+	struct nandfs_seginfo *info;
+
+	info = malloc(sizeof(*info), M_DEVBUF, M_WAITOK);
+
+	LIST_INIT(&info->seg_list);
+	info->fsdev = fsdev;
+	info->curseg = NULL;
+	info->blocks = 0;
+	*seginfo = info;
+	fsdev->nd_seginfo = info;
+	return (0);
+}
+
+static int
+delete_seginfo(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_device *nffsdev;
+
+	nffsdev = seginfo->fsdev;
+	delete_segment(seginfo);
+	nffsdev->nd_seginfo = NULL;
+	free(seginfo, M_DEVBUF);
+
+	return (0);
+}
+
+static int
+nandfs_create_superroot_block(struct nandfs_seginfo *seginfo,
+    struct buf **newbp)
+{
+	struct buf *bp;
+	int error;
+
+	bp = nandfs_geteblk(seginfo->fsdev->nd_blocksize, GB_NOWAIT_BD);
+
+	bzero(bp->b_data, seginfo->fsdev->nd_blocksize);
+	bp->b_bufobj = &seginfo->fsdev->nd_devvp->v_bufobj;
+	bp->b_flags |= B_MANAGED;
+
+	if (!(seginfo->curseg) || !seginfo->curseg->num_blocks) {
+		error = create_segment(seginfo);
+		if (error) {
+			brelse(bp);
+			nandfs_error("%s: no segment for superroot\n",
+			    __func__);
+			return (error);
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&seginfo->curseg->data, bp, b_cluster.cluster_entry);
+
+	seginfo->curseg->nblocks++;
+	seginfo->curseg->num_blocks--;
+	seginfo->blocks++;
+
+	*newbp = bp;
+	return (0);
+}
+
+static int
+nandfs_add_superroot(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_super_root *sr;
+	struct buf *bp = NULL;
+	uint64_t crc_skip;
+	uint32_t crc_calc;
+	int error;
+
+	fsdev = seginfo->fsdev;
+
+	error = nandfs_create_superroot_block(seginfo, &bp);
+	if (error) {
+		nandfs_error("%s: cannot add superroot\n", __func__);
+		return (error);
+	}
+
+	sr = (struct nandfs_super_root *)bp->b_data;
+	/* Save superroot CRC */
+	sr->sr_bytes = NANDFS_SR_BYTES;
+	sr->sr_flags = 0;
+	sr->sr_nongc_ctime = 0;
+
+	memcpy(&sr->sr_dat, &fsdev->nd_dat_node->nn_inode,
+	    sizeof(struct nandfs_inode));
+	memcpy(&sr->sr_cpfile, &fsdev->nd_cp_node->nn_inode,
+	    sizeof(struct nandfs_inode));
+	memcpy(&sr->sr_sufile, &fsdev->nd_su_node->nn_inode,
+	    sizeof(struct nandfs_inode));
+
+	crc_skip = sizeof(sr->sr_sum);
+	crc_calc = crc32((uint8_t *)sr + crc_skip, NANDFS_SR_BYTES - crc_skip);
+
+	sr->sr_sum = crc_calc;
+
+	bp->b_flags |= B_MANAGED;
+	bp->b_bufobj = &seginfo->fsdev->nd_devvp->v_bufobj;
+
+	bp->b_flags &= ~B_INVAL;
+	nandfs_dirty_bufs_increment(fsdev);
+	DPRINTF(SYNC, ("%s: bp:%p\n", __func__, bp));
+
+	return (0);
+}
+
+static int
+nandfs_add_segsum_block(struct nandfs_seginfo *seginfo, struct buf **newbp)
+{
+	struct nandfs_device *fsdev;
+	nandfs_daddr_t blk;
+	struct buf *bp;
+	int error;
+
+	if (!(seginfo->curseg) || seginfo->curseg->num_blocks <= 1) {
+		error = create_segment(seginfo);
+		if (error) {
+			nandfs_error("%s: error:%d when creating segment\n",
+			    __func__, error);
+			return (error);
+		}
+		*newbp = TAILQ_FIRST(&seginfo->curseg->segsum);
+		return (0);
+	}
+
+	fsdev = seginfo->fsdev;
+	blk = nandfs_block_to_dblock(fsdev, seginfo->curseg->start_block +
+	    seginfo->curseg->segsum_blocks);
+
+	bp = getblk(fsdev->nd_devvp, blk, fsdev->nd_blocksize, 0, 0, 0);
+
+	bzero(bp->b_data, seginfo->fsdev->nd_blocksize);
+	bp->b_bufobj = &seginfo->fsdev->nd_devvp->v_bufobj;
+	bp->b_flags |= B_MANAGED;
+
+	TAILQ_INSERT_TAIL(&seginfo->curseg->segsum, bp,
+	    b_cluster.cluster_entry);
+	seginfo->curseg->num_blocks--;
+
+	seginfo->curseg->segsum_blocks++;
+	seginfo->curseg->bytes_left = seginfo->fsdev->nd_blocksize;
+	seginfo->curseg->current_off = bp->b_data;
+	seginfo->blocks++;
+
+	*newbp = bp;
+
+	DPRINTF(SYNC, ("%s: bp %p\n", __func__, bp));
+
+	return (0);
+}
+
+static int
+nandfs_add_blocks(struct nandfs_seginfo *seginfo, struct nandfs_node *node,
+    struct buf *bp)
+{
+	union nandfs_binfo *binfo;
+	struct buf *seg_bp;
+	int error;
+
+	if (!(seginfo->curseg) || !seginfo->curseg->num_blocks) {
+		error = create_segment(seginfo);
+		if (error) {
+			nandfs_error("%s: error:%d when creating segment\n",
+			    __func__, error);
+			return (error);
+		}
+	}
+
+	if (seginfo->curseg->bytes_left < sizeof(union nandfs_binfo)) {
+		error = nandfs_add_segsum_block(seginfo, &seg_bp);
+		if (error) {
+			nandfs_error("%s: error:%d when adding segsum\n",
+			    __func__, error);
+			return (error);
+		}
+	}
+	binfo = (union nandfs_binfo *)seginfo->curseg->current_off;
+
+	if (node->nn_ino != NANDFS_DAT_INO) {
+		binfo->bi_v.bi_blkoff = bp->b_lblkno;
+		binfo->bi_v.bi_ino = node->nn_ino;
+	} else {
+		binfo->bi_dat.bi_blkoff = bp->b_lblkno;
+		binfo->bi_dat.bi_ino = node->nn_ino;
+		if (NANDFS_IS_INDIRECT(bp))
+			binfo->bi_dat.bi_level = 1;
+		else
+			binfo->bi_dat.bi_level = 0;
+	}
+	binfo++;
+
+	seginfo->curseg->bytes_left -= sizeof(union nandfs_binfo);
+	seginfo->curseg->segsum_bytes += sizeof(union nandfs_binfo);
+	seginfo->curseg->current_off = (char *)binfo;
+
+	TAILQ_INSERT_TAIL(&seginfo->curseg->data, bp, b_cluster.cluster_entry);
+
+	seginfo->curseg->nbinfos++;
+	seginfo->curseg->nblocks++;
+	seginfo->curseg->num_blocks--;
+	seginfo->blocks++;
+
+	DPRINTF(SYNC, ("%s: bp (%p) number %x (left %x)\n",
+	    __func__, bp, seginfo->curseg->nblocks,
+	    seginfo->curseg->num_blocks));
+	return (0);
+}
+
+static int
+nandfs_iterate_dirty_buf(struct vnode *vp, struct nandfs_seginfo *seginfo,
+    uint8_t hold)
+{
+	struct buf *bp, *tbd;
+	struct bufobj *bo;
+	struct nandfs_node *node;
+	int error;
+
+	node = VTON(vp);
+	bo = &vp->v_bufobj;
+
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	/* Iterate dirty data bufs */
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, tbd) {
+		DPRINTF(SYNC, ("%s: vp (%p): bp (%p) with lblkno %jx ino %jx "
+		    "add buf\n", __func__, vp, bp, bp->b_lblkno, node->nn_ino));
+
+		if (!(NANDFS_ISGATHERED(bp))) {
+			error = nandfs_bmap_update_dat(node,
+			    nandfs_vblk_get(bp), bp);
+			if (error)
+				return (error);
+			NANDFS_GATHER(bp);
+			nandfs_add_blocks(seginfo, node, bp);
+		}
+	}
+
+	return (0);
+}
+
+static int
+nandfs_iterate_system_vnode(struct nandfs_node *node,
+    struct nandfs_seginfo *seginfo)
+{
+	struct vnode *vp;
+	int nblocks;
+	uint8_t hold = 0;
+
+	if (node->nn_ino != NANDFS_IFILE_INO)
+		hold = 1;
+
+	vp = NTOV(node);
+
+	nblocks = vp->v_bufobj.bo_dirty.bv_cnt;
+	DPRINTF(SYNC, ("%s: vp (%p): nblocks %x ino %jx\n",
+	    __func__, vp, nblocks, node->nn_ino));
+
+	if (nblocks)
+		nandfs_iterate_dirty_buf(vp, seginfo, hold);
+
+	return (0);
+}
+
+static int
+nandfs_iterate_dirty_vnodes(struct mount *mp, struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_node *nandfs_node;
+	struct vnode *vp, *mvp;
+	struct thread *td;
+	int error, lockreq, update;
+
+	td = curthread;
+	lockreq = LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY;
+
+	MNT_ILOCK(mp);
+
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
+		update = 0;
+
+		if (mp->mnt_syncer == vp)
+			continue;
+		if (VOP_ISLOCKED(vp))
+			continue;
+
+		VI_LOCK(vp);
+		MNT_IUNLOCK(mp);
+		if (vp->v_iflag & VI_DOOMED) {
+			VI_UNLOCK(vp);
+			MNT_ILOCK(mp);
+			continue;
+		}
+
+		if ((error = vget(vp, lockreq, td)) != 0) {
+			MNT_ILOCK(mp);
+			continue;
+		}
+
+		if (vp->v_iflag & VI_DOOMED) {
+			vput(vp);
+			MNT_ILOCK(mp);
+			continue;
+		}
+
+		nandfs_node = VTON(vp);
+		if (nandfs_node->nn_flags & IN_MODIFIED) {
+			nandfs_node->nn_flags &= ~(IN_MODIFIED);
+			update = 1;
+		}
+
+		if (vp->v_bufobj.bo_dirty.bv_cnt) {
+			error = nandfs_iterate_dirty_buf(vp, seginfo, 0);
+			if (error) {
+				nandfs_error("%s: cannot iterate vnode:%p "
+				    "err:%d\n", __func__, vp, error);
+				vput(vp);
+				return (error);
+			}
+			update = 1;
+		} else
+			vput(vp);
+
+		if (update)
+			nandfs_node_update(nandfs_node);
+
+		MNT_ILOCK(mp);
+	}
+
+	MNT_IUNLOCK(mp);
+
+	return (0);
+}
+
+static int
+nandfs_update_phys_block(struct nandfs_device *fsdev, struct buf *bp,
+    uint64_t phys_blknr, union nandfs_binfo *binfo)
+{
+	struct nandfs_node *node, *dat;
+	struct vnode *vp;
+	uint64_t new_blknr;
+	int error;
+
+	vp = bp->b_vp;
+	node = VTON(vp);
+	new_blknr = nandfs_vblk_get(bp);
+	dat = fsdev->nd_dat_node;
+
+	DPRINTF(BMAP, ("%s: ino %#jx lblk %#jx: vblk %#jx -> %#jx\n",
+	    __func__, (uintmax_t)node->nn_ino, (uintmax_t)bp->b_lblkno,
+	    (uintmax_t)new_blknr, (uintmax_t)phys_blknr));
+
+	if (node->nn_ino != NANDFS_DAT_INO) {
+		KASSERT((new_blknr != 0), ("vblk for bp %p is 0", bp));
+
+		nandfs_vblock_assign(fsdev, new_blknr, phys_blknr);
+		binfo->bi_v.bi_vblocknr = new_blknr;
+		binfo->bi_v.bi_blkoff = bp->b_lblkno;
+		binfo->bi_v.bi_ino = node->nn_ino;
+	} else {
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		error = nandfs_bmap_update_block(node, bp, phys_blknr);
+		if (error) {
+			nandfs_error("%s: error updating block:%jx for bp:%p\n",
+			    __func__, (uintmax_t)phys_blknr, bp);
+			VOP_UNLOCK(NTOV(dat), 0);
+			return (error);
+		}
+		VOP_UNLOCK(NTOV(dat), 0);
+		binfo->bi_dat.bi_blkoff = bp->b_lblkno;
+		binfo->bi_dat.bi_ino = node->nn_ino;
+		if (NANDFS_IS_INDIRECT(bp))
+			binfo->bi_dat.bi_level = 1;
+		else
+			binfo->bi_dat.bi_level = 0;
+	}
+
+	return (0);
+}
+
+#define	NBINFO(off) ((off) + sizeof(union nandfs_binfo))
+static int
+nandfs_segment_assign_pblk(struct nandfs_segment *nfsseg)
+{
+	struct nandfs_device *fsdev;
+	union nandfs_binfo *binfo;
+	struct buf *bp, *seg_bp;
+	uint64_t blocknr;
+	uint32_t curr_off, blocksize;
+	int error;
+
+	fsdev = nfsseg->fsdev;
+	blocksize = fsdev->nd_blocksize;
+
+	blocknr = nfsseg->start_block + nfsseg->segsum_blocks;
+	seg_bp = TAILQ_FIRST(&nfsseg->segsum);
+	DPRINTF(SYNC, ("%s: seg:%p segsum bp:%p data:%p\n",
+	    __func__, nfsseg, seg_bp, seg_bp->b_data));
+
+	binfo = (union nandfs_binfo *)(seg_bp->b_data +
+	    sizeof(struct nandfs_segment_summary));
+	curr_off = sizeof(struct nandfs_segment_summary);
+
+	TAILQ_FOREACH(bp, &nfsseg->data, b_cluster.cluster_entry) {
+		KASSERT((bp->b_vp), ("bp %p has not vp", bp));
+
+		DPRINTF(BMAP, ("\n\n%s: assign buf %p for ino %#jx next %p\n",
+		    __func__, bp, (uintmax_t)VTON(bp->b_vp)->nn_ino,
+		    TAILQ_NEXT(bp, b_cluster.cluster_entry)));
+
+		if (NBINFO(curr_off) > blocksize) {
+			seg_bp = TAILQ_NEXT(seg_bp, b_cluster.cluster_entry);
+			binfo = (union nandfs_binfo *)seg_bp->b_data;
+			curr_off = 0;
+			DPRINTF(SYNC, ("%s: next segsum %p data %p\n",
+			    __func__, seg_bp, seg_bp->b_data));
+		}
+
+		error = nandfs_update_phys_block(fsdev, bp, blocknr, binfo);
+		if (error) {
+			nandfs_error("%s: err:%d when updatinng phys block:%jx"
+			    " for bp:%p and binfo:%p\n", __func__, error,
+			    (uintmax_t)blocknr, bp, binfo);
+			return (error);
+		}
+		binfo++;
+		curr_off = NBINFO(curr_off);
+
+		blocknr++;
+	}
+
+	return (0);
+}
+
+static int
+nandfs_seginfo_assign_pblk(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_segment *nfsseg;
+	int error = 0;
+
+	LIST_FOREACH(nfsseg, &seginfo->seg_list, seg_link) {
+		error = nandfs_segment_assign_pblk(nfsseg);
+		if (error)
+			break;
+	}
+
+	return (error);
+}
+
+static struct nandfs_segment_summary *
+nandfs_fill_segsum(struct nandfs_segment *seg, int has_sr)
+{
+	struct nandfs_segment_summary *ss;
+	struct nandfs_device *fsdev;
+	struct buf *bp;
+	uint32_t rest, segsum_size, blocksize, crc_calc;
+	uint16_t flags;
+	uint8_t *crc_area, crc_skip;
+
+	DPRINTF(SYNC, ("%s: seg %#jx nblocks %#x sumbytes %#x\n",
+	    __func__, (uintmax_t) seg->seg_num,
+	    seg->nblocks + seg->segsum_blocks,
+	    seg->segsum_bytes));
+
+	fsdev = seg->fsdev;
+
+	flags = NANDFS_SS_LOGBGN | NANDFS_SS_LOGEND;
+	if (has_sr)
+		flags |= NANDFS_SS_SR;
+
+	bp = TAILQ_FIRST(&seg->segsum);
+	ss = (struct nandfs_segment_summary *) bp->b_data;
+	ss->ss_magic = NANDFS_SEGSUM_MAGIC;
+	ss->ss_bytes = sizeof(struct nandfs_segment_summary);
+	ss->ss_flags = flags;
+	ss->ss_seq = ++(fsdev->nd_seg_sequence);
+	ss->ss_create = fsdev->nd_ts.tv_sec;
+	nandfs_get_segment_range(fsdev, seg->seg_next, &ss->ss_next, NULL);
+	ss->ss_nblocks = seg->nblocks + seg->segsum_blocks;
+	ss->ss_nbinfos = seg->nbinfos;
+	ss->ss_sumbytes = seg->segsum_bytes;
+
+	crc_skip = sizeof(ss->ss_datasum) + sizeof(ss->ss_sumsum);
+	blocksize = seg->fsdev->nd_blocksize;
+
+	segsum_size = seg->segsum_bytes - crc_skip;
+	rest = min(seg->segsum_bytes, blocksize) - crc_skip;
+	crc_area = (uint8_t *)ss + crc_skip;
+	crc_calc = ~0U;
+	while (segsum_size > 0) {
+		crc_calc = crc32_raw(crc_area, rest, crc_calc);
+		segsum_size -= rest;
+		if (!segsum_size)
+			break;
+		bp = TAILQ_NEXT(bp, b_cluster.cluster_entry);
+		crc_area = (uint8_t *)bp->b_data;
+		rest = segsum_size <= blocksize ? segsum_size : blocksize;
+	}
+	ss->ss_sumsum = crc_calc ^ ~0U;
+
+	return (ss);
+
+}
+
+static int
+nandfs_save_buf(struct buf *bp, uint64_t blocknr, struct nandfs_device *fsdev)
+{
+	struct bufobj *bo;
+	int error;
+
+	bo = &fsdev->nd_devvp->v_bufobj;
+
+	bp->b_blkno = nandfs_block_to_dblock(fsdev, blocknr);
+	bp->b_iooffset = dbtob(bp->b_blkno);
+
+	KASSERT(bp->b_bufobj != NULL, ("no bufobj for %p", bp));
+	if (bp->b_bufobj != bo) {
+		BO_LOCK(bp->b_bufobj);
+		BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
+		    BO_MTX(bp->b_bufobj));
+		KASSERT(BUF_ISLOCKED(bp), ("Problem with locking buffer"));
+	}
+
+	DPRINTF(SYNC, ("%s: buf: %p offset %#jx blk %#jx size %#x\n",
+	    __func__, bp, (uintmax_t)bp->b_offset, (uintmax_t)blocknr,
+	    fsdev->nd_blocksize));
+
+	NANDFS_UNGATHER(bp);
+	nandfs_buf_clear(bp, 0xffffffff);
+	bp->b_flags &= ~(B_ASYNC|B_INVAL|B_MANAGED);
+	error = bwrite(bp);
+	if (error) {
+		nandfs_error("%s: error:%d when writing buffer:%p\n",
+		    __func__, error, bp);
+		return (error);
+	}
+	return (error);
+}
+
+static void
+nandfs_clean_buf(struct nandfs_device *fsdev, struct buf *bp)
+{
+
+	DPRINTF(SYNC, ("%s: buf: %p\n", __func__, bp));
+
+	NANDFS_UNGATHER(bp);
+	nandfs_buf_clear(bp, 0xffffffff);
+	bp->b_flags &= ~(B_ASYNC|B_INVAL|B_MANAGED);
+	nandfs_undirty_buf_fsdev(fsdev, bp);
+}
+
+static void
+nandfs_clean_segblocks(struct nandfs_segment *seg, uint8_t unlock)
+{
+	struct nandfs_device *fsdev = seg->fsdev;
+	struct nandfs_segment *next_seg;
+	struct buf *bp, *tbp, *next_bp;
+	struct vnode *vp, *next_vp;
+
+	VOP_LOCK(fsdev->nd_devvp, LK_EXCLUSIVE);
+	TAILQ_FOREACH_SAFE(bp, &seg->segsum, b_cluster.cluster_entry, tbp) {
+		TAILQ_REMOVE(&seg->segsum, bp, b_cluster.cluster_entry);
+		nandfs_clean_buf(fsdev, bp);
+	};
+
+	TAILQ_FOREACH_SAFE(bp, &seg->data, b_cluster.cluster_entry, tbp) {
+		TAILQ_REMOVE(&seg->data, bp, b_cluster.cluster_entry);
+
+		/*
+		 * If bp is not super-root and vnode is not currently
+		 * locked lock it.
+		 */
+		vp = bp->b_vp;
+		next_vp = NULL;
+		next_bp = TAILQ_NEXT(bp,  b_cluster.cluster_entry);
+		if (!next_bp) {
+			next_seg = LIST_NEXT(seg, seg_link);
+			if (next_seg)
+				next_bp = TAILQ_FIRST(&next_seg->data);
+		}
+
+		if (next_bp)
+			next_vp = next_bp->b_vp;
+
+		nandfs_clean_buf(fsdev, bp);
+
+		if (unlock && vp != NULL && next_vp != vp &&
+		    !NANDFS_SYS_NODE(VTON(vp)->nn_ino))
+			vput(vp);
+
+		nandfs_dirty_bufs_decrement(fsdev);
+	}
+
+	VOP_UNLOCK(fsdev->nd_devvp, 0);
+}
+
+static int
+nandfs_save_segblocks(struct nandfs_segment *seg, uint8_t unlock)
+{
+	struct nandfs_device *fsdev = seg->fsdev;
+	struct nandfs_segment *next_seg;
+	struct buf *bp, *tbp, *next_bp;
+	struct vnode *vp, *next_vp;
+	uint64_t blocknr;
+	uint32_t i = 0;
+	int error = 0;
+
+	VOP_LOCK(fsdev->nd_devvp, LK_EXCLUSIVE);
+	TAILQ_FOREACH_SAFE(bp, &seg->segsum, b_cluster.cluster_entry, tbp) {
+		TAILQ_REMOVE(&seg->segsum, bp, b_cluster.cluster_entry);
+		blocknr = seg->start_block + i;
+		error = nandfs_save_buf(bp, blocknr, fsdev);
+		if (error) {
+			nandfs_error("%s: error saving buf: %p blocknr:%jx\n",
+			    __func__, bp, (uintmax_t)blocknr);
+			goto out;
+		}
+		i++;
+	};
+
+	i = 0;
+	TAILQ_FOREACH_SAFE(bp, &seg->data, b_cluster.cluster_entry, tbp) {
+		TAILQ_REMOVE(&seg->data, bp, b_cluster.cluster_entry);
+
+		blocknr = seg->start_block + seg->segsum_blocks + i;
+		/*
+		 * If bp is not super-root and vnode is not currently
+		 * locked lock it.
+		 */
+		vp = bp->b_vp;
+		next_vp = NULL;
+		next_bp = TAILQ_NEXT(bp,  b_cluster.cluster_entry);
+		if (!next_bp) {
+			next_seg = LIST_NEXT(seg, seg_link);
+			if (next_seg)
+				next_bp = TAILQ_FIRST(&next_seg->data);
+		}
+
+		if (next_bp)
+			next_vp = next_bp->b_vp;
+
+		error = nandfs_save_buf(bp, blocknr, fsdev);
+		if (error) {
+			nandfs_error("%s: error saving buf: %p blknr: %jx\n",
+			    __func__, bp, (uintmax_t)blocknr);
+			if (unlock && vp != NULL && next_vp != vp &&
+			    !NANDFS_SYS_NODE(VTON(vp)->nn_ino))
+				vput(vp);
+			goto out;
+		}
+
+		if (unlock && vp != NULL && next_vp != vp &&
+		    !NANDFS_SYS_NODE(VTON(vp)->nn_ino))
+			vput(vp);
+
+		i++;
+		nandfs_dirty_bufs_decrement(fsdev);
+	}
+out:
+	if (error) {
+		nandfs_clean_segblocks(seg, unlock);
+		VOP_UNLOCK(fsdev->nd_devvp, 0);
+		return (error);
+	}
+
+	VOP_UNLOCK(fsdev->nd_devvp, 0);
+	return (error);
+}
+
+
+static void
+clean_seginfo(struct nandfs_seginfo *seginfo, uint8_t unlock)
+{
+	struct nandfs_segment *seg;
+
+	DPRINTF(SYNC, ("%s: seginfo %p\n", __func__, seginfo));
+
+	LIST_FOREACH(seg, &seginfo->seg_list, seg_link) {
+		nandfs_clean_segblocks(seg, unlock);
+	}
+}
+
+static int
+save_seginfo(struct nandfs_seginfo *seginfo, uint8_t unlock)
+{
+	struct nandfs_segment *seg;
+	struct nandfs_device *fsdev;
+	struct nandfs_segment_summary *ss;
+	int error = 0;
+
+	fsdev = seginfo->fsdev;
+
+	DPRINTF(SYNC, ("%s: seginfo %p\n", __func__, seginfo));
+
+	LIST_FOREACH(seg, &seginfo->seg_list, seg_link) {
+		if (LIST_NEXT(seg, seg_link)) {
+			nandfs_fill_segsum(seg, 0);
+			error = nandfs_save_segblocks(seg, unlock);
+			if (error) {
+				nandfs_error("%s: error:%d saving seg:%p\n",
+				    __func__, error, seg);
+				goto out;
+			}
+		} else {
+			ss = nandfs_fill_segsum(seg, 1);
+			fsdev->nd_last_segsum = *ss;
+			error = nandfs_save_segblocks(seg, unlock);
+			if (error) {
+				nandfs_error("%s: error:%d saving seg:%p\n",
+				    __func__, error, seg);
+				goto out;
+			}
+			fsdev->nd_last_cno++;
+			fsdev->nd_last_pseg = seg->start_block;
+		}
+	}
+out:
+	if (error)
+		clean_seginfo(seginfo, unlock);
+	return (error);
+}
+
+static void
+nandfs_invalidate_bufs(struct nandfs_device *fsdev, uint64_t segno)
+{
+	uint64_t start, end;
+	struct buf *bp, *tbd;
+	struct bufobj *bo;
+
+	nandfs_get_segment_range(fsdev, segno, &start, &end);
+
+	bo = &NTOV(fsdev->nd_gc_node)->v_bufobj;
+
+	BO_LOCK(bo);
+restart_locked_gc:
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, tbd) {
+		if (!(bp->b_lblkno >= start && bp->b_lblkno <= end))
+			continue;
+
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
+			goto restart_locked_gc;
+
+		bremfree(bp);
+		bp->b_flags |= (B_INVAL | B_RELBUF);
+		bp->b_flags &= ~(B_ASYNC | B_MANAGED);
+		BO_UNLOCK(bo);
+		brelse(bp);
+		BO_LOCK(bo);
+	}
+	BO_UNLOCK(bo);
+}
+
+/* Process segments marks to free by cleaner */
+static void
+nandfs_process_segments(struct nandfs_device *fsdev)
+{
+	uint64_t saved_segment;
+	int i;
+
+	if (fsdev->nd_free_base) {
+		saved_segment = nandfs_get_segnum_of_block(fsdev,
+		    fsdev->nd_super.s_last_pseg);
+		for (i = 0; i < fsdev->nd_free_count; i++) {
+			if (fsdev->nd_free_base[i] == NANDFS_NOSEGMENT)
+				continue;
+			/* Update superblock if clearing segment point by it */
+			if (fsdev->nd_free_base[i] == saved_segment) {
+				nandfs_write_superblock(fsdev);
+				saved_segment = nandfs_get_segnum_of_block(
+				    fsdev, fsdev->nd_super.s_last_pseg);
+			}
+			nandfs_invalidate_bufs(fsdev, fsdev->nd_free_base[i]);
+			nandfs_clear_segment(fsdev, fsdev->nd_free_base[i]);
+		}
+
+		free(fsdev->nd_free_base, M_NANDFSTEMP);
+		fsdev->nd_free_base = NULL;
+		fsdev->nd_free_count = 0;
+	}
+}
+
+/* Collect and write dirty buffers */
+int
+nandfs_sync_file(struct vnode *vp)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_node *nandfs_node;
+	struct nandfsmount *nmp;
+	struct nandfs_node *dat, *su, *ifile, *cp;
+	struct nandfs_seginfo *seginfo = NULL;
+	struct nandfs_segment *seg;
+	int update, error;
+	int cno_changed;
+
+	ASSERT_VOP_LOCKED(vp, __func__);
+	DPRINTF(SYNC, ("%s: START\n", __func__));
+
+	error = 0;
+	nmp = VFSTONANDFS(vp->v_mount);
+	fsdev = nmp->nm_nandfsdev;
+
+	dat = fsdev->nd_dat_node;
+	su = fsdev->nd_su_node;
+	cp = fsdev->nd_cp_node;
+	ifile = nmp->nm_ifile_node;
+
+	NANDFS_WRITEASSERT(fsdev);
+	if (lockmgr(&fsdev->nd_seg_const, LK_UPGRADE, NULL) != 0) {
+		DPRINTF(SYNC, ("%s: lost shared lock\n", __func__));
+		if (lockmgr(&fsdev->nd_seg_const, LK_EXCLUSIVE, NULL) != 0)
+			panic("couldn't lock exclusive");
+	}
+	DPRINTF(SYNC, ("%s: got lock\n", __func__));
+
+	VOP_LOCK(NTOV(su), LK_EXCLUSIVE);
+	create_seginfo(fsdev, &seginfo);
+
+	update = 0;
+
+	nandfs_node = VTON(vp);
+	if (nandfs_node->nn_flags & IN_MODIFIED) {
+		nandfs_node->nn_flags &= ~(IN_MODIFIED);
+		update = 1;
+	}
+
+	if (vp->v_bufobj.bo_dirty.bv_cnt) {
+		error = nandfs_iterate_dirty_buf(vp, seginfo, 0);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d iterating dirty bufs vp:%p",
+			    __func__, error, vp);
+			return (error);
+		}
+		update = 1;
+	}
+
+	if (update) {
+		VOP_LOCK(NTOV(ifile), LK_EXCLUSIVE);
+		error = nandfs_node_update(nandfs_node);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(ifile), 0);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d updating vp:%p",
+			    __func__, error, vp);
+			return (error);
+		}
+		VOP_UNLOCK(NTOV(ifile), 0);
+	}
+
+	cno_changed = 0;
+	if (seginfo->blocks) {
+		VOP_LOCK(NTOV(cp), LK_EXCLUSIVE);
+		cno_changed = 1;
+		/* Create new checkpoint */
+		error = nandfs_get_checkpoint(fsdev, cp, fsdev->nd_last_cno + 1);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(cp), 0);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d getting cp:%jx",
+			    __func__, error, fsdev->nd_last_cno + 1);
+			return (error);
+		}
+
+		/* Reiterate all blocks and assign physical block number */
+		nandfs_seginfo_assign_pblk(seginfo);
+
+		/* Fill checkpoint data */
+		error = nandfs_set_checkpoint(fsdev, cp, fsdev->nd_last_cno + 1,
+		    &ifile->nn_inode, seginfo->blocks);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(cp), 0);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d setting cp:%jx",
+			    __func__, error, fsdev->nd_last_cno + 1);
+			return (error);
+		}
+
+		VOP_UNLOCK(NTOV(cp), 0);
+		LIST_FOREACH(seg, &seginfo->seg_list, seg_link)
+			nandfs_update_segment(fsdev, seg->seg_num,
+			    seg->nblocks + seg->segsum_blocks);
+
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		error = save_seginfo(seginfo, 0);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(dat), 0);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d updating seg",
+			    __func__, error);
+			return (error);
+		}
+		VOP_UNLOCK(NTOV(dat), 0);
+	}
+
+	VOP_UNLOCK(NTOV(su), 0);
+
+	delete_seginfo(seginfo);
+	lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+
+	if (cno_changed && !error) {
+		if (nandfs_cps_between_sblocks != 0 &&
+		    fsdev->nd_last_cno % nandfs_cps_between_sblocks == 0)
+			nandfs_write_superblock(fsdev);
+	}
+
+	ASSERT_VOP_LOCKED(vp, __func__);
+	DPRINTF(SYNC, ("%s: END error %d\n", __func__, error));
+	return (error);
+}
+
+int
+nandfs_segment_constructor(struct nandfsmount *nmp, int flags)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_seginfo *seginfo = NULL;
+	struct nandfs_segment *seg;
+	struct nandfs_node *dat, *su, *ifile, *cp, *gc;
+	int cno_changed, error;
+
+	DPRINTF(SYNC, ("%s: START\n", __func__));
+	fsdev = nmp->nm_nandfsdev;
+
+	lockmgr(&fsdev->nd_seg_const, LK_EXCLUSIVE, NULL);
+	DPRINTF(SYNC, ("%s: git lock\n", __func__));
+again:
+	create_seginfo(fsdev, &seginfo);
+
+	dat = fsdev->nd_dat_node;
+	su = fsdev->nd_su_node;
+	cp = fsdev->nd_cp_node;
+	gc = fsdev->nd_gc_node;
+	ifile = nmp->nm_ifile_node;
+
+	VOP_LOCK(NTOV(su), LK_EXCLUSIVE);
+	VOP_LOCK(NTOV(ifile), LK_EXCLUSIVE);
+	VOP_LOCK(NTOV(gc), LK_EXCLUSIVE);
+	VOP_LOCK(NTOV(cp), LK_EXCLUSIVE);
+
+	nandfs_iterate_system_vnode(gc, seginfo);
+	nandfs_iterate_dirty_vnodes(nmp->nm_vfs_mountp, seginfo);
+	nandfs_iterate_system_vnode(ifile, seginfo);
+	nandfs_iterate_system_vnode(su, seginfo);
+
+	cno_changed = 0;
+	if (seginfo->blocks || flags) {
+		cno_changed = 1;
+		/* Create new checkpoint */
+		error = nandfs_get_checkpoint(fsdev, cp, fsdev->nd_last_cno + 1);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			goto error_locks;
+		}
+
+		/* Collect blocks from system files */
+		nandfs_iterate_system_vnode(cp, seginfo);
+		nandfs_iterate_system_vnode(su, seginfo);
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		nandfs_iterate_system_vnode(dat, seginfo);
+		VOP_UNLOCK(NTOV(dat), 0);
+reiterate:
+		seginfo->reiterate = 0;
+		nandfs_iterate_system_vnode(su, seginfo);
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		nandfs_iterate_system_vnode(dat, seginfo);
+		VOP_UNLOCK(NTOV(dat), 0);
+		if (seginfo->reiterate)
+			goto reiterate;
+		if (!(seginfo->curseg) || !seginfo->curseg->num_blocks) {
+			error = create_segment(seginfo);
+			if (error) {
+				clean_seginfo(seginfo, 0);
+				delete_seginfo(seginfo);
+				goto error_locks;
+			}
+			goto reiterate;
+		}
+
+		/* Reiterate all blocks and assign physical block number */
+		nandfs_seginfo_assign_pblk(seginfo);
+
+		/* Fill superroot */
+		error = nandfs_add_superroot(seginfo);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			goto error_locks;
+		}
+		KASSERT(!(seginfo->reiterate), ("reiteration after superroot"));
+
+		/* Fill checkpoint data */
+		nandfs_set_checkpoint(fsdev, cp, fsdev->nd_last_cno + 1,
+		    &ifile->nn_inode, seginfo->blocks);
+
+		LIST_FOREACH(seg, &seginfo->seg_list, seg_link)
+			nandfs_update_segment(fsdev, seg->seg_num,
+			    seg->nblocks + seg->segsum_blocks);
+
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		error = save_seginfo(seginfo, 1);
+		if (error) {
+			clean_seginfo(seginfo, 1);
+			delete_seginfo(seginfo);
+			goto error_dat;
+		}
+		VOP_UNLOCK(NTOV(dat), 0);
+	}
+
+	VOP_UNLOCK(NTOV(cp), 0);
+	VOP_UNLOCK(NTOV(gc), 0);
+	VOP_UNLOCK(NTOV(ifile), 0);
+
+	nandfs_process_segments(fsdev);
+
+	VOP_UNLOCK(NTOV(su), 0);
+
+	delete_seginfo(seginfo);
+
+	/*
+	 * XXX: a hack, will go away soon
+	 */
+	if ((NTOV(dat)->v_bufobj.bo_dirty.bv_cnt != 0 ||
+	    NTOV(cp)->v_bufobj.bo_dirty.bv_cnt != 0 ||
+	    NTOV(gc)->v_bufobj.bo_dirty.bv_cnt != 0 ||
+	    NTOV(ifile)->v_bufobj.bo_dirty.bv_cnt != 0 ||
+	    NTOV(su)->v_bufobj.bo_dirty.bv_cnt != 0) &&
+	    (flags & NANDFS_UMOUNT)) {
+		DPRINTF(SYNC, ("%s: RERUN\n", __func__));
+		goto again;
+	}
+
+	MPASS(fsdev->nd_free_base == NULL);
+
+	lockmgr(&fsdev->nd_seg_const, LK_RELEASE, NULL);
+
+	if (cno_changed) {
+		if ((nandfs_cps_between_sblocks != 0 &&
+		    fsdev->nd_last_cno % nandfs_cps_between_sblocks == 0) ||
+		    flags & NANDFS_UMOUNT)
+			nandfs_write_superblock(fsdev);
+	}
+
+	DPRINTF(SYNC, ("%s: END\n", __func__));
+	return (0);
+error_dat:
+	VOP_UNLOCK(NTOV(dat), 0);
+error_locks:
+	VOP_UNLOCK(NTOV(cp), 0);
+	VOP_UNLOCK(NTOV(gc), 0);
+	VOP_UNLOCK(NTOV(ifile), 0);
+	VOP_UNLOCK(NTOV(su), 0);
+	lockmgr(&fsdev->nd_seg_const, LK_RELEASE, NULL);
+
+	return (error);
+}
+
+#ifdef DDB
+/*
+ * Show details about the given NANDFS mount point.
+ */
+DB_SHOW_COMMAND(nandfs, db_show_nandfs)
+{
+	struct mount *mp;
+	struct nandfs_device *nffsdev;
+	struct nandfs_segment *seg;
+	struct nandfsmount *nmp;
+	struct buf *bp;
+	struct vnode *vp;
+
+	if (!have_addr) {
+		db_printf("\nUsage: show nandfs <mount_addr>\n");
+		return;
+	}
+
+	mp = (struct mount *)addr;
+	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
+	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
+
+
+	nmp = (struct nandfsmount *)(mp->mnt_data);
+	nffsdev = nmp->nm_nandfsdev;
+	db_printf("dev vnode:%p\n", nffsdev->nd_devvp);
+	db_printf("blocksize:%jx last cno:%jx last pseg:%jx seg num:%jx\n",
+	    (uintmax_t)nffsdev->nd_blocksize, (uintmax_t)nffsdev->nd_last_cno,
+	    (uintmax_t)nffsdev->nd_last_pseg, (uintmax_t)nffsdev->nd_seg_num);
+	db_printf("system nodes: dat:%p cp:%p su:%p ifile:%p gc:%p\n",
+	    nffsdev->nd_dat_node, nffsdev->nd_cp_node, nffsdev->nd_su_node,
+	    nmp->nm_ifile_node, nffsdev->nd_gc_node);
+
+	if (nffsdev->nd_seginfo != NULL) {
+		LIST_FOREACH(seg, &nffsdev->nd_seginfo->seg_list, seg_link) {
+			db_printf("seg: %p\n", seg);
+			TAILQ_FOREACH(bp, &seg->segsum,
+			    b_cluster.cluster_entry)
+				db_printf("segbp %p\n", bp);
+			TAILQ_FOREACH(bp, &seg->data,
+			    b_cluster.cluster_entry) {
+				vp = bp->b_vp;
+				db_printf("bp:%p bp->b_vp:%p ino:%jx\n", bp, vp,
+				    (uintmax_t)(vp ? VTON(vp)->nn_ino : 0));
+			}
+		}
+	}
+}
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_subr.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_subr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,1120 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_subr.c,v 1.4 2009/07/29 17:06:57 reinoud
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_subr.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/libkern.h>
+
+#include <geom/geom.h>
+#include <geom/geom_vfs.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#include <machine/_inttypes.h>
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+
+MALLOC_DEFINE(M_NANDFSMNT, "nandfs_mount", "NANDFS mount");;
+MALLOC_DEFINE(M_NANDFSTEMP, "nandfs_tmt", "NANDFS tmp");
+
+uma_zone_t nandfs_node_zone;
+
+void nandfs_bdflush(struct bufobj *bo, struct buf *bp);
+int nandfs_bufsync(struct bufobj *bo, int waitfor);
+
+struct buf_ops buf_ops_nandfs = {
+	.bop_name	=	"buf_ops_nandfs",
+	.bop_write	=	bufwrite,
+	.bop_strategy	=	bufstrategy,
+	.bop_sync	=	nandfs_bufsync,
+	.bop_bdflush	=	nandfs_bdflush,
+};
+
+int
+nandfs_bufsync(struct bufobj *bo, int waitfor)
+{
+	struct vnode *vp;
+	int error = 0;
+
+	vp = bo->__bo_vnode;
+
+	ASSERT_VOP_LOCKED(vp, __func__);
+	error = nandfs_sync_file(vp);
+	if (error)
+		nandfs_warning("%s: cannot flush buffers err:%d\n",
+		    __func__, error);
+
+	return (error);
+}
+
+void
+nandfs_bdflush(bo, bp)
+	struct bufobj *bo;
+	struct buf *bp;
+{
+	struct vnode *vp;
+	int error;
+
+	if (bo->bo_dirty.bv_cnt <= ((dirtybufthresh * 8) / 10))
+		return;
+
+	vp = bp->b_vp;
+	if (NANDFS_SYS_NODE(VTON(vp)->nn_ino))
+		return;
+
+	if (NANDFS_IS_INDIRECT(bp))
+		return;
+
+	error = nandfs_sync_file(vp);
+	if (error)
+		nandfs_warning("%s: cannot flush buffers err:%d\n",
+		    __func__, error);
+}
+
+int
+nandfs_init(struct vfsconf *vfsp)
+{
+
+	nandfs_node_zone = uma_zcreate("nandfs node zone",
+	    sizeof(struct nandfs_node), NULL, NULL, NULL, NULL, 0, 0);
+
+	return (0);
+}
+
+int
+nandfs_uninit(struct vfsconf *vfsp)
+{
+
+	uma_zdestroy(nandfs_node_zone);
+	return (0);
+}
+
+/* Basic calculators */
+uint64_t
+nandfs_get_segnum_of_block(struct nandfs_device *nandfsdev,
+    nandfs_daddr_t blocknr)
+{
+	uint64_t segnum, blks_per_seg;
+
+	MPASS(blocknr >= nandfsdev->nd_fsdata.f_first_data_block);
+
+	blks_per_seg = nandfsdev->nd_fsdata.f_blocks_per_segment;
+
+	segnum = blocknr / blks_per_seg;
+	segnum -= nandfsdev->nd_fsdata.f_first_data_block / blks_per_seg;
+
+	DPRINTF(SYNC, ("%s: returning blocknr %jx -> segnum %jx\n", __func__,
+	    blocknr, segnum));
+
+	return (segnum);
+}
+
+void
+nandfs_get_segment_range(struct nandfs_device *nandfsdev, uint64_t segnum,
+    uint64_t *seg_start, uint64_t *seg_end)
+{
+	uint64_t blks_per_seg;
+
+	blks_per_seg = nandfsdev->nd_fsdata.f_blocks_per_segment;
+	*seg_start = nandfsdev->nd_fsdata.f_first_data_block +
+	    blks_per_seg * segnum;
+	if (seg_end != NULL)
+		*seg_end = *seg_start + blks_per_seg -1;
+}
+
+void nandfs_calc_mdt_consts(struct nandfs_device *nandfsdev,
+    struct nandfs_mdt *mdt, int entry_size)
+{
+	uint32_t blocksize = nandfsdev->nd_blocksize;
+
+	mdt->entries_per_group = blocksize * 8;
+	mdt->entries_per_block = blocksize / entry_size;
+
+	mdt->blocks_per_group =
+	    (mdt->entries_per_group -1) / mdt->entries_per_block + 1 + 1;
+	mdt->groups_per_desc_block =
+	    blocksize / sizeof(struct nandfs_block_group_desc);
+	mdt->blocks_per_desc_block =
+	    mdt->groups_per_desc_block * mdt->blocks_per_group + 1;
+}
+
+int
+nandfs_dev_bread(struct nandfs_device *nandfsdev, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	int blk2dev = nandfsdev->nd_blocksize / DEV_BSIZE;
+	int error;
+
+	DPRINTF(BLOCK, ("%s: read from block %jx vp %p\n", __func__,
+	    blocknr * blk2dev, nandfsdev->nd_devvp));
+	error = bread(nandfsdev->nd_devvp, blocknr * blk2dev,
+	    nandfsdev->nd_blocksize, NOCRED, bpp);
+	if (error)
+		nandfs_error("%s: cannot read from device - blk:%jx\n",
+		    __func__, blocknr);
+	return (error);
+}
+
+/* Read on a node */
+int
+nandfs_bread(struct nandfs_node *node, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	nandfs_daddr_t vblk;
+	int error;
+
+	DPRINTF(BLOCK, ("%s: vp:%p lbn:%#jx\n", __func__, NTOV(node),
+	    blocknr));
+
+	error = bread(NTOV(node), blocknr, node->nn_nandfsdev->nd_blocksize,
+	    cred, bpp);
+
+	KASSERT(error == 0, ("%s: vp:%p lbn:%#jx err:%d\n", __func__,
+	    NTOV(node), blocknr, error));
+
+	if (!nandfs_vblk_get(*bpp) &&
+	    ((*bpp)->b_flags & B_CACHE) && node->nn_ino != NANDFS_DAT_INO) {
+		nandfs_bmap_lookup(node, blocknr, &vblk);
+		nandfs_vblk_set(*bpp, vblk);
+	}
+	return (error);
+}
+
+int
+nandfs_bread_meta(struct nandfs_node *node, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	nandfs_daddr_t vblk;
+	int error;
+
+	DPRINTF(BLOCK, ("%s: vp:%p lbn:%#jx\n", __func__, NTOV(node),
+	    blocknr));
+
+	error = bread(NTOV(node), blocknr, node->nn_nandfsdev->nd_blocksize,
+	    cred, bpp);
+
+	KASSERT(error == 0, ("%s: vp:%p lbn:%#jx err:%d\n", __func__,
+	    NTOV(node), blocknr, error));
+
+	if (!nandfs_vblk_get(*bpp) &&
+	    ((*bpp)->b_flags & B_CACHE) && node->nn_ino != NANDFS_DAT_INO) {
+		nandfs_bmap_lookup(node, blocknr, &vblk);
+		nandfs_vblk_set(*bpp, vblk);
+	}
+
+	return (error);
+}
+
+int
+nandfs_bdestroy(struct nandfs_node *node, nandfs_daddr_t vblk)
+{
+	int error;
+
+	if (!NANDFS_SYS_NODE(node->nn_ino))
+		NANDFS_WRITEASSERT(node->nn_nandfsdev);
+
+	error = nandfs_vblock_end(node->nn_nandfsdev, vblk);
+	if (error) {
+		nandfs_error("%s: ending vblk: %jx failed\n",
+		    __func__, (uintmax_t)vblk);
+		return (error);
+	}
+	node->nn_inode.i_blocks--;
+
+	return (0);
+}
+
+int
+nandfs_bcreate(struct nandfs_node *node, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	int error;
+
+	ASSERT_VOP_LOCKED(NTOV(node), __func__);
+	if (!NANDFS_SYS_NODE(node->nn_ino))
+		NANDFS_WRITEASSERT(node->nn_nandfsdev);
+
+	DPRINTF(BLOCK, ("%s: vp:%p lbn:%#jx\n", __func__, NTOV(node),
+	    blocknr));
+
+	*bpp = getblk(NTOV(node), blocknr, node->nn_nandfsdev->nd_blocksize,
+	    0, 0, 0);
+
+	KASSERT((*bpp), ("%s: vp:%p lbn:%#jx\n", __func__,
+	    NTOV(node), blocknr));
+
+	if (*bpp) {
+		vfs_bio_clrbuf(*bpp);
+		(*bpp)->b_blkno = ~(0); /* To avoid VOP_BMAP in bdwrite */
+		error = nandfs_bmap_insert_block(node, blocknr, *bpp);
+		if (error) {
+			nandfs_warning("%s: failed bmap insert node:%p"
+			    " blk:%jx\n", __func__, node, blocknr);
+			brelse(*bpp);
+			return (error);
+		}
+		node->nn_inode.i_blocks++;
+
+		return (0);
+	}
+
+	return (-1);
+}
+
+int
+nandfs_bcreate_meta(struct nandfs_node *node, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	struct nandfs_device *fsdev;
+	nandfs_daddr_t vblk;
+	int error;
+
+	ASSERT_VOP_LOCKED(NTOV(node), __func__);
+	NANDFS_WRITEASSERT(node->nn_nandfsdev);
+
+	DPRINTF(BLOCK, ("%s: vp:%p lbn:%#jx\n", __func__, NTOV(node),
+	    blocknr));
+
+	fsdev = node->nn_nandfsdev;
+
+	*bpp = getblk(NTOV(node), blocknr, node->nn_nandfsdev->nd_blocksize,
+	    0, 0, 0);
+
+	KASSERT((*bpp), ("%s: vp:%p lbn:%#jx\n", __func__,
+	    NTOV(node), blocknr));
+
+	memset((*bpp)->b_data, 0, fsdev->nd_blocksize);
+
+	vfs_bio_clrbuf(*bpp);
+	(*bpp)->b_blkno = ~(0); /* To avoid VOP_BMAP in bdwrite */
+
+	nandfs_buf_set(*bpp, NANDFS_VBLK_ASSIGNED);
+
+	if (node->nn_ino != NANDFS_DAT_INO) {
+		error = nandfs_vblock_alloc(fsdev, &vblk);
+		if (error) {
+			nandfs_buf_clear(*bpp, NANDFS_VBLK_ASSIGNED);
+			brelse(*bpp);
+			return (error);
+		}
+	} else
+		vblk = fsdev->nd_fakevblk++;
+
+	nandfs_vblk_set(*bpp, vblk);
+
+	nandfs_bmap_insert_block(node, blocknr, *bpp);
+	return (0);
+}
+
+/* Translate index to a file block number and an entry */
+void
+nandfs_mdt_trans(struct nandfs_mdt *mdt, uint64_t index,
+    nandfs_lbn_t *blocknr, uint32_t *entry_in_block)
+{
+	uint64_t blknr;
+	uint64_t group, group_offset, blocknr_in_group;
+	uint64_t desc_block, desc_offset;
+
+	/* Calculate our offset in the file */
+	group = index / mdt->entries_per_group;
+	group_offset = index % mdt->entries_per_group;
+	desc_block = group / mdt->groups_per_desc_block;
+	desc_offset = group % mdt->groups_per_desc_block;
+	blocknr_in_group = group_offset / mdt->entries_per_block;
+
+	/* To descgroup offset */
+	blknr = 1 + desc_block * mdt->blocks_per_desc_block;
+
+	/* To group offset */
+	blknr += desc_offset * mdt->blocks_per_group;
+
+	/* To actual file block */
+	blknr += 1 + blocknr_in_group;
+
+	*blocknr = blknr;
+	*entry_in_block = group_offset % mdt->entries_per_block;
+}
+
+void
+nandfs_mdt_trans_blk(struct nandfs_mdt *mdt, uint64_t index,
+    uint64_t *desc, uint64_t *bitmap, nandfs_lbn_t *blocknr,
+    uint32_t *entry_in_block)
+{
+	uint64_t blknr;
+	uint64_t group, group_offset, blocknr_in_group;
+	uint64_t desc_block, desc_offset;
+
+	/* Calculate our offset in the file */
+	group = index / mdt->entries_per_group;
+	group_offset = index % mdt->entries_per_group;
+	desc_block = group / mdt->groups_per_desc_block;
+	desc_offset = group % mdt->groups_per_desc_block;
+	blocknr_in_group = group_offset / mdt->entries_per_block;
+
+	/* To descgroup offset */
+	*desc = desc_block * mdt->blocks_per_desc_block;
+	blknr = 1 + desc_block * mdt->blocks_per_desc_block;
+
+	/* To group offset */
+	blknr += desc_offset * mdt->blocks_per_group;
+	*bitmap = blknr;
+
+	/* To actual file block */
+	blknr += 1 + blocknr_in_group;
+
+	*blocknr = blknr;
+	*entry_in_block = group_offset % mdt->entries_per_block;
+
+	DPRINTF(ALLOC,
+	    ("%s: desc_buf: %jx bitmap_buf: %jx entry_buf: %jx entry: %x\n",
+	    __func__, (uintmax_t)*desc, (uintmax_t)*bitmap,
+	    (uintmax_t)*blocknr, *entry_in_block));
+}
+
+int
+nandfs_vtop(struct nandfs_node *node, nandfs_daddr_t vblocknr,
+    nandfs_daddr_t *pblocknr)
+{
+	struct nandfs_node *dat_node;
+	struct nandfs_dat_entry *entry;
+	struct buf *bp;
+	nandfs_lbn_t ldatblknr;
+	uint32_t entry_in_block;
+	int locked, error;
+
+	if (node->nn_ino == NANDFS_DAT_INO || node->nn_ino == NANDFS_GC_INO) {
+		*pblocknr = vblocknr;
+		return (0);
+	}
+
+	/* only translate valid vblocknrs */
+	if (vblocknr == 0)
+		return (0);
+
+	dat_node = node->nn_nandfsdev->nd_dat_node;
+	nandfs_mdt_trans(&node->nn_nandfsdev->nd_dat_mdt, vblocknr, &ldatblknr,
+	    &entry_in_block);
+
+	locked = NANDFS_VOP_ISLOCKED(NTOV(dat_node));
+	if (!locked)
+		VOP_LOCK(NTOV(dat_node), LK_SHARED);
+	error = nandfs_bread(dat_node, ldatblknr, NOCRED, 0, &bp);
+	if (error) {
+		DPRINTF(TRANSLATE, ("vtop: can't read in DAT block %#jx!\n",
+		    (uintmax_t)ldatblknr));
+		brelse(bp);
+		VOP_UNLOCK(NTOV(dat_node), 0);
+		return (error);
+	}
+
+	/* Get our translation */
+	entry = ((struct nandfs_dat_entry *) bp->b_data) + entry_in_block;
+	DPRINTF(TRANSLATE, ("\tentry %p data %p entry_in_block %x\n",
+	    entry, bp->b_data, entry_in_block))
+	DPRINTF(TRANSLATE, ("\tvblk %#jx -> %#jx for cp [%#jx-%#jx]\n",
+	    (uintmax_t)vblocknr, (uintmax_t)entry->de_blocknr,
+	    (uintmax_t)entry->de_start, (uintmax_t)entry->de_end));
+
+	*pblocknr = entry->de_blocknr;
+	brelse(bp);
+	if (!locked)
+		VOP_UNLOCK(NTOV(dat_node), 0);
+
+	MPASS(*pblocknr >= node->nn_nandfsdev->nd_fsdata.f_first_data_block ||
+	    *pblocknr == 0);
+
+	return (0);
+}
+
+int
+nandfs_segsum_valid(struct nandfs_segment_summary *segsum)
+{
+
+	return (segsum->ss_magic == NANDFS_SEGSUM_MAGIC);
+}
+
+int
+nandfs_load_segsum(struct nandfs_device *fsdev, nandfs_daddr_t blocknr,
+    struct nandfs_segment_summary *segsum)
+{
+	struct buf *bp;
+	int error;
+
+	DPRINTF(VOLUMES, ("nandfs: try segsum at block %jx\n",
+	    (uintmax_t)blocknr));
+
+	error = nandfs_dev_bread(fsdev, blocknr, NOCRED, 0, &bp);
+	if (error)
+		return (error);
+
+	memcpy(segsum, bp->b_data, sizeof(struct nandfs_segment_summary));
+	brelse(bp);
+
+	if (!nandfs_segsum_valid(segsum)) {
+		DPRINTF(VOLUMES, ("%s: bad magic pseg:%jx\n", __func__,
+		    blocknr));
+		return (EINVAL);
+	}
+
+	return (error);
+}
+
+static int
+nandfs_load_super_root(struct nandfs_device *nandfsdev,
+    struct nandfs_segment_summary *segsum, uint64_t pseg)
+{
+	struct nandfs_super_root super_root;
+	struct buf *bp;
+	uint64_t blocknr;
+	uint32_t super_root_crc, comp_crc;
+	int off, error;
+
+	/* Check if there is a superroot */
+	if ((segsum->ss_flags & NANDFS_SS_SR) == 0) {
+		DPRINTF(VOLUMES, ("%s: no super root in pseg:%jx\n", __func__,
+		    pseg));
+		return (ENOENT);
+	}
+
+	/* Get our super root, located at the end of the pseg */
+	blocknr = pseg + segsum->ss_nblocks - 1;
+	DPRINTF(VOLUMES, ("%s: try at %#jx\n", __func__, (uintmax_t)blocknr));
+
+	error = nandfs_dev_bread(nandfsdev, blocknr, NOCRED, 0, &bp);
+	if (error)
+		return (error);
+
+	memcpy(&super_root, bp->b_data, sizeof(struct nandfs_super_root));
+	brelse(bp);
+
+	/* Check super root CRC */
+	super_root_crc = super_root.sr_sum;
+	off = sizeof(super_root.sr_sum);
+	comp_crc = crc32((uint8_t *)&super_root + off,
+	    NANDFS_SR_BYTES - off);
+
+	if (super_root_crc != comp_crc) {
+		DPRINTF(VOLUMES, ("%s: invalid crc:%#x [expect:%#x]\n",
+		    __func__, super_root_crc, comp_crc));
+		return (EINVAL);
+	}
+
+	nandfsdev->nd_super_root = super_root;
+	DPRINTF(VOLUMES, ("%s: got valid superroot\n", __func__));
+
+	return (0);
+}
+
+/*
+ * Search for the last super root recorded.
+ */
+int
+nandfs_search_super_root(struct nandfs_device *nandfsdev)
+{
+	struct nandfs_super_block *super;
+	struct nandfs_segment_summary segsum;
+	uint64_t seg_start, seg_end, cno, seq, create, pseg;
+	uint64_t segnum;
+	int error, found;
+
+	error = found = 0;
+
+	/* Search for last super root */
+	pseg = nandfsdev->nd_super.s_last_pseg;
+	segnum = nandfs_get_segnum_of_block(nandfsdev, pseg);
+
+	cno = nandfsdev->nd_super.s_last_cno;
+	create = seq = 0;
+	DPRINTF(VOLUMES, ("%s: start in pseg %#jx\n", __func__,
+	    (uintmax_t)pseg));
+
+	for (;;) {
+		error = nandfs_load_segsum(nandfsdev, pseg, &segsum);
+		if (error)
+			break;
+
+		if (segsum.ss_seq < seq || segsum.ss_create < create)
+			break;
+
+		/* Try to load super root */
+		if (segsum.ss_flags & NANDFS_SS_SR) {
+			error = nandfs_load_super_root(nandfsdev, &segsum, pseg);
+			if (error)
+				break;	/* confused */
+			found = 1;
+
+			super = &nandfsdev->nd_super;
+			nandfsdev->nd_last_segsum = segsum;
+			super->s_last_pseg = pseg;
+			super->s_last_cno = cno++;
+			super->s_last_seq = segsum.ss_seq;
+			super->s_state = NANDFS_VALID_FS;
+			seq = segsum.ss_seq;
+			create = segsum.ss_create;
+		} else {
+			seq = segsum.ss_seq;
+			create = segsum.ss_create;
+		}
+
+		/* Calculate next partial segment location */
+		pseg += segsum.ss_nblocks;
+		DPRINTF(VOLUMES, ("%s: next partial seg is %jx\n", __func__,
+		    (uintmax_t)pseg));
+
+		/* Did we reach the end of the segment? if so, go to the next */
+		nandfs_get_segment_range(nandfsdev, segnum, &seg_start,
+		    &seg_end);
+		if (pseg >= seg_end) {
+			pseg = segsum.ss_next;
+			DPRINTF(VOLUMES,
+			    (" partial seg oor next is %jx[%jx - %jx]\n",
+			    (uintmax_t)pseg, (uintmax_t)seg_start,
+			    (uintmax_t)seg_end));
+		}
+		segnum = nandfs_get_segnum_of_block(nandfsdev, pseg);
+	}
+
+	if (error && !found)
+		return (error);
+
+	return (0);
+}
+
+int
+nandfs_get_node_raw(struct nandfs_device *nandfsdev, struct nandfsmount *nmp,
+    uint64_t ino, struct nandfs_inode *inode, struct nandfs_node **nodep)
+{
+	struct nandfs_node *node;
+	struct vnode *nvp;
+	struct mount *mp;
+	int error;
+
+	*nodep = NULL;
+
+	/* Associate with mountpoint if present */
+	if (nmp) {
+		mp = nmp->nm_vfs_mountp;
+		error = getnewvnode("nandfs", mp, &nandfs_vnodeops, &nvp);
+		if (error) {
+			return (error);
+		}
+	} else {
+		mp = NULL;
+		error = getnewvnode("snandfs", mp, &nandfs_system_vnodeops,
+		    &nvp);
+		if (error) {
+			return (error);
+		}
+	}
+
+	if (mp)
+		NANDFS_WRITELOCK(nandfsdev);
+
+	DPRINTF(IFILE, ("%s: ino: %#jx -> vp: %p\n",
+	    __func__, (uintmax_t)ino, nvp));
+	/* Lock node */
+	lockmgr(nvp->v_vnlock, LK_EXCLUSIVE, NULL);
+
+	if (mp) {
+		error = insmntque(nvp, mp);
+		if (error != 0) {
+			*nodep = NULL;
+			return (error);
+		}
+	}
+
+	node = uma_zalloc(nandfs_node_zone, M_WAITOK | M_ZERO);
+
+	/* Crosslink */
+	node->nn_vnode = nvp;
+	nvp->v_bufobj.bo_ops = &buf_ops_nandfs;
+	node->nn_nmp = nmp;
+	node->nn_nandfsdev = nandfsdev;
+	nvp->v_data = node;
+
+	/* Initiase NANDFS node */
+	node->nn_ino = ino;
+	if (inode != NULL)
+		node->nn_inode = *inode;
+
+	nandfs_vinit(nvp, ino);
+
+	/* Return node */
+	*nodep = node;
+	DPRINTF(IFILE, ("%s: ino:%#jx vp:%p node:%p\n",
+	    __func__, (uintmax_t)ino, nvp, *nodep));
+
+	return (0);
+}
+
+int
+nandfs_get_node(struct nandfsmount *nmp, uint64_t ino,
+    struct nandfs_node **nodep)
+{
+	struct nandfs_device *nandfsdev;
+	struct nandfs_inode inode, *entry;
+	struct vnode *nvp, *vpp;
+	struct thread *td;
+	struct buf *bp;
+	uint64_t ivblocknr;
+	uint32_t entry_in_block;
+	int error;
+
+	/* Look up node in hash table */
+	td = curthread;
+	*nodep = NULL;
+
+	if ((ino < NANDFS_ATIME_INO) && (ino != NANDFS_ROOT_INO)) {
+		printf("nandfs_get_node: system ino %"PRIu64" not in mount "
+		    "point!\n", ino);
+		return (ENOENT);
+	}
+
+	error = vfs_hash_get(nmp->nm_vfs_mountp, ino, LK_EXCLUSIVE, td, &nvp,
+	    NULL, NULL);
+	if (error)
+		return (error);
+
+	if (nvp != NULL) {
+		*nodep = (struct nandfs_node *)nvp->v_data;
+		return (0);
+	}
+
+	/* Look up inode structure in mountpoints ifile */
+	nandfsdev = nmp->nm_nandfsdev;
+	nandfs_mdt_trans(&nandfsdev->nd_ifile_mdt, ino, &ivblocknr,
+	    &entry_in_block);
+
+	VOP_LOCK(NTOV(nmp->nm_ifile_node), LK_SHARED);
+	error = nandfs_bread(nmp->nm_ifile_node, ivblocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(nmp->nm_ifile_node), 0);
+		return (ENOENT);
+	}
+
+	/* Get inode entry */
+	entry = (struct nandfs_inode *) bp->b_data + entry_in_block;
+	memcpy(&inode, entry, sizeof(struct nandfs_inode));
+	brelse(bp);
+	VOP_UNLOCK(NTOV(nmp->nm_ifile_node), 0);
+
+	/* Get node */
+	error = nandfs_get_node_raw(nmp->nm_nandfsdev, nmp, ino, &inode, nodep);
+	if (error) {
+		*nodep = NULL;
+		return (error);
+	}
+
+	nvp = (*nodep)->nn_vnode;
+	error = vfs_hash_insert(nvp, ino, 0, td, &vpp, NULL, NULL);
+	if (error) {
+		*nodep = NULL;
+		return (error);
+	}
+
+	return (error);
+}
+
+void
+nandfs_dispose_node(struct nandfs_node **nodep)
+{
+	struct nandfs_node *node;
+	struct vnode *vp;
+
+	/* Protect against rogue values */
+	node = *nodep;
+	if (!node) {
+		return;
+	}
+	DPRINTF(NODE, ("nandfs_dispose_node: %p\n", *nodep));
+
+	vp = NTOV(node);
+	vp->v_data = NULL;
+
+	/* Free our associated memory */
+	uma_zfree(nandfs_node_zone, node);
+
+	*nodep = NULL;
+}
+
+int
+nandfs_lookup_name_in_dir(struct vnode *dvp, const char *name, int namelen,
+    uint64_t *ino, int *found, uint64_t *off)
+{
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfs_dir_entry	*ndirent;
+	struct buf *bp;
+	uint64_t file_size, diroffset, blkoff;
+	uint64_t blocknr;
+	uint32_t blocksize = dir_node->nn_nandfsdev->nd_blocksize;
+	uint8_t *pos, name_len;
+	int error;
+
+	*found = 0;
+
+	DPRINTF(VNCALL, ("%s: %s file\n", __func__, name));
+	if (dvp->v_type != VDIR) {
+		return (ENOTDIR);
+	}
+
+	/* Get directory filesize */
+	file_size = dir_node->nn_inode.i_size;
+
+	/* Walk the directory */
+	diroffset = 0;
+	blocknr = 0;
+	blkoff = 0;
+	error = nandfs_bread(dir_node, blocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (EIO);
+	}
+
+	while (diroffset < file_size) {
+		if (blkoff >= blocksize) {
+			blkoff = 0; blocknr++;
+			brelse(bp);
+			error = nandfs_bread(dir_node, blocknr, NOCRED, 0,
+			    &bp);
+			if (error) {
+				brelse(bp);
+				return (EIO);
+			}
+		}
+
+		/* Read in one dirent */
+		pos = (uint8_t *) bp->b_data + blkoff;
+		ndirent = (struct nandfs_dir_entry *) pos;
+		name_len = ndirent->name_len;
+
+		if ((name_len == namelen) &&
+		    (strncmp(name, ndirent->name, name_len) == 0) &&
+		    (ndirent->inode != 0)) {
+			*ino = ndirent->inode;
+			*off = diroffset;
+			DPRINTF(LOOKUP, ("found `%.*s` with ino %"PRIx64"\n",
+			    name_len, ndirent->name, *ino));
+			*found = 1;
+			break;
+		}
+
+		/* Advance */
+		diroffset += ndirent->rec_len;
+		blkoff += ndirent->rec_len;
+	}
+	brelse(bp);
+
+	return (error);
+}
+
+int
+nandfs_get_fsinfo(struct nandfsmount *nmp, struct nandfs_fsinfo *fsinfo)
+{
+	struct nandfs_device *fsdev;
+
+	fsdev = nmp->nm_nandfsdev;
+
+	memcpy(&fsinfo->fs_fsdata, &fsdev->nd_fsdata, sizeof(fsdev->nd_fsdata));
+	memcpy(&fsinfo->fs_super, &fsdev->nd_super, sizeof(fsdev->nd_super));
+	snprintf(fsinfo->fs_dev, sizeof(fsinfo->fs_dev),
+	    "%s", nmp->nm_vfs_mountp->mnt_stat.f_mntfromname);
+
+	return (0);
+}
+
+void
+nandfs_inode_init(struct nandfs_inode *inode, uint16_t mode)
+{
+	struct timespec ts;
+
+	vfs_timestamp(&ts);
+
+	inode->i_blocks = 0;
+	inode->i_size = 0;
+	inode->i_ctime = ts.tv_sec;
+	inode->i_ctime_nsec = ts.tv_nsec;
+	inode->i_mtime = ts.tv_sec;
+	inode->i_mtime_nsec = ts.tv_nsec;
+	inode->i_mode = mode;
+	inode->i_links_count = 1;
+	if (S_ISDIR(mode))
+		inode->i_links_count = 2;
+	inode->i_flags = 0;
+
+	inode->i_special = 0;
+	memset(inode->i_db, 0, sizeof(inode->i_db));
+	memset(inode->i_ib, 0, sizeof(inode->i_ib));
+}
+
+void
+nandfs_inode_destroy(struct nandfs_inode *inode)
+{
+
+	MPASS(inode->i_blocks == 0);
+	bzero(inode, sizeof(*inode));
+}
+
+int
+nandfs_fs_full(struct nandfs_device *nffsdev)
+{
+	uint64_t space, bps;
+
+	bps = nffsdev->nd_fsdata.f_blocks_per_segment;
+	space = (nffsdev->nd_clean_segs - 1) * bps;
+
+	DPRINTF(BUF, ("%s: bufs:%jx space:%jx\n", __func__,
+	    (uintmax_t)nffsdev->nd_dirty_bufs, (uintmax_t)space));
+
+	if (nffsdev->nd_dirty_bufs + (10 * bps) >= space)
+		return (1);
+
+	return (0);
+}
+
+static int
+_nandfs_dirty_buf(struct buf *bp, int dirty_meta, int force)
+{
+	struct nandfs_device *nffsdev;
+	struct nandfs_node *node;
+	uint64_t ino, bps;
+
+	if (NANDFS_ISGATHERED(bp)) {
+		bqrelse(bp);
+		return (0);
+	}
+	if ((bp->b_flags & (B_MANAGED | B_DELWRI)) == (B_MANAGED | B_DELWRI)) {
+		bqrelse(bp);
+		return (0);
+	}
+
+	node = VTON(bp->b_vp);
+	nffsdev = node->nn_nandfsdev;
+	DPRINTF(BUF, ("%s: buf:%p\n", __func__, bp));
+	ino = node->nn_ino;
+
+	if (nandfs_fs_full(nffsdev) && !NANDFS_SYS_NODE(ino) && !force) {
+		brelse(bp);
+		return (ENOSPC);
+	}
+
+	bp->b_flags |= B_MANAGED;
+	bdwrite(bp);
+
+	nandfs_dirty_bufs_increment(nffsdev);
+
+	KASSERT((bp->b_vp), ("vp missing for bp"));
+	KASSERT((nandfs_vblk_get(bp) || ino == NANDFS_DAT_INO),
+	    ("bp vblk is 0"));
+
+	/*
+	 * To maintain consistency of FS we need to force making
+	 * meta buffers dirty, even if free space is low.
+	 */
+	if (dirty_meta && ino != NANDFS_GC_INO)
+		nandfs_bmap_dirty_blocks(VTON(bp->b_vp), bp, 1);
+
+	bps = nffsdev->nd_fsdata.f_blocks_per_segment;
+
+	if (nffsdev->nd_dirty_bufs >= (bps * nandfs_max_dirty_segs)) {
+		mtx_lock(&nffsdev->nd_sync_mtx);
+		if (nffsdev->nd_syncing == 0) {
+			DPRINTF(SYNC, ("%s: wakeup gc\n", __func__));
+			nffsdev->nd_syncing = 1;
+			wakeup(&nffsdev->nd_syncing);
+		}
+		mtx_unlock(&nffsdev->nd_sync_mtx);
+	}
+
+	return (0);
+}
+
+int
+nandfs_dirty_buf(struct buf *bp, int force)
+{
+
+	return (_nandfs_dirty_buf(bp, 1, force));
+}
+
+int
+nandfs_dirty_buf_meta(struct buf *bp, int force)
+{
+
+	return (_nandfs_dirty_buf(bp, 0, force));
+}
+
+void
+nandfs_undirty_buf_fsdev(struct nandfs_device *nffsdev, struct buf *bp)
+{
+
+	BUF_ASSERT_HELD(bp);
+
+	if (bp->b_flags & B_DELWRI) {
+		bp->b_flags &= ~(B_DELWRI|B_MANAGED);
+		nandfs_dirty_bufs_decrement(nffsdev);
+	}
+	/*
+	 * Since it is now being written, we can clear its deferred write flag.
+	 */
+	bp->b_flags &= ~B_DEFERRED;
+
+	brelse(bp);
+}
+
+void
+nandfs_undirty_buf(struct buf *bp)
+{
+	struct nandfs_node *node;
+
+	node = VTON(bp->b_vp);
+
+	nandfs_undirty_buf_fsdev(node->nn_nandfsdev, bp);
+}
+
+void
+nandfs_vblk_set(struct buf *bp, nandfs_daddr_t blocknr)
+{
+
+	nandfs_daddr_t *vblk = (nandfs_daddr_t *)(&bp->b_fsprivate1);
+	*vblk = blocknr;
+}
+
+nandfs_daddr_t
+nandfs_vblk_get(struct buf *bp)
+{
+
+	nandfs_daddr_t *vblk = (nandfs_daddr_t *)(&bp->b_fsprivate1);
+	return (*vblk);
+}
+
+void
+nandfs_buf_set(struct buf *bp, uint32_t bits)
+{
+	uintptr_t flags;
+
+	flags = (uintptr_t)bp->b_fsprivate3;
+	flags |= (uintptr_t)bits;
+	bp->b_fsprivate3 = (void *)flags;
+}
+
+void
+nandfs_buf_clear(struct buf *bp, uint32_t bits)
+{
+	uintptr_t flags;
+
+	flags = (uintptr_t)bp->b_fsprivate3;
+	flags &= ~(uintptr_t)bits;
+	bp->b_fsprivate3 = (void *)flags;
+}
+
+int
+nandfs_buf_check(struct buf *bp, uint32_t bits)
+{
+	uintptr_t flags;
+
+	flags = (uintptr_t)bp->b_fsprivate3;
+	if (flags & bits)
+		return (1);
+	return (0);
+}
+
+int
+nandfs_erase(struct nandfs_device *fsdev, off_t offset, size_t size)
+{
+	struct buf *bp;
+	int read_size, error, i;
+
+	DPRINTF(BLOCK, ("%s: performing erase at offset %jx size %zx\n",
+	    __func__, offset, size));
+
+	MPASS(size % fsdev->nd_erasesize == 0);
+
+	if (fsdev->nd_is_nand) {
+		error = g_delete_data(fsdev->nd_gconsumer, offset, size);
+		return (error);
+	}
+
+	if (size > MAXBSIZE)
+		read_size = MAXBSIZE;
+	else
+		read_size = size;
+
+	error = 0;
+	for (i = 0; i < size / MAXBSIZE; i++) {
+		error = bread(fsdev->nd_devvp, btodb(offset + i * read_size),
+		    read_size, NOCRED, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		memset(bp->b_data, 0xff, read_size);
+		error = bwrite(bp);
+		if (error) {
+			nandfs_error("%s: err:%d from bwrite\n",
+			    __func__, error);
+			return (error);
+		}
+	}
+
+	return (error);
+}
+
+int
+nandfs_vop_islocked(struct vnode *vp)
+{
+	int islocked;
+
+	islocked = VOP_ISLOCKED(vp);
+	return (islocked == LK_EXCLUSIVE || islocked == LK_SHARED);
+}
+
+nandfs_daddr_t
+nandfs_block_to_dblock(struct nandfs_device *fsdev, nandfs_lbn_t block)
+{
+
+	return (btodb(block * fsdev->nd_blocksize));
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_subr.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_subr.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_subr.h,v 1.1 2009/07/18 16:31:42 reinoud
+ *
+ * $FreeBSD: head/sys/fs/nandfs/nandfs_subr.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _FS_NANDFS_NANDFS_SUBR_H_
+#define _FS_NANDFS_NANDFS_SUBR_H_
+
+struct nandfs_mdt;
+
+struct nandfs_alloc_request
+{
+	uint64_t	entrynum;
+	struct buf	*bp_desc;
+	struct buf	*bp_bitmap;
+	struct buf	*bp_entry;
+};
+
+/* Segment creation */
+void nandfs_wakeup_wait_sync(struct nandfs_device *, int);
+int nandfs_segment_constructor(struct nandfsmount *, int);
+int nandfs_sync_file(struct vnode *);
+
+/* Basic calculators */
+uint64_t nandfs_get_segnum_of_block(struct nandfs_device *, nandfs_daddr_t);
+void nandfs_get_segment_range(struct nandfs_device *, uint64_t, uint64_t *,
+    uint64_t *);
+void nandfs_calc_mdt_consts(struct nandfs_device *, struct nandfs_mdt *, int);
+
+/* Log reading / volume helpers */
+int nandfs_search_super_root(struct nandfs_device *);
+
+/* Reading */
+int nandfs_dev_bread(struct nandfs_device *, nandfs_daddr_t, struct ucred *,
+    int, struct buf **);
+int nandfs_bread(struct nandfs_node *, nandfs_lbn_t, struct ucred *, int,
+    struct buf **);
+int nandfs_bread_meta(struct nandfs_node *, nandfs_lbn_t, struct ucred *, int,
+    struct buf **);
+int nandfs_bdestroy(struct nandfs_node *, nandfs_daddr_t);
+int nandfs_bcreate(struct nandfs_node *, nandfs_lbn_t, struct ucred *, int,
+    struct buf **);
+int nandfs_bcreate_meta(struct nandfs_node *, nandfs_lbn_t, struct ucred *,
+    int, struct buf **);
+int nandfs_bread_create(struct nandfs_node *, nandfs_lbn_t, struct ucred *,
+    int, struct buf **);
+
+/* vtop operations */
+int nandfs_vtop(struct nandfs_node *, nandfs_daddr_t, nandfs_daddr_t *);
+
+/* Node action implementators */
+int nandfs_vinit(struct vnode *, uint64_t);
+int nandfs_get_node(struct nandfsmount *, uint64_t, struct nandfs_node **);
+int nandfs_get_node_raw(struct nandfs_device *, struct nandfsmount *, uint64_t,
+    struct nandfs_inode *, struct nandfs_node **);
+void nandfs_dispose_node(struct nandfs_node **);
+
+void nandfs_itimes(struct vnode *);
+int nandfs_lookup_name_in_dir(struct vnode *, const char *, int, uint64_t *,
+    int *, uint64_t *);
+int nandfs_create_node(struct vnode *, struct vnode **, struct vattr *,
+    struct componentname *);
+void nandfs_delete_node(struct nandfs_node *);
+
+int nandfs_chsize(struct vnode *, u_quad_t, struct ucred *);
+int nandfs_dir_detach(struct nandfsmount *, struct nandfs_node *,
+    struct nandfs_node *, struct componentname *);
+int nandfs_dir_attach(struct nandfsmount *, struct nandfs_node *,
+    struct nandfs_node *, struct vattr *, struct componentname *);
+
+int nandfs_dirty_buf(struct buf *, int);
+int nandfs_dirty_buf_meta(struct buf *, int);
+int nandfs_fs_full(struct nandfs_device *);
+void nandfs_undirty_buf_fsdev(struct nandfs_device *, struct buf *);
+void nandfs_undirty_buf(struct buf *);
+
+void nandfs_clear_buf(struct buf *);
+void nandfs_buf_set(struct buf *, uint32_t);
+void nandfs_buf_clear(struct buf *, uint32_t);
+int nandfs_buf_check(struct buf *, uint32_t);
+
+int  nandfs_find_free_entry(struct nandfs_mdt *, struct nandfs_node *,
+    struct nandfs_alloc_request *);
+int  nandfs_find_entry(struct nandfs_mdt *, struct nandfs_node *,
+    struct nandfs_alloc_request *);
+int  nandfs_alloc_entry(struct nandfs_mdt *, struct nandfs_alloc_request *);
+void nandfs_abort_entry(struct nandfs_alloc_request *);
+int  nandfs_free_entry(struct nandfs_mdt *, struct nandfs_alloc_request *);
+int nandfs_get_entry_block(struct nandfs_mdt *, struct nandfs_node *,
+    struct nandfs_alloc_request *, uint32_t *, int);
+
+/* inode managment */
+int  nandfs_node_create(struct nandfsmount *, struct nandfs_node **, uint16_t);
+int nandfs_node_destroy(struct nandfs_node *);
+int nandfs_node_update(struct nandfs_node *);
+int nandfs_get_node_entry(struct nandfsmount *, struct nandfs_inode **,
+    uint64_t, struct buf **);
+void nandfs_mdt_trans_blk(struct nandfs_mdt *, uint64_t, uint64_t *,
+    uint64_t *, nandfs_lbn_t *, uint32_t *);
+
+/* vblock management */
+void nandfs_mdt_trans(struct nandfs_mdt *, uint64_t, nandfs_lbn_t *, uint32_t *);
+int nandfs_vblock_alloc(struct nandfs_device *, nandfs_daddr_t *);
+int nandfs_vblock_end(struct nandfs_device *, nandfs_daddr_t);
+int nandfs_vblock_assign(struct nandfs_device *, nandfs_daddr_t,
+    nandfs_lbn_t);
+int nandfs_vblock_free(struct nandfs_device *, nandfs_daddr_t);
+
+/* Checkpoint management */
+int nandfs_get_checkpoint(struct nandfs_device *, struct nandfs_node *,
+    uint64_t);
+int nandfs_set_checkpoint(struct nandfs_device *, struct nandfs_node *,
+    uint64_t, struct nandfs_inode *, uint64_t);
+
+/* Segment management */
+int nandfs_alloc_segment(struct nandfs_device *, uint64_t *);
+int nandfs_update_segment(struct nandfs_device *, uint64_t, uint32_t);
+int nandfs_free_segment(struct nandfs_device *, uint64_t);
+int nandfs_clear_segment(struct nandfs_device *, uint64_t);
+int nandfs_touch_segment(struct nandfs_device *, uint64_t);
+int nandfs_markgc_segment(struct nandfs_device *, uint64_t);
+
+int nandfs_bmap_insert_block(struct nandfs_node *, nandfs_lbn_t, struct buf *);
+int nandfs_bmap_update_block(struct nandfs_node *, struct buf *, nandfs_lbn_t);
+int nandfs_bmap_update_dat(struct nandfs_node *, nandfs_daddr_t, struct buf *);
+int nandfs_bmap_dirty_blocks(struct nandfs_node *, struct buf *, int);
+int nandfs_bmap_truncate_mapping(struct nandfs_node *, nandfs_lbn_t,
+    nandfs_lbn_t);
+int nandfs_bmap_lookup(struct nandfs_node *, nandfs_lbn_t, nandfs_daddr_t *);
+
+/* dirent */
+int nandfs_add_dirent(struct vnode *, uint64_t, char *, long, uint8_t);
+int nandfs_remove_dirent(struct vnode *, struct nandfs_node *,
+    struct componentname *);
+int nandfs_update_dirent(struct vnode *, struct nandfs_node *,
+    struct nandfs_node *);
+int nandfs_init_dir(struct vnode *, uint64_t, uint64_t);
+int nandfs_update_parent_dir(struct vnode *, uint64_t);
+
+void nandfs_vblk_set(struct buf *, nandfs_daddr_t);
+nandfs_daddr_t nandfs_vblk_get(struct buf *);
+
+void nandfs_inode_init(struct nandfs_inode *, uint16_t);
+void nandfs_inode_destroy(struct nandfs_inode *);
+
+/* ioctl */
+int nandfs_get_seg_stat(struct nandfs_device *, struct nandfs_seg_stat *);
+int nandfs_chng_cpmode(struct nandfs_node *, struct nandfs_cpmode *);
+int nandfs_get_cpinfo_ioctl(struct nandfs_node *, struct nandfs_argv *);
+int nandfs_delete_cp(struct nandfs_node *, uint64_t start, uint64_t);
+int nandfs_make_snap(struct nandfs_device *, uint64_t *);
+int nandfs_delete_snap(struct nandfs_device *, uint64_t);
+int nandfs_get_cpstat(struct nandfs_node *, struct nandfs_cpstat *);
+int nandfs_get_segment_info_ioctl(struct nandfs_device *, struct nandfs_argv *);
+int nandfs_get_dat_vinfo_ioctl(struct nandfs_device *, struct nandfs_argv *);
+int nandfs_get_dat_bdescs_ioctl(struct nandfs_device *, struct nandfs_argv *);
+int nandfs_get_fsinfo(struct nandfsmount *, struct nandfs_fsinfo *);
+
+int nandfs_get_cpinfo(struct nandfs_node *, uint64_t, uint16_t,
+    struct nandfs_cpinfo *, uint32_t, uint32_t *);
+
+nandfs_lbn_t nandfs_get_maxfilesize(struct nandfs_device *);
+
+int nandfs_write_superblock(struct nandfs_device *);
+
+extern int nandfs_sync_interval;
+extern int nandfs_max_dirty_segs;
+extern int nandfs_cps_between_sblocks;
+
+struct buf *nandfs_geteblk(int, int);
+
+void nandfs_dirty_bufs_increment(struct nandfs_device *);
+void nandfs_dirty_bufs_decrement(struct nandfs_device *);
+
+int nandfs_start_cleaner(struct nandfs_device *);
+int nandfs_stop_cleaner(struct nandfs_device *);
+
+int nandfs_segsum_valid(struct nandfs_segment_summary *);
+int nandfs_load_segsum(struct nandfs_device *, nandfs_daddr_t,
+    struct nandfs_segment_summary *);
+int nandfs_get_segment_info(struct nandfs_device *, struct nandfs_suinfo *,
+    uint32_t, uint64_t);
+int nandfs_get_segment_info_filter(struct nandfs_device *,
+    struct nandfs_suinfo *, uint32_t, uint64_t, uint64_t *, uint32_t, uint32_t);
+int nandfs_get_dat_vinfo(struct nandfs_device *, struct nandfs_vinfo *,
+    uint32_t);
+int nandfs_get_dat_bdescs(struct nandfs_device *, struct nandfs_bdesc *,
+    uint32_t);
+
+#define	NANDFS_VBLK_ASSIGNED	1
+
+#define	NANDFS_IS_INDIRECT(bp)	((bp)->b_lblkno < 0)
+
+int nandfs_erase(struct nandfs_device *, off_t, size_t);
+
+#define	NANDFS_VOP_ISLOCKED(vp)	nandfs_vop_islocked((vp))
+int nandfs_vop_islocked(struct vnode *vp);
+
+nandfs_daddr_t nandfs_block_to_dblock(struct nandfs_device *, nandfs_lbn_t);
+
+#define DEBUG_MODE
+#if defined(DEBUG_MODE)
+#define	nandfs_error		panic
+#define	nandfs_warning		printf
+#elif defined(TEST_MODE)
+#define	nandfs_error	printf
+#define	nandfs_warning	printf
+#else
+#define	nandfs_error(...)
+#define	nandfs_warning(...)
+#endif
+
+#endif	/* !_FS_NANDFS_NANDFS_SUBR_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_sufile.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_sufile.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,569 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_sufile.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <geom/geom.h>
+#include <geom/geom_vfs.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+#define	SU_USAGE_OFF(bp, offset) \
+	((struct nandfs_segment_usage *)((bp)->b_data + offset))
+
+static int
+nandfs_seg_usage_blk_offset(struct nandfs_device *fsdev, uint64_t seg,
+    uint64_t *blk, uint64_t *offset)
+{
+	uint64_t off;
+	uint16_t seg_size;
+
+	seg_size = fsdev->nd_fsdata.f_segment_usage_size;
+
+	off = roundup(sizeof(struct nandfs_sufile_header), seg_size);
+	off += (seg * seg_size);
+
+	*blk = off / fsdev->nd_blocksize;
+	*offset = off % fsdev->nd_blocksize;
+	return (0);
+}
+
+/* Alloc new segment */
+int
+nandfs_alloc_segment(struct nandfs_device *fsdev, uint64_t *seg)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_sufile_header *su_header;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp_header, *bp;
+	uint64_t blk, vblk, offset, i, rest, nsegments;
+	uint16_t seg_size;
+	int error, found;
+
+	seg_size = fsdev->nd_fsdata.f_segment_usage_size;
+	nsegments = fsdev->nd_fsdata.f_nsegments;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	/* Read header buffer */
+	error = nandfs_bread(su_node, 0, NOCRED, 0, &bp_header);
+	if (error) {
+		brelse(bp_header);
+		return (error);
+	}
+
+	su_header = (struct nandfs_sufile_header *)bp_header->b_data;
+
+	/* Get last allocated segment */
+	i = su_header->sh_last_alloc + 1;
+
+	found = 0;
+	bp = NULL;
+	while (!found) {
+		nandfs_seg_usage_blk_offset(fsdev, i, &blk, &offset);
+		if(blk != 0) {
+			error = nandfs_bmap_lookup(su_node, blk, &vblk);
+			if (error) {
+				nandfs_error("%s: cannot find vblk for blk "
+				    "blk:%jx\n", __func__, blk);
+				return (error);
+			}
+			if (vblk)
+				error = nandfs_bread(su_node, blk, NOCRED, 0,
+				    &bp);
+			else
+				error = nandfs_bcreate(su_node, blk, NOCRED, 0,
+				    &bp);
+			if (error) {
+				nandfs_error("%s: cannot create/read "
+				    "vblk:%jx\n", __func__, vblk);
+				if (bp)
+					brelse(bp);
+				return (error);
+			}
+
+			su_usage = SU_USAGE_OFF(bp, offset);
+		} else {
+			su_usage = SU_USAGE_OFF(bp_header, offset);
+			bp = bp_header;
+		}
+
+		rest = (fsdev->nd_blocksize - offset) / seg_size;
+		/* Go through all su usage in block */
+		while (rest) {
+			/* When last check start from beggining */
+			if (i == nsegments)
+				break;
+
+			if (!su_usage->su_flags) {
+				su_usage->su_flags = 1;
+				found = 1;
+				break;
+			}
+			su_usage++;
+			i++;
+
+			/* If all checked return error */
+			if (i == su_header->sh_last_alloc) {
+				DPRINTF(SEG, ("%s: cannot allocate segment \n",
+				    __func__));
+				brelse(bp_header);
+				if (blk != 0)
+					brelse(bp);
+				return (1);
+			}
+			rest--;
+		}
+		if (!found) {
+			/* Otherwise read another block */
+			if (blk != 0)
+				brelse(bp);
+			if (i == nsegments) {
+				blk = 0;
+				i = 0;
+			} else
+				blk++;
+			offset = 0;
+		}
+	}
+
+	if (found) {
+		*seg = i;
+		su_header->sh_last_alloc = i;
+		su_header->sh_ncleansegs--;
+		su_header->sh_ndirtysegs++;
+
+		fsdev->nd_super.s_free_blocks_count = su_header->sh_ncleansegs *
+		    fsdev->nd_fsdata.f_blocks_per_segment;
+		fsdev->nd_clean_segs--;
+
+		/*
+		 * It is mostly called from syncer() so we want to force
+		 * making buf dirty.
+		 */
+		error = nandfs_dirty_buf(bp_header, 1);
+		if (error) {
+			if (bp && bp != bp_header)
+				brelse(bp);
+			return (error);
+		}
+		if (bp && bp != bp_header)
+			nandfs_dirty_buf(bp, 1);
+
+		DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)i));
+
+		return (0);
+	}
+
+	DPRINTF(SEG, ("%s: failed\n", __func__));
+
+	return (1);
+}
+
+/*
+ * Make buffer dirty, it will be updated soon but first it need to be
+ * gathered by syncer.
+ */
+int
+nandfs_touch_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	struct nandfs_node *su_node;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		nandfs_error("%s: cannot preallocate new segment\n", __func__);
+		return (error);
+	} else
+		nandfs_dirty_buf(bp, 1);
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+	return (error);
+}
+
+/* Update block count of segment */
+int
+nandfs_update_segment(struct nandfs_device *fsdev, uint64_t seg, uint32_t nblks)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		nandfs_error("%s: read block:%jx to update\n",
+		    __func__, blk);
+		brelse(bp);
+		return (error);
+	}
+
+	su_usage = SU_USAGE_OFF(bp, offset);
+	su_usage->su_lastmod = fsdev->nd_ts.tv_sec;
+	su_usage->su_flags = NANDFS_SEGMENT_USAGE_DIRTY;
+	su_usage->su_nblocks += nblks;
+
+	DPRINTF(SEG, ("%s: seg:%#jx inc:%#x cur:%#x\n",  __func__,
+	    (uintmax_t)seg, nblks, su_usage->su_nblocks));
+
+	nandfs_dirty_buf(bp, 1);
+
+	return (0);
+}
+
+/* Make segment free */
+int
+nandfs_free_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_sufile_header *su_header;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp_header, *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	/* Read su header */
+	error = nandfs_bread(su_node, 0, NOCRED, 0, &bp_header);
+	if (error) {
+		brelse(bp_header);
+		return (error);
+	}
+
+	su_header = (struct nandfs_sufile_header *)bp_header->b_data;
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	/* Read su usage block if other than su header block */
+	if (blk != 0) {
+		error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			brelse(bp_header);
+			return (error);
+		}
+	} else
+		bp = bp_header;
+
+	/* Reset su usage data */
+	su_usage = SU_USAGE_OFF(bp, offset);
+	su_usage->su_lastmod = fsdev->nd_ts.tv_sec;
+	su_usage->su_nblocks = 0;
+	su_usage->su_flags = 0;
+
+	/* Update clean/dirty counter in header */
+	su_header->sh_ncleansegs++;
+	su_header->sh_ndirtysegs--;
+
+	/*
+	 *  Make buffers dirty, called by cleaner
+	 *  so force dirty even if no much space left
+	 *  on device
+	 */
+	nandfs_dirty_buf(bp_header, 1);
+	if (bp != bp_header)
+		nandfs_dirty_buf(bp, 1);
+
+	/* Update free block count */
+	fsdev->nd_super.s_free_blocks_count = su_header->sh_ncleansegs *
+	    fsdev->nd_fsdata.f_blocks_per_segment;
+	fsdev->nd_clean_segs++;
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+
+	return (0);
+}
+
+static int
+nandfs_bad_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	su_usage = SU_USAGE_OFF(bp, offset);
+	su_usage->su_lastmod = fsdev->nd_ts.tv_sec;
+	su_usage->su_flags = NANDFS_SEGMENT_USAGE_ERROR;
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+
+	nandfs_dirty_buf(bp, 1);
+
+	return (0);
+}
+
+int
+nandfs_markgc_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+
+	VOP_LOCK(NTOV(su_node), LK_EXCLUSIVE);
+
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(su_node), 0);
+		return (error);
+	}
+
+	su_usage = SU_USAGE_OFF(bp, offset);
+	MPASS((su_usage->su_flags & NANDFS_SEGMENT_USAGE_GC) == 0);
+	su_usage->su_flags |= NANDFS_SEGMENT_USAGE_GC;
+
+	brelse(bp);
+	VOP_UNLOCK(NTOV(su_node), 0);
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+
+	return (0);
+}
+
+int
+nandfs_clear_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	uint64_t offset, segsize;
+	uint32_t bps, bsize;
+	int error = 0;
+
+	bps = fsdev->nd_fsdata.f_blocks_per_segment;
+	bsize = fsdev->nd_blocksize;
+	segsize = bsize * bps;
+	nandfs_get_segment_range(fsdev, seg, &offset, NULL);
+	offset *= bsize;
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+
+	/* Erase it and mark it bad when fail */
+	if (nandfs_erase(fsdev, offset, segsize))
+		error = nandfs_bad_segment(fsdev, seg);
+
+	if (error)
+		return (error);
+
+	/* Mark it free */
+	error = nandfs_free_segment(fsdev, seg);
+
+	return (error);
+}
+
+int
+nandfs_get_seg_stat(struct nandfs_device *nandfsdev,
+    struct nandfs_seg_stat *nss)
+{
+	struct nandfs_sufile_header *suhdr;
+	struct nandfs_node *su_node;
+	struct buf *bp;
+	int err;
+
+	su_node = nandfsdev->nd_su_node;
+
+	NANDFS_WRITELOCK(nandfsdev);
+	VOP_LOCK(NTOV(su_node), LK_SHARED);
+	err = nandfs_bread(nandfsdev->nd_su_node, 0, NOCRED, 0, &bp);
+	if (err) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(su_node), 0);
+		NANDFS_WRITEUNLOCK(nandfsdev);
+		return (-1);
+	}
+
+	suhdr = (struct nandfs_sufile_header *)bp->b_data;
+	nss->nss_nsegs = nandfsdev->nd_fsdata.f_nsegments;
+	nss->nss_ncleansegs = suhdr->sh_ncleansegs;
+	nss->nss_ndirtysegs = suhdr->sh_ndirtysegs;
+	nss->nss_ctime = 0;
+	nss->nss_nongc_ctime = nandfsdev->nd_ts.tv_sec;
+	nss->nss_prot_seq = nandfsdev->nd_seg_sequence;
+
+	brelse(bp);
+	VOP_UNLOCK(NTOV(su_node), 0);
+
+	NANDFS_WRITEUNLOCK(nandfsdev);
+
+	return (0);
+}
+
+int
+nandfs_get_segment_info_ioctl(struct nandfs_device *fsdev,
+    struct nandfs_argv *nargv)
+{
+	struct nandfs_suinfo *nsi;
+	int error;
+
+	if (nargv->nv_nmembs > NANDFS_SEGMENTS_MAX)
+		return (EINVAL);
+
+	nsi = malloc(sizeof(struct nandfs_suinfo) * nargv->nv_nmembs,
+	    M_NANDFSTEMP, M_WAITOK | M_ZERO);
+
+	error = nandfs_get_segment_info(fsdev, nsi, nargv->nv_nmembs,
+	    nargv->nv_index);
+
+	if (error == 0)
+		error = copyout(nsi, (void *)(uintptr_t)nargv->nv_base,
+		    sizeof(struct nandfs_suinfo) * nargv->nv_nmembs);
+
+	free(nsi, M_NANDFSTEMP);
+	return (error);
+}
+
+int
+nandfs_get_segment_info(struct nandfs_device *fsdev, struct nandfs_suinfo *nsi,
+    uint32_t nmembs, uint64_t segment)
+{
+
+	return (nandfs_get_segment_info_filter(fsdev, nsi, nmembs, segment,
+	    NULL, 0, 0));
+}
+
+int
+nandfs_get_segment_info_filter(struct nandfs_device *fsdev,
+    struct nandfs_suinfo *nsi, uint32_t nmembs, uint64_t segment,
+    uint64_t *nsegs, uint32_t filter, uint32_t nfilter)
+{
+	struct nandfs_segment_usage *su;
+	struct nandfs_node *su_node;
+	struct buf *bp;
+	uint64_t curr, blocknr, blockoff, i;
+	uint32_t flags;
+	int err = 0;
+
+	curr = ~(0);
+
+	lockmgr(&fsdev->nd_seg_const, LK_EXCLUSIVE, NULL);
+	su_node = fsdev->nd_su_node;
+
+	VOP_LOCK(NTOV(su_node), LK_SHARED);
+
+	bp = NULL;
+	if (nsegs !=  NULL)
+		*nsegs = 0;
+	for (i = 0; i < nmembs; segment++) {
+		if (segment == fsdev->nd_fsdata.f_nsegments)
+			break;
+
+		nandfs_seg_usage_blk_offset(fsdev, segment, &blocknr,
+		    &blockoff);
+
+		if (i == 0 || curr != blocknr) {
+			if (bp != NULL)
+				brelse(bp);
+			err = nandfs_bread(su_node, blocknr, NOCRED,
+			    0, &bp);
+			if (err) {
+				goto out;
+			}
+			curr = blocknr;
+		}
+
+		su = SU_USAGE_OFF(bp, blockoff);
+		flags = su->su_flags;
+		if (segment == fsdev->nd_seg_num ||
+		    segment == fsdev->nd_next_seg_num)
+			flags |= NANDFS_SEGMENT_USAGE_ACTIVE;
+
+		if (nfilter != 0 && (flags & nfilter) != 0)
+			continue;
+		if (filter != 0 && (flags & filter) == 0)
+			continue;
+
+		nsi->nsi_num = segment;
+		nsi->nsi_lastmod = su->su_lastmod;
+		nsi->nsi_blocks = su->su_nblocks;
+		nsi->nsi_flags = flags;
+		nsi++;
+		i++;
+		if (nsegs != NULL)
+			(*nsegs)++;
+	}
+
+out:
+	if (bp != NULL)
+		brelse(bp);
+	VOP_UNLOCK(NTOV(su_node), 0);
+	lockmgr(&fsdev->nd_seg_const, LK_RELEASE, NULL);
+
+	return (err);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_vfsops.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,1590 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_vfsops.c,v 1.1 2009/07/18 16:31:42 reinoud Exp
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_vfsops.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+
+#include <geom/geom.h>
+#include <geom/geom_vfs.h>
+
+#include <machine/_inttypes.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+static MALLOC_DEFINE(M_NANDFSMNT, "nandfs_mount", "NANDFS mount structure");
+
+#define	NANDFS_SET_SYSTEMFILE(vp) {	\
+	(vp)->v_vflag |= VV_SYSTEM;	\
+	vref(vp);			\
+	vput(vp); }
+
+#define	NANDFS_UNSET_SYSTEMFILE(vp) {	\
+	VOP_LOCK(vp, LK_EXCLUSIVE);	\
+	MPASS(vp->v_bufobj.bo_dirty.bv_cnt == 0); \
+	(vp)->v_vflag &= ~VV_SYSTEM;	\
+	vgone(vp);			\
+	vput(vp); }
+
+/* Globals */
+struct _nandfs_devices nandfs_devices;
+
+/* Parameters */
+int nandfs_verbose = 0;
+
+static void
+nandfs_tunable_init(void *arg)
+{
+
+	TUNABLE_INT_FETCH("vfs.nandfs.verbose", &nandfs_verbose);
+}
+SYSINIT(nandfs_tunables, SI_SUB_VFS, SI_ORDER_ANY, nandfs_tunable_init, NULL);
+
+static SYSCTL_NODE(_vfs, OID_AUTO, nandfs, CTLFLAG_RD, 0, "NAND filesystem");
+static SYSCTL_NODE(_vfs_nandfs, OID_AUTO, mount, CTLFLAG_RD, 0,
+    "NANDFS mountpoints");
+SYSCTL_INT(_vfs_nandfs, OID_AUTO, verbose, CTLFLAG_RW, &nandfs_verbose, 0, "");
+
+#define NANDFS_CONSTR_INTERVAL	5
+int nandfs_sync_interval = NANDFS_CONSTR_INTERVAL; /* sync every 5 seconds */
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, sync_interval, CTLFLAG_RW,
+    &nandfs_sync_interval, 0, "");
+
+#define NANDFS_MAX_DIRTY_SEGS	5
+int nandfs_max_dirty_segs = NANDFS_MAX_DIRTY_SEGS; /* sync when 5 dirty seg */
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, max_dirty_segs, CTLFLAG_RW,
+    &nandfs_max_dirty_segs, 0, "");
+
+#define NANDFS_CPS_BETWEEN_SBLOCKS 5
+int nandfs_cps_between_sblocks = NANDFS_CPS_BETWEEN_SBLOCKS; /* write superblock every 5 checkpoints */
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cps_between_sblocks, CTLFLAG_RW,
+    &nandfs_cps_between_sblocks, 0, "");
+
+#define NANDFS_CLEANER_ENABLE 1
+int nandfs_cleaner_enable = NANDFS_CLEANER_ENABLE;
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_enable, CTLFLAG_RW,
+    &nandfs_cleaner_enable, 0, "");
+
+#define NANDFS_CLEANER_INTERVAL 5
+int nandfs_cleaner_interval = NANDFS_CLEANER_INTERVAL;
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_interval, CTLFLAG_RW,
+    &nandfs_cleaner_interval, 0, "");
+
+#define NANDFS_CLEANER_SEGMENTS 5
+int nandfs_cleaner_segments = NANDFS_CLEANER_SEGMENTS;
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_segments, CTLFLAG_RW,
+    &nandfs_cleaner_segments, 0, "");
+
+static int nandfs_mountfs(struct vnode *devvp, struct mount *mp);
+static vfs_mount_t	nandfs_mount;
+static vfs_root_t	nandfs_root;
+static vfs_statfs_t	nandfs_statfs;
+static vfs_unmount_t	nandfs_unmount;
+static vfs_vget_t	nandfs_vget;
+static vfs_sync_t	nandfs_sync;
+static const char *nandfs_opts[] = {
+	"snap", "from", "noatime", NULL
+};
+
+/* System nodes */
+static int
+nandfs_create_system_nodes(struct nandfs_device *nandfsdev)
+{
+	int error;
+
+	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_DAT_INO,
+	    &nandfsdev->nd_super_root.sr_dat, &nandfsdev->nd_dat_node);
+	if (error)
+		goto errorout;
+
+	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_CPFILE_INO,
+	    &nandfsdev->nd_super_root.sr_cpfile, &nandfsdev->nd_cp_node);
+	if (error)
+		goto errorout;
+
+	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_SUFILE_INO,
+	    &nandfsdev->nd_super_root.sr_sufile, &nandfsdev->nd_su_node);
+	if (error)
+		goto errorout;
+
+	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_GC_INO,
+	    NULL, &nandfsdev->nd_gc_node);
+	if (error)
+		goto errorout;
+
+	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_dat_node));
+	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_cp_node));
+	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_su_node));
+	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_gc_node));
+
+	DPRINTF(VOLUMES, ("System vnodes: dat: %p cp: %p su: %p\n",
+	    NTOV(nandfsdev->nd_dat_node), NTOV(nandfsdev->nd_cp_node),
+	    NTOV(nandfsdev->nd_su_node)));
+	return (0);
+
+errorout:
+	nandfs_dispose_node(&nandfsdev->nd_gc_node);
+	nandfs_dispose_node(&nandfsdev->nd_dat_node);
+	nandfs_dispose_node(&nandfsdev->nd_cp_node);
+	nandfs_dispose_node(&nandfsdev->nd_su_node);
+
+	return (error);
+}
+
+static void
+nandfs_release_system_nodes(struct nandfs_device *nandfsdev)
+{
+
+	if (!nandfsdev)
+		return;
+	if (nandfsdev->nd_refcnt > 0)
+		return;
+
+	if (nandfsdev->nd_gc_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_gc_node));
+	if (nandfsdev->nd_dat_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_dat_node));
+	if (nandfsdev->nd_cp_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_cp_node));
+	if (nandfsdev->nd_su_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_su_node));
+}
+
+static int
+nandfs_check_fsdata_crc(struct nandfs_fsdata *fsdata)
+{
+	uint32_t fsdata_crc, comp_crc;
+
+	if (fsdata->f_magic != NANDFS_FSDATA_MAGIC)
+		return (0);
+
+	/* Preserve CRC */
+	fsdata_crc = fsdata->f_sum;
+
+	/* Calculate */
+	fsdata->f_sum = (0);
+	comp_crc = crc32((uint8_t *)fsdata, fsdata->f_bytes);
+
+	/* Restore */
+	fsdata->f_sum = fsdata_crc;
+
+	/* Check CRC */
+	return (fsdata_crc == comp_crc);
+}
+
+static int
+nandfs_check_superblock_crc(struct nandfs_fsdata *fsdata,
+    struct nandfs_super_block *super)
+{
+	uint32_t super_crc, comp_crc;
+
+	/* Check super block magic */
+	if (super->s_magic != NANDFS_SUPER_MAGIC)
+		return (0);
+
+	/* Preserve CRC */
+	super_crc = super->s_sum;
+
+	/* Calculate */
+	super->s_sum = (0);
+	comp_crc = crc32((uint8_t *)super, fsdata->f_sbbytes);
+
+	/* Restore */
+	super->s_sum = super_crc;
+
+	/* Check CRC */
+	return (super_crc == comp_crc);
+}
+
+static void
+nandfs_calc_superblock_crc(struct nandfs_fsdata *fsdata,
+    struct nandfs_super_block *super)
+{
+	uint32_t comp_crc;
+
+	/* Calculate */
+	super->s_sum = 0;
+	comp_crc = crc32((uint8_t *)super, fsdata->f_sbbytes);
+
+	/* Restore */
+	super->s_sum = comp_crc;
+}
+
+static int
+nandfs_is_empty(u_char *area, int size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		if (area[i] != 0xff)
+			return (0);
+
+	return (1);
+}
+
+static __inline int
+nandfs_sblocks_in_esize(struct nandfs_device *fsdev)
+{
+
+	return ((fsdev->nd_erasesize - NANDFS_SBLOCK_OFFSET_BYTES) /
+	    sizeof(struct nandfs_super_block));
+}
+
+static __inline int
+nandfs_max_sblocks(struct nandfs_device *fsdev)
+{
+
+	return (NANDFS_NFSAREAS * nandfs_sblocks_in_esize(fsdev));
+}
+
+static __inline int
+nandfs_sblocks_in_block(struct nandfs_device *fsdev)
+{
+
+	return (fsdev->nd_devblocksize / sizeof(struct nandfs_super_block));
+}
+
+static __inline int
+nandfs_sblocks_in_first_block(struct nandfs_device *fsdev)
+{
+	int n;
+
+	n = nandfs_sblocks_in_block(fsdev) -
+	    NANDFS_SBLOCK_OFFSET_BYTES / sizeof(struct nandfs_super_block);
+	if (n < 0)
+		n = 0;
+
+	return (n);
+}
+
+static int
+nandfs_write_superblock_at(struct nandfs_device *fsdev,
+    struct nandfs_fsarea *fstp)
+{
+	struct nandfs_super_block *super, *supert;
+	struct buf *bp;
+	int sb_per_sector, sbs_in_fsd, read_block;
+	int index, pos, error;
+	off_t offset;
+
+	DPRINTF(SYNC, ("%s: last_used %d nandfs_sblocks_in_esize %d\n",
+	    __func__, fstp->last_used, nandfs_sblocks_in_esize(fsdev)));
+	if (fstp->last_used == nandfs_sblocks_in_esize(fsdev) - 1)
+		index = 0;
+	else
+		index = fstp->last_used + 1;
+
+	super = &fsdev->nd_super;
+	supert = NULL;
+
+	sb_per_sector = nandfs_sblocks_in_block(fsdev);
+	sbs_in_fsd = sizeof(struct nandfs_fsdata) /
+	    sizeof(struct nandfs_super_block);
+	index += sbs_in_fsd;
+	offset = fstp->offset;
+
+	DPRINTF(SYNC, ("%s: offset %#jx s_last_pseg %#jx s_last_cno %#jx "
+	    "s_last_seq %#jx wtime %jd index %d\n", __func__, offset,
+	    super->s_last_pseg, super->s_last_cno, super->s_last_seq,
+	    super->s_wtime, index));
+
+	read_block = btodb(offset + ((index / sb_per_sector) * sb_per_sector)
+	    * sizeof(struct nandfs_super_block));
+
+	DPRINTF(SYNC, ("%s: read_block %#x\n", __func__, read_block));
+
+	if (index == sbs_in_fsd) {
+		error = nandfs_erase(fsdev, offset, fsdev->nd_erasesize);
+		if (error)
+			return (error);
+
+		error = bread(fsdev->nd_devvp, btodb(offset),
+		    fsdev->nd_devblocksize, NOCRED, &bp);
+		if (error) {
+			printf("NANDFS: couldn't read initial data: %d\n",
+			    error);
+			brelse(bp);
+			return (error);
+		}
+		memcpy(bp->b_data, &fsdev->nd_fsdata, sizeof(fsdev->nd_fsdata));
+		/*
+		 * 0xff-out the rest. This bp could be cached, so potentially
+		 * b_data contains stale super blocks.
+		 *
+		 * We don't mind cached bp since most of the time we just add
+		 * super blocks to already 0xff-out b_data and don't need to
+		 * perform actual read.
+		 */
+		if (fsdev->nd_devblocksize > sizeof(fsdev->nd_fsdata))
+			memset(bp->b_data + sizeof(fsdev->nd_fsdata), 0xff,
+			    fsdev->nd_devblocksize - sizeof(fsdev->nd_fsdata));
+		error = bwrite(bp);
+		if (error) {
+			printf("NANDFS: cannot rewrite initial data at %jx\n",
+			    offset);
+			return (error);
+		}
+	}
+
+	error = bread(fsdev->nd_devvp, read_block, fsdev->nd_devblocksize,
+	    NOCRED, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	supert = (struct nandfs_super_block *)(bp->b_data);
+	pos = index % sb_per_sector;
+
+	DPRINTF(SYNC, ("%s: storing at %d\n", __func__, pos));
+	memcpy(&supert[pos], super, sizeof(struct nandfs_super_block));
+
+	/*
+	 * See comment above in code that performs erase.
+	 */
+	if (pos == 0)
+		memset(&supert[1], 0xff,
+		    (sb_per_sector - 1) * sizeof(struct nandfs_super_block));
+
+	error = bwrite(bp);
+	if (error) {
+		printf("NANDFS: cannot update superblock at %jx\n", offset);
+		return (error);
+	}
+
+	DPRINTF(SYNC, ("%s: fstp->last_used %d -> %d\n", __func__,
+	    fstp->last_used, index - sbs_in_fsd));
+	fstp->last_used = index - sbs_in_fsd;
+
+	return (0);
+}
+
+int
+nandfs_write_superblock(struct nandfs_device *fsdev)
+{
+	struct nandfs_super_block *super;
+	struct timespec ts;
+	int error;
+	int i, j;
+
+	vfs_timestamp(&ts);
+
+	super = &fsdev->nd_super;
+
+	super->s_last_pseg = fsdev->nd_last_pseg;
+	super->s_last_cno = fsdev->nd_last_cno;
+	super->s_last_seq = fsdev->nd_seg_sequence;
+	super->s_wtime = ts.tv_sec;
+
+	nandfs_calc_superblock_crc(&fsdev->nd_fsdata, super);
+
+	error = 0;
+	for (i = 0, j = fsdev->nd_last_fsarea; i < NANDFS_NFSAREAS;
+	    i++, j = (j + 1 % NANDFS_NFSAREAS)) {
+		if (fsdev->nd_fsarea[j].flags & NANDFS_FSSTOR_FAILED) {
+			DPRINTF(SYNC, ("%s: skipping %d\n", __func__, j));
+			continue;
+		}
+		error = nandfs_write_superblock_at(fsdev, &fsdev->nd_fsarea[j]);
+		if (error) {
+			printf("NANDFS: writing superblock at offset %d failed:"
+			    "%d\n", j * fsdev->nd_erasesize, error);
+			fsdev->nd_fsarea[j].flags |= NANDFS_FSSTOR_FAILED;
+		} else
+			break;
+	}
+
+	if (i == NANDFS_NFSAREAS) {
+		printf("NANDFS: superblock was not written\n");
+		/*
+		 * TODO: switch to read-only?
+		 */
+		return (error);
+	} else
+		fsdev->nd_last_fsarea = (j + 1) % NANDFS_NFSAREAS;
+
+	return (0);
+}
+
+static int
+nandfs_select_fsdata(struct nandfs_device *fsdev,
+    struct nandfs_fsdata *fsdatat, struct nandfs_fsdata **fsdata, int nfsds)
+{
+	int i;
+
+	*fsdata = NULL;
+	for (i = 0; i < nfsds; i++) {
+		DPRINTF(VOLUMES, ("%s: i %d f_magic %x f_crc %x\n", __func__,
+		    i, fsdatat[i].f_magic, fsdatat[i].f_sum));
+		if (!nandfs_check_fsdata_crc(&fsdatat[i]))
+			continue;
+		*fsdata = &fsdatat[i];
+		break;
+	}
+
+	return (*fsdata != NULL ? 0 : EINVAL);
+}
+
+static int
+nandfs_select_sb(struct nandfs_device *fsdev,
+    struct nandfs_super_block *supert, struct nandfs_super_block **super,
+    int nsbs)
+{
+	int i;
+
+	*super = NULL;
+	for (i = 0; i < nsbs; i++) {
+		if (!nandfs_check_superblock_crc(&fsdev->nd_fsdata, &supert[i]))
+			continue;
+		DPRINTF(SYNC, ("%s: i %d s_last_cno %jx s_magic %x "
+		    "s_wtime %jd\n", __func__, i, supert[i].s_last_cno,
+		    supert[i].s_magic, supert[i].s_wtime));
+		if (*super == NULL || supert[i].s_last_cno >
+		    (*super)->s_last_cno)
+			*super = &supert[i];
+	}
+
+	return (*super != NULL ? 0 : EINVAL);
+}
+
+static int
+nandfs_read_structures_at(struct nandfs_device *fsdev,
+    struct nandfs_fsarea *fstp, struct nandfs_fsdata *fsdata,
+    struct nandfs_super_block *super)
+{
+	struct nandfs_super_block *tsuper, *tsuperd;
+	struct buf *bp;
+	int error, read_size;
+	int i;
+	int offset;
+
+	offset = fstp->offset;
+
+	if (fsdev->nd_erasesize > MAXBSIZE)
+		read_size = MAXBSIZE;
+	else
+		read_size = fsdev->nd_erasesize;
+
+	error = bread(fsdev->nd_devvp, btodb(offset), read_size, NOCRED, &bp);
+	if (error) {
+		printf("couldn't read: %d\n", error);
+		brelse(bp);
+		fstp->flags |= NANDFS_FSSTOR_FAILED;
+		return (error);
+	}
+
+	tsuper = super;
+
+	memcpy(fsdata, bp->b_data, sizeof(struct nandfs_fsdata));
+	memcpy(tsuper, (bp->b_data + sizeof(struct nandfs_fsdata)),
+	    read_size - sizeof(struct nandfs_fsdata));
+	brelse(bp);
+
+	tsuper += (read_size - sizeof(struct nandfs_fsdata)) /
+	    sizeof(struct nandfs_super_block);
+
+	for (i = 1; i < fsdev->nd_erasesize / read_size; i++) {
+		error = bread(fsdev->nd_devvp, btodb(offset + i * read_size),
+		    read_size, NOCRED, &bp);
+		if (error) {
+			printf("couldn't read: %d\n", error);
+			brelse(bp);
+			fstp->flags |= NANDFS_FSSTOR_FAILED;
+			return (error);
+		}
+		memcpy(tsuper, bp->b_data, read_size);
+		tsuper += read_size / sizeof(struct nandfs_super_block);
+		brelse(bp);
+	}
+
+	tsuper -= 1;
+	fstp->last_used = nandfs_sblocks_in_esize(fsdev) - 1;
+	for (tsuperd = super - 1; (tsuper != tsuperd); tsuper -= 1) {
+		if (nandfs_is_empty((u_char *)tsuper, sizeof(*tsuper)))
+			fstp->last_used--;
+		else
+			break;
+	}
+
+	DPRINTF(VOLUMES, ("%s: last_used %d\n", __func__, fstp->last_used));
+
+	return (0);
+}
+
+static int
+nandfs_read_structures(struct nandfs_device *fsdev)
+{
+	struct nandfs_fsdata *fsdata, *fsdatat;
+	struct nandfs_super_block *sblocks, *ssblock;
+	int nsbs, nfsds, i;
+	int error = 0;
+	int nrsbs;
+
+	nfsds = NANDFS_NFSAREAS;
+	nsbs = nandfs_max_sblocks(fsdev);
+
+	fsdatat = malloc(sizeof(struct nandfs_fsdata) * nfsds, M_NANDFSTEMP,
+	    M_WAITOK | M_ZERO);
+	sblocks = malloc(sizeof(struct nandfs_super_block) * nsbs, M_NANDFSTEMP,
+	    M_WAITOK | M_ZERO);
+
+	nrsbs = 0;
+	for (i = 0; i < NANDFS_NFSAREAS; i++) {
+		fsdev->nd_fsarea[i].offset = i * fsdev->nd_erasesize;
+		error = nandfs_read_structures_at(fsdev, &fsdev->nd_fsarea[i],
+		    &fsdatat[i], sblocks + nrsbs);
+		if (error)
+			continue;
+		nrsbs += (fsdev->nd_fsarea[i].last_used + 1);
+		if (fsdev->nd_fsarea[fsdev->nd_last_fsarea].last_used >
+		    fsdev->nd_fsarea[i].last_used)
+			fsdev->nd_last_fsarea = i;
+	}
+
+	if (nrsbs == 0) {
+		printf("nandfs: no valid superblocks found\n");
+		error = EINVAL;
+		goto out;
+	}
+
+	error = nandfs_select_fsdata(fsdev, fsdatat, &fsdata, nfsds);
+	if (error)
+		goto out;
+	memcpy(&fsdev->nd_fsdata, fsdata, sizeof(struct nandfs_fsdata));
+
+	error = nandfs_select_sb(fsdev, sblocks, &ssblock, nsbs);
+	if (error)
+		goto out;
+
+	memcpy(&fsdev->nd_super, ssblock, sizeof(struct nandfs_super_block));
+out:
+	free(fsdatat, M_NANDFSTEMP);
+	free(sblocks, M_NANDFSTEMP);
+
+	if (error == 0)
+		DPRINTF(VOLUMES, ("%s: selected sb with w_time %jd "
+		    "last_pseg %#jx\n", __func__, fsdev->nd_super.s_wtime,
+		    fsdev->nd_super.s_last_pseg));
+
+	return (error);
+}
+
+static void
+nandfs_unmount_base(struct nandfs_device *nandfsdev)
+{
+	int error;
+
+	if (!nandfsdev)
+		return;
+
+	/* Remove all our information */
+	error = vinvalbuf(nandfsdev->nd_devvp, V_SAVE, 0, 0);
+	if (error) {
+		/*
+		 * Flushing buffers failed when fs was umounting, can't do
+		 * much now, just printf error and continue with umount.
+		 */
+		nandfs_error("%s(): error:%d when umounting FS\n",
+		    __func__, error);
+	}
+
+	/* Release the device's system nodes */
+	nandfs_release_system_nodes(nandfsdev);
+}
+
+static void
+nandfs_get_ncleanseg(struct nandfs_device *nandfsdev)
+{
+	struct nandfs_seg_stat nss;
+
+	nandfs_get_seg_stat(nandfsdev, &nss);
+	nandfsdev->nd_clean_segs = nss.nss_ncleansegs;
+	DPRINTF(VOLUMES, ("nandfs_mount: clean segs: %jx\n",
+	    (uintmax_t)nandfsdev->nd_clean_segs));
+}
+
+
+static int
+nandfs_mount_base(struct nandfs_device *nandfsdev, struct mount *mp,
+    struct nandfs_args *args)
+{
+	uint32_t log_blocksize;
+	int error;
+
+	/* Flush out any old buffers remaining from a previous use. */
+	if ((error = vinvalbuf(nandfsdev->nd_devvp, V_SAVE, 0, 0)))
+		return (error);
+
+	error = nandfs_read_structures(nandfsdev);
+	if (error) {
+		printf("nandfs: could not get valid filesystem structures\n");
+		return (error);
+	}
+
+	if (nandfsdev->nd_fsdata.f_rev_level != NANDFS_CURRENT_REV) {
+		printf("nandfs: unsupported file system revision: %d "
+		    "(supported is %d).\n", nandfsdev->nd_fsdata.f_rev_level,
+		    NANDFS_CURRENT_REV);
+		return (EINVAL);
+	}
+
+	if (nandfsdev->nd_fsdata.f_erasesize != nandfsdev->nd_erasesize) {
+		printf("nandfs: erasesize mismatch (device %#x, fs %#x)\n",
+		    nandfsdev->nd_erasesize, nandfsdev->nd_fsdata.f_erasesize);
+		return (EINVAL);
+	}
+
+	/* Get our blocksize */
+	log_blocksize = nandfsdev->nd_fsdata.f_log_block_size;
+	nandfsdev->nd_blocksize = (uint64_t) 1 << (log_blocksize + 10);
+	DPRINTF(VOLUMES, ("%s: blocksize:%x\n", __func__,
+	    nandfsdev->nd_blocksize));
+
+	DPRINTF(VOLUMES, ("%s: accepted super block with cp %#jx\n", __func__,
+	    (uintmax_t)nandfsdev->nd_super.s_last_cno));
+
+	/* Calculate dat structure parameters */
+	nandfs_calc_mdt_consts(nandfsdev, &nandfsdev->nd_dat_mdt,
+	    nandfsdev->nd_fsdata.f_dat_entry_size);
+	nandfs_calc_mdt_consts(nandfsdev, &nandfsdev->nd_ifile_mdt,
+	    nandfsdev->nd_fsdata.f_inode_size);
+
+	/* Search for the super root and roll forward when needed */
+	if (nandfs_search_super_root(nandfsdev)) {
+		printf("Cannot find valid SuperRoot\n");
+		return (EINVAL);
+	}
+
+	nandfsdev->nd_mount_state = nandfsdev->nd_super.s_state;
+	if (nandfsdev->nd_mount_state != NANDFS_VALID_FS) {
+		printf("FS is seriously damaged, needs repairing\n");
+		printf("aborting mount\n");
+		return (EINVAL);
+	}
+
+	/*
+	 * FS should be ok now. The superblock and the last segsum could be
+	 * updated from the repair so extract running values again.
+	 */
+	nandfsdev->nd_last_pseg = nandfsdev->nd_super.s_last_pseg;
+	nandfsdev->nd_seg_sequence = nandfsdev->nd_super.s_last_seq;
+	nandfsdev->nd_seg_num = nandfs_get_segnum_of_block(nandfsdev,
+	    nandfsdev->nd_last_pseg);
+	nandfsdev->nd_next_seg_num = nandfs_get_segnum_of_block(nandfsdev,
+	    nandfsdev->nd_last_segsum.ss_next);
+	nandfsdev->nd_ts.tv_sec = nandfsdev->nd_last_segsum.ss_create;
+	nandfsdev->nd_last_cno = nandfsdev->nd_super.s_last_cno;
+	nandfsdev->nd_fakevblk = 1;
+	nandfsdev->nd_last_ino  = NANDFS_USER_INO;
+	DPRINTF(VOLUMES, ("%s: last_pseg %#jx last_cno %#jx last_seq %#jx\n"
+	    "fsdev: last_seg: seq %#jx num %#jx, next_seg_num %#jx\n",
+	    __func__, (uintmax_t)nandfsdev->nd_last_pseg,
+	    (uintmax_t)nandfsdev->nd_last_cno,
+	    (uintmax_t)nandfsdev->nd_seg_sequence,
+	    (uintmax_t)nandfsdev->nd_seg_sequence,
+	    (uintmax_t)nandfsdev->nd_seg_num,
+	    (uintmax_t)nandfsdev->nd_next_seg_num));
+
+	DPRINTF(VOLUMES, ("nandfs_mount: accepted super root\n"));
+
+	/* Create system vnodes for DAT, CP and SEGSUM */
+	error = nandfs_create_system_nodes(nandfsdev);
+	if (error)
+		nandfs_unmount_base(nandfsdev);
+
+	nandfs_get_ncleanseg(nandfsdev);
+
+	return (error);
+}
+
+static void
+nandfs_unmount_device(struct nandfs_device *nandfsdev)
+{
+
+	/* Is there anything? */
+	if (nandfsdev == NULL)
+		return;
+
+	/* Remove the device only if we're the last reference */
+	nandfsdev->nd_refcnt--;
+	if (nandfsdev->nd_refcnt >= 1)
+		return;
+
+	MPASS(nandfsdev->nd_syncer == NULL);
+	MPASS(nandfsdev->nd_cleaner == NULL);
+	MPASS(nandfsdev->nd_free_base == NULL);
+
+	/* Unmount our base */
+	nandfs_unmount_base(nandfsdev);
+
+	/* Remove from our device list */
+	SLIST_REMOVE(&nandfs_devices, nandfsdev, nandfs_device, nd_next_device);
+
+	DROP_GIANT();
+	g_topology_lock();
+	g_vfs_close(nandfsdev->nd_gconsumer);
+	g_topology_unlock();
+	PICKUP_GIANT();
+
+	DPRINTF(VOLUMES, ("closing device\n"));
+
+	/* Clear our mount reference and release device node */
+	vrele(nandfsdev->nd_devvp);
+
+	dev_rel(nandfsdev->nd_devvp->v_rdev);
+
+	/* Free our device info */
+	cv_destroy(&nandfsdev->nd_sync_cv);
+	mtx_destroy(&nandfsdev->nd_sync_mtx);
+	cv_destroy(&nandfsdev->nd_clean_cv);
+	mtx_destroy(&nandfsdev->nd_clean_mtx);
+	mtx_destroy(&nandfsdev->nd_mutex);
+	lockdestroy(&nandfsdev->nd_seg_const);
+	free(nandfsdev, M_NANDFSMNT);
+}
+
+static int
+nandfs_check_mounts(struct nandfs_device *nandfsdev, struct mount *mp,
+    struct nandfs_args *args)
+{
+	struct nandfsmount *nmp;
+	uint64_t last_cno;
+
+	/* no double-mounting of the same checkpoint */
+	STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) {
+		if (nmp->nm_mount_args.cpno == args->cpno)
+			return (EBUSY);
+	}
+
+	/* Allow readonly mounts without questioning here */
+	if (mp->mnt_flag & MNT_RDONLY)
+		return (0);
+
+	/* Read/write mount */
+	STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) {
+		/* Only one RW mount on this device! */
+		if ((nmp->nm_vfs_mountp->mnt_flag & MNT_RDONLY)==0)
+			return (EROFS);
+		/* RDONLY on last mountpoint is device busy */
+		last_cno = nmp->nm_nandfsdev->nd_super.s_last_cno;
+		if (nmp->nm_mount_args.cpno == last_cno)
+			return (EBUSY);
+	}
+
+	/* OK for now */
+	return (0);
+}
+
+static int
+nandfs_mount_device(struct vnode *devvp, struct mount *mp,
+    struct nandfs_args *args, struct nandfs_device **nandfsdev_p)
+{
+	struct nandfs_device *nandfsdev;
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	struct cdev *dev;
+	uint32_t erasesize;
+	int error, size;
+	int ronly;
+
+	DPRINTF(VOLUMES, ("Mounting NANDFS device\n"));
+
+	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+	/* Look up device in our nandfs_mountpoints */
+	*nandfsdev_p = NULL;
+	SLIST_FOREACH(nandfsdev, &nandfs_devices, nd_next_device)
+		if (nandfsdev->nd_devvp == devvp)
+			break;
+
+	if (nandfsdev) {
+		DPRINTF(VOLUMES, ("device already mounted\n"));
+		error = nandfs_check_mounts(nandfsdev, mp, args);
+		if (error)
+			return error;
+		nandfsdev->nd_refcnt++;
+		*nandfsdev_p = nandfsdev;
+
+		if (!ronly) {
+			DROP_GIANT();
+			g_topology_lock();
+			error = g_access(nandfsdev->nd_gconsumer, 0, 1, 0);
+			g_topology_unlock();
+			PICKUP_GIANT();
+		}
+		return (error);
+	}
+
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+	dev = devvp->v_rdev;
+	dev_ref(dev);
+	DROP_GIANT();
+	g_topology_lock();
+	error = g_vfs_open(devvp, &cp, "nandfs", ronly ? 0 : 1);
+	pp = g_dev_getprovider(dev);
+	g_topology_unlock();
+	PICKUP_GIANT();
+	VOP_UNLOCK(devvp, 0);
+	if (error) {
+		dev_rel(dev);
+		return (error);
+	}
+
+	nandfsdev = malloc(sizeof(struct nandfs_device), M_NANDFSMNT, M_WAITOK | M_ZERO);
+
+	/* Initialise */
+	nandfsdev->nd_refcnt = 1;
+	nandfsdev->nd_devvp = devvp;
+	nandfsdev->nd_syncing = 0;
+	nandfsdev->nd_cleaning = 0;
+	nandfsdev->nd_gconsumer = cp;
+	cv_init(&nandfsdev->nd_sync_cv, "nandfssync");
+	mtx_init(&nandfsdev->nd_sync_mtx, "nffssyncmtx", NULL, MTX_DEF);
+	cv_init(&nandfsdev->nd_clean_cv, "nandfsclean");
+	mtx_init(&nandfsdev->nd_clean_mtx, "nffscleanmtx", NULL, MTX_DEF);
+	mtx_init(&nandfsdev->nd_mutex, "nandfsdev lock", NULL, MTX_DEF);
+	lockinit(&nandfsdev->nd_seg_const, PVFS, "nffssegcon", VLKTIMEOUT,
+	    LK_CANRECURSE);
+	STAILQ_INIT(&nandfsdev->nd_mounts);
+
+	nandfsdev->nd_devsize = pp->mediasize;
+	nandfsdev->nd_devblocksize = pp->sectorsize;
+
+	size = sizeof(erasesize);
+	error = g_io_getattr("NAND::blocksize", nandfsdev->nd_gconsumer, &size,
+	    &erasesize);
+	if (error) {
+		DPRINTF(VOLUMES, ("couldn't get erasesize: %d\n", error));
+
+		if (error == ENOIOCTL || error == EOPNOTSUPP) {
+			/*
+			 * We conclude that this is not NAND storage
+			 */
+			nandfsdev->nd_erasesize = NANDFS_DEF_ERASESIZE;
+			nandfsdev->nd_is_nand = 0;
+		} else {
+			DROP_GIANT();
+			g_topology_lock();
+			g_vfs_close(nandfsdev->nd_gconsumer);
+			g_topology_unlock();
+			PICKUP_GIANT();
+			dev_rel(dev);
+			free(nandfsdev, M_NANDFSMNT);
+			return (error);
+		}
+	} else {
+		nandfsdev->nd_erasesize = erasesize;
+		nandfsdev->nd_is_nand = 1;
+	}
+
+	DPRINTF(VOLUMES, ("%s: erasesize %x\n", __func__,
+	    nandfsdev->nd_erasesize));
+
+	/* Register nandfs_device in list */
+	SLIST_INSERT_HEAD(&nandfs_devices, nandfsdev, nd_next_device);
+
+	error = nandfs_mount_base(nandfsdev, mp, args);
+	if (error) {
+		/* Remove all our information */
+		nandfs_unmount_device(nandfsdev);
+		return (EINVAL);
+	}
+
+	nandfsdev->nd_maxfilesize = nandfs_get_maxfilesize(nandfsdev);
+
+	*nandfsdev_p = nandfsdev;
+	DPRINTF(VOLUMES, ("NANDFS device mounted ok\n"));
+
+	return (0);
+}
+
+static int
+nandfs_mount_checkpoint(struct nandfsmount *nmp)
+{
+	struct nandfs_cpfile_header *cphdr;
+	struct nandfs_checkpoint *cp;
+	struct nandfs_inode ifile_inode;
+	struct nandfs_node *cp_node;
+	struct buf *bp;
+	uint64_t ncp, nsn, cpno, fcpno, blocknr, last_cno;
+	uint32_t off, dlen;
+	int cp_per_block, error;
+
+	cpno = nmp->nm_mount_args.cpno;
+	if (cpno == 0)
+		cpno = nmp->nm_nandfsdev->nd_super.s_last_cno;
+
+	DPRINTF(VOLUMES, ("%s: trying to mount checkpoint number %"PRIu64"\n",
+	    __func__, cpno));
+
+	cp_node = nmp->nm_nandfsdev->nd_cp_node;
+
+	VOP_LOCK(NTOV(cp_node), LK_SHARED);
+	/* Get cpfile header from 1st block of cp file */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (error);
+	}
+
+	cphdr = (struct nandfs_cpfile_header *) bp->b_data;
+	ncp = cphdr->ch_ncheckpoints;
+	nsn = cphdr->ch_nsnapshots;
+
+	brelse(bp);
+
+	DPRINTF(VOLUMES, ("mount_nandfs: checkpoint header read in\n"));
+	DPRINTF(VOLUMES, ("\tNumber of checkpoints %"PRIu64"\n", ncp));
+	DPRINTF(VOLUMES, ("\tNumber of snapshots %"PRIu64"\n", nsn));
+
+	/* Read in our specified checkpoint */
+	dlen = nmp->nm_nandfsdev->nd_fsdata.f_checkpoint_size;
+	cp_per_block = nmp->nm_nandfsdev->nd_blocksize / dlen;
+
+	fcpno = cpno + NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET - 1;
+	blocknr = fcpno / cp_per_block;
+	off = (fcpno % cp_per_block) * dlen;
+	error = nandfs_bread(cp_node, blocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		printf("mount_nandfs: couldn't read cp block %"PRIu64"\n",
+		    fcpno);
+		return (EINVAL);
+	}
+
+	/* Needs to be a valid checkpoint */
+	cp = (struct nandfs_checkpoint *) ((uint8_t *) bp->b_data + off);
+	if (cp->cp_flags & NANDFS_CHECKPOINT_INVALID) {
+		printf("mount_nandfs: checkpoint marked invalid\n");
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (EINVAL);
+	}
+
+	/* Is this really the checkpoint we want? */
+	if (cp->cp_cno != cpno) {
+		printf("mount_nandfs: checkpoint file corrupt? "
+		    "expected cpno %"PRIu64", found cpno %"PRIu64"\n",
+		    cpno, cp->cp_cno);
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (EINVAL);
+	}
+
+	/* Check if it's a snapshot ! */
+	last_cno = nmp->nm_nandfsdev->nd_super.s_last_cno;
+	if (cpno != last_cno) {
+		/* Only allow snapshots if not mounting on the last cp */
+		if ((cp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT) == 0) {
+			printf( "mount_nandfs: checkpoint %"PRIu64" is not a "
+			    "snapshot\n", cpno);
+			brelse(bp);
+			VOP_UNLOCK(NTOV(cp_node), 0);
+			return (EINVAL);
+		}
+	}
+
+	ifile_inode = cp->cp_ifile_inode;
+	brelse(bp);
+
+	/* Get ifile inode */
+	error = nandfs_get_node_raw(nmp->nm_nandfsdev, NULL, NANDFS_IFILE_INO,
+	    &ifile_inode, &nmp->nm_ifile_node);
+	if (error) {
+		printf("mount_nandfs: can't read ifile node\n");
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (EINVAL);
+	}
+
+	NANDFS_SET_SYSTEMFILE(NTOV(nmp->nm_ifile_node));
+	VOP_UNLOCK(NTOV(cp_node), 0);
+	/* Get root node? */
+
+	return (0);
+}
+
+static void
+free_nandfs_mountinfo(struct mount *mp)
+{
+	struct nandfsmount *nmp = VFSTONANDFS(mp);
+
+	if (nmp == NULL)
+		return;
+
+	free(nmp, M_NANDFSMNT);
+}
+
+void
+nandfs_wakeup_wait_sync(struct nandfs_device *nffsdev, int reason)
+{
+	char *reasons[] = {
+	    "umount",
+	    "vfssync",
+	    "bdflush",
+	    "fforce",
+	    "fsync",
+	    "ro_upd"
+	};
+
+	DPRINTF(SYNC, ("%s: %s\n", __func__, reasons[reason]));
+	mtx_lock(&nffsdev->nd_sync_mtx);
+	if (nffsdev->nd_syncing)
+		cv_wait(&nffsdev->nd_sync_cv, &nffsdev->nd_sync_mtx);
+	if (reason == SYNCER_UMOUNT)
+		nffsdev->nd_syncer_exit = 1;
+	nffsdev->nd_syncing = 1;
+	wakeup(&nffsdev->nd_syncing);
+	cv_wait(&nffsdev->nd_sync_cv, &nffsdev->nd_sync_mtx);
+
+	mtx_unlock(&nffsdev->nd_sync_mtx);
+}
+
+static void
+nandfs_gc_finished(struct nandfs_device *nffsdev, int exit)
+{
+	int error;
+
+	mtx_lock(&nffsdev->nd_sync_mtx);
+	nffsdev->nd_syncing = 0;
+	DPRINTF(SYNC, ("%s: cleaner finish\n", __func__));
+	cv_broadcast(&nffsdev->nd_sync_cv);
+	mtx_unlock(&nffsdev->nd_sync_mtx);
+	if (!exit) {
+		error = tsleep(&nffsdev->nd_syncing, PRIBIO, "-",
+		    hz * nandfs_sync_interval);
+		DPRINTF(SYNC, ("%s: cleaner waked up: %d\n",
+		    __func__, error));
+	}
+}
+
+static void
+nandfs_syncer(struct nandfsmount *nmp)
+{
+	struct nandfs_device *nffsdev;
+	struct mount *mp;
+	int flags, error;
+
+	mp = nmp->nm_vfs_mountp;
+	nffsdev = nmp->nm_nandfsdev;
+	tsleep(&nffsdev->nd_syncing, PRIBIO, "-", hz * nandfs_sync_interval);
+
+	while (!nffsdev->nd_syncer_exit) {
+		DPRINTF(SYNC, ("%s: syncer run\n", __func__));
+		nffsdev->nd_syncing = 1;
+
+		flags = (nmp->nm_flags & (NANDFS_FORCE_SYNCER | NANDFS_UMOUNT));
+
+		error = nandfs_segment_constructor(nmp, flags);
+		if (error)
+			nandfs_error("%s: error:%d when creating segments\n",
+			    __func__, error);
+
+		nmp->nm_flags &= ~flags;
+
+		nandfs_gc_finished(nffsdev, 0);
+	}
+
+	MPASS(nffsdev->nd_cleaner == NULL);
+	error = nandfs_segment_constructor(nmp,
+	    NANDFS_FORCE_SYNCER | NANDFS_UMOUNT);
+	if (error)
+		nandfs_error("%s: error:%d when creating segments\n",
+		    __func__, error);
+	nandfs_gc_finished(nffsdev, 1);
+	nffsdev->nd_syncer = NULL;
+	MPASS(nffsdev->nd_free_base == NULL);
+
+	DPRINTF(SYNC, ("%s: exiting\n", __func__));
+	kthread_exit();
+}
+
+static int
+start_syncer(struct nandfsmount *nmp)
+{
+	int error;
+
+	MPASS(nmp->nm_nandfsdev->nd_syncer == NULL);
+
+	DPRINTF(SYNC, ("%s: start syncer\n", __func__));
+
+	nmp->nm_nandfsdev->nd_syncer_exit = 0;
+
+	error = kthread_add((void(*)(void *))nandfs_syncer, nmp, NULL,
+	    &nmp->nm_nandfsdev->nd_syncer, 0, 0, "nandfs_syncer");
+
+	if (error)
+		printf("nandfs: could not start syncer: %d\n", error);
+
+	return (error);
+}
+
+static int
+stop_syncer(struct nandfsmount *nmp)
+{
+
+	MPASS(nmp->nm_nandfsdev->nd_syncer != NULL);
+
+	nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_UMOUNT);
+
+	DPRINTF(SYNC, ("%s: stop syncer\n", __func__));
+	return (0);
+}
+
+/*
+ * Mount null layer
+ */
+static int
+nandfs_mount(struct mount *mp)
+{
+	struct nandfsmount *nmp;
+	struct vnode *devvp;
+	struct nameidata nd;
+	struct vfsoptlist *opts;
+	struct thread *td;
+	char *from;
+	int error = 0, flags;
+
+	DPRINTF(VOLUMES, ("%s: mp = %p\n", __func__, (void *)mp));
+
+	td = curthread;
+	opts = mp->mnt_optnew;
+
+	if (vfs_filteropt(opts, nandfs_opts))
+		return (EINVAL);
+
+	/*
+	 * Update is a no-op
+	 */
+	if (mp->mnt_flag & MNT_UPDATE) {
+		nmp = VFSTONANDFS(mp);
+		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) {
+			return (error);
+		}
+		if (!(nmp->nm_ronly) && vfs_flagopt(opts, "ro", NULL, 0)) {
+			vn_start_write(NULL, &mp, V_WAIT);
+			error = VFS_SYNC(mp, MNT_WAIT);
+			if (error)
+				return (error);
+			vn_finished_write(mp);
+
+			flags = WRITECLOSE;
+			if (mp->mnt_flag & MNT_FORCE)
+				flags |= FORCECLOSE;
+
+			nandfs_wakeup_wait_sync(nmp->nm_nandfsdev,
+			    SYNCER_ROUPD);
+			error = vflush(mp, 0, flags, td);
+			if (error)
+				return (error);
+
+			nandfs_stop_cleaner(nmp->nm_nandfsdev);
+			stop_syncer(nmp);
+			DROP_GIANT();
+			g_topology_lock();
+			g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, -1, 0);
+			g_topology_unlock();
+			PICKUP_GIANT();
+			MNT_ILOCK(mp);
+			mp->mnt_flag |= MNT_RDONLY;
+			MNT_IUNLOCK(mp);
+			nmp->nm_ronly = 1;
+
+		} else if ((nmp->nm_ronly) &&
+		    !vfs_flagopt(opts, "ro", NULL, 0)) {
+			/*
+			 * Don't allow read-write snapshots.
+			 */
+			if (nmp->nm_mount_args.cpno != 0)
+				return (EROFS);
+			/*
+			 * If upgrade to read-write by non-root, then verify
+			 * that user has necessary permissions on the device.
+			 */
+			devvp = nmp->nm_nandfsdev->nd_devvp;
+			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+			error = VOP_ACCESS(devvp, VREAD | VWRITE,
+			    td->td_ucred, td);
+			if (error) {
+				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
+				if (error) {
+					VOP_UNLOCK(devvp, 0);
+					return (error);
+				}
+			}
+
+			VOP_UNLOCK(devvp, 0);
+			DROP_GIANT();
+			g_topology_lock();
+			error = g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, 1,
+			    0);
+			g_topology_unlock();
+			PICKUP_GIANT();
+			if (error)
+				return (error);
+
+			MNT_ILOCK(mp);
+			mp->mnt_flag &= ~MNT_RDONLY;
+			MNT_IUNLOCK(mp);
+			error = start_syncer(nmp);
+			if (error == 0)
+				error = nandfs_start_cleaner(nmp->nm_nandfsdev);
+			if (error) {
+				DROP_GIANT();
+				g_topology_lock();
+				g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, -1,
+				    0);
+				g_topology_unlock();
+				PICKUP_GIANT();
+				return (error);
+			}
+
+			nmp->nm_ronly = 0;
+		}
+		return (0);
+	}
+
+	from = vfs_getopts(opts, "from", &error);
+	if (error)
+		return (error);
+
+	/*
+	 * Find device node
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, from, curthread);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	devvp = nd.ni_vp;
+
+	if (!vn_isdisk(devvp, &error)) {
+		vput(devvp);
+		return (error);
+	}
+
+	/* Check the access rights on the mount device */
+	error = VOP_ACCESS(devvp, VREAD, curthread->td_ucred, curthread);
+	if (error)
+		error = priv_check(curthread, PRIV_VFS_MOUNT_PERM);
+	if (error) {
+		vput(devvp);
+		return (error);
+	}
+
+	vfs_getnewfsid(mp);
+
+	error = nandfs_mountfs(devvp, mp);
+	if (error)
+		return (error);
+	vfs_mountedfrom(mp, from);
+
+	return (0);
+}
+
+static int
+nandfs_mountfs(struct vnode *devvp, struct mount *mp)
+{
+	struct nandfsmount *nmp = NULL;
+	struct nandfs_args *args = NULL;
+	struct nandfs_device *nandfsdev;
+	char *from;
+	int error, ronly;
+	char *cpno;
+
+	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+	if (devvp->v_rdev->si_iosize_max != 0)
+		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
+	VOP_UNLOCK(devvp, 0);
+
+	if (mp->mnt_iosize_max > MAXPHYS)
+		mp->mnt_iosize_max = MAXPHYS;
+
+	from = vfs_getopts(mp->mnt_optnew, "from", &error);
+	if (error)
+		goto error;
+
+	error = vfs_getopt(mp->mnt_optnew, "snap", (void **)&cpno, NULL);
+	if (error == ENOENT)
+		cpno = NULL;
+	else if (error)
+		goto error;
+
+	args = (struct nandfs_args *)malloc(sizeof(struct nandfs_args),
+	    M_NANDFSMNT, M_WAITOK | M_ZERO);
+
+	if (cpno != NULL)
+		args->cpno = strtoul(cpno, (char **)NULL, 10);
+	else
+		args->cpno = 0;
+	args->fspec = from;
+
+	if (args->cpno != 0 && !ronly) {
+		error = EROFS;
+		goto error;
+	}
+
+	printf("WARNING: NANDFS is considered to be a highly experimental "
+	    "feature in FreeBSD.\n");
+
+	error = nandfs_mount_device(devvp, mp, args, &nandfsdev);
+	if (error)
+		goto error;
+
+	nmp = (struct nandfsmount *) malloc(sizeof(struct nandfsmount),
+	    M_NANDFSMNT, M_WAITOK | M_ZERO);
+
+	mp->mnt_data = nmp;
+	nmp->nm_vfs_mountp = mp;
+	nmp->nm_ronly = ronly;
+	MNT_ILOCK(mp);
+	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_kern_flag |= MNTK_MPSAFE;
+	MNT_IUNLOCK(mp);
+	nmp->nm_nandfsdev = nandfsdev;
+	/* Add our mountpoint */
+	STAILQ_INSERT_TAIL(&nandfsdev->nd_mounts, nmp, nm_next_mount);
+
+	if (args->cpno > nandfsdev->nd_last_cno) {
+		printf("WARNING: supplied checkpoint number (%jd) is greater "
+		    "than last known checkpoint on filesystem (%jd). Mounting"
+		    " checkpoint %jd\n", (uintmax_t)args->cpno,
+		    (uintmax_t)nandfsdev->nd_last_cno,
+		    (uintmax_t)nandfsdev->nd_last_cno);
+		args->cpno = nandfsdev->nd_last_cno;
+	}
+
+	/* Setting up other parameters */
+	nmp->nm_mount_args = *args;
+	free(args, M_NANDFSMNT);
+	error = nandfs_mount_checkpoint(nmp);
+	if (error) {
+		nandfs_unmount(mp, MNT_FORCE);
+		goto unmounted;
+	}
+
+	if (!ronly) {
+		error = start_syncer(nmp);
+		if (error == 0)
+			error = nandfs_start_cleaner(nmp->nm_nandfsdev);
+		if (error)
+			nandfs_unmount(mp, MNT_FORCE);
+	}
+
+	return (0);
+
+error:
+	if (args != NULL)
+		free(args, M_NANDFSMNT);
+
+	if (nmp != NULL) {
+		free(nmp, M_NANDFSMNT);
+		mp->mnt_data = NULL;
+	}
+unmounted:
+	return (error);
+}
+
+static int
+nandfs_unmount(struct mount *mp, int mntflags)
+{
+	struct nandfs_device *nandfsdev;
+	struct nandfsmount *nmp;
+	int error;
+	int flags = 0;
+
+	DPRINTF(VOLUMES, ("%s: mp = %p\n", __func__, (void *)mp));
+
+	if (mntflags & MNT_FORCE)
+		flags |= FORCECLOSE;
+
+	nmp = mp->mnt_data;
+	nandfsdev = nmp->nm_nandfsdev;
+
+	error = vflush(mp, 0, flags | SKIPSYSTEM, curthread);
+	if (error)
+		return (error);
+
+	if (!(nmp->nm_ronly)) {
+		nandfs_stop_cleaner(nandfsdev);
+		stop_syncer(nmp);
+	}
+
+	if (nmp->nm_ifile_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nmp->nm_ifile_node));
+
+	/* Remove our mount point */
+	STAILQ_REMOVE(&nandfsdev->nd_mounts, nmp, nandfsmount, nm_next_mount);
+
+	/* Unmount the device itself when we're the last one */
+	nandfs_unmount_device(nandfsdev);
+
+	free_nandfs_mountinfo(mp);
+
+	/*
+	 * Finally, throw away the null_mount structure
+	 */
+	mp->mnt_data = 0;
+	MNT_ILOCK(mp);
+	mp->mnt_flag &= ~MNT_LOCAL;
+	MNT_IUNLOCK(mp);
+
+	return (0);
+}
+
+static int
+nandfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+	struct nandfsmount *nmp;
+	struct nandfs_device *nandfsdev;
+	struct nandfs_fsdata *fsdata;
+	struct nandfs_super_block *sb;
+	struct nandfs_block_group_desc *groups;
+	struct nandfs_node *ifile;
+	struct nandfs_mdt *mdt;
+	struct buf *bp;
+	int i, error;
+	uint32_t entries_per_group;
+	uint64_t files = 0;
+
+	nmp = mp->mnt_data;
+	nandfsdev = nmp->nm_nandfsdev;
+	fsdata = &nandfsdev->nd_fsdata;
+	sb = &nandfsdev->nd_super;
+	ifile = nmp->nm_ifile_node;
+	mdt = &nandfsdev->nd_ifile_mdt;
+	entries_per_group = mdt->entries_per_group;
+
+	VOP_LOCK(NTOV(ifile), LK_SHARED);
+	error = nandfs_bread(ifile, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(ifile), 0);
+		return (error);
+	}
+
+	groups = (struct nandfs_block_group_desc *)bp->b_data;
+
+	for (i = 0; i < mdt->groups_per_desc_block; i++)
+		files += (entries_per_group - groups[i].bg_nfrees);
+
+	brelse(bp);
+	VOP_UNLOCK(NTOV(ifile), 0);
+
+	sbp->f_bsize = nandfsdev->nd_blocksize;
+	sbp->f_iosize = sbp->f_bsize;
+	sbp->f_blocks = fsdata->f_blocks_per_segment * fsdata->f_nsegments;
+	sbp->f_bfree = sb->s_free_blocks_count;
+	sbp->f_bavail = sbp->f_bfree;
+	sbp->f_files = files;
+	sbp->f_ffree = 0;
+	return (0);
+}
+
+static int
+nandfs_root(struct mount *mp, int flags, struct vnode **vpp)
+{
+	struct nandfsmount *nmp = VFSTONANDFS(mp);
+	struct nandfs_node *node;
+	int error;
+
+	error = nandfs_get_node(nmp, NANDFS_ROOT_INO, &node);
+	if (error)
+		return (error);
+
+	KASSERT(NTOV(node)->v_vflag & VV_ROOT,
+	    ("root_vp->v_vflag & VV_ROOT"));
+
+	*vpp = NTOV(node);
+
+	return (error);
+}
+
+static int
+nandfs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
+{
+	struct nandfsmount *nmp = VFSTONANDFS(mp);
+	struct nandfs_node *node;
+	int error;
+
+	error = nandfs_get_node(nmp, ino, &node);
+	if (node)
+		*vpp = NTOV(node);
+
+	return (error);
+}
+
+static int
+nandfs_sync(struct mount *mp, int waitfor)
+{
+	struct nandfsmount *nmp = VFSTONANDFS(mp);
+
+	DPRINTF(SYNC, ("%s: mp %p waitfor %d\n", __func__, mp, waitfor));
+
+	/*
+	 * XXX: A hack to be removed soon
+	 */
+	if (waitfor == MNT_LAZY)
+		return (0);
+	if (waitfor == MNT_SUSPEND)
+		return (0);
+	nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_VFS_SYNC);
+	return (0);
+}
+
+static struct vfsops nandfs_vfsops = {
+	.vfs_init =		nandfs_init,
+	.vfs_mount =		nandfs_mount,
+	.vfs_root =		nandfs_root,
+	.vfs_statfs =		nandfs_statfs,
+	.vfs_uninit =		nandfs_uninit,
+	.vfs_unmount =		nandfs_unmount,
+	.vfs_vget =		nandfs_vget,
+	.vfs_sync =		nandfs_sync,
+};
+
+VFS_SET(nandfs_vfsops, nandfs, VFCF_LOOPBACK);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_vnops.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,2455 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_vnops.c,v 1.2 2009/08/26 03:40:48 elad
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_vnops.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/fcntl.h>
+#include <sys/dirent.h>
+#include <sys/stat.h>
+#include <sys/priv.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include <machine/_inttypes.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+extern uma_zone_t nandfs_node_zone;
+static void nandfs_read_filebuf(struct nandfs_node *, struct buf *);
+static void nandfs_itimes_locked(struct vnode *);
+static int nandfs_truncate(struct vnode *, uint64_t);
+
+static vop_pathconf_t	nandfs_pathconf;
+
+#define UPDATE_CLOSE 0
+#define UPDATE_WAIT 0
+
+static int
+nandfs_inactive(struct vop_inactive_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	int error = 0;
+
+	DPRINTF(VNCALL, ("%s: vp:%p node:%p\n", __func__, vp, node));
+
+	if (node == NULL) {
+		DPRINTF(NODE, ("%s: inactive NULL node\n", __func__));
+		return (0);
+	}
+
+	if (node->nn_inode.i_mode != 0 && !(node->nn_inode.i_links_count)) {
+		nandfs_truncate(vp, 0);
+		error = nandfs_node_destroy(node);
+		if (error)
+			nandfs_error("%s: destroy node: %p\n", __func__, node);
+		node->nn_flags = 0;
+		vrecycle(vp);
+	}
+
+	return (error);
+}
+
+static int
+nandfs_reclaim(struct vop_reclaim_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *nandfs_node = VTON(vp);
+	struct nandfs_device *fsdev = nandfs_node->nn_nandfsdev;
+	uint64_t ino = nandfs_node->nn_ino;
+
+	DPRINTF(VNCALL, ("%s: vp:%p node:%p\n", __func__, vp, nandfs_node));
+
+	/* Invalidate all entries to a particular vnode. */
+	cache_purge(vp);
+
+	/* Destroy the vm object and flush associated pages. */
+	vnode_destroy_vobject(vp);
+
+	/* Remove from vfs hash if not system vnode */
+	if (!NANDFS_SYS_NODE(nandfs_node->nn_ino))
+		vfs_hash_remove(vp);
+
+	/* Dispose all node knowledge */
+	nandfs_dispose_node(&nandfs_node);
+
+	if (!NANDFS_SYS_NODE(ino))
+		NANDFS_WRITEUNLOCK(fsdev);
+
+	return (0);
+}
+
+static int
+nandfs_read(struct vop_read_args *ap)
+{
+	register struct vnode *vp = ap->a_vp;
+	register struct nandfs_node *node = VTON(vp);
+	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
+	struct uio *uio = ap->a_uio;
+	struct buf *bp;
+	uint64_t size;
+	uint32_t blocksize;
+	off_t bytesinfile;
+	ssize_t toread, off;
+	daddr_t lbn;
+	ssize_t resid;
+	int error = 0;
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	size = node->nn_inode.i_size;
+	if (uio->uio_offset >= size)
+		return (0);
+
+	blocksize = nandfsdev->nd_blocksize;
+	bytesinfile = size - uio->uio_offset;
+
+	resid = omin(uio->uio_resid, bytesinfile);
+
+	while (resid) {
+		lbn = uio->uio_offset / blocksize;
+		off = uio->uio_offset & (blocksize - 1);
+
+		toread = omin(resid, blocksize - off);
+
+		DPRINTF(READ, ("nandfs_read bn: 0x%jx toread: 0x%zx (0x%x)\n",
+		    (uintmax_t)lbn, toread, blocksize));
+
+		error = nandfs_bread(node, lbn, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			break;
+		}
+
+		error = uiomove(bp->b_data + off, toread, uio);
+		if (error) {
+			brelse(bp);
+			break;
+		}
+
+		brelse(bp);
+		resid -= toread;
+	}
+
+	return (error);
+}
+
+static int
+nandfs_write(struct vop_write_args *ap)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_node *node;
+	struct vnode *vp;
+	struct uio *uio;
+	struct buf *bp;
+	uint64_t file_size, vblk;
+	uint32_t blocksize;
+	ssize_t towrite, off;
+	daddr_t lbn;
+	ssize_t resid;
+	int error, ioflag, modified;
+
+	vp = ap->a_vp;
+	uio = ap->a_uio;
+	ioflag = ap->a_ioflag;
+	node = VTON(vp);
+	fsdev = node->nn_nandfsdev;
+
+	if (nandfs_fs_full(fsdev))
+		return (ENOSPC);
+
+	DPRINTF(WRITE, ("nandfs_write called %#zx at %#jx\n",
+	    uio->uio_resid, (uintmax_t)uio->uio_offset));
+
+	if (uio->uio_offset < 0)
+		return (EINVAL);
+	if (uio->uio_resid == 0)
+		return (0);
+
+	blocksize = fsdev->nd_blocksize;
+	file_size = node->nn_inode.i_size;
+
+	switch (vp->v_type) {
+	case VREG:
+		if (ioflag & IO_APPEND)
+			uio->uio_offset = file_size;
+		break;
+	case VDIR:
+		return (EISDIR);
+	case VLNK:
+		break;
+	default:
+		panic("%s: bad file type vp: %p", __func__, vp);
+	}
+
+	/* If explicitly asked to append, uio_offset can be wrong? */
+	if (ioflag & IO_APPEND)
+		uio->uio_offset = file_size;
+
+	resid = uio->uio_resid;
+	modified = error = 0;
+
+	while (uio->uio_resid) {
+		lbn = uio->uio_offset / blocksize;
+		off = uio->uio_offset & (blocksize - 1);
+
+		towrite = omin(uio->uio_resid, blocksize - off);
+
+		DPRINTF(WRITE, ("%s: lbn: 0x%jd toread: 0x%zx (0x%x)\n",
+		    __func__, (uintmax_t)lbn, towrite, blocksize));
+
+		error = nandfs_bmap_lookup(node, lbn, &vblk);
+		if (error)
+			break;
+
+		DPRINTF(WRITE, ("%s: lbn: 0x%jd toread: 0x%zx (0x%x) "
+		    "vblk=%jx\n", __func__, (uintmax_t)lbn, towrite, blocksize,
+		    vblk));
+
+		if (vblk != 0)
+			error = nandfs_bread(node, lbn, NOCRED, 0, &bp);
+		else
+			error = nandfs_bcreate(node, lbn, NOCRED, 0, &bp);
+
+		DPRINTF(WRITE, ("%s: vp %p bread bp %p lbn %#jx\n", __func__,
+		    vp, bp, (uintmax_t)lbn));
+		if (error) {
+			if (bp)
+				brelse(bp);
+			break;
+		}
+
+		error = uiomove((char *)bp->b_data + off, (int)towrite, uio);
+		if (error)
+			break;
+
+		error = nandfs_dirty_buf(bp, 0);
+		if (error)
+			break;
+
+		modified++;
+	}
+
+	/* XXX proper handling when only part of file was properly written */
+	if (modified) {
+		if (resid > uio->uio_resid && ap->a_cred &&
+		    ap->a_cred->cr_uid != 0)
+			node->nn_inode.i_mode &= ~(ISUID | ISGID);
+
+		if (file_size < uio->uio_offset + uio->uio_resid) {
+			node->nn_inode.i_size = uio->uio_offset +
+			    uio->uio_resid;
+			node->nn_flags |= IN_CHANGE | IN_UPDATE;
+			vnode_pager_setsize(vp, uio->uio_offset +
+			    uio->uio_resid);
+			nandfs_itimes(vp);
+		}
+	}
+
+	DPRINTF(WRITE, ("%s: return:%d\n", __func__, error));
+
+	return (error);
+}
+
+static int
+nandfs_lookup(struct vop_cachedlookup_args *ap)
+{
+	struct vnode *dvp, **vpp;
+	struct componentname *cnp;
+	struct ucred *cred;
+	struct thread *td;
+	struct nandfs_node *dir_node, *node;
+	struct nandfsmount *nmp;
+	uint64_t ino, off;
+	const char *name;
+	int namelen, nameiop, islastcn, mounted_ro;
+	int error, found;
+
+	DPRINTF(VNCALL, ("%s\n", __func__));
+
+	dvp = ap->a_dvp;
+	vpp = ap->a_vpp;
+	*vpp = NULL;
+
+	cnp = ap->a_cnp;
+	cred = cnp->cn_cred;
+	td = cnp->cn_thread;
+
+	dir_node = VTON(dvp);
+	nmp = dir_node->nn_nmp;
+
+	/* Simplify/clarification flags */
+	nameiop = cnp->cn_nameiop;
+	islastcn = cnp->cn_flags & ISLASTCN;
+	mounted_ro = dvp->v_mount->mnt_flag & MNT_RDONLY;
+
+	/*
+	 * If requesting a modify on the last path element on a read-only
+	 * filingsystem, reject lookup;
+	 */
+	if (islastcn && mounted_ro && (nameiop == DELETE || nameiop == RENAME))
+		return (EROFS);
+
+	if (dir_node->nn_inode.i_links_count == 0)
+		return (ENOENT);
+
+	/*
+	 * Obviously, the file is not (anymore) in the namecache, we have to
+	 * search for it. There are three basic cases: '.', '..' and others.
+	 *
+	 * Following the guidelines of VOP_LOOKUP manpage and tmpfs.
+	 */
+	error = 0;
+	if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) {
+		DPRINTF(LOOKUP, ("\tlookup '.'\n"));
+		/* Special case 1 '.' */
+		VREF(dvp);
+		*vpp = dvp;
+		/* Done */
+	} else if (cnp->cn_flags & ISDOTDOT) {
+		/* Special case 2 '..' */
+		DPRINTF(LOOKUP, ("\tlookup '..'\n"));
+
+		/* Get our node */
+		name = "..";
+		namelen = 2;
+		error = nandfs_lookup_name_in_dir(dvp, name, namelen, &ino,
+		    &found, &off);
+		if (error)
+			goto out;
+		if (!found)
+			error = ENOENT;
+
+		/* First unlock parent */
+		VOP_UNLOCK(dvp, 0);
+
+		if (error == 0) {
+			DPRINTF(LOOKUP, ("\tfound '..'\n"));
+			/* Try to create/reuse the node */
+			error = nandfs_get_node(nmp, ino, &node);
+
+			if (!error) {
+				DPRINTF(LOOKUP,
+				    ("\tnode retrieved/created OK\n"));
+				*vpp = NTOV(node);
+			}
+		}
+
+		/* Try to relock parent */
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+	} else {
+		DPRINTF(LOOKUP, ("\tlookup file\n"));
+		/* All other files */
+		/* Look up filename in the directory returning its inode */
+		name = cnp->cn_nameptr;
+		namelen = cnp->cn_namelen;
+		error = nandfs_lookup_name_in_dir(dvp, name, namelen,
+		    &ino, &found, &off);
+		if (error)
+			goto out;
+		if (!found) {
+			DPRINTF(LOOKUP, ("\tNOT found\n"));
+			/*
+			 * UGH, didn't find name. If we're creating or
+			 * renaming on the last name this is OK and we ought
+			 * to return EJUSTRETURN if its allowed to be created.
+			 */
+			error = ENOENT;
+			if ((nameiop == CREATE || nameiop == RENAME) &&
+			    islastcn) {
+				error = VOP_ACCESS(dvp, VWRITE, cred,
+				    td);
+				if (!error) {
+					/* keep the component name */
+					cnp->cn_flags |= SAVENAME;
+					error = EJUSTRETURN;
+				}
+			}
+			/* Done */
+		} else {
+			if (ino == NANDFS_WHT_INO)
+				cnp->cn_flags |= ISWHITEOUT;
+
+			if ((cnp->cn_flags & ISWHITEOUT) &&
+			    (nameiop == LOOKUP))
+				return (ENOENT);
+
+			if ((nameiop == DELETE) && islastcn) {
+				if ((cnp->cn_flags & ISWHITEOUT) &&
+				    (cnp->cn_flags & DOWHITEOUT)) {
+					cnp->cn_flags |= SAVENAME;
+					dir_node->nn_diroff = off;
+					return (EJUSTRETURN);
+				}
+
+				error = VOP_ACCESS(dvp, VWRITE, cred,
+				    cnp->cn_thread);
+				if (error)
+					return (error);
+
+				/* Try to create/reuse the node */
+				error = nandfs_get_node(nmp, ino, &node);
+				if (!error) {
+					*vpp = NTOV(node);
+					node->nn_diroff = off;
+				}
+
+				if ((dir_node->nn_inode.i_mode & ISVTX) &&
+				    cred->cr_uid != 0 &&
+				    cred->cr_uid != dir_node->nn_inode.i_uid &&
+				    node->nn_inode.i_uid != cred->cr_uid) {
+					vput(*vpp);
+					*vpp = NULL;
+					return (EPERM);
+				}
+			} else if ((nameiop == RENAME) && islastcn) {
+				error = VOP_ACCESS(dvp, VWRITE, cred,
+				    cnp->cn_thread);
+				if (error)
+					return (error);
+
+				/* Try to create/reuse the node */
+				error = nandfs_get_node(nmp, ino, &node);
+				if (!error) {
+					*vpp = NTOV(node);
+					node->nn_diroff = off;
+				}
+			} else {
+				/* Try to create/reuse the node */
+				error = nandfs_get_node(nmp, ino, &node);
+				if (!error) {
+					*vpp = NTOV(node);
+					node->nn_diroff = off;
+				}
+			}
+		}
+	}
+
+out:
+	/*
+	 * Store result in the cache if requested. If we are creating a file,
+	 * the file might not be found and thus putting it into the namecache
+	 * might be seen as negative caching.
+	 */
+	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+		cache_enter(dvp, *vpp, cnp);
+
+	return (error);
+
+}
+
+static int
+nandfs_getattr(struct vop_getattr_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vattr *vap = ap->a_vap;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+
+	DPRINTF(VNCALL, ("%s: vp: %p\n", __func__, vp));
+	nandfs_itimes(vp);
+
+	/* Basic info */
+	VATTR_NULL(vap);
+	vap->va_atime.tv_sec = inode->i_mtime;
+	vap->va_atime.tv_nsec = inode->i_mtime_nsec;
+	vap->va_mtime.tv_sec = inode->i_mtime;
+	vap->va_mtime.tv_nsec = inode->i_mtime_nsec;
+	vap->va_ctime.tv_sec = inode->i_ctime;
+	vap->va_ctime.tv_nsec = inode->i_ctime_nsec;
+	vap->va_type = IFTOVT(inode->i_mode);
+	vap->va_mode = inode->i_mode & ~S_IFMT;
+	vap->va_nlink = inode->i_links_count;
+	vap->va_uid = inode->i_uid;
+	vap->va_gid = inode->i_gid;
+	vap->va_rdev = inode->i_special;
+	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+	vap->va_fileid = node->nn_ino;
+	vap->va_size = inode->i_size;
+	vap->va_blocksize = node->nn_nandfsdev->nd_blocksize;
+	vap->va_gen = 0;
+	vap->va_flags = inode->i_flags;
+	vap->va_bytes = inode->i_blocks * vap->va_blocksize;
+	vap->va_filerev = 0;
+	vap->va_vaflags = 0;
+
+	return (0);
+}
+
+static int
+nandfs_vtruncbuf(struct vnode *vp, uint64_t nblks)
+{
+	struct nandfs_device *nffsdev;
+	struct bufobj *bo;
+	struct buf *bp, *nbp;
+
+	bo = &vp->v_bufobj;
+	nffsdev = VTON(vp)->nn_nandfsdev;
+
+	ASSERT_VOP_LOCKED(vp, "nandfs_truncate");
+restart:
+	BO_LOCK(bo);
+restart_locked:
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
+		if (bp->b_lblkno < nblks)
+			continue;
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
+			goto restart_locked;
+
+		bremfree(bp);
+		bp->b_flags |= (B_INVAL | B_RELBUF);
+		bp->b_flags &= ~(B_ASYNC | B_MANAGED);
+		BO_UNLOCK(bo);
+		brelse(bp);
+		BO_LOCK(bo);
+	}
+
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+		if (bp->b_lblkno < nblks)
+			continue;
+		if (BUF_LOCK(bp,
+		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+		    BO_MTX(bo)) == ENOLCK)
+			goto restart;
+		bp->b_flags |= (B_INVAL | B_RELBUF);
+		bp->b_flags &= ~(B_ASYNC | B_MANAGED);
+		brelse(bp);
+		nandfs_dirty_bufs_decrement(nffsdev);
+		BO_LOCK(bo);
+	}
+
+	BO_UNLOCK(bo);
+
+	return (0);
+}
+
+static int
+nandfs_truncate(struct vnode *vp, uint64_t newsize)
+{
+	struct nandfs_device *nffsdev;
+	struct nandfs_node *node;
+	struct nandfs_inode *inode;
+	struct buf *bp = NULL;
+	uint64_t oblks, nblks, vblk, size, rest;
+	int error;
+
+	node = VTON(vp);
+	nffsdev = node->nn_nandfsdev;
+	inode = &node->nn_inode;
+
+	/* Calculate end of file */
+	size = inode->i_size;
+
+	if (newsize == size) {
+		node->nn_flags |= IN_CHANGE | IN_UPDATE;
+		nandfs_itimes(vp);
+		return (0);
+	}
+
+	if (newsize > size) {
+		inode->i_size = newsize;
+		vnode_pager_setsize(vp, newsize);
+		node->nn_flags |= IN_CHANGE | IN_UPDATE;
+		nandfs_itimes(vp);
+		return (0);
+	}
+
+	nblks = howmany(newsize, nffsdev->nd_blocksize);
+	oblks = howmany(size, nffsdev->nd_blocksize);
+	rest = newsize % nffsdev->nd_blocksize;
+
+	if (rest) {
+		error = nandfs_bmap_lookup(node, nblks - 1, &vblk);
+		if (error)
+			return (error);
+
+		if (vblk != 0)
+			error = nandfs_bread(node, nblks - 1, NOCRED, 0, &bp);
+		else
+			error = nandfs_bcreate(node, nblks - 1, NOCRED, 0, &bp);
+
+		if (error) {
+			if (bp)
+				brelse(bp);
+			return (error);
+		}
+
+		bzero((char *)bp->b_data + rest,
+		    (u_int)(nffsdev->nd_blocksize - rest));
+		error = nandfs_dirty_buf(bp, 0);
+		if (error)
+			return (error);
+	}
+
+	DPRINTF(VNCALL, ("%s: vp %p oblks %jx nblks %jx\n", __func__, vp, oblks,
+	    nblks));
+
+	error = nandfs_bmap_truncate_mapping(node, oblks - 1, nblks - 1);
+	if (error) {
+		if (bp)
+			nandfs_undirty_buf(bp);
+		return (error);
+	}
+
+	error = nandfs_vtruncbuf(vp, nblks);
+	if (error) {
+		if (bp)
+			nandfs_undirty_buf(bp);
+		return (error);
+	}
+
+	inode->i_size = newsize;
+	vnode_pager_setsize(vp, newsize);
+	node->nn_flags |= IN_CHANGE | IN_UPDATE;
+	nandfs_itimes(vp);
+
+	return (error);
+}
+
+static void
+nandfs_itimes_locked(struct vnode *vp)
+{
+	struct nandfs_node *node;
+	struct nandfs_inode *inode;
+	struct timespec ts;
+
+	ASSERT_VI_LOCKED(vp, __func__);
+
+	node = VTON(vp);
+	inode = &node->nn_inode;
+
+	if ((node->nn_flags & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
+		return;
+
+	if (((vp->v_mount->mnt_kern_flag &
+	    (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
+	    (node->nn_flags & (IN_CHANGE | IN_UPDATE)))
+		node->nn_flags |= IN_MODIFIED;
+
+	vfs_timestamp(&ts);
+	if (node->nn_flags & IN_UPDATE) {
+		inode->i_mtime = ts.tv_sec;
+		inode->i_mtime_nsec = ts.tv_nsec;
+	}
+	if (node->nn_flags & IN_CHANGE) {
+		inode->i_ctime = ts.tv_sec;
+		inode->i_ctime_nsec = ts.tv_nsec;
+	}
+
+	node->nn_flags &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
+}
+
+void
+nandfs_itimes(struct vnode *vp)
+{
+
+	VI_LOCK(vp);
+	nandfs_itimes_locked(vp);
+	VI_UNLOCK(vp);
+}
+
+static int
+nandfs_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td)
+{
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+	uint16_t nmode;
+	int error = 0;
+
+	DPRINTF(VNCALL, ("%s: vp %p, mode %x, cred %p, td %p\n", __func__, vp,
+	    mode, cred, td));
+	/*
+	 * To modify the permissions on a file, must possess VADMIN
+	 * for that file.
+	 */
+	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
+		return (error);
+
+	/*
+	 * Privileged processes may set the sticky bit on non-directories,
+	 * as well as set the setgid bit on a file with a group that the
+	 * process is not a member of. Both of these are allowed in
+	 * jail(8).
+	 */
+	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
+		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0))
+			return (EFTYPE);
+	}
+	if (!groupmember(inode->i_gid, cred) && (mode & ISGID)) {
+		error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * Deny setting setuid if we are not the file owner.
+	 */
+	if ((mode & ISUID) && inode->i_uid != cred->cr_uid) {
+		error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
+		if (error)
+			return (error);
+	}
+
+	nmode = inode->i_mode;
+	nmode &= ~ALLPERMS;
+	nmode |= (mode & ALLPERMS);
+	inode->i_mode = nmode;
+	node->nn_flags |= IN_CHANGE;
+
+	DPRINTF(VNCALL, ("%s: to mode %x\n", __func__, nmode));
+
+	return (error);
+}
+
+static int
+nandfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred,
+    struct thread *td)
+{
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+	uid_t ouid;
+	gid_t ogid;
+	int error = 0;
+
+	if (uid == (uid_t)VNOVAL)
+		uid = inode->i_uid;
+	if (gid == (gid_t)VNOVAL)
+		gid = inode->i_gid;
+	/*
+	 * To modify the ownership of a file, must possess VADMIN for that
+	 * file.
+	 */
+	if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td)))
+		return (error);
+	/*
+	 * To change the owner of a file, or change the group of a file to a
+	 * group of which we are not a member, the caller must have
+	 * privilege.
+	 */
+	if (((uid != inode->i_uid && uid != cred->cr_uid) ||
+	    (gid != inode->i_gid && !groupmember(gid, cred))) &&
+	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0)))
+		return (error);
+	ogid = inode->i_gid;
+	ouid = inode->i_uid;
+
+	inode->i_gid = gid;
+	inode->i_uid = uid;
+
+	node->nn_flags |= IN_CHANGE;
+	if ((inode->i_mode & (ISUID | ISGID)) &&
+	    (ouid != uid || ogid != gid)) {
+		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
+			inode->i_mode &= ~(ISUID | ISGID);
+		}
+	}
+	DPRINTF(VNCALL, ("%s: vp %p, cred %p, td %p - ret OK\n", __func__, vp,
+	    cred, td));
+	return (0);
+}
+
+static int
+nandfs_setattr(struct vop_setattr_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+	struct vattr *vap = ap->a_vap;
+	struct ucred *cred = ap->a_cred;
+	struct thread *td = curthread;
+	uint32_t flags;
+	int error = 0;
+
+	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
+	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
+	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
+	    (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
+		DPRINTF(VNCALL, ("%s: unsettable attribute\n", __func__));
+		return (EINVAL);
+	}
+
+	if (vap->va_flags != VNOVAL) {
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p flags:%lx\n", __func__, vp,
+		    td, vap->va_flags));
+
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		/*
+		 * Callers may only modify the file flags on objects they
+		 * have VADMIN rights for.
+		 */
+		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
+			return (error);
+		/*
+		 * Unprivileged processes are not permitted to unset system
+		 * flags, or modify flags if any system flags are set.
+		 * Privileged non-jail processes may not modify system flags
+		 * if securelevel > 0 and any existing system flags are set.
+		 * Privileged jail processes behave like privileged non-jail
+		 * processes if the security.jail.chflags_allowed sysctl is
+		 * is non-zero; otherwise, they behave like unprivileged
+		 * processes.
+		 */
+
+		flags = inode->i_flags;
+		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
+			if (flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
+				error = securelevel_gt(cred, 0);
+				if (error)
+					return (error);
+			}
+			/* Snapshot flag cannot be set or cleared */
+			if (((vap->va_flags & SF_SNAPSHOT) != 0 &&
+			    (flags & SF_SNAPSHOT) == 0) ||
+			    ((vap->va_flags & SF_SNAPSHOT) == 0 &&
+			    (flags & SF_SNAPSHOT) != 0))
+				return (EPERM);
+
+			inode->i_flags = vap->va_flags;
+		} else {
+			if (flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
+			    (vap->va_flags & UF_SETTABLE) != vap->va_flags)
+				return (EPERM);
+
+			flags &= SF_SETTABLE;
+			flags |= (vap->va_flags & UF_SETTABLE);
+			inode->i_flags = flags;
+		}
+		node->nn_flags |= IN_CHANGE;
+		if (vap->va_flags & (IMMUTABLE | APPEND))
+			return (0);
+	}
+	if (inode->i_flags & (IMMUTABLE | APPEND))
+		return (EPERM);
+
+	if (vap->va_size != (u_quad_t)VNOVAL) {
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p size:%jx\n", __func__, vp, td,
+		    (uintmax_t)vap->va_size));
+
+		switch (vp->v_type) {
+		case VDIR:
+			return (EISDIR);
+		case VLNK:
+		case VREG:
+			if (vp->v_mount->mnt_flag & MNT_RDONLY)
+				return (EROFS);
+			if ((inode->i_flags & SF_SNAPSHOT) != 0)
+				return (EPERM);
+			break;
+		default:
+			return (0);
+		}
+
+		if (vap->va_size > node->nn_nandfsdev->nd_maxfilesize)
+			return (EFBIG);
+
+		KASSERT((vp->v_type == VREG), ("Set size %d", vp->v_type));
+		nandfs_truncate(vp, vap->va_size);
+		node->nn_flags |= IN_CHANGE;
+
+		return (0);
+	}
+
+	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p uid/gid %x/%x\n", __func__,
+		    vp, td, vap->va_uid, vap->va_gid));
+		error = nandfs_chown(vp, vap->va_uid, vap->va_gid, cred, td);
+		if (error)
+			return (error);
+	}
+
+	if (vap->va_mode != (mode_t)VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p mode %x\n", __func__, vp, td,
+		    vap->va_mode));
+
+		error = nandfs_chmod(vp, (int)vap->va_mode, cred, td);
+		if (error)
+			return (error);
+	}
+	if (vap->va_atime.tv_sec != VNOVAL ||
+	    vap->va_mtime.tv_sec != VNOVAL ||
+	    vap->va_birthtime.tv_sec != VNOVAL) {
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p time a/m/b %jx/%jx/%jx\n",
+		    __func__, vp, td, (uintmax_t)vap->va_atime.tv_sec,
+		    (uintmax_t)vap->va_mtime.tv_sec,
+		    (uintmax_t)vap->va_birthtime.tv_sec));
+
+		if (vap->va_atime.tv_sec != VNOVAL)
+			node->nn_flags |= IN_ACCESS;
+		if (vap->va_mtime.tv_sec != VNOVAL)
+			node->nn_flags |= IN_CHANGE | IN_UPDATE;
+		if (vap->va_birthtime.tv_sec != VNOVAL)
+			node->nn_flags |= IN_MODIFIED;
+		nandfs_itimes(vp);
+		return (0);
+	}
+
+	return (0);
+}
+
+static int
+nandfs_open(struct vop_open_args *ap)
+{
+	struct nandfs_node *node = VTON(ap->a_vp);
+	uint64_t filesize;
+
+	DPRINTF(VNCALL, ("nandfs_open called ap->a_mode %x\n", ap->a_mode));
+
+	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
+		return (EOPNOTSUPP);
+
+	if ((node->nn_inode.i_flags & APPEND) &&
+	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
+		return (EPERM);
+
+	filesize = node->nn_inode.i_size;
+	vnode_create_vobject(ap->a_vp, filesize, ap->a_td);
+
+	return (0);
+}
+
+static int
+nandfs_close(struct vop_close_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+
+	DPRINTF(VNCALL, ("%s: vp %p node %p\n", __func__, vp, node));
+
+	mtx_lock(&vp->v_interlock);
+	if (vp->v_usecount > 1)
+		nandfs_itimes_locked(vp);
+	mtx_unlock(&vp->v_interlock);
+
+	return (0);
+}
+
+static int
+nandfs_check_possible(struct vnode *vp, struct vattr *vap, mode_t mode)
+{
+
+	/* Check if we are allowed to write */
+	switch (vap->va_type) {
+	case VDIR:
+	case VLNK:
+	case VREG:
+		/*
+		 * Normal nodes: check if we're on a read-only mounted
+		 * filingsystem and bomb out if we're trying to write.
+		 */
+		if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY))
+			return (EROFS);
+		break;
+	case VBLK:
+	case VCHR:
+	case VSOCK:
+	case VFIFO:
+		/*
+		 * Special nodes: even on read-only mounted filingsystems
+		 * these are allowed to be written to if permissions allow.
+		 */
+		break;
+	default:
+		/* No idea what this is */
+		return (EINVAL);
+	}
+
+	/* Noone may write immutable files */
+	if ((mode & VWRITE) && (VTON(vp)->nn_inode.i_flags & IMMUTABLE))
+		return (EPERM);
+
+	return (0);
+}
+
+static int
+nandfs_check_permitted(struct vnode *vp, struct vattr *vap, mode_t mode,
+    struct ucred *cred)
+{
+
+	return (vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid, mode,
+	    cred, NULL));
+}
+
+static int
+nandfs_advlock(struct vop_advlock_args *ap)
+{
+	struct nandfs_node *nvp;
+	quad_t size;
+
+	nvp = VTON(ap->a_vp);
+	size = nvp->nn_inode.i_size;
+	return (lf_advlock(ap, &(nvp->nn_lockf), size));
+}
+
+static int
+nandfs_access(struct vop_access_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	accmode_t accmode = ap->a_accmode;
+	struct ucred *cred = ap->a_cred;
+	struct vattr vap;
+	int error;
+
+	DPRINTF(VNCALL, ("%s: vp:%p mode: %x\n", __func__, vp, accmode));
+
+	error = VOP_GETATTR(vp, &vap, NULL);
+	if (error)
+		return (error);
+
+	error = nandfs_check_possible(vp, &vap, accmode);
+	if (error) {
+		return (error);
+	}
+
+	error = nandfs_check_permitted(vp, &vap, accmode, cred);
+
+	return (error);
+}
+
+static int
+nandfs_print(struct vop_print_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *nvp = VTON(vp);
+
+	printf("\tvp=%p, nandfs_node=%p\n", vp, nvp);
+	printf("nandfs inode %#jx\n", (uintmax_t)nvp->nn_ino);
+	printf("flags = 0x%b\n", (u_int)nvp->nn_flags, PRINT_NODE_FLAGS);
+
+	return (0);
+}
+
+static void
+nandfs_read_filebuf(struct nandfs_node *node, struct buf *bp)
+{
+	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
+	struct buf *nbp;
+	nandfs_daddr_t vblk, pblk;
+	nandfs_lbn_t from;
+	uint32_t blocksize;
+	int error = 0;
+	int blk2dev = nandfsdev->nd_blocksize / DEV_BSIZE;
+
+	/*
+	 * Translate all the block sectors into a series of buffers to read
+	 * asynchronously from the nandfs device. Note that this lookup may
+	 * induce readin's too.
+	 */
+
+	blocksize = nandfsdev->nd_blocksize;
+	if (bp->b_bcount / blocksize != 1)
+		panic("invalid b_count in bp %p\n", bp);
+
+	from = bp->b_blkno;
+
+	DPRINTF(READ, ("\tread in from inode %#jx blkno %#jx"
+	    " count %#lx\n", (uintmax_t)node->nn_ino, from,
+	    bp->b_bcount));
+
+	/* Get virtual block numbers for the vnode's buffer span */
+	error = nandfs_bmap_lookup(node, from, &vblk);
+	if (error) {
+		bp->b_error = EINVAL;
+		bp->b_ioflags |= BIO_ERROR;
+		bufdone(bp);
+		return;
+	}
+
+	/* Translate virtual block numbers to physical block numbers */
+	error = nandfs_vtop(node, vblk, &pblk);
+	if (error) {
+		bp->b_error = EINVAL;
+		bp->b_ioflags |= BIO_ERROR;
+		bufdone(bp);
+		return;
+	}
+
+	/* Issue translated blocks */
+	bp->b_resid = bp->b_bcount;
+
+	/* Note virtual block 0 marks not mapped */
+	if (vblk == 0) {
+		vfs_bio_clrbuf(bp);
+		bufdone(bp);
+		return;
+	}
+
+	nbp = bp;
+	nbp->b_blkno = pblk * blk2dev;
+	bp->b_iooffset = dbtob(nbp->b_blkno);
+	MPASS(bp->b_iooffset >= 0);
+	BO_STRATEGY(&nandfsdev->nd_devvp->v_bufobj, nbp);
+	nandfs_vblk_set(bp, vblk);
+	DPRINTF(READ, ("read_filebuf : ino %#jx blk %#jx -> "
+	    "%#jx -> %#jx [bp %p]\n", (uintmax_t)node->nn_ino,
+	    (uintmax_t)(from), (uintmax_t)vblk,
+	    (uintmax_t)pblk, nbp));
+}
+
+static void
+nandfs_write_filebuf(struct nandfs_node *node, struct buf *bp)
+{
+	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
+
+	bp->b_iooffset = dbtob(bp->b_blkno);
+	MPASS(bp->b_iooffset >= 0);
+	BO_STRATEGY(&nandfsdev->nd_devvp->v_bufobj, bp);
+}
+
+static int
+nandfs_strategy(struct vop_strategy_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct buf *bp = ap->a_bp;
+	struct nandfs_node *node = VTON(vp);
+
+
+	/* check if we ought to be here */
+	KASSERT((vp->v_type != VBLK && vp->v_type != VCHR),
+	    ("nandfs_strategy on type %d", vp->v_type));
+
+	/* Translate if needed and pass on */
+	if (bp->b_iocmd == BIO_READ) {
+		nandfs_read_filebuf(node, bp);
+		return (0);
+	}
+
+	/* Send to segment collector */
+	nandfs_write_filebuf(node, bp);
+	return (0);
+}
+
+static int
+nandfs_readdir(struct vop_readdir_args *ap)
+{
+	struct uio *uio = ap->a_uio;
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_dir_entry *ndirent;
+	struct dirent dirent;
+	struct buf *bp;
+	uint64_t file_size, diroffset, transoffset, blkoff;
+	uint64_t blocknr;
+	uint32_t blocksize = node->nn_nandfsdev->nd_blocksize;
+	uint8_t *pos, name_len;
+	int error;
+
+	DPRINTF(READDIR, ("nandfs_readdir called\n"));
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	file_size = node->nn_inode.i_size;
+	DPRINTF(READDIR, ("nandfs_readdir filesize %jd resid %zd\n",
+	    (uintmax_t)file_size, uio->uio_resid ));
+
+	/* We are called just as long as we keep on pushing data in */
+	error = 0;
+	if ((uio->uio_offset < file_size) &&
+	    (uio->uio_resid >= sizeof(struct dirent))) {
+		diroffset = uio->uio_offset;
+		transoffset = diroffset;
+
+		blocknr = diroffset / blocksize;
+		blkoff = diroffset % blocksize;
+		error = nandfs_bread(node, blocknr, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (EIO);
+		}
+		while (diroffset < file_size) {
+			DPRINTF(READDIR, ("readdir : offset = %"PRIu64"\n",
+			    diroffset));
+			if (blkoff >= blocksize) {
+				blkoff = 0; blocknr++;
+				brelse(bp);
+				error = nandfs_bread(node, blocknr, NOCRED, 0,
+				    &bp);
+				if (error) {
+					brelse(bp);
+					return (EIO);
+				}
+			}
+
+			/* Read in one dirent */
+			pos = (uint8_t *)bp->b_data + blkoff;
+			ndirent = (struct nandfs_dir_entry *)pos;
+
+			name_len = ndirent->name_len;
+			memset(&dirent, 0, sizeof(struct dirent));
+			dirent.d_fileno = ndirent->inode;
+			if (dirent.d_fileno) {
+				dirent.d_type = ndirent->file_type;
+				dirent.d_namlen = name_len;
+				strncpy(dirent.d_name, ndirent->name, name_len);
+				dirent.d_reclen = GENERIC_DIRSIZ(&dirent);
+				DPRINTF(READDIR, ("copying `%*.*s`\n", name_len,
+				    name_len, dirent.d_name));
+			}
+
+			/*
+			 * If there isn't enough space in the uio to return a
+			 * whole dirent, break off read
+			 */
+			if (uio->uio_resid < GENERIC_DIRSIZ(&dirent))
+				break;
+
+			/* Transfer */
+			if (dirent.d_fileno)
+				uiomove(&dirent, GENERIC_DIRSIZ(&dirent), uio);
+
+			/* Advance */
+			diroffset += ndirent->rec_len;
+			blkoff += ndirent->rec_len;
+
+			/* Remember the last entry we transfered */
+			transoffset = diroffset;
+		}
+		brelse(bp);
+
+		/* Pass on last transfered offset */
+		uio->uio_offset = transoffset;
+	}
+
+	if (ap->a_eofflag)
+		*ap->a_eofflag = (uio->uio_offset >= file_size);
+
+	return (error);
+}
+
+static int
+nandfs_dirempty(struct vnode *dvp, uint64_t parentino, struct ucred *cred)
+{
+	struct nandfs_node *dnode = VTON(dvp);
+	struct nandfs_dir_entry *dirent;
+	uint64_t file_size = dnode->nn_inode.i_size;
+	uint64_t blockcount = dnode->nn_inode.i_blocks;
+	uint64_t blocknr;
+	uint32_t blocksize = dnode->nn_nandfsdev->nd_blocksize;
+	uint32_t limit;
+	uint32_t off;
+	uint8_t	*pos;
+	struct buf *bp;
+	int error;
+
+	DPRINTF(LOOKUP, ("%s: dvp %p parentino %#jx cred %p\n", __func__, dvp,
+	    (uintmax_t)parentino, cred));
+
+	KASSERT((file_size != 0), ("nandfs_dirempty for NULL dir %p", dvp));
+
+	blocknr = 0;
+	while (blocknr < blockcount) {
+		error = nandfs_bread(dnode, blocknr, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (0);
+		}
+
+		pos = (uint8_t *)bp->b_data;
+		off = 0;
+
+		if (blocknr == (blockcount - 1))
+			limit = file_size % blocksize;
+		else
+			limit = blocksize;
+
+		while (off < limit) {
+			dirent = (struct nandfs_dir_entry *)(pos + off);
+			off += dirent->rec_len;
+
+			if (dirent->inode == 0)
+				continue;
+
+			switch (dirent->name_len) {
+			case 0:
+				break;
+			case 1:
+				if (dirent->name[0] != '.')
+					goto notempty;
+
+				KASSERT(dirent->inode == dnode->nn_ino,
+				    (".'s inode does not match dir"));
+				break;
+			case 2:
+				if (dirent->name[0] != '.' &&
+				    dirent->name[1] != '.')
+					goto notempty;
+
+				KASSERT(dirent->inode == parentino,
+				    ("..'s inode does not match parent"));
+				break;
+			default:
+				goto notempty;
+			}
+		}
+
+		brelse(bp);
+		blocknr++;
+	}
+
+	return (1);
+notempty:
+	brelse(bp);
+	return (0);
+}
+
+static int
+nandfs_link(struct vop_link_args *ap)
+{
+	struct vnode *tdvp = ap->a_tdvp;
+	struct vnode *vp = ap->a_vp;
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+	int error;
+
+	if (tdvp->v_mount != vp->v_mount)
+		return (EXDEV);
+
+	if (inode->i_links_count >= LINK_MAX)
+		return (EMLINK);
+
+	if (inode->i_flags & (IMMUTABLE | APPEND))
+		return (EPERM);
+
+	/* Update link count */
+	inode->i_links_count++;
+
+	/* Add dir entry */
+	error = nandfs_add_dirent(tdvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(inode->i_mode));
+	if (error) {
+		inode->i_links_count--;
+	}
+
+	node->nn_flags |= IN_CHANGE;
+	nandfs_itimes(vp);
+	DPRINTF(VNCALL, ("%s: tdvp %p vp %p cnp %p\n",
+	    __func__, tdvp, vp, cnp));
+
+	return (0);
+}
+
+static int
+nandfs_create(struct vop_create_args *ap)
+{
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	uint16_t mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfsmount *nmp = dir_node->nn_nmp;
+	struct nandfs_node *node;
+	int error;
+
+	DPRINTF(VNCALL, ("%s: dvp %p\n", __func__, dvp));
+
+	if (nandfs_fs_full(dir_node->nn_nandfsdev))
+		return (ENOSPC);
+
+	/* Create new vnode/inode */
+	error = nandfs_node_create(nmp, &node, mode);
+	if (error)
+		return (error);
+	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
+	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
+
+	/* Add new dir entry */
+	error = nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(mode));
+	if (error) {
+		if (nandfs_node_destroy(node)) {
+			nandfs_error("%s: error destroying node %p\n",
+			    __func__, node);
+		}
+		return (error);
+	}
+	*vpp = NTOV(node);
+
+	DPRINTF(VNCALL, ("created file vp %p nandnode %p ino %jx\n", *vpp, node,
+	    (uintmax_t)node->nn_ino));
+	return (0);
+}
+
+static int
+nandfs_remove(struct vop_remove_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vnode *dvp = ap->a_dvp;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_node *dnode = VTON(dvp);
+	struct componentname *cnp = ap->a_cnp;
+
+	DPRINTF(VNCALL, ("%s: dvp %p vp %p nandnode %p ino %#jx link %d\n",
+	    __func__, dvp, vp, node, (uintmax_t)node->nn_ino,
+	    node->nn_inode.i_links_count));
+
+	if (vp->v_type == VDIR)
+		return (EISDIR);
+
+	/* Files marked as immutable or append-only cannot be deleted. */
+	if ((node->nn_inode.i_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
+	    (dnode->nn_inode.i_flags & APPEND))
+		return (EPERM);
+
+	nandfs_remove_dirent(dvp, node, cnp);
+	node->nn_inode.i_links_count--;
+	node->nn_flags |= IN_CHANGE;
+
+	return (0);
+}
+
+/*
+ * Check if source directory is in the path of the target directory.
+ * Target is supplied locked, source is unlocked.
+ * The target is always vput before returning.
+ */
+static int
+nandfs_checkpath(struct nandfs_node *src, struct nandfs_node *dest,
+    struct ucred *cred)
+{
+	struct vnode *vp;
+	int error, rootino;
+	struct nandfs_dir_entry dirent;
+
+	vp = NTOV(dest);
+	if (src->nn_ino == dest->nn_ino) {
+		error = EEXIST;
+		goto out;
+	}
+	rootino = NANDFS_ROOT_INO;
+	error = 0;
+	if (dest->nn_ino == rootino)
+		goto out;
+
+	for (;;) {
+		if (vp->v_type != VDIR) {
+			error = ENOTDIR;
+			break;
+		}
+
+		error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirent,
+		    NANDFS_DIR_REC_LEN(2), (off_t)0, UIO_SYSSPACE,
+		    IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED,
+		    NULL, NULL);
+		if (error != 0)
+			break;
+		if (dirent.name_len != 2 ||
+		    dirent.name[0] != '.' ||
+		    dirent.name[1] != '.') {
+			error = ENOTDIR;
+			break;
+		}
+		if (dirent.inode == src->nn_ino) {
+			error = EINVAL;
+			break;
+		}
+		if (dirent.inode == rootino)
+			break;
+		vput(vp);
+		if ((error = VFS_VGET(vp->v_mount, dirent.inode,
+		    LK_EXCLUSIVE, &vp)) != 0) {
+			vp = NULL;
+			break;
+		}
+	}
+
+out:
+	if (error == ENOTDIR)
+		printf("checkpath: .. not a directory\n");
+	if (vp != NULL)
+		vput(vp);
+	return (error);
+}
+
+static int
+nandfs_rename(struct vop_rename_args *ap)
+{
+	struct vnode *tvp = ap->a_tvp;
+	struct vnode *tdvp = ap->a_tdvp;
+	struct vnode *fvp = ap->a_fvp;
+	struct vnode *fdvp = ap->a_fdvp;
+	struct componentname *tcnp = ap->a_tcnp;
+	struct componentname *fcnp = ap->a_fcnp;
+	int doingdirectory = 0, oldparent = 0, newparent = 0;
+	int error = 0;
+
+	struct nandfs_node *fdnode, *fnode, *fnode1;
+	struct nandfs_node *tdnode = VTON(tdvp);
+	struct nandfs_node *tnode;
+
+	uint32_t tdflags, fflags, fdflags;
+	uint16_t mode;
+
+	DPRINTF(VNCALL, ("%s: fdvp:%p fvp:%p tdvp:%p tdp:%p\n", __func__, fdvp,
+	    fvp, tdvp, tvp));
+
+	/*
+	 * Check for cross-device rename.
+	 */
+	if ((fvp->v_mount != tdvp->v_mount) ||
+	    (tvp && (fvp->v_mount != tvp->v_mount))) {
+		error = EXDEV;
+abortit:
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		if (tvp)
+			vput(tvp);
+		vrele(fdvp);
+		vrele(fvp);
+		return (error);
+	}
+
+	tdflags = tdnode->nn_inode.i_flags;
+	if (tvp &&
+	    ((VTON(tvp)->nn_inode.i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
+	    (tdflags & APPEND))) {
+		error = EPERM;
+		goto abortit;
+	}
+
+	/*
+	 * Renaming a file to itself has no effect.  The upper layers should
+	 * not call us in that case.  Temporarily just warn if they do.
+	 */
+	if (fvp == tvp) {
+		printf("nandfs_rename: fvp == tvp (can't happen)\n");
+		error = 0;
+		goto abortit;
+	}
+
+	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
+		goto abortit;
+
+	fdnode = VTON(fdvp);
+	fnode = VTON(fvp);
+
+	if (fnode->nn_inode.i_links_count >= LINK_MAX) {
+		VOP_UNLOCK(fvp, 0);
+		error = EMLINK;
+		goto abortit;
+	}
+
+	fflags = fnode->nn_inode.i_flags;
+	fdflags = fdnode->nn_inode.i_flags;
+
+	if ((fflags & (NOUNLINK | IMMUTABLE | APPEND)) ||
+	    (fdflags & APPEND)) {
+		VOP_UNLOCK(fvp, 0);
+		error = EPERM;
+		goto abortit;
+	}
+
+	mode = fnode->nn_inode.i_mode;
+	if ((mode & S_IFMT) == S_IFDIR) {
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+
+		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+		    (fdvp == fvp) ||
+		    ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) ||
+		    (fnode->nn_flags & IN_RENAME)) {
+			VOP_UNLOCK(fvp, 0);
+			error = EINVAL;
+			goto abortit;
+		}
+		fnode->nn_flags |= IN_RENAME;
+		doingdirectory = 1;
+		DPRINTF(VNCALL, ("%s: doingdirectory dvp %p\n", __func__,
+		    tdvp));
+		oldparent = fdnode->nn_ino;
+	}
+
+	vrele(fdvp);
+
+	tnode = NULL;
+	if (tvp)
+		tnode = VTON(tvp);
+
+	/*
+	 * Bump link count on fvp while we are moving stuff around. If we
+	 * crash before completing the work, the link count may be wrong
+	 * but correctable.
+	 */
+	fnode->nn_inode.i_links_count++;
+
+	/* Check for in path moving XXX */
+	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
+	VOP_UNLOCK(fvp, 0);
+	if (oldparent != tdnode->nn_ino)
+		newparent = tdnode->nn_ino;
+	if (doingdirectory && newparent) {
+		if (error)	/* write access check above */
+			goto bad;
+		if (tnode != NULL)
+			vput(tvp);
+
+		error = nandfs_checkpath(fnode, tdnode, tcnp->cn_cred);
+		if (error)
+			goto out;
+
+		VREF(tdvp);
+		error = relookup(tdvp, &tvp, tcnp);
+		if (error)
+			goto out;
+		vrele(tdvp);
+		tdnode = VTON(tdvp);
+		tnode = NULL;
+		if (tvp)
+			tnode = VTON(tvp);
+	}
+
+	/*
+	 * If the target doesn't exist, link the target to the source and
+	 * unlink the source. Otherwise, rewrite the target directory to
+	 * reference the source and remove the original entry.
+	 */
+
+	if (tvp == NULL) {
+		/*
+		 * Account for ".." in new directory.
+		 */
+		if (doingdirectory && fdvp != tdvp)
+			tdnode->nn_inode.i_links_count++;
+
+		DPRINTF(VNCALL, ("%s: new entry in dvp:%p\n", __func__, tdvp));
+		/*
+		 * Add name in new directory.
+		 */
+		error = nandfs_add_dirent(tdvp, fnode->nn_ino, tcnp->cn_nameptr,
+		    tcnp->cn_namelen, IFTODT(fnode->nn_inode.i_mode));
+		if (error) {
+			if (doingdirectory && fdvp != tdvp)
+				tdnode->nn_inode.i_links_count--;
+			goto bad;
+		}
+
+		vput(tdvp);
+	} else {
+		/*
+		 * If the parent directory is "sticky", then the user must
+		 * own the parent directory, or the destination of the rename,
+		 * otherwise the destination may not be changed (except by
+		 * root). This implements append-only directories.
+		 */
+		if ((tdnode->nn_inode.i_mode & S_ISTXT) &&
+		    tcnp->cn_cred->cr_uid != 0 &&
+		    tcnp->cn_cred->cr_uid != tdnode->nn_inode.i_uid &&
+		    tnode->nn_inode.i_uid != tcnp->cn_cred->cr_uid) {
+			error = EPERM;
+			goto bad;
+		}
+		/*
+		 * Target must be empty if a directory and have no links
+		 * to it. Also, ensure source and target are compatible
+		 * (both directories, or both not directories).
+		 */
+		mode = tnode->nn_inode.i_mode;
+		if ((mode & S_IFMT) == S_IFDIR) {
+			if (!nandfs_dirempty(tvp, tdnode->nn_ino,
+			    tcnp->cn_cred)) {
+				error = ENOTEMPTY;
+				goto bad;
+			}
+			if (!doingdirectory) {
+				error = ENOTDIR;
+				goto bad;
+			}
+			/*
+			 * Update name cache since directory is going away.
+			 */
+			cache_purge(tdvp);
+		} else if (doingdirectory) {
+			error = EISDIR;
+			goto bad;
+		}
+
+		DPRINTF(VNCALL, ("%s: update entry dvp:%p\n", __func__, tdvp));
+		/*
+		 * Change name tcnp in tdvp to point at fvp.
+		 */
+		error = nandfs_update_dirent(tdvp, fnode, tnode);
+		if (error)
+			goto bad;
+
+		if (doingdirectory && !newparent)
+			tdnode->nn_inode.i_links_count--;
+
+		vput(tdvp);
+
+		tnode->nn_inode.i_links_count--;
+		vput(tvp);
+		tnode = NULL;
+	}
+
+	/*
+	 * Unlink the source.
+	 */
+	fcnp->cn_flags &= ~MODMASK;
+	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+	VREF(fdvp);
+	error = relookup(fdvp, &fvp, fcnp);
+	if (error == 0)
+		vrele(fdvp);
+	if (fvp != NULL) {
+		fnode1 = VTON(fvp);
+		fdnode = VTON(fdvp);
+	} else {
+		/*
+		 * From name has disappeared.
+		 */
+		if (doingdirectory)
+			panic("nandfs_rename: lost dir entry");
+		vrele(ap->a_fvp);
+		return (0);
+	}
+
+	DPRINTF(VNCALL, ("%s: unlink source fnode:%p\n", __func__, fnode));
+
+	/*
+	 * Ensure that the directory entry still exists and has not
+	 * changed while the new name has been entered. If the source is
+	 * a file then the entry may have been unlinked or renamed. In
+	 * either case there is no further work to be done. If the source
+	 * is a directory then it cannot have been rmdir'ed; its link
+	 * count of three would cause a rmdir to fail with ENOTEMPTY.
+	 * The IN_RENAME flag ensures that it cannot be moved by another
+	 * rename.
+	 */
+	if (fnode != fnode1) {
+		if (doingdirectory)
+			panic("nandfs: lost dir entry");
+	} else {
+		/*
+		 * If the source is a directory with a
+		 * new parent, the link count of the old
+		 * parent directory must be decremented
+		 * and ".." set to point to the new parent.
+		 */
+		if (doingdirectory && newparent) {
+			DPRINTF(VNCALL, ("%s: new parent %#jx -> %#jx\n",
+			    __func__, (uintmax_t) oldparent,
+			    (uintmax_t) newparent));
+			error = nandfs_update_parent_dir(fvp, newparent);
+			if (!error) {
+				fdnode->nn_inode.i_links_count--;
+				fdnode->nn_flags |= IN_CHANGE;
+			}
+		}
+		error = nandfs_remove_dirent(fdvp, fnode, fcnp);
+		if (!error) {
+			fnode->nn_inode.i_links_count--;
+			fnode->nn_flags |= IN_CHANGE;
+		}
+		fnode->nn_flags &= ~IN_RENAME;
+	}
+	if (fdnode)
+		vput(fdvp);
+	if (fnode)
+		vput(fvp);
+	vrele(ap->a_fvp);
+	return (error);
+
+bad:
+	DPRINTF(VNCALL, ("%s: error:%d\n", __func__, error));
+	if (tnode)
+		vput(NTOV(tnode));
+	vput(NTOV(tdnode));
+out:
+	if (doingdirectory)
+		fnode->nn_flags &= ~IN_RENAME;
+	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
+		fnode->nn_inode.i_links_count--;
+		fnode->nn_flags |= IN_CHANGE;
+		fnode->nn_flags &= ~IN_RENAME;
+		vput(fvp);
+	} else
+		vrele(fvp);
+	return (error);
+}
+
+static int
+nandfs_mkdir(struct vop_mkdir_args *ap)
+{
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfs_inode *dir_inode = &dir_node->nn_inode;
+	struct nandfs_node *node;
+	struct nandfsmount *nmp = dir_node->nn_nmp;
+	uint16_t mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
+	int error;
+
+	DPRINTF(VNCALL, ("%s: dvp %p\n", __func__, dvp));
+
+	if (nandfs_fs_full(dir_node->nn_nandfsdev))
+		return (ENOSPC);
+
+	if (dir_inode->i_links_count >= LINK_MAX)
+		return (EMLINK);
+
+	error = nandfs_node_create(nmp, &node, mode);
+	if (error)
+		return (error);
+
+	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
+	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
+
+	*vpp = NTOV(node);
+
+	error = nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(mode));
+	if (error) {
+		vput(*vpp);
+		return (error);
+	}
+
+	dir_node->nn_inode.i_links_count++;
+	dir_node->nn_flags |= IN_CHANGE;
+
+	error = nandfs_init_dir(NTOV(node), node->nn_ino, dir_node->nn_ino);
+	if (error) {
+		vput(NTOV(node));
+		return (error);
+	}
+
+	DPRINTF(VNCALL, ("created dir vp %p nandnode %p ino %jx\n", *vpp, node,
+	    (uintmax_t)node->nn_ino));
+	return (0);
+}
+
+static int
+nandfs_mknod(struct vop_mknod_args *ap)
+{
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
+	struct vattr *vap = ap->a_vap;
+	uint16_t mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfsmount *nmp = dir_node->nn_nmp;
+	struct nandfs_node *node;
+	int error;
+
+	if (nandfs_fs_full(dir_node->nn_nandfsdev))
+		return (ENOSPC);
+
+	error = nandfs_node_create(nmp, &node, mode);
+	if (error)
+		return (error);
+	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
+	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
+	if (vap->va_rdev != VNOVAL)
+		node->nn_inode.i_special = vap->va_rdev;
+
+	*vpp = NTOV(node);
+
+	if (nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(mode))) {
+		vput(*vpp);
+		return (ENOTDIR);
+	}
+
+	node->nn_flags |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+
+	return (0);
+}
+
+static int
+nandfs_symlink(struct vop_symlink_args *ap)
+{
+	struct vnode **vpp = ap->a_vpp;
+	struct vnode *dvp = ap->a_dvp;
+	uint16_t mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfsmount *nmp = dir_node->nn_nmp;
+	struct nandfs_node *node;
+	int len, error;
+
+	if (nandfs_fs_full(dir_node->nn_nandfsdev))
+		return (ENOSPC);
+
+	error = nandfs_node_create(nmp, &node, S_IFLNK | mode);
+	if (error)
+		return (error);
+	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
+	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
+
+	*vpp = NTOV(node);
+
+	if (nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(mode))) {
+		vput(*vpp);
+		return (ENOTDIR);
+	}
+
+
+	len = strlen(ap->a_target);
+	error = vn_rdwr(UIO_WRITE, *vpp, ap->a_target, len, (off_t)0,
+	    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
+	    cnp->cn_cred, NOCRED, NULL, NULL);
+	if (error)
+		vput(*vpp);
+
+	return (error);
+}
+
+static int
+nandfs_readlink(struct vop_readlink_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+
+	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
+}
+
+static int
+nandfs_rmdir(struct vop_rmdir_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vnode *dvp = ap->a_dvp;
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *node, *dnode;
+	uint32_t dflag, flag;
+	int error = 0;
+
+	node = VTON(vp);
+	dnode = VTON(dvp);
+
+	/* Files marked as immutable or append-only cannot be deleted. */
+	if ((node->nn_inode.i_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
+	    (dnode->nn_inode.i_flags & APPEND))
+		return (EPERM);
+
+	DPRINTF(VNCALL, ("%s: dvp %p vp %p nandnode %p ino %#jx\n", __func__,
+	    dvp, vp, node, (uintmax_t)node->nn_ino));
+
+	if (node->nn_inode.i_links_count < 2)
+		return (EINVAL);
+
+	if (!nandfs_dirempty(vp, dnode->nn_ino, cnp->cn_cred))
+		return (ENOTEMPTY);
+
+	/* Files marked as immutable or append-only cannot be deleted. */
+	dflag = dnode->nn_inode.i_flags;
+	flag = node->nn_inode.i_flags;
+	if ((dflag & APPEND) ||
+	    (flag & (NOUNLINK | IMMUTABLE | APPEND))) {
+		return (EPERM);
+	}
+
+	if (vp->v_mountedhere != 0)
+		return (EINVAL);
+
+	nandfs_remove_dirent(dvp, node, cnp);
+	dnode->nn_inode.i_links_count -= 1;
+	dnode->nn_flags |= IN_CHANGE;
+
+	cache_purge(dvp);
+
+	error = nandfs_truncate(vp, (uint64_t)0);
+	if (error)
+		return (error);
+
+	node->nn_inode.i_links_count -= 2;
+	node->nn_flags |= IN_CHANGE;
+
+	cache_purge(vp);
+
+	return (error);
+}
+
+static int
+nandfs_fsync(struct vop_fsync_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	int locked;
+
+	DPRINTF(VNCALL, ("%s: vp %p nandnode %p ino %#jx\n", __func__, vp,
+	    node, (uintmax_t)node->nn_ino));
+
+	/*
+	 * Start syncing vnode only if inode was modified or
+	 * there are some dirty buffers
+	 */
+	if (VTON(vp)->nn_flags & IN_MODIFIED ||
+	    vp->v_bufobj.bo_dirty.bv_cnt) {
+		locked = VOP_ISLOCKED(vp);
+		VOP_UNLOCK(vp, 0);
+		nandfs_wakeup_wait_sync(node->nn_nandfsdev, SYNCER_FSYNC);
+		VOP_LOCK(vp, locked | LK_RETRY);
+	}
+
+	return (0);
+}
+
+static int
+nandfs_bmap(struct vop_bmap_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *nnode = VTON(vp);
+	struct nandfs_device *nandfsdev = nnode->nn_nandfsdev;
+	nandfs_daddr_t l2vmap, v2pmap;
+	int error;
+	int blk2dev = nandfsdev->nd_blocksize / DEV_BSIZE;
+
+	DPRINTF(VNCALL, ("%s: vp %p nandnode %p ino %#jx\n", __func__, vp,
+	    nnode, (uintmax_t)nnode->nn_ino));
+
+	if (ap->a_bop != NULL)
+		*ap->a_bop = &nandfsdev->nd_devvp->v_bufobj;
+	if (ap->a_bnp == NULL)
+		return (0);
+	if (ap->a_runp != NULL)
+		*ap->a_runp = 0;
+	if (ap->a_runb != NULL)
+		*ap->a_runb = 0;
+
+	/*
+	 * Translate all the block sectors into a series of buffers to read
+	 * asynchronously from the nandfs device. Note that this lookup may
+	 * induce readin's too.
+	 */
+
+	/* Get virtual block numbers for the vnode's buffer span */
+	error = nandfs_bmap_lookup(nnode, ap->a_bn, &l2vmap);
+	if (error)
+		return (-1);
+
+	/* Translate virtual block numbers to physical block numbers */
+	error = nandfs_vtop(nnode, l2vmap, &v2pmap);
+	if (error)
+		return (-1);
+
+	/* Note virtual block 0 marks not mapped */
+	if (l2vmap == 0)
+		*ap->a_bnp = -1;
+	else
+		*ap->a_bnp = v2pmap * blk2dev;	/* in DEV_BSIZE */
+
+	DPRINTF(VNCALL, ("%s: vp %p nandnode %p ino %#jx lblk %jx -> blk %jx\n",
+	    __func__, vp, nnode, (uintmax_t)nnode->nn_ino, (uintmax_t)ap->a_bn,
+	    (uintmax_t)*ap->a_bnp ));
+
+	return (0);
+}
+
+static void
+nandfs_force_syncer(struct nandfsmount *nmp)
+{
+
+	nmp->nm_flags |= NANDFS_FORCE_SYNCER;
+	nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_FFORCE);
+}
+
+static int
+nandfs_ioctl(struct vop_ioctl_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	u_long command = ap->a_command;
+	caddr_t data = ap->a_data;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
+	struct nandfsmount *nmp = node->nn_nmp;
+	uint64_t *tab, *cno;
+	struct nandfs_seg_stat *nss;
+	struct nandfs_cpmode *ncpm;
+	struct nandfs_argv *nargv;
+	struct nandfs_cpstat *ncp;
+	int error;
+
+	DPRINTF(VNCALL, ("%s: %x\n", __func__, (uint32_t)command));
+
+	error = priv_check(ap->a_td, PRIV_VFS_MOUNT);
+	if (error)
+		return (error);
+
+	if (nmp->nm_ronly) {
+		switch (command) {
+		case NANDFS_IOCTL_GET_FSINFO:
+		case NANDFS_IOCTL_GET_SUSTAT:
+		case NANDFS_IOCTL_GET_CPINFO:
+		case NANDFS_IOCTL_GET_CPSTAT:
+		case NANDFS_IOCTL_GET_SUINFO:
+		case NANDFS_IOCTL_GET_VINFO:
+		case NANDFS_IOCTL_GET_BDESCS:
+			break;
+		default:
+			return (EROFS);
+		}
+	}
+
+	switch (command) {
+	case NANDFS_IOCTL_GET_FSINFO:
+		error = nandfs_get_fsinfo(nmp, (struct nandfs_fsinfo *)data);
+		break;
+	case NANDFS_IOCTL_GET_SUSTAT:
+		nss = (struct nandfs_seg_stat *)data;
+		error = nandfs_get_seg_stat(nandfsdev, nss);
+		break;
+	case NANDFS_IOCTL_CHANGE_CPMODE:
+		ncpm = (struct nandfs_cpmode *)data;
+		error = nandfs_chng_cpmode(nandfsdev->nd_cp_node, ncpm);
+		nandfs_force_syncer(nmp);
+		break;
+	case NANDFS_IOCTL_GET_CPINFO:
+		nargv = (struct nandfs_argv *)data;
+		error = nandfs_get_cpinfo_ioctl(nandfsdev->nd_cp_node, nargv);
+		break;
+	case NANDFS_IOCTL_DELETE_CP:
+		tab = (uint64_t *)data;
+		error = nandfs_delete_cp(nandfsdev->nd_cp_node, tab[0], tab[1]);
+		nandfs_force_syncer(nmp);
+		break;
+	case NANDFS_IOCTL_GET_CPSTAT:
+		ncp = (struct nandfs_cpstat *)data;
+		error = nandfs_get_cpstat(nandfsdev->nd_cp_node, ncp);
+		break;
+	case NANDFS_IOCTL_GET_SUINFO:
+		nargv = (struct nandfs_argv *)data;
+		error = nandfs_get_segment_info_ioctl(nandfsdev, nargv);
+		break;
+	case NANDFS_IOCTL_GET_VINFO:
+		nargv = (struct nandfs_argv *)data;
+		error = nandfs_get_dat_vinfo_ioctl(nandfsdev, nargv);
+		break;
+	case NANDFS_IOCTL_GET_BDESCS:
+		nargv = (struct nandfs_argv *)data;
+		error = nandfs_get_dat_bdescs_ioctl(nandfsdev, nargv);
+		break;
+	case NANDFS_IOCTL_SYNC:
+		cno = (uint64_t *)data;
+		nandfs_force_syncer(nmp);
+		*cno = nandfsdev->nd_last_cno;
+		error = 0;
+		break;
+	case NANDFS_IOCTL_MAKE_SNAP:
+		cno = (uint64_t *)data;
+		error = nandfs_make_snap(nandfsdev, cno);
+		nandfs_force_syncer(nmp);
+		break;
+	case NANDFS_IOCTL_DELETE_SNAP:
+		cno = (uint64_t *)data;
+		error = nandfs_delete_snap(nandfsdev, *cno);
+		nandfs_force_syncer(nmp);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	return (error);
+}
+
+/*
+ * Whiteout vnode call
+ */
+static int
+nandfs_whiteout(struct vop_whiteout_args *ap)
+{
+	struct vnode *dvp = ap->a_dvp;
+	struct componentname *cnp = ap->a_cnp;
+	int error = 0;
+
+	switch (ap->a_flags) {
+	case LOOKUP:
+		return (0);
+	case CREATE:
+		/* Create a new directory whiteout */
+#ifdef INVARIANTS
+		if ((cnp->cn_flags & SAVENAME) == 0)
+			panic("ufs_whiteout: missing name");
+#endif
+		error = nandfs_add_dirent(dvp, NANDFS_WHT_INO, cnp->cn_nameptr,
+		    cnp->cn_namelen, DT_WHT);
+		break;
+
+	case DELETE:
+		/* Remove an existing directory whiteout */
+		cnp->cn_flags &= ~DOWHITEOUT;
+		error = nandfs_remove_dirent(dvp, NULL, cnp);
+		break;
+	default:
+		panic("nandf_whiteout: unknown op: %d", ap->a_flags);
+	}
+
+	return (error);
+}
+
+static int
+nandfs_pathconf(struct vop_pathconf_args *ap)
+{
+	int error;
+
+	error = 0;
+	switch (ap->a_name) {
+	case _PC_LINK_MAX:
+		*ap->a_retval = LINK_MAX;
+		break;
+	case _PC_NAME_MAX:
+		*ap->a_retval = NAME_MAX;
+		break;
+	case _PC_PATH_MAX:
+		*ap->a_retval = PATH_MAX;
+		break;
+	case _PC_PIPE_BUF:
+		*ap->a_retval = PIPE_BUF;
+		break;
+	case _PC_CHOWN_RESTRICTED:
+		*ap->a_retval = 1;
+		break;
+	case _PC_NO_TRUNC:
+		*ap->a_retval = 1;
+		break;
+	case _PC_ACL_EXTENDED:
+		*ap->a_retval = 0;
+		break;
+	case _PC_ALLOC_SIZE_MIN:
+		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
+		break;
+	case _PC_FILESIZEBITS:
+		*ap->a_retval = 64;
+		break;
+	case _PC_REC_INCR_XFER_SIZE:
+		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
+		break;
+	case _PC_REC_MAX_XFER_SIZE:
+		*ap->a_retval = -1; /* means ``unlimited'' */
+		break;
+	case _PC_REC_MIN_XFER_SIZE:
+		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static int
+nandfs_vnlock1(struct vop_lock1_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	int error, vi_locked;
+
+	/*
+	 * XXX can vnode go away while we are sleeping?
+	 */
+	vi_locked = mtx_owned(&vp->v_interlock);
+	if (vi_locked)
+		VI_UNLOCK(vp);
+	error = NANDFS_WRITELOCKFLAGS(node->nn_nandfsdev,
+	    ap->a_flags & LK_NOWAIT);
+	if (vi_locked && !error)
+		VI_LOCK(vp);
+	if (error)
+		return (error);
+
+	error = vop_stdlock(ap);
+	if (error) {
+		NANDFS_WRITEUNLOCK(node->nn_nandfsdev);
+		return (error);
+	}
+
+	return (0);
+}
+
+static int
+nandfs_vnunlock(struct vop_unlock_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	int error;
+
+	error = vop_stdunlock(ap);
+	if (error)
+		return (error);
+
+	NANDFS_WRITEUNLOCK(node->nn_nandfsdev);
+
+	return (0);
+}
+
+/*
+ * Global vfs data structures
+ */
+struct vop_vector nandfs_vnodeops = {
+	.vop_default =		&default_vnodeops,
+	.vop_access =		nandfs_access,
+	.vop_advlock =		nandfs_advlock,
+	.vop_bmap =		nandfs_bmap,
+	.vop_close =		nandfs_close,
+	.vop_create =		nandfs_create,
+	.vop_fsync =		nandfs_fsync,
+	.vop_getattr =		nandfs_getattr,
+	.vop_inactive =		nandfs_inactive,
+	.vop_cachedlookup =	nandfs_lookup,
+	.vop_ioctl =		nandfs_ioctl,
+	.vop_link =		nandfs_link,
+	.vop_lookup =		vfs_cache_lookup,
+	.vop_mkdir =		nandfs_mkdir,
+	.vop_mknod =		nandfs_mknod,
+	.vop_open =		nandfs_open,
+	.vop_pathconf =		nandfs_pathconf,
+	.vop_print =		nandfs_print,
+	.vop_read =		nandfs_read,
+	.vop_readdir =		nandfs_readdir,
+	.vop_readlink =		nandfs_readlink,
+	.vop_reclaim =		nandfs_reclaim,
+	.vop_remove =		nandfs_remove,
+	.vop_rename =		nandfs_rename,
+	.vop_rmdir =		nandfs_rmdir,
+	.vop_whiteout =		nandfs_whiteout,
+	.vop_write =		nandfs_write,
+	.vop_setattr =		nandfs_setattr,
+	.vop_strategy =		nandfs_strategy,
+	.vop_symlink =		nandfs_symlink,
+	.vop_lock1 =		nandfs_vnlock1,
+	.vop_unlock =		nandfs_vnunlock,
+};
+
+struct vop_vector nandfs_system_vnodeops = {
+	.vop_default =		&default_vnodeops,
+	.vop_close =		nandfs_close,
+	.vop_inactive =		nandfs_inactive,
+	.vop_reclaim =		nandfs_reclaim,
+	.vop_strategy =		nandfs_strategy,
+	.vop_fsync =		nandfs_fsync,
+	.vop_bmap =		nandfs_bmap,
+	.vop_access =		VOP_PANIC,
+	.vop_advlock =		VOP_PANIC,
+	.vop_create =		VOP_PANIC,
+	.vop_getattr =		VOP_PANIC,
+	.vop_cachedlookup =	VOP_PANIC,
+	.vop_ioctl =		VOP_PANIC,
+	.vop_link =		VOP_PANIC,
+	.vop_lookup =		VOP_PANIC,
+	.vop_mkdir =		VOP_PANIC,
+	.vop_mknod =		VOP_PANIC,
+	.vop_open =		VOP_PANIC,
+	.vop_pathconf =		VOP_PANIC,
+	.vop_print =		VOP_PANIC,
+	.vop_read =		VOP_PANIC,
+	.vop_readdir =		VOP_PANIC,
+	.vop_readlink =		VOP_PANIC,
+	.vop_remove =		VOP_PANIC,
+	.vop_rename =		VOP_PANIC,
+	.vop_rmdir =		VOP_PANIC,
+	.vop_whiteout =		VOP_PANIC,
+	.vop_write =		VOP_PANIC,
+	.vop_setattr =		VOP_PANIC,
+	.vop_symlink =		VOP_PANIC,
+};
+
+static int
+nandfsfifo_close(struct vop_close_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+
+	DPRINTF(VNCALL, ("%s: vp %p node %p\n", __func__, vp, node));
+
+	mtx_lock(&vp->v_interlock);
+	if (vp->v_usecount > 1)
+		nandfs_itimes_locked(vp);
+	mtx_unlock(&vp->v_interlock);
+
+	return (fifo_specops.vop_close(ap));
+}
+
+struct vop_vector nandfs_fifoops = {
+	.vop_default =		&fifo_specops,
+	.vop_fsync =		VOP_PANIC,
+	.vop_access =		nandfs_access,
+	.vop_close =		nandfsfifo_close,
+	.vop_getattr =		nandfs_getattr,
+	.vop_inactive =		nandfs_inactive,
+	.vop_print =		nandfs_print,
+	.vop_read =		VOP_PANIC,
+	.vop_reclaim =		nandfs_reclaim,
+	.vop_setattr =		nandfs_setattr,
+	.vop_write =		VOP_PANIC,
+	.vop_lock1 =		nandfs_vnlock1,
+	.vop_unlock =		nandfs_vnunlock,
+};
+
+int
+nandfs_vinit(struct vnode *vp, uint64_t ino)
+{
+	struct nandfs_node *node;
+
+	ASSERT_VOP_LOCKED(vp, __func__);
+
+	node = VTON(vp);
+
+	/* Check if we're fetching the root */
+	if (ino == NANDFS_ROOT_INO)
+		vp->v_vflag |= VV_ROOT;
+
+	if (ino != NANDFS_GC_INO)
+		vp->v_type = IFTOVT(node->nn_inode.i_mode);
+	else
+		vp->v_type = VREG;
+
+	if (vp->v_type == VFIFO)
+		vp->v_op = &nandfs_fifoops;
+
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfs/nfs_commonacl.c
--- a/head/sys/fs/nfs/nfs_commonacl.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfs/nfs_commonacl.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfs/nfs_commonacl.c 224086 2011-07-16 08:51:09Z zack $");
+__FBSDID("$FreeBSD: head/sys/fs/nfs/nfs_commonacl.c 235568 2012-05-17 21:52:17Z rmacklem $");
 
 #ifndef APPLEKEXT
 #include <fs/nfs/nfsport.h>
@@ -468,9 +468,7 @@
 		error = NFSERR_ATTRNOTSUPP;
 		goto out;
 	}
-	error = VOP_ACLCHECK(vp, ACL_TYPE_NFS4, aclp, cred, p);
-	if (!error)
-		error = VOP_SETACL(vp, ACL_TYPE_NFS4, aclp, cred, p);
+	error = VOP_SETACL(vp, ACL_TYPE_NFS4, aclp, cred, p);
 
 out:
 	NFSEXITCODE(error);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfs_clbio.c
--- a/head/sys/fs/nfsclient/nfs_clbio.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfs_clbio.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clbio.c 233101 2012-03-17 23:03:20Z kib $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clbio.c 237987 2012-07-02 09:53:08Z kib $");
 
 #include "opt_kdtrace.h"
 
@@ -281,7 +281,11 @@
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
-	cred = curthread->td_ucred;		/* XXX */
+	/* Set the cred to n_writecred for the write rpcs. */
+	if (np->n_writecred != NULL)
+		cred = crhold(np->n_writecred);
+	else
+		cred = crhold(curthread->td_ucred);	/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
@@ -345,6 +349,7 @@
 	    iomode = NFSWRITE_FILESYNC;
 
 	error = ncl_writerpc(vp, &uio, cred, &iomode, &must_commit, 0);
+	crfree(cred);
 
 	pmap_qremove(kva, npages);
 	relpbuf(bp, &ncl_pbuf_freecnt);
@@ -717,7 +722,7 @@
 	    };
 
 	    if (n > 0) {
-		    error = uiomove(bp->b_data + on, (int)n, uio);
+		    error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio);
 	    }
 	    if (vp->v_type == VLNK)
 		n = 0;
@@ -892,8 +897,9 @@
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
 	int bcount;
-	int n, on, error = 0;
-	off_t tmp_off;
+	int bp_cached, n, on, error = 0, error1;
+	size_t orig_resid, local_resid;
+	off_t orig_size, tmp_off;
 
 	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
@@ -945,6 +951,11 @@
 			mtx_unlock(&np->n_mtx);
 	}
 
+	orig_resid = uio->uio_resid;
+	mtx_lock(&np->n_mtx);
+	orig_size = np->n_size;
+	mtx_unlock(&np->n_mtx);
+
 	/*
 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
 	 * get the append lock.
@@ -1122,7 +1133,10 @@
 		 * normally.
 		 */
 
+		bp_cached = 1;
 		if (on == 0 && n == bcount) {
+			if ((bp->b_flags & B_CACHE) == 0)
+				bp_cached = 0;
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
@@ -1173,7 +1187,7 @@
 		 * significant cache coherency problems with multiple clients,
 		 * especially if locking is implemented later on.
 		 *
-		 * as an optimization we could theoretically maintain
+		 * As an optimization we could theoretically maintain
 		 * a linked list of discontinuous areas, but we would still
 		 * have to commit them separately so there isn't much
 		 * advantage to it except perhaps a bit of asynchronization.
@@ -1188,7 +1202,23 @@
 			goto again;
 		}
 
-		error = uiomove((char *)bp->b_data + on, n, uio);
+		local_resid = uio->uio_resid;
+		error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio);
+
+		if (error != 0 && !bp_cached) {
+			/*
+			 * This block has no other content then what
+			 * possibly was written by the faulty uiomove.
+			 * Release it, forgetting the data pages, to
+			 * prevent the leak of uninitialized data to
+			 * usermode.
+			 */
+			bp->b_ioflags |= BIO_ERROR;
+			brelse(bp);
+			uio->uio_offset -= local_resid - uio->uio_resid;
+			uio->uio_resid = local_resid;
+			break;
+		}
 
 		/*
 		 * Since this block is being modified, it must be written
@@ -1198,17 +1228,18 @@
 		 */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
-		if (error) {
-			bp->b_ioflags |= BIO_ERROR;
-			brelse(bp);
-			break;
-		}
+		/*
+		 * Get the partial update on the progress made from
+		 * uiomove, if an error occured.
+		 */
+		if (error != 0)
+			n = local_resid - uio->uio_resid;
 
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate
 		 * condition.
 		 */
-		if (n) {
+		if (n > 0) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
@@ -1228,17 +1259,34 @@
 		if ((ioflag & IO_SYNC)) {
 			if (ioflag & IO_INVAL)
 				bp->b_flags |= B_NOCACHE;
-			error = bwrite(bp);
-			if (error)
+			error1 = bwrite(bp);
+			if (error1 != 0) {
+				if (error == 0)
+					error = error1;
 				break;
+			}
 		} else if ((n + on) == biosize) {
 			bp->b_flags |= B_ASYNC;
 			(void) ncl_writebp(bp, 0, NULL);
 		} else {
 			bdwrite(bp);
 		}
+
+		if (error != 0)
+			break;
 	} while (uio->uio_resid > 0 && n > 0);
 
+	if (error != 0) {
+		if (ioflag & IO_UNIT) {
+			VATTR_NULL(&vattr);
+			vattr.va_size = orig_size;
+			/* IO_SYNC is handled implicitely */
+			(void)VOP_SETATTR(vp, &vattr, cred);
+			uio->uio_offset -= orig_resid - uio->uio_resid;
+			uio->uio_resid = orig_resid;
+		}
+	}
+
 	return (error);
 }
 
@@ -1817,7 +1865,7 @@
 		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
 		 * buffer that now needs to be truncated.
 		 */
-		error = vtruncbuf(vp, cred, td, nsize, biosize);
+		error = vtruncbuf(vp, cred, nsize, biosize);
 		lbn = nsize / biosize;
 		bufsize = nsize & (biosize - 1);
 		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfs_clnode.c
--- a/head/sys/fs/nfsclient/nfs_clnode.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfs_clnode.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clnode.c 230605 2012-01-27 02:46:12Z rmacklem $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clnode.c 237244 2012-06-18 22:17:28Z rmacklem $");
 
 #include "opt_kdtrace.h"
 
@@ -210,18 +210,28 @@
 	struct nfsnode *np;
 	struct sillyrename *sp;
 	struct vnode *vp = ap->a_vp;
+	boolean_t retv;
 
 	np = VTONFS(vp);
 
 	if (NFS_ISV4(vp) && vp->v_type == VREG) {
 		/*
 		 * Since mmap()'d files do I/O after VOP_CLOSE(), the NFSv4
-		 * Close operations are delayed until now. Any dirty buffers
-		 * must be flushed before the close, so that the stateid is
-		 * available for the writes.
+		 * Close operations are delayed until now. Any dirty
+		 * buffers/pages must be flushed before the close, so that the
+		 * stateid is available for the writes.
 		 */
-		(void) ncl_flush(vp, MNT_WAIT, NULL, ap->a_td, 1, 0);
-		(void) nfsrpc_close(vp, 1, ap->a_td);
+		if (vp->v_object != NULL) {
+			VM_OBJECT_LOCK(vp->v_object);
+			retv = vm_object_page_clean(vp->v_object, 0, 0,
+			    OBJPC_SYNC);
+			VM_OBJECT_UNLOCK(vp->v_object);
+		} else
+			retv = TRUE;
+		if (retv == TRUE) {
+			(void)ncl_flush(vp, MNT_WAIT, NULL, ap->a_td, 1, 0);
+			(void)nfsrpc_close(vp, 1, ap->a_td);
+		}
 	}
 
 	mtx_lock(&np->n_mtx);
@@ -257,15 +267,6 @@
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsdmap *dp, *dp2;
 
-	if (NFS_ISV4(vp) && vp->v_type == VREG)
-		/*
-		 * Since mmap()'d files do I/O after VOP_CLOSE(), the NFSv4
-		 * Close operations are delayed until ncl_inactive().
-		 * However, since VOP_INACTIVE() is not guaranteed to be
-		 * called, we need to do it again here.
-		 */
-		(void) nfsrpc_close(vp, 1, ap->a_td);
-
 	/*
 	 * If the NLM is running, give it a chance to abort pending
 	 * locks.
@@ -278,6 +279,15 @@
 	 */
 	vnode_destroy_vobject(vp);
 
+	if (NFS_ISV4(vp) && vp->v_type == VREG)
+		/*
+		 * We can now safely close any remaining NFSv4 Opens for
+		 * this file. Most opens will have already been closed by
+		 * ncl_inactive(), but there are cases where it is not
+		 * called, so we need to do it again here.
+		 */
+		(void) nfsrpc_close(vp, 1, ap->a_td);
+
 	vfs_hash_remove(vp);
 
 	/*
@@ -300,6 +310,8 @@
 			FREE((caddr_t)dp2, M_NFSDIROFF);
 		}
 	}
+	if (np->n_writecred != NULL)
+		crfree(np->n_writecred);
 	FREE((caddr_t)np->n_fhp, M_NFSFH);
 	if (np->n_v4 != NULL)
 		FREE((caddr_t)np->n_v4, M_NFSV4NODE);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfs_clvfsops.c
--- a/head/sys/fs/nfsclient/nfs_clvfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfs_clvfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clvfsops.c 234386 2012-04-17 16:28:22Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clvfsops.c 237367 2012-06-21 09:26:06Z kib $");
 
 
 #include "opt_bootp.h"
@@ -1136,7 +1136,8 @@
 out:
 	if (!error) {
 		MNT_ILOCK(mp);
-		mp->mnt_kern_flag |= (MNTK_MPSAFE|MNTK_LOOKUP_SHARED);
+		mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED |
+		    MNTK_NO_IOPF;
 		MNT_IUNLOCK(mp);
 	}
 	return (error);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfs_clvnops.c
--- a/head/sys/fs/nfsclient/nfs_clvnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfs_clvnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clvnops.c 233101 2012-03-17 23:03:20Z kib $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clvnops.c 235332 2012-05-12 12:02:51Z rmacklem $");
 
 /*
  * vnode op calls for Sun NFS version 2, 3 and 4
@@ -513,6 +513,7 @@
 	struct vattr vattr;
 	int error;
 	int fmode = ap->a_mode;
+	struct ucred *cred;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 		return (EOPNOTSUPP);
@@ -604,7 +605,22 @@
 		}
 		np->n_directio_opens++;
 	}
+
+	/*
+	 * If this is an open for writing, capture a reference to the
+	 * credentials, so they can be used by ncl_putpages(). Using
+	 * these write credentials is preferable to the credentials of
+	 * whatever thread happens to be doing the VOP_PUTPAGES() since
+	 * the write RPCs are less likely to fail with EACCES.
+	 */
+	if ((fmode & FWRITE) != 0) {
+		cred = np->n_writecred;
+		np->n_writecred = crhold(ap->a_cred);
+	} else
+		cred = NULL;
 	mtx_unlock(&np->n_mtx);
+	if (cred != NULL)
+		crfree(cred);
 	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 	return (0);
 }
@@ -1546,7 +1562,10 @@
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
-		if (attrflag)
+		if (attrflag == 0)
+			error = nfsrpc_getattr(newvp, cnp->cn_cred,
+			    cnp->cn_thread, &nfsva, NULL);
+		if (error == 0)
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfsnode.h
--- a/head/sys/fs/nfsclient/nfsnode.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfsnode.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/nfsclient/nfsnode.h 230394 2012-01-20 20:02:01Z jhb $
+ * $FreeBSD: head/sys/fs/nfsclient/nfsnode.h 235332 2012-05-12 12:02:51Z rmacklem $
  */
 
 #ifndef _NFSCLIENT_NFSNODE_H_
@@ -123,6 +123,7 @@
 	int                     n_directio_asyncwr;
 	u_int64_t		 n_change;	/* old Change attribute */
 	struct nfsv4node	*n_v4;		/* extra V4 stuff */
+	struct ucred		*n_writecred;	/* Cred. for putpages */
 };
 
 #define	n_atim		n_un1.nf_atim
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsserver/nfs_nfsdport.c
--- a/head/sys/fs/nfsserver/nfs_nfsdport.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsserver/nfs_nfsdport.c	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsserver/nfs_nfsdport.c 234482 2012-04-20 06:50:44Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsserver/nfs_nfsdport.c 235136 2012-05-08 03:39:44Z jwd $");
 
 #include <sys/capability.h>
 
@@ -505,11 +505,10 @@
 
 out:
 	if (error) {
-		uma_zfree(namei_zone, cnp->cn_pnbuf);
+		nfsvno_relpathbuf(ndp);
 		ndp->ni_vp = NULL;
 		ndp->ni_dvp = NULL;
 		ndp->ni_startdir = NULL;
-		cnp->cn_flags &= ~HASBUF;
 	} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) {
 		ndp->ni_dvp = NULL;
 	}
@@ -1047,6 +1046,8 @@
 	else
 		vput(ndp->ni_dvp);
 	vput(vp);
+	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
+		nfsvno_relpathbuf(ndp);
 	NFSEXITCODE(error);
 	return (error);
 }
@@ -1086,6 +1087,8 @@
 	else
 		vput(ndp->ni_dvp);
 	vput(vp);
+	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
+		nfsvno_relpathbuf(ndp);
 	NFSEXITCODE(error);
 	return (error);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsserver/nfs_nfsdstate.c
--- a/head/sys/fs/nfsserver/nfs_nfsdstate.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsserver/nfs_nfsdstate.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsserver/nfs_nfsdstate.c 231949 2012-02-21 01:05:12Z kib $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsserver/nfs_nfsdstate.c 235381 2012-05-12 22:20:55Z rmacklem $");
 
 #ifndef APPLEKEXT
 #include <fs/nfs/nfsport.h>
@@ -331,11 +331,13 @@
 		 * Must wait until any outstanding callback on the old clp
 		 * completes.
 		 */
+		NFSLOCKSTATE();
 		while (clp->lc_cbref) {
 			clp->lc_flags |= LCL_WAKEUPWANTED;
-			(void) tsleep((caddr_t)clp, PZERO - 1,
+			(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
 			    "nfsd clp", 10 * hz);
 		}
+		NFSUNLOCKSTATE();
 		nfsrv_zapclient(clp, p);
 		*new_clpp = NULL;
 		goto out;
@@ -385,10 +387,13 @@
 	 * Must wait until any outstanding callback on the old clp
 	 * completes.
 	 */
+	NFSLOCKSTATE();
 	while (clp->lc_cbref) {
 		clp->lc_flags |= LCL_WAKEUPWANTED;
-		(void) tsleep((caddr_t)clp, PZERO - 1, "nfsd clp", 10 * hz);
+		(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1, "nfsd clp",
+		    10 * hz);
 	}
+	NFSUNLOCKSTATE();
 	nfsrv_zapclient(clp, p);
 	*new_clpp = NULL;
 
@@ -3816,11 +3821,9 @@
 	clp->lc_cbref--;
 	if ((clp->lc_flags & LCL_WAKEUPWANTED) && clp->lc_cbref == 0) {
 		clp->lc_flags &= ~LCL_WAKEUPWANTED;
-		NFSUNLOCKSTATE();
-		wakeup((caddr_t)clp);
-	} else {
-		NFSUNLOCKSTATE();
+		wakeup(clp);
 	}
+	NFSUNLOCKSTATE();
 
 	NFSEXITCODE(error);
 	return (error);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs.h
--- a/head/sys/fs/ntfs/ntfs.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs.h	Wed Jul 25 16:40:53 2012 +0300
@@ -25,16 +25,16 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs.h 232100 2012-02-24 07:30:44Z kevlo $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs.h 236140 2012-05-27 09:34:47Z ed $
  */
 
 /*#define NTFS_DEBUG 1*/
 
-typedef u_int64_t cn_t;
-typedef u_int16_t wchar;
+typedef uint64_t cn_t;
+typedef uint16_t wchar;
 
 #pragma pack(1)
-#define BBSIZE			1024
+#define	BBSIZE			1024
 #define	BBOFF			((off_t)(0))
 #define	BBLOCK			0
 #define	NTFS_MFTINO		0
@@ -45,157 +45,157 @@
 #define	NTFS_BOOTINO		7
 #define	NTFS_BADCLUSINO		8
 #define	NTFS_UPCASEINO		10
-#define NTFS_MAXFILENAME	255
+#define	NTFS_MAXFILENAME	255
 
 struct fixuphdr {
-	u_int32_t       fh_magic;
-	u_int16_t       fh_foff;
-	u_int16_t       fh_fnum;
+	uint32_t	fh_magic;
+	uint16_t	fh_foff;
+	uint16_t	fh_fnum;
 };
 
-#define NTFS_AF_INRUN	0x00000001
+#define	NTFS_AF_INRUN	0x00000001
 struct attrhdr {
-	u_int32_t       a_type;
-	u_int32_t       reclen;
-	u_int8_t        a_flag;
-	u_int8_t        a_namelen;
-	u_int8_t        a_nameoff;
-	u_int8_t        reserved1;
-	u_int8_t        a_compression;
-	u_int8_t        reserved2;
-	u_int16_t       a_index;
+	uint32_t	a_type;
+	uint32_t	reclen;
+	uint8_t		a_flag;
+	uint8_t		a_namelen;
+	uint8_t		a_nameoff;
+	uint8_t		reserved1;
+	uint8_t		a_compression;
+	uint8_t		reserved2;
+	uint16_t	a_index;
 };
-#define NTFS_A_STD	0x10
-#define NTFS_A_ATTRLIST	0x20
-#define NTFS_A_NAME	0x30
-#define NTFS_A_VOLUMENAME	0x60
-#define NTFS_A_DATA	0x80
+#define	NTFS_A_STD	0x10
+#define	NTFS_A_ATTRLIST	0x20
+#define	NTFS_A_NAME	0x30
+#define	NTFS_A_VOLUMENAME	0x60
+#define	NTFS_A_DATA	0x80
 #define	NTFS_A_INDXROOT	0x90
 #define	NTFS_A_INDX	0xA0
-#define NTFS_A_INDXBITMAP 0xB0
+#define	NTFS_A_INDXBITMAP 0xB0
 
-#define NTFS_MAXATTRNAME	255
+#define	NTFS_MAXATTRNAME	255
 struct attr {
-	struct attrhdr  a_hdr;
+	struct attrhdr	a_hdr;
 	union {
 		struct {
-			u_int16_t       a_datalen;
-			u_int16_t       reserved1;
-			u_int16_t       a_dataoff;
-			u_int16_t       a_indexed;
-		}               a_S_r;
+			uint16_t	a_datalen;
+			uint16_t	reserved1;
+			uint16_t	a_dataoff;
+			uint16_t	a_indexed;
+		} a_S_r;
 		struct {
-			cn_t            a_vcnstart;
-			cn_t            a_vcnend;
-			u_int16_t       a_dataoff;
-			u_int16_t       a_compressalg;
-			u_int32_t       reserved1;
-			u_int64_t       a_allocated;
-			u_int64_t       a_datalen;
-			u_int64_t       a_initialized;
-		}               a_S_nr;
-	}               a_S;
+			cn_t		a_vcnstart;
+			cn_t		a_vcnend;
+			uint16_t	a_dataoff;
+			uint16_t	a_compressalg;
+			uint32_t	reserved1;
+			uint64_t	a_allocated;
+			uint64_t	a_datalen;
+			uint64_t	a_initialized;
+		} a_S_nr;
+	} a_S;
 };
-#define a_r	a_S.a_S_r
-#define a_nr	a_S.a_S_nr
+#define	a_r	a_S.a_S_r
+#define	a_nr	a_S.a_S_nr
 
 typedef struct {
-	u_int64_t       t_create;
-	u_int64_t       t_write;
-	u_int64_t       t_mftwrite;
-	u_int64_t       t_access;
-}               ntfs_times_t;
+	uint64_t	t_create;
+	uint64_t	t_write;
+	uint64_t	t_mftwrite;
+	uint64_t	t_access;
+} ntfs_times_t;
 
-#define NTFS_FFLAG_RDONLY	0x01LL
-#define NTFS_FFLAG_HIDDEN	0x02LL
-#define NTFS_FFLAG_SYSTEM	0x04LL
-#define NTFS_FFLAG_ARCHIVE	0x20LL
-#define NTFS_FFLAG_COMPRESSED	0x0800LL
-#define NTFS_FFLAG_DIR		0x10000000LL
+#define	NTFS_FFLAG_RDONLY	0x01LL
+#define	NTFS_FFLAG_HIDDEN	0x02LL
+#define	NTFS_FFLAG_SYSTEM	0x04LL
+#define	NTFS_FFLAG_ARCHIVE	0x20LL
+#define	NTFS_FFLAG_COMPRESSED	0x0800LL
+#define	NTFS_FFLAG_DIR		0x10000000LL
 
 struct attr_name {
-	u_int32_t       n_pnumber;	/* Parent ntnode */
-	u_int32_t       reserved;
-	ntfs_times_t    n_times;
-	u_int64_t       n_size;
-	u_int64_t       n_attrsz;
-	u_int64_t       n_flag;
-	u_int8_t        n_namelen;
-	u_int8_t        n_nametype;
-	u_int16_t       n_name[1];
+	uint32_t	n_pnumber;	/* Parent ntnode */
+	uint32_t	reserved;
+	ntfs_times_t	n_times;
+	uint64_t	n_size;
+	uint64_t	n_attrsz;
+	uint64_t	n_flag;
+	uint8_t		n_namelen;
+	uint8_t		n_nametype;
+	uint16_t	n_name[1];
 };
 
-#define NTFS_IRFLAG_INDXALLOC	0x00000001
+#define	NTFS_IRFLAG_INDXALLOC	0x00000001
 struct attr_indexroot {
-	u_int32_t       ir_unkn1;	/* always 0x30 */
-	u_int32_t       ir_unkn2;	/* always 0x1 */
-	u_int32_t       ir_size;/* ??? */
-	u_int32_t       ir_unkn3;	/* number of cluster */
-	u_int32_t       ir_unkn4;	/* always 0x10 */
-	u_int32_t       ir_datalen;	/* sizeof simething */
-	u_int32_t       ir_allocated;	/* same as above */
-	u_int16_t       ir_flag;/* ?? always 1 */
-	u_int16_t       ir_unkn7;
+	uint32_t	ir_unkn1;	/* always 0x30 */
+	uint32_t	ir_unkn2;	/* always 0x1 */
+	uint32_t	ir_size;/* ??? */
+	uint32_t	ir_unkn3;	/* number of cluster */
+	uint32_t	ir_unkn4;	/* always 0x10 */
+	uint32_t	ir_datalen;	/* sizeof simething */
+	uint32_t	ir_allocated;	/* same as above */
+	uint16_t	ir_flag;/* ?? always 1 */
+	uint16_t	ir_unkn7;
 };
 
 struct attr_attrlist {
-	u_int32_t       al_type;	/* Attribute type */
-	u_int16_t       reclen;		/* length of this entry */
-	u_int8_t        al_namelen;	/* Attribute name len */
-	u_int8_t        al_nameoff;	/* Name offset from entry start */
-	u_int64_t       al_vcnstart;	/* VCN number */
-	u_int32_t       al_inumber;	/* Parent ntnode */
-	u_int32_t       reserved;
-	u_int16_t       al_index;	/* Attribute index in MFT record */
-	u_int16_t       al_name[1];	/* Name */
+	uint32_t	al_type;	/* Attribute type */
+	uint16_t	reclen;		/* length of this entry */
+	uint8_t		al_namelen;	/* Attribute name len */
+	uint8_t		al_nameoff;	/* Name offset from entry start */
+	uint64_t	al_vcnstart;	/* VCN number */
+	uint32_t	al_inumber;	/* Parent ntnode */
+	uint32_t	reserved;
+	uint16_t	al_index;	/* Attribute index in MFT record */
+	uint16_t	al_name[1];	/* Name */
 };
 
-#define	NTFS_INDXMAGIC	(u_int32_t)(0x58444E49)
+#define	NTFS_INDXMAGIC	(uint32_t)(0x58444E49)
 struct attr_indexalloc {
 	struct fixuphdr ia_fixup;
-	u_int64_t       unknown1;
-	cn_t            ia_bufcn;
-	u_int16_t       ia_hdrsize;
-	u_int16_t       unknown2;
-	u_int32_t       ia_inuse;
-	u_int32_t       ia_allocated;
+	uint64_t	unknown1;
+	cn_t		ia_bufcn;
+	uint16_t	ia_hdrsize;
+	uint16_t	unknown2;
+	uint32_t	ia_inuse;
+	uint32_t	ia_allocated;
 };
 
 #define	NTFS_IEFLAG_SUBNODE	0x00000001
 #define	NTFS_IEFLAG_LAST	0x00000002
 
 struct attr_indexentry {
-	u_int32_t       ie_number;
-	u_int32_t       unknown1;
-	u_int16_t       reclen;
-	u_int16_t       ie_size;
-	u_int32_t       ie_flag;/* 1 - has subnodes, 2 - last */
-	u_int32_t       ie_fpnumber;
-	u_int32_t       unknown2;
-	ntfs_times_t    ie_ftimes;
-	u_int64_t       ie_fallocated;
-	u_int64_t       ie_fsize;
-	u_int64_t       ie_fflag;
-	u_int8_t        ie_fnamelen;
-	u_int8_t        ie_fnametype;
-	wchar           ie_fname[NTFS_MAXFILENAME];
+	uint32_t	ie_number;
+	uint32_t	unknown1;
+	uint16_t	reclen;
+	uint16_t	ie_size;
+	uint32_t	ie_flag; /* 1 - has subnodes, 2 - last */
+	uint32_t	ie_fpnumber;
+	uint32_t	unknown2;
+	ntfs_times_t	ie_ftimes;
+	uint64_t	ie_fallocated;
+	uint64_t	ie_fsize;
+	uint64_t	ie_fflag;
+	uint8_t		ie_fnamelen;
+	uint8_t		ie_fnametype;
+	wchar		ie_fname[NTFS_MAXFILENAME];
 	/* cn_t		ie_bufcn;	 buffer with subnodes */
 };
 
-#define	NTFS_FILEMAGIC	(u_int32_t)(0x454C4946)
+#define	NTFS_FILEMAGIC	(uint32_t)(0x454C4946)
 #define	NTFS_BLOCK_SIZE	512
 #define	NTFS_FRFLAG_DIR	0x0002
 struct filerec {
-	struct fixuphdr fr_fixup;
-	u_int8_t        reserved[8];
-	u_int16_t       fr_seqnum;	/* Sequence number */
-	u_int16_t       fr_nlink;
-	u_int16_t       fr_attroff;	/* offset to attributes */
-	u_int16_t       fr_flags;	/* 1-nonresident attr, 2-directory */
-	u_int32_t       fr_size;/* hdr + attributes */
-	u_int32_t       fr_allocated;	/* allocated length of record */
-	u_int64_t       fr_mainrec;	/* main record */
-	u_int16_t       fr_attrnum;	/* maximum attr number + 1 ??? */
+	struct fixuphdr	fr_fixup;
+	uint8_t		reserved[8];
+	uint16_t	fr_seqnum;	/* Sequence number */
+	uint16_t	fr_nlink;
+	uint16_t	fr_attroff;	/* offset to attributes */
+	uint16_t	fr_flags;	/* 1-nonresident attr, 2-directory */
+	uint32_t	fr_size;/* hdr + attributes */
+	uint32_t	fr_allocated;	/* allocated length of record */
+	uint64_t	fr_mainrec;	/* main record */
+	uint16_t	fr_attrnum;	/* maximum attr number + 1 ??? */
 };
 
 #define	NTFS_ATTRNAME_MAXLEN	0x40
@@ -203,66 +203,66 @@
 #define	NTFS_ADFLAG_INDEX	0x0002	/* Attrib can be indexed */
 struct attrdef {
 	wchar		ad_name[NTFS_ATTRNAME_MAXLEN];
-	u_int32_t	ad_type;
-	u_int32_t	reserved1[2];
-	u_int32_t	ad_flag;
-	u_int64_t	ad_minlen;
-	u_int64_t	ad_maxlen;	/* -1 for nonlimited */
+	uint32_t	ad_type;
+	uint32_t	reserved1[2];
+	uint32_t	ad_flag;
+	uint64_t	ad_minlen;
+	uint64_t	ad_maxlen;	/* -1 for nonlimited */
 };
 
 struct ntvattrdef {
 	char		ad_name[0x40];
 	int		ad_namelen;
-	u_int32_t	ad_type;
+	uint32_t	ad_type;
 };
 
 #define	NTFS_BBID	"NTFS    "
 #define	NTFS_BBIDLEN	8
 struct bootfile {
-	u_int8_t        reserved1[3];	/* asm jmp near ... */
-	u_int8_t        bf_sysid[8];	/* 'NTFS    ' */
-	u_int16_t       bf_bps;		/* bytes per sector */
-	u_int8_t        bf_spc;		/* sectors per cluster */
-	u_int8_t        reserved2[7];	/* unused (zeroed) */
-	u_int8_t        bf_media;	/* media desc. (0xF8) */
-	u_int8_t        reserved3[2];
-	u_int16_t       bf_spt;		/* sectors per track */
-	u_int16_t       bf_heads;	/* number of heads */
-	u_int8_t        reserver4[12];
-	u_int64_t       bf_spv;		/* sectors per volume */
-	cn_t            bf_mftcn;	/* $MFT cluster number */
-	cn_t            bf_mftmirrcn;	/* $MFTMirr cn */
-	u_int8_t        bf_mftrecsz;	/* MFT record size (clust) */
+	uint8_t		reserved1[3];	/* asm jmp near ... */
+	uint8_t		bf_sysid[8];	/* 'NTFS    ' */
+	uint16_t	bf_bps;		/* bytes per sector */
+	uint8_t		bf_spc;		/* sectors per cluster */
+	uint8_t		reserved2[7];	/* unused (zeroed) */
+	uint8_t		bf_media;	/* media desc. (0xF8) */
+	uint8_t		reserved3[2];
+	uint16_t	bf_spt;		/* sectors per track */
+	uint16_t	bf_heads;	/* number of heads */
+	uint8_t		reserver4[12];
+	uint64_t	bf_spv;		/* sectors per volume */
+	cn_t		bf_mftcn;	/* $MFT cluster number */
+	cn_t		bf_mftmirrcn;	/* $MFTMirr cn */
+	uint8_t		bf_mftrecsz;	/* MFT record size (clust) */
 					/* 0xF6 inducates 1/4 */
-	u_int32_t       bf_ibsz;	/* index buffer size */
-	u_int32_t       bf_volsn;	/* volume ser. num. */
+	uint32_t	bf_ibsz;	/* index buffer size */
+	uint32_t	bf_volsn;	/* volume ser. num. */
 };
 
 #define	NTFS_SYSNODESNUM	0x0B
 struct ntfsmount {
 	struct mount   *ntm_mountp;	/* filesystem vfs structure */
-	struct bootfile ntm_bootfile;
+	struct bootfile	ntm_bootfile;
 	struct g_consumer *ntm_cp;
 	struct bufobj  *ntm_bo;
 	struct vnode   *ntm_devvp;	/* block device mounted vnode */
 	struct vnode   *ntm_sysvn[NTFS_SYSNODESNUM];
-	u_int32_t       ntm_bpmftrec;
-	uid_t           ntm_uid;
-	gid_t           ntm_gid;
-	mode_t          ntm_mode;
+	uint32_t	ntm_bpmftrec;
+	uid_t		ntm_uid;
+	gid_t		ntm_gid;
+	mode_t		ntm_mode;
 	uint64_t	ntm_flag;
 	cn_t		ntm_cfree;
 	struct ntvattrdef *ntm_ad;
 	int		ntm_adnum;
- 	wchar *		ntm_82u;	/* 8bit to Unicode */
- 	char **		ntm_u28;	/* Unicode to 8 bit */
+	wchar *		ntm_82u;	/* 8bit to Unicode */
+	char **		ntm_u28;	/* Unicode to 8 bit */
 	void *		ntm_ic_l2u;	/* Local to Unicode (iconv) */
 	void *		ntm_ic_u2l;	/* Unicode to Local (iconv) */
-	u_int8_t	ntm_multiplier; /* NTFS blockno to DEV_BSIZE sectorno */
+	uint8_t		ntm_multiplier; /* NTFS blockno to DEV_BSIZE sectorno */
 };
 
-#define ntm_mftcn	ntm_bootfile.bf_mftcn
-#define ntm_mftmirrcn	ntm_bootfile.bf_mftmirrcn
+#define	ntm_mftcn	ntm_bootfile.bf_mftcn
+#define	ntm_mftmirrcn	ntm_bootfile.bf_mftmirrcn
 #define	ntm_mftrecsz	ntm_bootfile.bf_mftrecsz
 #define	ntm_spc		ntm_bootfile.bf_spc
 #define	ntm_bps		ntm_bootfile.bf_bps
@@ -272,17 +272,17 @@
 #define	NTFS_NEXTREC(s, type) ((type)(((caddr_t) s) + (s)->reclen))
 
 /* Convert mount ptr to ntfsmount ptr. */
-#define VFSTONTFS(mp)	((struct ntfsmount *)((mp)->mnt_data))
-#define VTONT(v)	FTONT(VTOF(v))
+#define	VFSTONTFS(mp)	((struct ntfsmount *)((mp)->mnt_data))
+#define	VTONT(v)	FTONT(VTOF(v))
 #define	VTOF(v)		((struct fnode *)((v)->v_data))
 #define	FTOV(f)		((f)->f_vp)
 #define	FTONT(f)	((f)->f_ip)
-#define ntfs_cntobn(cn)	(daddr_t)((cn) * (ntmp->ntm_spc))
-#define ntfs_cntob(cn)	(off_t)((cn) * (ntmp)->ntm_spc * (ntmp)->ntm_bps)
-#define ntfs_btocn(off)	(cn_t)((off) / ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
-#define ntfs_btocl(off)	(cn_t)((off + ntfs_cntob(1) - 1) / ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
-#define ntfs_btocnoff(off)	(off_t)((off) % ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
-#define ntfs_bntob(bn)	(daddr_t)((bn) * (ntmp)->ntm_bps)
+#define	ntfs_cntobn(cn)	(daddr_t)((cn) * (ntmp->ntm_spc))
+#define	ntfs_cntob(cn)	(off_t)((cn) * (ntmp)->ntm_spc * (ntmp)->ntm_bps)
+#define	ntfs_btocn(off)	(cn_t)((off) / ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
+#define	ntfs_btocl(off)	(cn_t)((off + ntfs_cntob(1) - 1) / ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
+#define	ntfs_btocnoff(off)	(off_t)((off) % ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
+#define	ntfs_bntob(bn)	(daddr_t)((bn) * (ntmp)->ntm_bps)
 
 #define	ntfs_bpbl	(daddr_t)((ntmp)->ntm_bps)
 
@@ -294,15 +294,15 @@
 #endif
 
 #if defined(NTFS_DEBUG)
-#define dprintf(a) printf a
+#define	dprintf(a)	printf a
 #if NTFS_DEBUG > 1
-#define ddprintf(a) printf a
+#define	ddprintf(a)	printf a
 #else
-#define ddprintf(a)	(void)0
+#define	ddprintf(a)	(void)0
 #endif
 #else
-#define dprintf(a)	(void)0
-#define ddprintf(a)	(void)0
+#define	dprintf(a)	(void)0
+#define	ddprintf(a)	(void)0
 #endif
 
 extern struct vop_vector ntfs_vnodeops;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs_subr.c
--- a/head/sys/fs/ntfs/ntfs_subr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs_subr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs_subr.c 229407 2012-01-03 19:09:01Z pfg $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs_subr.c 238315 2012-07-10 00:01:00Z attilio $
  */
 
 #include <sys/param.h>
@@ -1353,174 +1353,6 @@
 }
 
 /*
- * This is one of write routine.
- */
-int
-ntfs_writeattr_plain(
-	struct ntfsmount * ntmp,
-	struct ntnode * ip,
-	u_int32_t attrnum,	
-	char *attrname,
-	off_t roff,
-	size_t rsize,
-	void *rdata,
-	size_t * initp,
-	struct uio *uio)
-{
-	size_t          init;
-	int             error = 0;
-	off_t           off = roff, left = rsize, towrite;
-	caddr_t         data = rdata;
-	struct ntvattr *vap;
-	*initp = 0;
-
-	while (left) {
-		error = ntfs_ntvattrget(ntmp, ip, attrnum, attrname,
-					ntfs_btocn(off), &vap);
-		if (error)
-			return (error);
-		towrite = MIN(left, ntfs_cntob(vap->va_vcnend + 1) - off);
-		ddprintf(("ntfs_writeattr_plain: o: %d, s: %d (%d - %d)\n",
-			 (u_int32_t) off, (u_int32_t) towrite,
-			 (u_int32_t) vap->va_vcnstart,
-			 (u_int32_t) vap->va_vcnend));
-		error = ntfs_writentvattr_plain(ntmp, ip, vap,
-					 off - ntfs_cntob(vap->va_vcnstart),
-					 towrite, data, &init, uio);
-		if (error) {
-			printf("ntfs_writeattr_plain: " \
-			       "ntfs_writentvattr_plain failed: o: %d, s: %d\n",
-			       (u_int32_t) off, (u_int32_t) towrite);
-			printf("ntfs_writeattr_plain: attrib: %d - %d\n",
-			       (u_int32_t) vap->va_vcnstart, 
-			       (u_int32_t) vap->va_vcnend);
-			ntfs_ntvattrrele(vap);
-			break;
-		}
-		ntfs_ntvattrrele(vap);
-		left -= towrite;
-		off += towrite;
-		data = data + towrite;
-		*initp += init;
-	}
-
-	return (error);
-}
-
-/*
- * This is one of write routine.
- *
- * ntnode should be locked.
- */
-int
-ntfs_writentvattr_plain(
-	struct ntfsmount * ntmp,
-	struct ntnode * ip,
-	struct ntvattr * vap,
-	off_t roff,
-	size_t rsize,
-	void *rdata,
-	size_t * initp,
-	struct uio *uio)
-{
-	int             error = 0;
-	off_t           off;
-	int             cnt;
-	cn_t            ccn, ccl, cn, left, cl;
-	caddr_t         data = rdata;
-	struct buf     *bp;
-	size_t          tocopy;
-
-	*initp = 0;
-
-	if ((vap->va_flag & NTFS_AF_INRUN) == 0) {
-		printf("ntfs_writevattr_plain: CAN'T WRITE RES. ATTRIBUTE\n");
-		return ENOTTY;
-	}
-
-	ddprintf(("ntfs_writentvattr_plain: data in run: %ld chains\n",
-		 vap->va_vruncnt));
-
-	off = roff;
-	left = rsize;
-	ccl = 0;
-	ccn = 0;
-	cnt = 0;
-	for (; left && (cnt < vap->va_vruncnt); cnt++) {
-		ccn = vap->va_vruncn[cnt];
-		ccl = vap->va_vruncl[cnt];
-
-		ddprintf(("ntfs_writentvattr_plain: " \
-			 "left %d, cn: 0x%x, cl: %d, off: %d\n", \
-			 (u_int32_t) left, (u_int32_t) ccn, \
-			 (u_int32_t) ccl, (u_int32_t) off));
-
-		if (ntfs_cntob(ccl) < off) {
-			off -= ntfs_cntob(ccl);
-			cnt++;
-			continue;
-		}
-		if (!ccn && ip->i_number != NTFS_BOOTINO)
-			continue; /* XXX */
-
-		ccl -= ntfs_btocn(off);
-		cn = ccn + ntfs_btocn(off);
-		off = ntfs_btocnoff(off);
-
-		while (left && ccl) {
-			/*
-			 * Always read and write single clusters at a time -
-			 * we need to avoid requesting differently-sized
-			 * blocks at the same disk offsets to avoid
-			 * confusing the buffer cache.
-			 */
-			tocopy = MIN(left, ntfs_cntob(1) - off);
-			cl = ntfs_btocl(tocopy + off);
-			KASSERT(cl == 1 && tocopy <= ntfs_cntob(1),
-			    ("single cluster limit mistake"));
-			ddprintf(("ntfs_writentvattr_plain: write: " \
-				"cn: 0x%x cl: %d, off: %d len: %d, left: %d\n",
-				(u_int32_t) cn, (u_int32_t) cl, 
-				(u_int32_t) off, (u_int32_t) tocopy, 
-				(u_int32_t) left));
-			if ((off == 0) && (tocopy == ntfs_cntob(cl)))
-			{
-				bp = getblk(ntmp->ntm_devvp, ntfs_cntobn(cn)
-					    * ntmp->ntm_multiplier,
-					    ntfs_cntob(cl), 0, 0, 0);
-				clrbuf(bp);
-			} else {
-				error = bread(ntmp->ntm_devvp, ntfs_cntobn(cn)
-					      * ntmp->ntm_multiplier,
-					      ntfs_cntob(cl), NOCRED, &bp);
-				if (error) {
-					brelse(bp);
-					return (error);
-				}
-			}
-			if (uio)
-				uiomove(bp->b_data + off, tocopy, uio);
-			else
-				memcpy(bp->b_data + off, data, tocopy);
-			bawrite(bp);
-			data = data + tocopy;
-			*initp += tocopy;
-			off = 0;
-			left -= tocopy;
-			cn += cl;
-			ccl -= cl;
-		}
-	}
-
-	if (left) {
-		printf("ntfs_writentvattr_plain: POSSIBLE RUN ERROR\n");
-		error = EINVAL;
-	}
-
-	return (error);
-}
-
-/*
  * This is one of read routines.
  *
  * ntnode should be locked.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs_subr.h
--- a/head/sys/fs/ntfs/ntfs_subr.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs_subr.h	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs_subr.h 228864 2011-12-24 15:49:52Z kevlo $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs_subr.h 238315 2012-07-10 00:01:00Z attilio $
  */
 
 #define	VA_LOADED		0x0001
@@ -99,8 +99,6 @@
 void ntfs_ntrele(struct ntnode *);
 void ntfs_ntput(struct ntnode *);
 int ntfs_loadntnode( struct ntfsmount *, struct ntnode * );
-int ntfs_writentvattr_plain(struct ntfsmount *, struct ntnode *, struct ntvattr *, off_t, size_t, void *, size_t *, struct uio *);
-int ntfs_writeattr_plain(struct ntfsmount *, struct ntnode *, u_int32_t, char *, off_t, size_t, void *, size_t *, struct uio *);
 void ntfs_toupper_init(void);
 void ntfs_toupper_destroy(void);
 int ntfs_toupper_use(struct mount *, struct ntfsmount *);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs_vfsops.c
--- a/head/sys/fs/ntfs/ntfs_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs_vfsops.c 232483 2012-03-04 09:38:20Z kevlo $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs_vfsops.c 238320 2012-07-10 00:23:25Z attilio $
  */
 
 
@@ -152,7 +152,6 @@
 ntfs_mount(struct mount *mp)
 {
 	int err = 0, error;
-	accmode_t accmode;
 	struct vnode *devvp;
 	struct nameidata ndp;
 	struct thread *td;
@@ -162,6 +161,11 @@
 	if (vfs_filteropt(mp->mnt_optnew, ntfs_opts))
 		return (EINVAL);
 
+	/* Force mount as read-only. */
+	MNT_ILOCK(mp);
+	mp->mnt_flag |= MNT_RDONLY;
+	MNT_IUNLOCK(mp);
+
 	from = vfs_getopts(mp->mnt_optnew, "from", &error);
 	if (error)	
 		return (error);
@@ -173,11 +177,10 @@
 	if (mp->mnt_flag & MNT_UPDATE) {
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) {
 			/* Process export requests in vfs_mount.c */
-			goto success;
+			return (0);
 		} else {
 			printf("ntfs_mount(): MNT_UPDATE not supported\n");
-			err = EINVAL;
-			goto error_1;
+			return (EINVAL);
 		}
 	}
 
@@ -187,10 +190,8 @@
 	 */
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td);
 	err = namei(&ndp);
-	if (err) {
-		/* can't get devvp!*/
-		goto error_1;
-	}
+	if (err)
+		return (err);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	devvp = ndp.ni_vp;
 
@@ -203,10 +204,7 @@
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
-	accmode = VREAD;
-	if ((mp->mnt_flag & MNT_RDONLY) == 0)
-		accmode |= VWRITE;
-	err = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
+	err = VOP_ACCESS(devvp, VREAD, td->td_ucred, td);
 	if (err)
 		err = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (err) {
@@ -214,52 +212,23 @@
 		return (err);
 	}
 
-	if (mp->mnt_flag & MNT_UPDATE) {
-#if 0
-		/*
-		 ********************
-		 * UPDATE
-		 ********************
-		 */
 
-		if (devvp != ntmp->um_devvp)
-			err = EINVAL;	/* needs translation */
-		vput(devvp);
-		if (err)
-			return (err);
-#endif
-	} else {
-		/*
-		 ********************
-		 * NEW MOUNT
-		 ********************
-		 */
+	/*
+	 * Since this is a new mount, we want the names for the device and
+	 * the mount point copied in.  If an error occurs, the mountpoint is
+	 * discarded by the upper level code.  Note that vfs_mount() handles
+	 * copying the mountpoint f_mntonname for us, so we don't have to do
+	 * it here unless we want to set it to something other than "path"
+	 * for some rason.
+	 */
 
-		/*
-		 * Since this is a new mount, we want the names for
-		 * the device and the mount point copied in.  If an
-		 * error occurs, the mountpoint is discarded by the
-		 * upper level code.  Note that vfs_mount() handles
-		 * copying the mountpoint f_mntonname for us, so we
-		 * don't have to do it here unless we want to set it
-		 * to something other than "path" for some rason.
-		 */
-		/* Save "mounted from" info for mount point (NULL pad)*/
+	err = ntfs_mountfs(devvp, mp, td);
+	if (err == 0) {
+
+		/* Save "mounted from" info for mount point. */
 		vfs_mountedfrom(mp, from);
-
-		err = ntfs_mountfs(devvp, mp, td);
-	}
-	if (err) {
+	} else
 		vrele(devvp);
-		return (err);
-	}
-
-	goto success;
-
-error_1:	/* no state to back out*/
-	/* XXX: missing NDFREE(&ndp, ...) */
-
-success:
 	return (err);
 }
 
@@ -275,13 +244,12 @@
 	struct buf *bp;
 	struct ntfsmount *ntmp;
 	struct cdev *dev = devvp->v_rdev;
-	int error, ronly, i, v;
+	int error, i, v;
 	struct vnode *vp;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	char *cs_ntfs, *cs_local;
 
-	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 	DROP_GIANT();
 	g_topology_lock();
 
@@ -296,7 +264,7 @@
  	if ((pp != NULL) && ((pp->acr | pp->acw | pp->ace ) != 0)) 
 		error = EPERM;
 	else 
-		error = g_vfs_open(devvp, &cp, "ntfs", ronly ? 0 : 1);
+		error = g_vfs_open(devvp, &cp, "ntfs", 0);
 
 	g_topology_unlock();
 	PICKUP_GIANT();
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs_vnops.c
--- a/head/sys/fs/ntfs/ntfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs_vnops.c 228864 2011-12-24 15:49:52Z kevlo $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs_vnops.c 238315 2012-07-10 00:01:00Z attilio $
  *
  */
 
@@ -67,7 +67,6 @@
 #include <sys/unistd.h> /* for pathconf(2) constants */
 
 static vop_read_t	ntfs_read;
-static vop_write_t	ntfs_write;
 static vop_getattr_t	ntfs_getattr;
 static vop_inactive_t	ntfs_inactive;
 static vop_reclaim_t	ntfs_reclaim;
@@ -78,7 +77,6 @@
 static vop_close_t	ntfs_close;
 static vop_readdir_t	ntfs_readdir;
 static vop_cachedlookup_t	ntfs_lookup;
-static vop_fsync_t	ntfs_fsync;
 static vop_pathconf_t	ntfs_pathconf;
 static vop_vptofh_t	ntfs_vptofh;
 
@@ -272,6 +270,7 @@
 	register struct fnode *fp = VTOF(vp);
 	register struct ntnode *ip = FTONT(fp);
 	struct ntfsmount *ntmp = ip->i_mp;
+	u_int32_t toread;
 	int error;
 
 	dprintf(("ntfs_strategy: offset: %d, blkno: %d, lblkno: %d\n",
@@ -281,99 +280,33 @@
 	dprintf(("strategy: bcount: %d flags: 0x%x\n", 
 		(u_int32_t)bp->b_bcount,bp->b_flags));
 
-	if (bp->b_iocmd == BIO_READ) {
-		u_int32_t toread;
+	KASSERT(bp->b_iocmd == BIO_READ, ("Invalid buffer\n"));
 
-		if (ntfs_cntob(bp->b_blkno) >= fp->f_size) {
-			clrbuf(bp);
-			error = 0;
-		} else {
-			toread = MIN(bp->b_bcount,
-				 fp->f_size-ntfs_cntob(bp->b_blkno));
-			dprintf(("ntfs_strategy: toread: %d, fsize: %d\n",
-				toread,(u_int32_t)fp->f_size));
+	if (ntfs_cntob(bp->b_blkno) >= fp->f_size) {
+		clrbuf(bp);
+		error = 0;
+	} else {
+		toread = MIN(bp->b_bcount,
+			 fp->f_size-ntfs_cntob(bp->b_blkno));
+		dprintf(("ntfs_strategy: toread: %d, fsize: %d\n",
+			toread,(u_int32_t)fp->f_size));
 
-			error = ntfs_readattr(ntmp, ip, fp->f_attrtype,
-				fp->f_attrname, ntfs_cntob(bp->b_blkno),
-				toread, bp->b_data, NULL);
+		error = ntfs_readattr(ntmp, ip, fp->f_attrtype,
+			fp->f_attrname, ntfs_cntob(bp->b_blkno),
+			toread, bp->b_data, NULL);
 
-			if (error) {
-				printf("ntfs_strategy: ntfs_readattr failed\n");
-				bp->b_error = error;
-				bp->b_ioflags |= BIO_ERROR;
-			}
+		if (error) {
+			printf("ntfs_strategy: ntfs_readattr failed\n");
+			bp->b_error = error;
+			bp->b_ioflags |= BIO_ERROR;
+		}
 
-			bzero(bp->b_data + toread, bp->b_bcount - toread);
-		}
-	} else {
-		size_t tmp;
-		u_int32_t towrite;
-
-		if (ntfs_cntob(bp->b_blkno) + bp->b_bcount >= fp->f_size) {
-			printf("ntfs_strategy: CAN'T EXTEND FILE\n");
-			bp->b_error = error = EFBIG;
-			bp->b_ioflags |= BIO_ERROR;
-		} else {
-			towrite = MIN(bp->b_bcount,
-				fp->f_size-ntfs_cntob(bp->b_blkno));
-			dprintf(("ntfs_strategy: towrite: %d, fsize: %d\n",
-				towrite,(u_int32_t)fp->f_size));
-
-			error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype,	
-				fp->f_attrname, ntfs_cntob(bp->b_blkno),towrite,
-				bp->b_data, &tmp, NULL);
-
-			if (error) {
-				printf("ntfs_strategy: ntfs_writeattr fail\n");
-				bp->b_error = error;
-				bp->b_ioflags |= BIO_ERROR;
-			}
-		}
+		bzero(bp->b_data + toread, bp->b_bcount - toread);
 	}
 	bufdone(bp);
 	return (0);
 }
 
-static int
-ntfs_write(ap)
-	struct vop_write_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		int  a_ioflag;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-	register struct vnode *vp = ap->a_vp;
-	register struct fnode *fp = VTOF(vp);
-	register struct ntnode *ip = FTONT(fp);
-	struct uio *uio = ap->a_uio;
-	struct ntfsmount *ntmp = ip->i_mp;
-	u_int64_t towrite;
-	size_t written;
-	int error;
-
-	dprintf(("ntfs_write: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg));
-	dprintf(("ntfs_write: filesize: %d",(u_int32_t)fp->f_size));
-
-	if (uio->uio_resid + uio->uio_offset > fp->f_size) {
-		printf("ntfs_write: CAN'T WRITE BEYOND END OF FILE\n");
-		return (EFBIG);
-	}
-
-	towrite = MIN(uio->uio_resid, fp->f_size - uio->uio_offset);
-
-	dprintf((", towrite: %d\n",(u_int32_t)towrite));
-
-	error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype,
-		fp->f_attrname, uio->uio_offset, towrite, NULL, &written, uio);
-#ifdef NTFS_DEBUG
-	if (error)
-		printf("ntfs_write: ntfs_writeattr failed: %d\n", error);
-#endif
-
-	return (error);
-}
-
 int
 ntfs_access(ap)
 	struct vop_access_args /* {
@@ -390,7 +323,7 @@
 	dprintf(("ntfs_access: %d\n",ip->i_number));
 
 	/*
-	 * Disallow write attempts on read-only filesystems;
+	 * Disallow write attempts as we assume read-only filesystems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
@@ -399,8 +332,8 @@
 		case VDIR:
 		case VLNK:
 		case VREG:
-			if (vp->v_mount->mnt_flag & MNT_RDONLY)
-				return (EROFS);
+			return (EROFS);
+		default:
 			break;
 		}
 	}
@@ -493,8 +426,13 @@
 
 	/* Simulate . in every dir except ROOT */
 	if( ip->i_number != NTFS_ROOTINO ) {
-		struct dirent dot = { NTFS_ROOTINO,
-				sizeof(struct dirent), DT_DIR, 1, "." };
+		struct dirent dot = {
+			.d_fileno = NTFS_ROOTINO,
+			.d_reclen = sizeof(struct dirent),
+			.d_type = DT_DIR,
+			.d_namlen = 1,
+			.d_name = "."
+		};
 
 		if( uio->uio_offset < sizeof(struct dirent) ) {
 			dot.d_fileno = ip->i_number;
@@ -508,8 +446,13 @@
 
 	/* Simulate .. in every dir including ROOT */
 	if( uio->uio_offset < 2 * sizeof(struct dirent) ) {
-		struct dirent dotdot = { NTFS_ROOTINO,
-				sizeof(struct dirent), DT_DIR, 2, ".." };
+		struct dirent dotdot = {
+			.d_fileno = NTFS_ROOTINO,
+			.d_reclen = sizeof(struct dirent),
+			.d_type = DT_DIR,
+			.d_namlen = 2,
+			.d_name = ".."
+		};
 
 		error = uiomove((char *)&dotdot,sizeof(struct dirent),uio);
 		if(error)
@@ -620,7 +563,6 @@
 		return (error);
 
 	if ((cnp->cn_flags & ISLASTCN) &&
-	    (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
@@ -669,24 +611,6 @@
 }
 
 /*
- * Flush the blocks of a file to disk.
- *
- * This function is worthless for vnodes that represent directories. Maybe we
- * could just do a sync if they try an fsync on a directory file.
- */
-static int
-ntfs_fsync(ap)
-	struct vop_fsync_args /* {
-		struct vnode *a_vp;
-		struct ucred *a_cred;
-		int a_waitfor;
-		struct thread *a_td;
-	} */ *ap;
-{
-	return (0);
-}
-
-/*
  * Return POSIX pathconf information applicable to NTFS filesystem
  */
 int
@@ -746,7 +670,6 @@
 	.vop_bmap =		ntfs_bmap,
 	.vop_cachedlookup =	ntfs_lookup,
 	.vop_close =		ntfs_close,
-	.vop_fsync =		ntfs_fsync,
 	.vop_getattr =		ntfs_getattr,
 	.vop_inactive =		ntfs_inactive,
 	.vop_lookup =		vfs_cache_lookup,
@@ -756,6 +679,5 @@
 	.vop_readdir =		ntfs_readdir,
 	.vop_reclaim =		ntfs_reclaim,
 	.vop_strategy =		ntfs_strategy,
-	.vop_write =		ntfs_write,
 	.vop_vptofh =		ntfs_vptofh,
 };
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nullfs/null_vnops.c
--- a/head/sys/fs/nullfs/null_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nullfs/null_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  *	...and...
  *	@(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project
  *
- * $FreeBSD: head/sys/fs/nullfs/null_vnops.c 232303 2012-02-29 15:15:36Z kib $
+ * $FreeBSD: head/sys/fs/nullfs/null_vnops.c 234607 2012-04-23 14:10:34Z trasz $
  */
 
 /*
@@ -678,7 +678,6 @@
 null_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
-	struct thread *td = ap->a_td;
 
 	vp->v_object = NULL;
 
@@ -686,7 +685,7 @@
 	 * If this is the last reference, then free up the vnode
 	 * so as not to tie up the lower vnodes.
 	 */
-	vrecycle(vp, td);
+	vrecycle(vp);
 
 	return (0);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/portalfs/portal_vnops.c
--- a/head/sys/fs/portalfs/portal_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/portalfs/portal_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  *
  *	@(#)portal_vnops.c	8.14 (Berkeley) 5/21/95
  *
- * $FreeBSD: head/sys/fs/portalfs/portal_vnops.c 226497 2011-10-18 07:31:49Z des $
+ * $FreeBSD: head/sys/fs/portalfs/portal_vnops.c 238697 2012-07-22 15:40:31Z kevlo $
  */
 
 /*
@@ -110,7 +110,7 @@
 	char *pname = cnp->cn_nameptr;
 	struct portalnode *pt;
 	int error;
-	struct vnode *fvp = 0;
+	struct vnode *fvp = NULL;
 	char *path;
 	int size;
 
@@ -217,14 +217,14 @@
 		struct thread *a_td;
 	} */ *ap;
 {
-	struct socket *so = 0;
+	struct socket *so = NULL;
 	struct portalnode *pt;
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct uio auio;
 	struct iovec aiov[2];
 	int res;
-	struct mbuf *cm = 0;
+	struct mbuf *cm = NULL;
 	struct cmsghdr *cmsg;
 	int newfds;
 	int *ip;
@@ -356,7 +356,7 @@
 
 	len = auio.uio_resid = sizeof(int);
 	do {
-		struct mbuf *m = 0;
+		struct mbuf *m = NULL;
 		int flags = MSG_WAITALL;
 		error = soreceive(so, (struct sockaddr **) 0, &auio,
 					&m, &cm, &flags);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/smbfs/smbfs_node.c
--- a/head/sys/fs/smbfs/smbfs_node.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/smbfs/smbfs_node.c	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/smbfs/smbfs_node.c 227293 2011-11-07 06:44:47Z ed $
+ * $FreeBSD: head/sys/fs/smbfs/smbfs_node.c 238539 2012-07-16 22:07:29Z brueffer $
  */
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -223,19 +223,16 @@
 	if (fap == NULL)
 		return ENOENT;
 
-	np = malloc(sizeof *np, M_SMBNODE, M_WAITOK);
 	error = getnewvnode("smbfs", mp, &smbfs_vnodeops, &vp);
-	if (error) {
-		free(np, M_SMBNODE);
-		return error;
-	}
+	if (error != 0)
+		return (error);
 	error = insmntque(vp, mp);	/* XXX: Too early for mpsafe fs */
-	if (error != 0) {
-		free(np, M_SMBNODE);
+	if (error != 0)
 		return (error);
-	}
+
+	np = malloc(sizeof *np, M_SMBNODE, M_WAITOK | M_ZERO);
+
 	vp->v_type = fap->fa_attr & SMB_FA_DIR ? VDIR : VREG;
-	bzero(np, sizeof(*np));
 	vp->v_data = np;
 	np->n_vnode = vp;
 	np->n_mount = VFSTOSMBFS(mp);
@@ -373,7 +370,7 @@
 		smbfs_attr_cacheremove(vp);
 	}
 	if (np->n_flag & NGONE)
-		vrecycle(vp, td);
+		vrecycle(vp);
 	return (0);
 }
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/tmpfs/tmpfs_vnops.c
--- a/head/sys/fs/tmpfs/tmpfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/tmpfs/tmpfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -34,7 +34,7 @@
  * tmpfs vnode interface.
  */
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/tmpfs/tmpfs_vnops.c 234064 2012-04-09 17:05:18Z attilio $");
+__FBSDID("$FreeBSD: head/sys/fs/tmpfs/tmpfs_vnops.c 234607 2012-04-23 14:10:34Z trasz $");
 
 #include <sys/param.h>
 #include <sys/fcntl.h>
@@ -1577,7 +1577,6 @@
 tmpfs_inactive(struct vop_inactive_args *v)
 {
 	struct vnode *vp = v->a_vp;
-	struct thread *l = v->a_td;
 
 	struct tmpfs_node *node;
 
@@ -1586,7 +1585,7 @@
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if (node->tn_links == 0)
-		vrecycle(vp, l);
+		vrecycle(vp);
 
 	return 0;
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/udf/udf_vfsops.c
--- a/head/sys/fs/udf/udf_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/udf/udf_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/udf/udf_vfsops.c 222167 2011-05-22 01:07:54Z rmacklem $
+ * $FreeBSD: head/sys/fs/udf/udf_vfsops.c 238697 2012-07-22 15:40:31Z kevlo $
  */
 
 /* udf_vfsops.c */
@@ -190,7 +190,7 @@
 {
 	struct vnode *devvp;	/* vnode of the mount device */
 	struct thread *td;
-	struct udf_mnt *imp = 0;
+	struct udf_mnt *imp = NULL;
 	struct vfsoptlist *opts;
 	char *fspec, *cs_disk, *cs_local;
 	int error, len, *udf_flags;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/unionfs/union_subr.c
--- a/head/sys/fs/unionfs/union_subr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/unionfs/union_subr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * Copyright (c) 1994 Jan-Simon Pendry
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
- * Copyright (c) 2005, 2006 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
- * Copyright (c) 2006 Daichi Goto <daichi at freebsd.org>
+ * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
+ * Copyright (c) 2006, 2012 Daichi Goto <daichi at freebsd.org>
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
- * $FreeBSD: head/sys/fs/unionfs/union_subr.c 232701 2012-03-08 20:27:20Z jhb $
+ * $FreeBSD: head/sys/fs/unionfs/union_subr.c 235503 2012-05-16 10:44:09Z gleb $
  */
 
 #include <sys/param.h>
@@ -350,19 +350,22 @@
 	uvp = unp->un_uppervp;
 	dvp = unp->un_dvp;
 	unp->un_lowervp = unp->un_uppervp = NULLVP;
-
 	vp->v_vnlock = &(vp->v_lock);
 	vp->v_data = NULL;
-	lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_INTERLOCK, VI_MTX(vp));
+	vp->v_object = NULL;
+	VI_UNLOCK(vp);
+
 	if (lvp != NULLVP)
-		VOP_UNLOCK(lvp, 0);
+		VOP_UNLOCK(lvp, LK_RELEASE);
 	if (uvp != NULLVP)
-		VOP_UNLOCK(uvp, 0);
-	vp->v_object = NULL;
+		VOP_UNLOCK(uvp, LK_RELEASE);
 
 	if (dvp != NULLVP && unp->un_hash.le_prev != NULL)
 		unionfs_rem_cached_vnode(unp, dvp);
 
+	if (lockmgr(vp->v_vnlock, LK_EXCLUSIVE, VI_MTX(vp)) != 0)
+		panic("the lock for deletion is unacquirable.");
+
 	if (lvp != NULLVP) {
 		vfslocked = VFS_LOCK_GIANT(lvp->v_mount);
 		vrele(lvp);
@@ -550,7 +553,7 @@
 		cn->cn_flags |= (cnp->cn_flags & SAVESTART);
 
 	vref(dvp);
-	VOP_UNLOCK(dvp, 0);
+	VOP_UNLOCK(dvp, LK_RELEASE);
 
 	if ((error = relookup(dvp, vpp, cn))) {
 		uma_zfree(namei_zone, cn->cn_pnbuf);
@@ -957,7 +960,7 @@
 	*vpp = vp;
 
 unionfs_vn_create_on_upper_free_out1:
-	VOP_UNLOCK(udvp, 0);
+	VOP_UNLOCK(udvp, LK_RELEASE);
 
 unionfs_vn_create_on_upper_free_out2:
 	if (cn.cn_flags & HASBUF) {
@@ -1181,7 +1184,7 @@
 		edp = (struct dirent*)&buf[sizeof(buf) - uio.uio_resid];
 		for (dp = (struct dirent*)buf; !error && dp < edp;
 		     dp = (struct dirent*)((caddr_t)dp + dp->d_reclen)) {
-			if (dp->d_type == DT_WHT ||
+			if (dp->d_type == DT_WHT || dp->d_fileno == 0 ||
 			    (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
 			    (dp->d_namlen == 2 && !bcmp(dp->d_name, "..", 2)))
 				continue;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/unionfs/union_vfsops.c
--- a/head/sys/fs/unionfs/union_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/unionfs/union_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,8 +1,8 @@
 /*-
  * Copyright (c) 1994, 1995 The Regents of the University of California.
  * Copyright (c) 1994, 1995 Jan-Simon Pendry.
- * Copyright (c) 2005, 2006 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
- * Copyright (c) 2006 Daichi Goto <daichi at freebsd.org>
+ * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
+ * Copyright (c) 2006, 2012 Daichi Goto <daichi at freebsd.org>
  * All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)union_vfsops.c	8.20 (Berkeley) 5/20/95
- * $FreeBSD: head/sys/fs/unionfs/union_vfsops.c 232918 2012-03-13 10:04:13Z kevlo $
+ * $FreeBSD: head/sys/fs/unionfs/union_vfsops.c 234867 2012-05-01 07:46:30Z daichi $
  */
 
 #include <sys/param.h>
@@ -165,7 +165,7 @@
 		uid = va.va_uid;
 		gid = va.va_gid;
 	}
-	VOP_UNLOCK(mp->mnt_vnodecovered, 0);
+	VOP_UNLOCK(mp->mnt_vnodecovered, LK_RELEASE);
 	if (error)
 		return (error);
 
@@ -250,7 +250,7 @@
 	 * Save reference
 	 */
 	if (below) {
-		VOP_UNLOCK(upperrootvp, 0);
+		VOP_UNLOCK(upperrootvp, LK_RELEASE);
 		vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY);
 		ump->um_lowervp = upperrootvp;
 		ump->um_uppervp = lowerrootvp;
@@ -281,7 +281,7 @@
 	/*
 	 * Unlock the node
 	 */
-	VOP_UNLOCK(ump->um_uppervp, 0);
+	VOP_UNLOCK(ump->um_uppervp, LK_RELEASE);
 
 	/*
 	 * Get the unionfs root vnode.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/unionfs/union_vnops.c
--- a/head/sys/fs/unionfs/union_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/unionfs/union_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry.
  * Copyright (c) 1992, 1993, 1994, 1995
  *      The Regents of the University of California.
- * Copyright (c) 2005, 2006 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
- * Copyright (c) 2006 Daichi Goto <daichi at freebsd.org>
+ * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
+ * Copyright (c) 2006, 2012 Daichi Goto <daichi at freebsd.org>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)union_vnops.c	8.32 (Berkeley) 6/23/95
- * $FreeBSD: head/sys/fs/unionfs/union_vnops.c 226234 2011-10-10 21:32:08Z trasz $
+ * $FreeBSD: head/sys/fs/unionfs/union_vnops.c 234944 2012-05-03 07:22:29Z daichi $
  *
  */
 
@@ -75,21 +75,6 @@
 	KASSERT(((vp)->v_op == &unionfs_vnodeops), \
 	    ("unionfs: it is not unionfs-vnode"))
 
-/* lockmgr lock <-> reverse table */
-struct lk_lr_table {
-	int	lock;
-	int	revlock;
-};
-
-static struct lk_lr_table un_llt[] = {
-	{LK_SHARED, LK_RELEASE},
-	{LK_EXCLUSIVE, LK_RELEASE},
-	{LK_UPGRADE, LK_DOWNGRADE},
-	{LK_DOWNGRADE, LK_UPGRADE},
-	{0, 0}
-};
-
-
 static int
 unionfs_lookup(struct vop_cachedlookup_args *ap)
 {
@@ -141,7 +126,7 @@
 		if (udvp != NULLVP) {
 			dtmpvp = udvp;
 			if (ldvp != NULLVP)
-				VOP_UNLOCK(ldvp, 0);
+				VOP_UNLOCK(ldvp, LK_RELEASE);
 		}
 		else
 			dtmpvp = ldvp;
@@ -149,7 +134,7 @@
 		error = VOP_LOOKUP(dtmpvp, &vp, cnp);
 
 		if (dtmpvp == udvp && ldvp != NULLVP) {
-			VOP_UNLOCK(udvp, 0);
+			VOP_UNLOCK(udvp, LK_RELEASE);
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		}
 
@@ -161,10 +146,10 @@
 			 */
 			if (nameiop == DELETE  || nameiop == RENAME ||
 			    (cnp->cn_lkflags & LK_TYPE_MASK))
-				VOP_UNLOCK(vp, 0);
+				VOP_UNLOCK(vp, LK_RELEASE);
 			vrele(vp);
 
-			VOP_UNLOCK(dvp, 0);
+			VOP_UNLOCK(dvp, LK_RELEASE);
 			*(ap->a_vpp) = dunp->un_dvp;
 			vref(dunp->un_dvp);
 
@@ -202,7 +187,7 @@
 			}
 			if (nameiop == DELETE || nameiop == RENAME ||
 			    (cnp->cn_lkflags & LK_TYPE_MASK))
-				VOP_UNLOCK(uvp, 0);
+				VOP_UNLOCK(uvp, LK_RELEASE);
 		}
 
 		/* check whiteout */
@@ -246,7 +231,7 @@
 				return (lerror);
 			}
 			if (cnp->cn_lkflags & LK_TYPE_MASK)
-				VOP_UNLOCK(lvp, 0);
+				VOP_UNLOCK(lvp, LK_RELEASE);
 		}
 	}
 
@@ -281,7 +266,7 @@
 			goto unionfs_lookup_out;
 
 		if (LK_SHARED == (cnp->cn_lkflags & LK_TYPE_MASK))
-			VOP_UNLOCK(vp, 0);
+			VOP_UNLOCK(vp, LK_RELEASE);
 		if (LK_EXCLUSIVE != VOP_ISLOCKED(vp)) {
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			lockflag = 1;
@@ -289,7 +274,7 @@
 		error = unionfs_mkshadowdir(MOUNTTOUNIONFSMOUNT(dvp->v_mount),
 		    udvp, VTOUNIONFS(vp), cnp, td);
 		if (lockflag != 0)
-			VOP_UNLOCK(vp, 0);
+			VOP_UNLOCK(vp, LK_RELEASE);
 		if (error != 0) {
 			UNIONFSDEBUG("unionfs_lookup: Unable to create shadow dir.");
 			if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE)
@@ -386,7 +371,7 @@
 		if (vp->v_type == VSOCK)
 			*(ap->a_vpp) = vp;
 		else {
-			VOP_UNLOCK(vp, 0);
+			VOP_UNLOCK(vp, LK_RELEASE);
 			error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, curthread);
 			vrele(vp);
@@ -460,7 +445,7 @@
 		if (vp->v_type == VSOCK)
 			*(ap->a_vpp) = vp;
 		else {
-			VOP_UNLOCK(vp, 0);
+			VOP_UNLOCK(vp, LK_RELEASE);
 			error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, curthread);
 			vrele(vp);
@@ -564,6 +549,7 @@
 	struct unionfs_node_status *unsp;
 	struct ucred   *cred;
 	struct thread  *td;
+	struct vnode   *vp;
 	struct vnode   *ovp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n");
@@ -571,12 +557,14 @@
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	locked = 0;
-	unp = VTOUNIONFS(ap->a_vp);
+	vp = ap->a_vp;
+	unp = VTOUNIONFS(vp);
 	cred = ap->a_cred;
 	td = ap->a_td;
 
-	if (VOP_ISLOCKED(ap->a_vp) != LK_EXCLUSIVE) {
-		vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
+	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+		if (vn_lock(vp, LK_UPGRADE) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		locked = 1;
 	}
 	unionfs_get_node_status(unp, td, &unsp);
@@ -599,7 +587,7 @@
 	if (error != 0)
 		goto unionfs_close_abort;
 
-	ap->a_vp->v_object = ovp->v_object;
+	vp->v_object = ovp->v_object;
 
 	if (ovp == unp->un_uppervp) {
 		unsp->uns_upper_opencnt--;
@@ -610,7 +598,7 @@
 				unsp->uns_lower_opencnt--;
 			}
 			if (unsp->uns_lower_opencnt > 0)
-				ap->a_vp->v_object = unp->un_lowervp->v_object;
+				vp->v_object = unp->un_lowervp->v_object;
 		}
 	} else
 		unsp->uns_lower_opencnt--;
@@ -619,7 +607,7 @@
 	unionfs_tryrem_node_status(unp, unsp);
 
 	if (locked != 0)
-		VOP_UNLOCK(ap->a_vp, 0);
+		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_close: leave (%d)\n", error);
 
@@ -914,7 +902,7 @@
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, unsp);
-	VOP_UNLOCK(ap->a_vp, 0);
+	VOP_UNLOCK(ap->a_vp, LK_RELEASE);
 
 	if (ovp == NULLVP)
 		return (EBADF);
@@ -941,7 +929,7 @@
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, unsp);
-	VOP_UNLOCK(ap->a_vp, 0);
+	VOP_UNLOCK(ap->a_vp, LK_RELEASE);
 
 	if (ovp == NULLVP)
 		return (EBADF);
@@ -1001,7 +989,7 @@
 		ump = NULL;
 		vp = uvp = lvp = NULLVP;
 		/* search vnode */
-		VOP_UNLOCK(ap->a_vp, 0);
+		VOP_UNLOCK(ap->a_vp, LK_RELEASE);
 		error = unionfs_relookup(udvp, &vp, cnp, &cn, td,
 		    cnp->cn_nameptr, strlen(cnp->cn_nameptr), DELETE);
 		if (error != 0 && error != ENOENT) {
@@ -1204,7 +1192,7 @@
 			if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 				goto unionfs_rename_abort;
 			error = unionfs_copyfile(unp, 1, fcnp->cn_cred, td);
-			VOP_UNLOCK(fvp, 0);
+			VOP_UNLOCK(fvp, LK_RELEASE);
 			if (error != 0)
 				goto unionfs_rename_abort;
 			break;
@@ -1212,7 +1200,7 @@
 			if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 				goto unionfs_rename_abort;
 			error = unionfs_mkshadowdir(ump, rfdvp, unp, fcnp, td);
-			VOP_UNLOCK(fvp, 0);
+			VOP_UNLOCK(fvp, LK_RELEASE);
 			if (error != 0)
 				goto unionfs_rename_abort;
 			break;
@@ -1269,13 +1257,13 @@
 		if ((error = vn_lock(fdvp, LK_EXCLUSIVE)) != 0)
 			goto unionfs_rename_abort;
 		error = unionfs_relookup_for_delete(fdvp, fcnp, td);
-		VOP_UNLOCK(fdvp, 0);
+		VOP_UNLOCK(fdvp, LK_RELEASE);
 		if (error != 0)
 			goto unionfs_rename_abort;
 
 		/* Locke of tvp is canceled in order to avoid recursive lock. */
 		if (tvp != NULLVP && tvp != tdvp)
-			VOP_UNLOCK(tvp, 0);
+			VOP_UNLOCK(tvp, LK_RELEASE);
 		error = unionfs_relookup_for_rename(tdvp, tcnp, td);
 		if (tvp != NULLVP && tvp != tdvp)
 			vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
@@ -1293,11 +1281,11 @@
 	}
 
 	if (ltdvp != NULLVP)
-		VOP_UNLOCK(ltdvp, 0);
+		VOP_UNLOCK(ltdvp, LK_RELEASE);
 	if (tdvp != rtdvp)
 		vrele(tdvp);
 	if (ltvp != NULLVP)
-		VOP_UNLOCK(ltvp, 0);
+		VOP_UNLOCK(ltvp, LK_RELEASE);
 	if (tvp != rtvp && tvp != NULLVP) {
 		if (rtvp == NULLVP)
 			vput(tvp);
@@ -1371,7 +1359,7 @@
 		}
 
 		if ((error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap)) == 0) {
-			VOP_UNLOCK(uvp, 0);
+			VOP_UNLOCK(uvp, LK_RELEASE);
 			cnp->cn_lkflags = LK_EXCLUSIVE;
 			error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
@@ -1427,7 +1415,9 @@
 		ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 		if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP)
 			cnp->cn_flags |= DOWHITEOUT;
-		error = VOP_RMDIR(udvp, uvp, cnp);
+		error = unionfs_relookup_for_delete(ap->a_dvp, cnp, td);
+		if (!error)
+			error = VOP_RMDIR(udvp, uvp, cnp);
 	}
 	else if (lvp != NULLVP)
 		error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path);
@@ -1467,7 +1457,7 @@
 	if (udvp != NULLVP) {
 		error = VOP_SYMLINK(udvp, &uvp, cnp, ap->a_vap, ap->a_target);
 		if (error == 0) {
-			VOP_UNLOCK(uvp, 0);
+			VOP_UNLOCK(uvp, LK_RELEASE);
 			cnp->cn_lkflags = LK_EXCLUSIVE;
 			error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
@@ -1487,9 +1477,11 @@
 	int		error;
 	int		eofflag;
 	int		locked;
+	int		uio_offset_bk;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct uio     *uio;
+	struct vnode   *vp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
@@ -1505,17 +1497,42 @@
 	error = 0;
 	eofflag = 0;
 	locked = 0;
-	unp = VTOUNIONFS(ap->a_vp);
+	uio_offset_bk = 0;
 	uio = ap->a_uio;
-	uvp = unp->un_uppervp;
-	lvp = unp->un_lowervp;
+	uvp = NULLVP;
+	lvp = NULLVP;
 	td = uio->uio_td;
 	ncookies_bk = 0;
 	cookies_bk = NULL;
 
-	if (ap->a_vp->v_type != VDIR)
+	vp = ap->a_vp;
+	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
+	/* check the open count. unionfs needs to open before readdir. */
+	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+		if (vn_lock(vp, LK_UPGRADE) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		locked = 1;
+	}
+	unp = VTOUNIONFS(vp);
+	if (unp == NULL)
+		error = EBADF;
+	else {
+		uvp = unp->un_uppervp;
+		lvp = unp->un_lowervp;
+		unionfs_get_node_status(unp, td, &unsp);
+		if ((uvp != NULLVP && unsp->uns_upper_opencnt <= 0) ||
+			(lvp != NULLVP && unsp->uns_lower_opencnt <= 0)) {
+			unionfs_tryrem_node_status(unp, unsp);
+			error = EBADF;
+		}
+	}
+	if (locked)
+		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
+	if (error != 0)
+		goto unionfs_readdir_exit;
+
 	/* check opaque */
 	if (uvp != NULLVP && lvp != NULLVP) {
 		if ((error = VOP_GETATTR(uvp, &va, ap->a_cred)) != 0)
@@ -1524,22 +1541,6 @@
 			lvp = NULLVP;
 	}
 
-	/* check the open count. unionfs needs to open before readdir. */
-	if (VOP_ISLOCKED(ap->a_vp) != LK_EXCLUSIVE) {
-		vn_lock(ap->a_vp, LK_UPGRADE | LK_RETRY);
-		locked = 1;
-	}
-	unionfs_get_node_status(unp, td, &unsp);
-	if ((uvp != NULLVP && unsp->uns_upper_opencnt <= 0) ||
-	    (lvp != NULLVP && unsp->uns_lower_opencnt <= 0)) {
-		unionfs_tryrem_node_status(unp, unsp);
-		error = EBADF;
-	}
-	if (locked == 1)
-		vn_lock(ap->a_vp, LK_DOWNGRADE | LK_RETRY);
-	if (error != 0)
-		goto unionfs_readdir_exit;
-
 	/* upper only */
 	if (uvp != NULLVP && lvp == NULLVP) {
 		error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag,
@@ -1576,7 +1577,7 @@
 		unsp->uns_readdir_status = 1;
 
 		/*
-		 * ufs(and other fs) needs size of uio_resid larger than
+		 * UFS(and other FS) needs size of uio_resid larger than
 		 * DIRBLKSIZ.
 		 * size of DIRBLKSIZ equals DEV_BSIZE.
 		 * (see: ufs/ufs/ufs_vnops.c ufs_readdir func , ufs/ufs/dir.h)
@@ -1585,7 +1586,7 @@
 			goto unionfs_readdir_exit;
 
 		/*
-		 * backup cookies
+		 * Backup cookies.
 		 * It prepares to readdir in lower.
 		 */
 		if (ap->a_ncookies != NULL) {
@@ -1601,6 +1602,11 @@
 	/* initialize for readdir in lower */
 	if (unsp->uns_readdir_status == 1) {
 		unsp->uns_readdir_status = 2;
+		/*
+		 * Backup uio_offset. See the comment after the
+		 * VOP_READDIR call on the lower layer.
+		 */
+		uio_offset_bk = uio->uio_offset;
 		uio->uio_offset = 0;
 	}
 
@@ -1612,6 +1618,19 @@
 	error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag,
 			    ap->a_ncookies, ap->a_cookies);
 
+	/*
+	 * We can't return an uio_offset of 0: this would trigger an
+	 * infinite loop, because the next call to unionfs_readdir would
+	 * always restart with the upper layer (uio_offset == 0) and
+	 * always return some data.
+	 *
+	 * This happens when the lower layer root directory is removed.
+	 * (A root directory deleting of unionfs should not be permitted.
+	 *  But current VFS can not do it.)
+	 */
+	if (uio->uio_offset == 0)
+		uio->uio_offset = uio_offset_bk;
+
 	if (cookies_bk != NULL) {
 		/* merge cookies */
 		int		size;
@@ -1623,7 +1642,7 @@
 		pos = newcookies;
 
 		memcpy(pos, cookies_bk, ncookies_bk * sizeof(u_long));
-		pos += ncookies_bk * sizeof(u_long);
+		pos += ncookies_bk;
 		memcpy(pos, *(ap->a_cookies), *(ap->a_ncookies) * sizeof(u_long));
 		free(cookies_bk, M_TEMP);
 		free(*(ap->a_cookies), M_TEMP);
@@ -1702,7 +1721,7 @@
 unionfs_inactive(struct vop_inactive_args *ap)
 {
 	ap->a_vp->v_object = NULL;
-	vrecycle(ap->a_vp, ap->a_td);
+	vrecycle(ap->a_vp);
 	return (0);
 }
 
@@ -1743,18 +1762,66 @@
 }
 
 static int
-unionfs_get_llt_revlock(int flags)
+unionfs_islocked(struct vop_islocked_args *ap)
 {
-	int count;
-
-	flags &= LK_TYPE_MASK;
-	for (count = 0; un_llt[count].lock != 0; count++) {
-		if (flags == un_llt[count].lock) {
-			return un_llt[count].revlock;
-		}
+	struct unionfs_node *unp;
+
+	KASSERT_UNIONFS_VNODE(ap->a_vp);
+
+	unp = VTOUNIONFS(ap->a_vp);
+	if (unp == NULL)
+		return (vop_stdislocked(ap));
+
+	if (unp->un_uppervp != NULLVP)
+		return (VOP_ISLOCKED(unp->un_uppervp));
+	if (unp->un_lowervp != NULLVP)
+		return (VOP_ISLOCKED(unp->un_lowervp));
+	return (vop_stdislocked(ap));
+}
+
+static int
+unionfs_get_llt_revlock(struct vnode *vp, int flags)
+{
+	int revlock;
+
+	revlock = 0;
+
+	switch (flags & LK_TYPE_MASK) {
+	case LK_SHARED:
+		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+			revlock = LK_UPGRADE;
+		else
+			revlock = LK_RELEASE;
+		break;
+	case LK_EXCLUSIVE:
+	case LK_UPGRADE:
+		revlock = LK_RELEASE;
+		break;
+	case LK_DOWNGRADE:
+		revlock = LK_UPGRADE;
+		break;
+	default:
+		break;
 	}
 
-	return 0;
+	return (revlock);
+}
+
+/*
+ * The state of an acquired lock is adjusted similarly to
+ * the time of error generating. 
+ * flags: LK_RELEASE or LK_UPGRADE
+ */
+static void
+unionfs_revlock(struct vnode *vp, int flags)
+{
+	if (flags & LK_RELEASE)
+		VOP_UNLOCK(vp, flags);
+	else {
+		/* UPGRADE */
+		if (vn_lock(vp, flags) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	}
 }
 
 static int
@@ -1763,6 +1830,7 @@
 	int		error;
 	int		flags;
 	int		revlock;
+	int		interlock;
 	int		uhold;
 	struct mount   *mp;
 	struct unionfs_mount *ump;
@@ -1774,15 +1842,13 @@
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = 0;
+	interlock = 1;
 	uhold = 0;
 	flags = ap->a_flags;
 	vp = ap->a_vp;
 
 	if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK))
-		return (VOP_UNLOCK(vp, flags));
-
-	if ((revlock = unionfs_get_llt_revlock(flags)) == 0)
-		panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK);
+		return (VOP_UNLOCK(vp, flags | LK_RELEASE));
 
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
@@ -1798,6 +1864,9 @@
 	lvp = unp->un_lowervp;
 	uvp = unp->un_uppervp;
 
+	if ((revlock = unionfs_get_llt_revlock(vp, flags)) == 0)
+		panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK);
+
 	if ((mp->mnt_kern_flag & MNTK_MPSAFE) != 0 &&
 	    (vp->v_iflag & VI_OWEINACT) != 0)
 		flags |= LK_NOWAIT;
@@ -1811,6 +1880,23 @@
 		flags |= LK_CANRECURSE;
 
 	if (lvp != NULLVP) {
+		if (uvp != NULLVP && flags & LK_UPGRADE) {
+			/* Share Lock is once released and a deadlock is avoided.  */
+			VI_LOCK_FLAGS(uvp, MTX_DUPOK);
+			vholdl(uvp);
+			uhold = 1;
+			VI_UNLOCK(vp);
+			VOP_UNLOCK(uvp, LK_RELEASE | LK_INTERLOCK);
+			VI_LOCK(vp);
+			unp = VTOUNIONFS(vp);
+			if (unp == NULL) {
+				/* vnode is released. */
+				VI_UNLOCK(vp);
+				VOP_UNLOCK(lvp, LK_RELEASE);
+				vdrop(uvp);
+				return (EBUSY);
+			}
+		}
 		VI_LOCK_FLAGS(lvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
 		vholdl(lvp);
@@ -1823,19 +1909,28 @@
 		VI_LOCK(vp);
 		unp = VTOUNIONFS(vp);
 		if (unp == NULL) {
+			/* vnode is released. */
 			VI_UNLOCK(vp);
 			if (error == 0)
-				VOP_UNLOCK(lvp, 0);
+				VOP_UNLOCK(lvp, LK_RELEASE);
 			vdrop(lvp);
+			if (uhold != 0)
+				vdrop(uvp);
 			return (vop_stdlock(ap));
 		}
 	}
 
 	if (error == 0 && uvp != NULLVP) {
+		if (uhold && flags & LK_UPGRADE) {
+			flags &= ~LK_TYPE_MASK;
+			flags |= LK_EXCLUSIVE;
+		}
 		VI_LOCK_FLAGS(uvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
-		vholdl(uvp);
-		uhold = 1;
+		if (uhold == 0) {
+			vholdl(uvp);
+			uhold = 1;
+		}
 
 		VI_UNLOCK(vp);
 		ap->a_flags &= ~LK_INTERLOCK;
@@ -1845,30 +1940,27 @@
 		VI_LOCK(vp);
 		unp = VTOUNIONFS(vp);
 		if (unp == NULL) {
+			/* vnode is released. */
 			VI_UNLOCK(vp);
-			if (error == 0) {
-				VOP_UNLOCK(uvp, 0);
-				if (lvp != NULLVP)
-					VOP_UNLOCK(lvp, 0);
+			if (error == 0)
+				VOP_UNLOCK(uvp, LK_RELEASE);
+			vdrop(uvp);
+			if (lvp != NULLVP) {
+				VOP_UNLOCK(lvp, LK_RELEASE);
+				vdrop(lvp);
 			}
-			if (lvp != NULLVP)
-				vdrop(lvp);
-			vdrop(uvp);
 			return (vop_stdlock(ap));
 		}
-
 		if (error != 0 && lvp != NULLVP) {
+			/* rollback */
 			VI_UNLOCK(vp);
-			if ((revlock & LK_TYPE_MASK) == LK_RELEASE)
-				VOP_UNLOCK(lvp, revlock);
-			else
-				vn_lock(lvp, revlock | LK_RETRY);
-			goto unionfs_lock_abort;
+			unionfs_revlock(lvp, revlock);
+			interlock = 0;
 		}
 	}
 
-	VI_UNLOCK(vp);
-unionfs_lock_abort:
+	if (interlock)
+		VI_UNLOCK(vp);
 	if (lvp != NULLVP)
 		vdrop(lvp);
 	if (uhold != 0)
@@ -2013,7 +2105,7 @@
 			unionfs_tryrem_node_status(unp, unsp);
 	}
 
-	VOP_UNLOCK(vp, 0);
+	VOP_UNLOCK(vp, LK_RELEASE);
 
 	error = VOP_ADVLOCK(uvp, ap->a_id, ap->a_op, ap->a_fl, ap->a_flags);
 
@@ -2022,7 +2114,7 @@
 	return error;
 
 unionfs_advlock_abort:
-	VOP_UNLOCK(vp, 0);
+	VOP_UNLOCK(vp, LK_RELEASE);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error);
 
@@ -2150,7 +2242,8 @@
 	error = VOP_OPENEXTATTR(tvp, ap->a_cred, ap->a_td);
 
 	if (error == 0) {
-		vn_lock(vp, LK_UPGRADE | LK_RETRY);
+		if (vn_lock(vp, LK_UPGRADE) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (tvp == unp->un_uppervp)
 			unp->un_flag |= UNIONFS_OPENEXTU;
 		else
@@ -2186,7 +2279,8 @@
 	error = VOP_CLOSEEXTATTR(tvp, ap->a_commit, ap->a_cred, ap->a_td);
 
 	if (error == 0) {
-		vn_lock(vp, LK_UPGRADE | LK_RETRY);
+		if (vn_lock(vp, LK_UPGRADE) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (tvp == unp->un_uppervp)
 			unp->un_flag &= ~UNIONFS_OPENEXTU;
 		else
@@ -2435,6 +2529,7 @@
 	.vop_getextattr =	unionfs_getextattr,
 	.vop_getwritemount =	unionfs_getwritemount,
 	.vop_inactive =		unionfs_inactive,
+	.vop_islocked =		unionfs_islocked,
 	.vop_ioctl =		unionfs_ioctl,
 	.vop_link =		unionfs_link,
 	.vop_listextattr =	unionfs_listextattr,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/acpica/acpi_machdep.c
--- a/head/sys/i386/acpica/acpi_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/acpica/acpi_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/acpica/acpi_machdep.c 235556 2012-05-17 17:58:53Z jhb $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -44,8 +44,6 @@
 
 #include <machine/nexusvar.h>
 
-SYSCTL_DECL(_debug_acpi);
-
 uint32_t acpi_resume_beep;
 TUNABLE_INT("debug.acpi.resume_beep", &acpi_resume_beep);
 SYSCTL_UINT(_debug_acpi, OID_AUTO, resume_beep, CTLFLAG_RW, &acpi_resume_beep,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/acpica/acpi_wakecode.S
--- a/head/sys/i386/acpica/acpi_wakecode.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/acpica/acpi_wakecode.S	Wed Jul 25 16:40:53 2012 +0300
@@ -1,6 +1,8 @@
 /*-
  * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
+ * Copyright (c) 2001-2012 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
+ * Copyright (c) 2003 Peter Wemm
+ * Copyright (c) 2008-2012 Jung-uk Kim <jkim at FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -24,11 +26,13 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/acpica/acpi_wakecode.S 237027 2012-06-13 21:03:01Z jkim $
  */
 
 #include <machine/asmacros.h>
+#include <machine/ppireg.h>
 #include <machine/specialreg.h>
+#include <machine/timerreg.h>
 
 #include "assym.s"
 
@@ -39,221 +43,166 @@
  * Depending on the previous sleep state, we may need to initialize more
  * of the system (i.e., S3 suspend-to-RAM vs. S4 suspend-to-disk).
  */
-	.align 4
+
+	.data				/* So we can modify it */
+
+	ALIGN_TEXT
 	.code16
-wakeup_16:
-	nop
-	cli
-	cld
-
+wakeup_start:
 	/*
 	 * Set up segment registers for real mode, a small stack for
 	 * any calls we make, and clear any flags.
 	 */
-	movw	%cs,%ax
-	movw	%ax,%ds
-	movw	%ax,%ss
-	movw	$PAGE_SIZE,%sp
-	pushl	$0
-	popfl
+	cli				/* make sure no interrupts */
+	mov	%cs, %ax		/* copy %cs to %ds.  Remember these */
+	mov	%ax, %ds		/* are offsets rather than selectors */
+	mov	%ax, %ss
+	movw	$PAGE_SIZE, %sp
+	xorw	%ax, %ax
+	pushw	%ax
+	popfw
 
 	/* To debug resume hangs, beep the speaker if the user requested. */
-	cmpl	$1,resume_beep
-	jne	nobeep
-	movb	$0xc0,%al
-	outb	%al,$0x42
-	movb	$0x04,%al
-	outb	%al,$0x42
-	inb	$0x61,%al
-	orb	$0x3,%al
-	outb	%al,$0x61
-nobeep:
+	testb	$~0, resume_beep - wakeup_start
+	jz	1f
+	movb	$0, resume_beep - wakeup_start
+
+	/* Set PIC timer2 to beep. */
+	movb	$(TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT), %al
+	outb	%al, $TIMER_MODE
+
+	/* Turn on speaker. */
+	inb	$IO_PPI, %al
+	orb	$PIT_SPKR, %al
+	outb	%al, $IO_PPI
+
+	/* Set frequency. */
+	movw	$0x4c0, %ax
+	outb	%al, $TIMER_CNTR2
+	shrw	$8, %ax
+	outb	%al, $TIMER_CNTR2
+1:
 
 	/* Re-initialize video BIOS if the reset_video tunable is set. */
-	cmpl	$1,reset_video
-	jne	nobiosreset
-	lcall	$0xc000,$3
+	testb	$~0, reset_video - wakeup_start
+	jz	1f
+	movb	$0, reset_video - wakeup_start
+	lcall	$0xc000, $3
 
-	/*
-	 * Set up segment registers for real mode again in case the
-	 * previous BIOS call clobbers them.
-	 */
-	movw	%cs,%ax
-	movw	%ax,%ds
-	movw	%ax,%ss
-nobiosreset:
+	/* When we reach here, int 0x10 should be ready.  Hide cursor. */
+	movb	$0x01, %ah
+	movb	$0x20, %ch
+	int	$0x10
 
-	/* Load GDT for real mode.  Use 32 bit prefix for addresses >16 MB. */
-	lgdtl	physical_gdt
-
-	/* Restore CR2, CR3 and CR4 */
-	movl	previous_cr2,%eax
-	movl	%eax,%cr2
-	movl	previous_cr3,%eax
-	movl	%eax,%cr3
-	movl	previous_cr4,%eax
-	movl	%eax,%cr4
-
-	/* Transfer some values to protected mode with an inline stack */
-#define NVALUES	9
-#define TRANSFER_STACK32(val, idx)	\
-	movl	val,%eax;		\
-	movl	%eax,wakeup_32stack+(idx+1)+(idx*4)
-
-	TRANSFER_STACK32(previous_ss,		(NVALUES - 9))
-	TRANSFER_STACK32(previous_fs,		(NVALUES - 8))
-	TRANSFER_STACK32(previous_ds,		(NVALUES - 7))
-	TRANSFER_STACK32(physical_gdt+2,	(NVALUES - 6))
-	TRANSFER_STACK32(where_to_recover,	(NVALUES - 5))
-	TRANSFER_STACK32(previous_idt+2,	(NVALUES - 4))
-	TRANSFER_STACK32(previous_ldt,		(NVALUES - 3))
-	TRANSFER_STACK32(previous_gdt+2,	(NVALUES - 2))
-	TRANSFER_STACK32(previous_tr,		(NVALUES - 1))
-	TRANSFER_STACK32(previous_cr0,		(NVALUES - 0))
-
-	mov	physical_esp,%esi	/* to be used in 32bit code */
-
-	/* Enable protected mode */
-	movl	%cr0,%eax
-	orl	$(CR0_PE),%eax
-	movl	%eax,%cr0
-
-wakeup_sw32:
-	/* Switch to protected mode by intersegmental jump */
-	ljmpl	$KCSEL,$0x12345678	/* Code location, to be replaced */
-
-	/*
-	 * Now switched to protected mode without paging enabled.
-	 *	%esi: KERNEL stack pointer (physical address)
-	 */
-	.code32
-wakeup_32:
-	nop
-
-	/* Set up segment registers for protected mode */
-	movw	$KDSEL,%ax		/* KDSEL to segment registers */
-	movw	%ax,%ds
-	movw	%ax,%es
-	movw	%ax,%gs
-	movw	%ax,%ss
-	movw	$KPSEL,%ax		/* KPSEL to %fs */
-	movw	%ax,%fs
-	movl	%esi,%esp		/* physical address stack pointer */
-
-wakeup_32stack:
-	/* Operands are overwritten in 16 bit code by TRANSFER_STACK32 macro */
-	pushl	$0xabcdef09		/* ss + dummy */
-	pushl	$0xabcdef08		/* fs + gs */
-	pushl	$0xabcdef07		/* ds + es */
-	pushl	$0xabcdef06		/* gdt:base (physical address) */
-	pushl	$0xabcdef05		/* recover address */
-	pushl	$0xabcdef04		/* idt:base */
-	pushl	$0xabcdef03		/* ldt + idt:limit */
-	pushl	$0xabcdef02		/* gdt:base */
-	pushl	$0xabcdef01		/* TR + gdt:limit */
-	pushl	$0xabcdef00		/* CR0 */
-
-	movl	%esp,%ebp
-#define CR0_REGISTER		0(%ebp)
-#define TASK_REGISTER		4(%ebp)
-#define PREVIOUS_GDT		6(%ebp)
-#define PREVIOUS_LDT		12(%ebp)
-#define PREVIOUS_IDT		14(%ebp)
-#define RECOVER_ADDR		20(%ebp)
-#define PHYSICAL_GDT_BASE	24(%ebp)
-#define PREVIOUS_DS		28(%ebp)
-#define PREVIOUS_ES		30(%ebp)
-#define PREVIOUS_FS		32(%ebp)
-#define PREVIOUS_GS		34(%ebp)
-#define PREVIOUS_SS		36(%ebp)
-
-	/* Fixup TSS type field */
-#define TSS_TYPEFIX_MASK	0xf9
-	xorl	%esi,%esi
-	movl	PHYSICAL_GDT_BASE,%ebx
-	movw	TASK_REGISTER,%si
-	leal	(%ebx,%esi),%eax	/* get TSS segment descriptor */
-	andb	$TSS_TYPEFIX_MASK,5(%eax)
-
-	/* Prepare to return to sleep/wakeup code point */
-	lgdtl	PREVIOUS_GDT
-	lidtl	PREVIOUS_IDT
-
-	/* Pack values from the GDT to be loaded into segment registers. */
-	movl	PREVIOUS_DS,%ebx
-	movl	PREVIOUS_FS,%ecx
-	movl	PREVIOUS_SS,%edx
-	movw	TASK_REGISTER,%si
-	shll	$16,%esi
-	movw	PREVIOUS_LDT,%si
-	movl	RECOVER_ADDR,%edi
-
-	/* Enable paging and etc. */
-	movl	CR0_REGISTER,%eax
-	movl	%eax,%cr0
-
-	/* Flush the prefetch queue */
-	jmp	1f
-1:	jmp	1f
+	/* Re-start in case the previous BIOS call clobbers them. */
+	jmp	wakeup_start
 1:
 
 	/*
-	 * Now we are in kernel virtual memory addressing with the following
-	 * original register values:
-	 *	%ebx: ds + es
-	 *	%ecx: fs + gs
-	 *	%edx: ss + dummy
-	 *	%esi: LDTR + TR
-	 *	%edi: recover address
-	 * We'll load these back into the segment registers now.
+	 * Find relocation base and patch the gdt descript and ljmp targets
 	 */
-	nop
+	xorl	%ebx, %ebx
+	mov	%cs, %bx
+	sall	$4, %ebx		/* %ebx is now our relocation base */
 
-	movl	%esi,%eax		/* LDTR + TR */
-	lldt	%ax			/* load LDT register */
-	shrl	$16,%eax
-	ltr	%ax			/* load task register */
+	/*
+	 * Load the descriptor table pointer.  We'll need it when running
+	 * in 16-bit protected mode.
+	 */
+	lgdtl	bootgdtdesc - wakeup_start
 
-	/* Restore segment registers */
-	movl	%ebx,%eax		/* ds + es */
-	movw	%ax,%ds
-	shrl	$16,%eax
-	movw	%ax,%es
-	movl	%ecx,%eax		/* fs + gs */
-	movw	%ax,%fs
-	shrl	$16,%eax
-	movw	%ax,%gs
-	movl	%edx,%eax		/* ss */
-	movw	%ax,%ss
+	/* Enable protected mode */
+	movl	$CR0_PE, %eax
+	mov	%eax, %cr0
 
-	/* Jump to acpi_restorecpu() */
-	jmp	*%edi
+	/*
+	 * Now execute a far jump to turn on protected mode.  This
+	 * causes the segment registers to turn into selectors and causes
+	 * %cs to be loaded from the gdt.
+	 *
+	 * The following instruction is:
+	 * ljmpl $bootcode32 - bootgdt, $wakeup_32 - wakeup_start
+	 * but gas cannot assemble that.  And besides, we patch the targets
+	 * in early startup and its a little clearer what we are patching.
+	 */
+wakeup_sw32:
+	.byte	0x66			/* size override to 32 bits */
+	.byte	0xea			/* opcode for far jump */
+	.long	wakeup_32 - wakeup_start /* offset in segment */
+	.word	bootcode32 - bootgdt	/* index in gdt for 32 bit code */
 
-/* used in real mode */
-physical_gdt:		.word 0
-			.long 0
-physical_esp:		.long 0
-previous_cr2:		.long 0
-previous_cr3:		.long 0
-previous_cr4:		.long 0
-resume_beep:		.long 0
-reset_video:		.long 0
+	/*
+	 * At this point, we are running in 32 bit legacy protected mode.
+	 */
+	ALIGN_TEXT
+	.code32
+wakeup_32:
 
-/*
- * Transfer from real mode to protected mode.  The order of these variables
- * is very important, DO NOT INSERT OR CHANGE unless you know why.
- */
-previous_cr0:		.long 0
-previous_tr:		.word 0
-previous_gdt:		.word 0
-			.long 0
-previous_ldt:		.word 0
-previous_idt:		.word 0
-			.long 0
-where_to_recover:	.long 0
-previous_ds:		.word 0
-previous_es:		.word 0
-previous_fs:		.word 0
-previous_gs:		.word 0
-previous_ss:		.word 0
-dummy:			.word 0
+	mov	$bootdata32 - bootgdt, %eax
+	mov	%ax, %ds
+
+	/* Get PCB and return address. */
+	movl	wakeup_pcb - wakeup_start(%ebx), %ecx
+	movl	wakeup_ret - wakeup_start(%ebx), %edx
+
+	/* Restore CR4 and CR3. */
+	movl	wakeup_cr4 - wakeup_start(%ebx), %eax
+	mov	%eax, %cr4
+	movl	wakeup_cr3 - wakeup_start(%ebx), %eax
+	mov	%eax, %cr3
+
+	/*
+	 * Finally, switch to long bit mode by enabling paging.  We have
+	 * to be very careful here because all the segmentation disappears
+	 * out from underneath us.  The spec says we can depend on the
+	 * subsequent pipelined branch to execute, but *only if* everthing
+	 * is still identity mapped.  If any mappings change, the pipeline
+	 * will flush.
+	 */
+	mov	%cr0, %eax
+	orl	$CR0_PG, %eax
+	mov	%eax, %cr0
+
+	jmp	1f
+1:
+	/* Jump to return address. */
+	jmp	*%edx
+
+	.data
+
+resume_beep:
+	.byte	0
+reset_video:
+	.byte	0
+
+	ALIGN_DATA
+bootgdt:
+	.long	0x00000000
+	.long	0x00000000
+
+bootcode32:
+	.long	0x0000ffff
+	.long	0x00cf9b00
+
+bootdata32:
+	.long	0x0000ffff
+	.long	0x00cf9300
+bootgdtend:
+
+bootgdtdesc:
+	.word	bootgdtend - bootgdt	/* Length */
+	.long	bootgdt - wakeup_start	/* Offset plus %ds << 4 */
+
+	ALIGN_DATA
+wakeup_cr4:
+	.long	0
+wakeup_cr3:
+	.long	0
+wakeup_pcb:
+	.long	0
+wakeup_ret:
+	.long	0
+wakeup_gdt:		/* not used */
+	.word	0
+	.long	0
+dummy:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/acpica/acpi_wakeup.c
--- a/head/sys/i386/acpica/acpi_wakeup.c	Wed Jul 25 16:32:50 2012 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,371 +0,0 @@
-/*-
- * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/acpica/acpi_wakeup.c 233250 2012-03-20 21:37:52Z jkim $");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/bus.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/memrange.h>
-#include <sys/proc.h>
-#include <sys/sysctl.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-#include <vm/vm_object.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
-
-#include <machine/bus.h>
-#include <machine/cpufunc.h>
-#include <machine/intr_machdep.h>
-#include <x86/mca.h>
-#include <machine/segments.h>
-
-#include <contrib/dev/acpica/include/acpi.h>
-
-#include <dev/acpica/acpivar.h>
-
-#include "acpi_wakecode.h"
-#include "acpi_wakedata.h"
-
-/* Make sure the code is less than one page and leave room for the stack. */
-CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024);
-
-#ifndef _SYS_CDEFS_H_
-#error this file needs sys/cdefs.h as a prerequisite
-#endif
-
-extern uint32_t	acpi_resume_beep;
-extern uint32_t	acpi_reset_video;
-extern void	initializecpu(void);
-
-static struct region_descriptor __used	saved_idt, saved_gdt;
-static struct region_descriptor	*p_gdt;
-static uint16_t __used 	saved_ldt;
-
-static uint32_t	__used	r_eax, r_ebx, r_ecx, r_edx, r_ebp, r_esi, r_edi,
-			r_efl, r_cr0, r_cr2, r_cr3, r_cr4, ret_addr;
-
-static uint16_t	__used	r_cs, r_ds, r_es, r_fs, r_gs, r_ss, r_tr;
-static uint32_t	__used	r_esp;
-
-static void		acpi_printcpu(void);
-static void		acpi_realmodeinst(void *arg, bus_dma_segment_t *segs,
-					  int nsegs, int error);
-static void		acpi_alloc_wakeup_handler(void);
-
-/* XXX shut gcc up */
-extern int		acpi_savecpu(void);
-extern int		acpi_restorecpu(void);
-
-#ifdef __GNUCLIKE_ASM
-__asm__("				\n\
-	.text				\n\
-	.p2align 2, 0x90		\n\
-	.type acpi_restorecpu, @function\n\
-acpi_restorecpu:			\n\
-	.align 4			\n\
-	movl	r_eax,%eax		\n\
-	movl	r_ebx,%ebx		\n\
-	movl	r_ecx,%ecx		\n\
-	movl	r_edx,%edx		\n\
-	movl	r_ebp,%ebp		\n\
-	movl	r_esi,%esi		\n\
-	movl	r_edi,%edi		\n\
-	movl	r_esp,%esp		\n\
-					\n\
-	pushl	r_efl			\n\
-	popfl				\n\
-					\n\
-	movl	ret_addr,%eax		\n\
-	movl	%eax,(%esp)		\n\
-	xorl	%eax,%eax		\n\
-	ret				\n\
-					\n\
-	.text				\n\
-	.p2align 2, 0x90		\n\
-	.type acpi_savecpu, @function	\n\
-acpi_savecpu:				\n\
-	movw	%cs,r_cs		\n\
-	movw	%ds,r_ds		\n\
-	movw	%es,r_es		\n\
-	movw	%fs,r_fs		\n\
-	movw	%gs,r_gs		\n\
-	movw	%ss,r_ss		\n\
-					\n\
-	movl	%eax,r_eax		\n\
-	movl	%ebx,r_ebx		\n\
-	movl	%ecx,r_ecx		\n\
-	movl	%edx,r_edx		\n\
-	movl	%ebp,r_ebp		\n\
-	movl	%esi,r_esi		\n\
-	movl	%edi,r_edi		\n\
-					\n\
-	movl	%cr0,%eax		\n\
-	movl	%eax,r_cr0		\n\
-	movl	%cr2,%eax		\n\
-	movl	%eax,r_cr2		\n\
-	movl	%cr3,%eax		\n\
-	movl	%eax,r_cr3		\n\
-	movl	%cr4,%eax		\n\
-	movl	%eax,r_cr4		\n\
-					\n\
-	pushfl				\n\
-	popl	r_efl			\n\
-					\n\
-	movl	%esp,r_esp		\n\
-					\n\
-	sgdt	saved_gdt		\n\
-	sidt	saved_idt		\n\
-	sldt	saved_ldt		\n\
-	str	r_tr			\n\
-					\n\
-	movl	(%esp),%eax		\n\
-	movl	%eax,ret_addr		\n\
-	movl	$1,%eax			\n\
-	ret				\n\
-");
-#endif /* __GNUCLIKE_ASM */
-
-static void
-acpi_printcpu(void)
-{
-	printf("======== acpi_printcpu() debug dump ========\n");
-	printf("gdt[%04x:%08x] idt[%04x:%08x] ldt[%04x] tr[%04x] efl[%08x]\n",
-		saved_gdt.rd_limit, saved_gdt.rd_base,
-		saved_idt.rd_limit, saved_idt.rd_base,
-		saved_ldt, r_tr, r_efl);
-	printf("eax[%08x] ebx[%08x] ecx[%08x] edx[%08x]\n",
-		r_eax, r_ebx, r_ecx, r_edx);
-	printf("esi[%08x] edi[%08x] ebp[%08x] esp[%08x]\n",
-		r_esi, r_edi, r_ebp, r_esp);
-	printf("cr0[%08x] cr2[%08x] cr3[%08x] cr4[%08x]\n",
-		r_cr0, r_cr2, r_cr3, r_cr4);
-	printf("cs[%04x] ds[%04x] es[%04x] fs[%04x] gs[%04x] ss[%04x]\n",
-		r_cs, r_ds, r_es, r_fs, r_gs, r_ss);
-}
-
-#define WAKECODE_FIXUP(offset, type, val) do	{		\
-	type	*addr;						\
-	addr = (type *)(sc->acpi_wakeaddr + offset);		\
-	*addr = val;						\
-} while (0)
-
-#define WAKECODE_BCOPY(offset, type, val) do	{		\
-	void	*addr;						\
-	addr = (void *)(sc->acpi_wakeaddr + offset);		\
-	bcopy(&(val), addr, sizeof(type));			\
-} while (0)
-
-/* Turn off bits 1&2 of the PIT, stopping the beep. */
-static void
-acpi_stop_beep(void *arg)
-{
-	outb(0x61, inb(0x61) & ~0x3);
-}
-
-int
-acpi_sleep_machdep(struct acpi_softc *sc, int state)
-{
-	ACPI_STATUS		status;
-	struct pmap		*pm;
-	int			ret;
-	uint32_t		cr3;
-	u_long			ef;
-
-	ret = -1;
-	if (sc->acpi_wakeaddr == 0)
-		return (ret);
-
-	AcpiSetFirmwareWakingVector(sc->acpi_wakephys);
-
-	ef = intr_disable();
-	intr_suspend();
-
-	/*
-	 * Temporarily switch to the kernel pmap because it provides an
-	 * identity mapping (setup at boot) for the low physical memory
-	 * region containing the wakeup code.
-	 */
-	pm = kernel_pmap;
-	cr3 = rcr3();
-#ifdef PAE
-	load_cr3(vtophys(pm->pm_pdpt));
-#else
-	load_cr3(vtophys(pm->pm_pdir));
-#endif
-
-	ret_addr = 0;
-	if (acpi_savecpu()) {
-		/* Execute Sleep */
-
-		p_gdt = (struct region_descriptor *)
-				(sc->acpi_wakeaddr + physical_gdt);
-		p_gdt->rd_limit = saved_gdt.rd_limit;
-		p_gdt->rd_base = vtophys(saved_gdt.rd_base);
-
-		WAKECODE_FIXUP(physical_esp, uint32_t, vtophys(r_esp));
-		WAKECODE_FIXUP(previous_cr0, uint32_t, r_cr0);
-		WAKECODE_FIXUP(previous_cr2, uint32_t, r_cr2);
-		WAKECODE_FIXUP(previous_cr3, uint32_t, r_cr3);
-		WAKECODE_FIXUP(previous_cr4, uint32_t, r_cr4);
-
-		WAKECODE_FIXUP(resume_beep, uint32_t, acpi_resume_beep);
-		WAKECODE_FIXUP(reset_video, uint32_t, acpi_reset_video);
-
-		WAKECODE_FIXUP(previous_tr,  uint16_t, r_tr);
-		WAKECODE_BCOPY(previous_gdt, struct region_descriptor, saved_gdt);
-		WAKECODE_FIXUP(previous_ldt, uint16_t, saved_ldt);
-		WAKECODE_BCOPY(previous_idt, struct region_descriptor, saved_idt);
-
-		WAKECODE_FIXUP(where_to_recover, void *, acpi_restorecpu);
-
-		WAKECODE_FIXUP(previous_ds,  uint16_t, r_ds);
-		WAKECODE_FIXUP(previous_es,  uint16_t, r_es);
-		WAKECODE_FIXUP(previous_fs,  uint16_t, r_fs);
-		WAKECODE_FIXUP(previous_gs,  uint16_t, r_gs);
-		WAKECODE_FIXUP(previous_ss,  uint16_t, r_ss);
-
-		if (bootverbose)
-			acpi_printcpu();
-
-		/* Call ACPICA to enter the desired sleep state */
-		if (state == ACPI_STATE_S4 && sc->acpi_s4bios)
-			status = AcpiEnterSleepStateS4bios();
-		else
-			status = AcpiEnterSleepState(state, acpi_sleep_flags);
-
-		if (status != AE_OK) {
-			device_printf(sc->acpi_dev,
-				"AcpiEnterSleepState failed - %s\n",
-				AcpiFormatException(status));
-			goto out;
-		}
-
-		for (;;)
-			ia32_pause();
-	} else {
-		pmap_init_pat();
-		PCPU_SET(switchtime, 0);
-		PCPU_SET(switchticks, ticks);
-		if (bootverbose) {
-			acpi_savecpu();
-			acpi_printcpu();
-		}
-		ret = 0;
-	}
-
-out:
-	load_cr3(cr3);
-	mca_resume();
-	intr_resume();
-	intr_restore(ef);
-
-	if (ret == 0 && mem_range_softc.mr_op != NULL &&
-	    mem_range_softc.mr_op->reinit != NULL)
-		mem_range_softc.mr_op->reinit(&mem_range_softc);
-
-	/* If we beeped, turn it off after a delay. */
-	if (acpi_resume_beep)
-		timeout(acpi_stop_beep, NULL, 3 * hz);
-
-	return (ret);
-}
-
-static bus_dma_tag_t	acpi_waketag;
-static bus_dmamap_t	acpi_wakemap;
-static vm_offset_t	acpi_wakeaddr;
-
-static void
-acpi_alloc_wakeup_handler(void)
-{
-	void *wakeaddr;
-
-	if (!cold)
-		return;
-
-	/*
-	 * Specify the region for our wakeup code.  We want it in the low 1 MB
-	 * region, excluding video memory and above (0xa0000).  We ask for
-	 * it to be page-aligned, just to be safe.
-	 */
-	if (bus_dma_tag_create(/*parent*/ NULL,
-	    /*alignment*/ PAGE_SIZE, /*no boundary*/ 0,
-	    /*lowaddr*/ 0x9ffff, /*highaddr*/ BUS_SPACE_MAXADDR, NULL, NULL,
-	    /*maxsize*/ PAGE_SIZE, /*segments*/ 1, /*maxsegsize*/ PAGE_SIZE,
-	    0, busdma_lock_mutex, &Giant, &acpi_waketag) != 0) {
-		printf("acpi_alloc_wakeup_handler: can't create wake tag\n");
-		return;
-	}
-	if (bus_dmamem_alloc(acpi_waketag, &wakeaddr, BUS_DMA_NOWAIT,
-	    &acpi_wakemap) != 0) {
-		printf("acpi_alloc_wakeup_handler: can't alloc wake memory\n");
-		return;
-	}
-	acpi_wakeaddr = (vm_offset_t)wakeaddr;
-}
-
-SYSINIT(acpiwakeup, SI_SUB_KMEM, SI_ORDER_ANY, acpi_alloc_wakeup_handler, 0);
-
-static void
-acpi_realmodeinst(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
-{
-	struct acpi_softc *sc;
-	uint32_t *addr;
-
-	/* Overwrite the ljmp target with the real address */
-	sc = arg;
-	sc->acpi_wakephys = segs[0].ds_addr;
-	addr = (uint32_t *)&wakecode[wakeup_sw32 + 2];
-	*addr = sc->acpi_wakephys + wakeup_32;
-
-	/* Copy the wake code into our low page and save its physical addr. */
-	bcopy(wakecode, (void *)sc->acpi_wakeaddr, sizeof(wakecode));
-	if (bootverbose) {
-		device_printf(sc->acpi_dev, "wakeup code va %#x pa %#jx\n",
-		    acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys);
-	}
-}
-
-void
-acpi_install_wakeup_handler(struct acpi_softc *sc)
-{
-	if (acpi_wakeaddr == 0)
-		return;
-
-	sc->acpi_waketag = acpi_waketag;
-	sc->acpi_wakeaddr = acpi_wakeaddr;
-	sc->acpi_wakemap = acpi_wakemap;
-
-	bus_dmamap_load(sc->acpi_waketag, sc->acpi_wakemap,
-	    (void *)sc->acpi_wakeaddr, PAGE_SIZE, acpi_realmodeinst, sc, 0);
-}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/conf/GENERIC
--- a/head/sys/i386/conf/GENERIC	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/conf/GENERIC	Wed Jul 25 16:40:53 2012 +0300
@@ -16,7 +16,7 @@
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
-# $FreeBSD: head/sys/i386/conf/GENERIC 234504 2012-04-20 21:37:42Z brooks $
+# $FreeBSD: head/sys/i386/conf/GENERIC 237263 2012-06-19 07:34:13Z np $
 
 cpu		I486_CPU
 cpu		I586_CPU
@@ -30,6 +30,7 @@
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
+options 	TCP_OFFLOAD		# TCP offload
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
@@ -46,6 +47,7 @@
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
+options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
@@ -66,6 +68,7 @@
 options 	CAPABILITIES		# Capsicum capabilities
 options 	MAC			# TrustedBSD MAC Framework
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
+options 	DDB_CTF			# Kernel ELF linker loads CTF data
 options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
 
 # Debugging support.  Always need this:
@@ -75,7 +78,6 @@
 # For full debugger support use this instead:
 options 	DDB			# Support DDB.
 options 	GDB			# Support remote GDB.
-options 	DDB_CTF			# kernel ELF linker loads CTF data
 options 	DEADLKRES		# Enable the deadlock resolver
 options 	INVARIANTS		# Enable calls of extra sanity checking
 options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
@@ -284,6 +286,8 @@
 device		ath_pci		# Atheros pci/cardbus glue
 device		ath_hal		# pci/cardbus chip support
 options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
+options 	AH_AR5416_INTERRUPT_MITIGATION	# AR5416 interrupt mitigation
+options 	ATH_ENABLE_11N	# Enable 802.11n support for AR5416 and later
 device		ath_rate_sample	# SampleRate tx rate control for ath
 #device		bwi		# Broadcom BCM430x/BCM431x wireless NICs.
 #device		bwn		# Broadcom BCM43xx wireless NICs.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/conf/XEN
--- a/head/sys/i386/conf/XEN	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/conf/XEN	Wed Jul 25 16:40:53 2012 +0300
@@ -1,13 +1,13 @@
 #
 # XEN -- Kernel configuration for i386 XEN DomU
 #
-# $FreeBSD: head/sys/i386/conf/XEN 233271 2012-03-21 08:38:42Z ed $
+# $FreeBSD: head/sys/i386/conf/XEN 237263 2012-06-19 07:34:13Z np $
 
 cpu		I686_CPU
 ident		XEN
 
 makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
-makeoptions	WITHOUT_MODULES="aha ahb amd cxgb dpt drm hptmv ida malo mps mwl nve sound sym trm xfs"
+makeoptions	WITHOUT_MODULES="aha ahb amd cxgb dpt drm drm2 hptmv ida malo mps mwl nve rdma sound sym trm xfs"
 
 options 	SCHED_ULE		# ULE scheduler
 options 	PREEMPTION		# Enable kernel thread preemption
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/apic_vector.s
--- a/head/sys/i386/i386/apic_vector.s	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/apic_vector.s	Wed Jul 25 16:40:53 2012 +0300
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/i386/apic_vector.s 235683 2012-05-20 08:17:20Z iwasaki $
  */
 
 /*
@@ -334,6 +334,26 @@
 	iret
 
 /*
+ * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
+ */
+#ifndef XEN
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(cpususpend)
+	PUSH_FRAME
+	SET_KERNEL_SREGS
+	cld
+
+	movl	lapic, %eax
+	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
+
+	call	cpususpend_handler
+
+	POP_FRAME
+	jmp	doreti_iret
+#endif
+
+/*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/bios.c
--- a/head/sys/i386/i386/bios.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/bios.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/i386/bios.c 236213 2012-05-29 01:48:06Z kevlo $");
 
 /*
  * Code for dealing with the BIOS in x86 PC systems.
@@ -372,9 +372,11 @@
 	    break;
 
 	default:
+	    va_end(ap);
 	    return (EINVAL);
 	}
     }
+    va_end(ap);
 
     if (flags & BIOSARGS_FLAG) {
 	if (arg_end - arg_start > ctob(16))
@@ -448,9 +450,11 @@
 	    break;
 
 	default:
+	    va_end(ap);
 	    return (EINVAL);
 	}
     }
+    va_end(ap);
 
     set_bios_selectors(&args->seg, flags);
     bioscall_vector.vec16.offset = (u_short)args->entry;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/elf_machdep.c
--- a/head/sys/i386/i386/elf_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/elf_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/i386/elf_machdep.c 237435 2012-06-22 07:16:29Z kib $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -74,12 +74,15 @@
 	.sv_setregs	= exec_setregs,
 	.sv_fixlimit	= NULL,
 	.sv_maxssiz	= NULL,
-	.sv_flags	= SV_ABI_FREEBSD | SV_IA32 | SV_ILP32,
+	.sv_flags	= SV_ABI_FREEBSD | SV_IA32 | SV_ILP32 | SV_SHP,
 	.sv_set_syscall_retval = cpu_set_syscall_retval,
 	.sv_fetch_syscall_args = cpu_fetch_syscall_args,
 	.sv_syscallnames = syscallnames,
+	.sv_shared_page_base = SHAREDPAGE,
+	.sv_shared_page_len = PAGE_SIZE,
 	.sv_schedtail	= NULL,
 };
+INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec);
 
 static Elf32_Brandinfo freebsd_brand_info = {
 	.brand		= ELFOSABI_FREEBSD,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/genassym.c
--- a/head/sys/i386/i386/genassym.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/genassym.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/genassym.c 224187 2011-07-18 15:19:40Z attilio $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/genassym.c 235622 2012-05-18 18:55:58Z iwasaki $");
 
 #include "opt_apic.h"
 #include "opt_compat.h"
@@ -121,7 +121,10 @@
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(KERNLOAD, KERNLOAD);
 ASSYM(MCLBYTES, MCLBYTES);
+ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0));
+ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2));
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
+ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4));
 ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi));
 ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi));
 ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp));
@@ -130,7 +133,11 @@
 ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip));
 ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0));
 
+ASSYM(PCB_DS, offsetof(struct pcb, pcb_ds));
+ASSYM(PCB_ES, offsetof(struct pcb, pcb_es));
+ASSYM(PCB_FS, offsetof(struct pcb, pcb_fs));
 ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs));
+ASSYM(PCB_SS, offsetof(struct pcb, pcb_ss));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
@@ -143,6 +150,7 @@
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 
 ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd));
+ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd));
 ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
@@ -152,6 +160,11 @@
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_VM86CALL, PCB_VM86CALL);
 
+ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt));
+ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt));
+ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt));
+ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
+
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip));
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/initcpu.c
--- a/head/sys/i386/i386/initcpu.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/initcpu.c	Wed Jul 25 16:40:53 2012 +0300
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/initcpu.c 230767 2012-01-30 07:56:00Z kib $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/initcpu.c 235622 2012-05-18 18:55:58Z iwasaki $");
 
 #include "opt_cpu.h"
 
@@ -48,7 +48,6 @@
 #define CPU_ENABLE_SSE
 #endif
 
-void initializecpu(void);
 #if defined(I586_CPU) && defined(CPU_WT_ALLOC)
 void	enable_K5_wt_alloc(void);
 void	enable_K6_wt_alloc(void);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/machdep.c
--- a/head/sys/i386/i386/machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/machdep.c 234105 2012-04-10 16:08:46Z marius $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/machdep.c 238310 2012-07-09 20:42:08Z jhb $");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
@@ -75,6 +75,7 @@
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
@@ -180,7 +181,6 @@
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
-extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
@@ -248,6 +248,8 @@
 
 struct mtx icu_lock;
 
+struct mem_range_softc mem_range_softc;
+
 static void
 cpu_startup(dummy)
 	void *dummy;
@@ -337,12 +339,10 @@
 	cpu_setregs();
 #endif
 
-#ifdef SMP
 	/*
 	 * Add BSP as an interrupt target.
 	 */
 	intr_add_cpu(0);
-#endif
 }
 
 /*
@@ -472,7 +472,13 @@
 	}
 
 	regs->tf_esp = (int)fp;
-	regs->tf_eip = PS_STRINGS - szosigcode;
+	if (p->p_sysent->sv_sigcode_base != 0) {
+		regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
+		    szosigcode;
+	} else {
+		/* a.out sysentvec does not use shared page */
+		regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode;
+	}
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
@@ -599,7 +605,8 @@
 	}
 
 	regs->tf_esp = (int)sfp;
-	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
+	regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
+	    szfreebsd4_sigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
@@ -750,7 +757,7 @@
 	}
 
 	regs->tf_esp = (int)sfp;
-	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
+	regs->tf_eip = p->p_sysent->sv_sigcode_base;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
@@ -2178,7 +2185,7 @@
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 #ifndef XEN
-	int hasbrokenint12, i;
+	int hasbrokenint12, i, res;
 	u_int extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
@@ -2263,7 +2270,8 @@
 	pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
-	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
+	res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
+	KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
 
 	vmf.vmf_ebx = 0;
 	do {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/mem.c
--- a/head/sys/i386/i386/mem.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/mem.c	Wed Jul 25 16:40:53 2012 +0300
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/i386/mem.c 238310 2012-07-09 20:42:08Z jhb $");
 
 /*
  * Memory special file
@@ -72,8 +72,6 @@
  */
 MALLOC_DEFINE(M_MEMDESC, "memdesc", "memory range descriptors");
 
-struct mem_range_softc mem_range_softc;
-
 static struct sx memsxlock;
 SX_SYSINIT(memsxlockinit, &memsxlock, "/dev/mem lock");
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/minidump_machdep.c
--- a/head/sys/i386/i386/minidump_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/minidump_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/minidump_machdep.c 221173 2011-04-28 16:02:05Z attilio $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/minidump_machdep.c 236503 2012-06-03 08:01:12Z avg $");
 
 #include "opt_watchdog.h"
 
@@ -36,9 +36,7 @@
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/msgbuf.h>
-#ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
-#endif
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <machine/atomic.h>
@@ -143,9 +141,9 @@
 			printf(" %lld", PG2MB(progress >> PAGE_SHIFT));
 			counter &= (1<<24) - 1;
 		}
-#ifdef SW_WATCHDOG
+
 		wdog_kern_pat(WD_LASTVAL);
-#endif
+
 		if (ptr) {
 			error = dump_write(di, ptr, 0, dumplo, len);
 			if (error)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/mp_machdep.c
--- a/head/sys/i386/i386/mp_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/mp_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/mp_machdep.c 234208 2012-04-13 07:18:19Z avg $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/mp_machdep.c 236938 2012-06-12 00:14:54Z iwasaki $");
 
 #include "opt_apic.h"
 #include "opt_cpu.h"
@@ -146,6 +146,7 @@
 static void *dpcpu;
 
 struct pcb stoppcbs[MAXCPU];
+struct pcb **susppcbs = NULL;
 
 /* Variables needed for SMP tlb shootdown. */
 vm_offset_t smp_tlb_addr1;
@@ -587,6 +588,9 @@
 	setidt(IPI_STOP, IDTVEC(cpustop),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
+	/* Install an inter-CPU IPI for CPU suspend/resume */
+	setidt(IPI_SUSPEND, IDTVEC(cpususpend),
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
@@ -1077,6 +1081,60 @@
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
+	ipi_startup(apic_id, vector);
+
+	/* Wait up to 5 seconds for it to start. */
+	for (ms = 0; ms < 5000; ms++) {
+		if (mp_naps > cpus)
+			return 1;	/* return SUCCESS */
+		DELAY(1000);
+	}
+	return 0;		/* return FAILURE */
+}
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+    sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+    sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+    sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+    0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+    &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+    &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+    &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+    &ipi_masked_range_size, 0, "");
+#endif /* COUNT_XINVLTLB_HITS */
+
+/*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
 	/*
 	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
@@ -1127,52 +1185,8 @@
 	    vector, apic_id);
 	lapic_ipi_wait(-1);
 	DELAY(200);		/* wait ~200uS */
-
-	/* Wait up to 5 seconds for it to start. */
-	for (ms = 0; ms < 5000; ms++) {
-		if (mp_naps > cpus)
-			return 1;	/* return SUCCESS */
-		DELAY(1000);
-	}
-	return 0;		/* return FAILURE */
 }
 
-#ifdef COUNT_XINVLTLB_HITS
-u_int xhits_gbl[MAXCPU];
-u_int xhits_pg[MAXCPU];
-u_int xhits_rng[MAXCPU];
-static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
-    sizeof(xhits_gbl), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
-    sizeof(xhits_pg), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
-    sizeof(xhits_rng), "IU", "");
-
-u_int ipi_global;
-u_int ipi_page;
-u_int ipi_range;
-u_int ipi_range_size;
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
-    0, "");
-
-u_int ipi_masked_global;
-u_int ipi_masked_page;
-u_int ipi_masked_range;
-u_int ipi_masked_range_size;
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
-    &ipi_masked_global, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
-    &ipi_masked_page, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
-    &ipi_masked_range, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
-    &ipi_masked_range_size, 0, "");
-#endif /* COUNT_XINVLTLB_HITS */
-
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
@@ -1498,6 +1512,39 @@
 }
 
 /*
+ * Handle an IPI_SUSPEND by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpususpend_handler(void)
+{
+	u_int cpu;
+
+	cpu = PCPU_GET(cpuid);
+
+	if (savectx(susppcbs[cpu])) {
+		wbinvd();
+		CPU_SET_ATOMIC(cpu, &suspended_cpus);
+	} else {
+		pmap_init_pat();
+		PCPU_SET(switchtime, 0);
+		PCPU_SET(switchticks, ticks);
+
+		/* Indicate that we are resumed */
+		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+	}
+
+	/* Wait for resume */
+	while (!CPU_ISSET(cpu, &started_cpus))
+		ia32_pause();
+
+	CPU_CLR_ATOMIC(cpu, &started_cpus);
+
+	/* Resume MCA and local APIC */
+	mca_resume();
+	lapic_setup(0);
+}
+/*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/pmap.c
--- a/head/sys/i386/i386/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -75,7 +75,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 233433 2012-03-24 19:43:49Z alc $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 237623 2012-06-27 03:45:25Z alc $");
 
 /*
  *	Manages physical address maps.
@@ -118,6 +118,7 @@
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
@@ -231,8 +232,20 @@
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
 /*
+ * Isolate the global pv list lock from data and other locks to prevent false
+ * sharing within the cache.
+ */
+static struct {
+	struct rwlock	lock;
+	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
+} pvh_global __aligned(CACHE_LINE_SIZE);
+
+#define	pvh_global_lock	pvh_global.lock
+
+/*
  * Data for the pv entry allocation mechanism
  */
+static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static struct md_page *pv_table;
 static int shpgperproc = PMAP_SHPGPERPROC;
@@ -283,8 +296,9 @@
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
+static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
+static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
@@ -391,6 +405,12 @@
 	kernel_pmap->pm_root = NULL;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
+
+ 	/*
+	 * Initialize the global pv list lock.
+	 */
+	rw_init(&pvh_global_lock, "pmap pv global");
+
 	LIST_INIT(&allpmaps);
 
 	/*
@@ -1275,7 +1295,7 @@
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  *
- * If the given pmap is not the current pmap, vm_page_queue_mtx
+ * If the given pmap is not the current pmap, pvh_global_lock
  * must be held and curthread pinned to a CPU.
  */
 static pt_entry_t *
@@ -1291,7 +1311,7 @@
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
-		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+		rw_assert(&pvh_global_lock, RA_WLOCKED);
 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP1 & PG_FRAME) != newpf) {
@@ -1840,9 +1860,9 @@
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (flags & M_WAITOK) {
 			PMAP_UNLOCK(pmap);
-			vm_page_unlock_queues();
+			rw_wunlock(&pvh_global_lock);
 			VM_WAIT;
-			vm_page_lock_queues();
+			rw_wlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 
@@ -2143,6 +2163,7 @@
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
+CTASSERT(_NPCPV == 336);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
@@ -2156,7 +2177,7 @@
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
-static uint32_t pc_freemask[11] = {
+static const uint32_t pc_freemask[_NPCM] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
@@ -2187,83 +2208,155 @@
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
-
-static int pmap_collect_inactive, pmap_collect_active;
-
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
-	"Current number times pmap_collect called on inactive queue");
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
-	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
- * another pv entry chunk.  This is normally called to
- * unmap inactive pages, and if necessary, active pages.
+ * another pv entry chunk.
  */
-static void
-pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
+static vm_page_t
+pmap_pv_reclaim(pmap_t locked_pmap)
 {
+	struct pch newtail;
+	struct pv_chunk *pc;
+	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
-	pv_entry_t next_pv, pv;
+	pv_entry_t pv;
 	vm_offset_t va;
-	vm_page_t m, free;
-
+	vm_page_t free, m, m_pc;
+	uint32_t inuse;
+	int bit, field, freed;
+
+	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+	pmap = NULL;
+	free = m_pc = NULL;
+	TAILQ_INIT(&newtail);
 	sched_pin();
-	TAILQ_FOREACH(m, &vpq->pl, pageq) {
-		if ((m->flags & PG_MARKER) != 0 || m->hold_count || m->busy)
-			continue;
-		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
-			va = pv->pv_va;
-			pmap = PV_PMAP(pv);
+	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
+	    free == NULL)) {
+		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+		if (pmap != pc->pc_pmap) {
+			if (pmap != NULL) {
+				pmap_invalidate_all(pmap);
+				if (pmap != locked_pmap)
+					PMAP_UNLOCK(pmap);
+			}
+			pmap = pc->pc_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
-			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
+			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
+				pmap = NULL;
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 				continue;
-			pmap->pm_stats.resident_count--;
-			pde = pmap_pde(pmap, va);
-			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
-			    " a 4mpage in page %p's pv list", m));
-			pte = pmap_pte_quick(pmap, va);
-			tpte = pte_load_clear(pte);
-			KASSERT((tpte & PG_W) == 0,
-			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
-			if (tpte & PG_A)
-				vm_page_aflag_set(m, PGA_REFERENCED);
-			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
-				vm_page_dirty(m);
-			free = NULL;
-			pmap_unuse_pt(pmap, va, &free);
-			pmap_invalidate_page(pmap, va);
-			pmap_free_zero_pages(free);
-			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-			free_pv_entry(pmap, pv);
-			if (pmap != locked_pmap)
-				PMAP_UNLOCK(pmap);
+			}
 		}
-		if (TAILQ_EMPTY(&m->md.pv_list) &&
-		    TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list))
-			vm_page_aflag_clear(m, PGA_WRITEABLE);
+
+		/*
+		 * Destroy every non-wired, 4 KB page mapping in the chunk.
+		 */
+		freed = 0;
+		for (field = 0; field < _NPCM; field++) {
+			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
+			    inuse != 0; inuse &= ~(1UL << bit)) {
+				bit = bsfl(inuse);
+				pv = &pc->pc_pventry[field * 32 + bit];
+				va = pv->pv_va;
+				pde = pmap_pde(pmap, va);
+				if ((*pde & PG_PS) != 0)
+					continue;
+				pte = pmap_pte_quick(pmap, va);
+				if ((*pte & PG_W) != 0)
+					continue;
+				tpte = pte_load_clear(pte);
+				if ((tpte & PG_G) != 0)
+					pmap_invalidate_page(pmap, va);
+				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+					vm_page_dirty(m);
+				if ((tpte & PG_A) != 0)
+					vm_page_aflag_set(m, PGA_REFERENCED);
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list) &&
+				    (m->flags & PG_FICTITIOUS) == 0) {
+					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+					if (TAILQ_EMPTY(&pvh->pv_list)) {
+						vm_page_aflag_clear(m,
+						    PGA_WRITEABLE);
+					}
+				}
+				pc->pc_map[field] |= 1UL << bit;
+				pmap_unuse_pt(pmap, va, &free);
+				freed++;
+			}
+		}
+		if (freed == 0) {
+			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+			continue;
+		}
+		/* Every freed mapping is for a 4 KB page. */
+		pmap->pm_stats.resident_count -= freed;
+		PV_STAT(pv_entry_frees += freed);
+		PV_STAT(pv_entry_spare += freed);
+		pv_entry_count -= freed;
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		for (field = 0; field < _NPCM; field++)
+			if (pc->pc_map[field] != pc_freemask[field]) {
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+
+				/*
+				 * One freed pv entry in locked_pmap is
+				 * sufficient.
+				 */
+				if (pmap == locked_pmap)
+					goto out;
+				break;
+			}
+		if (field == _NPCM) {
+			PV_STAT(pv_entry_spare -= _NPCPV);
+			PV_STAT(pc_chunk_count--);
+			PV_STAT(pc_chunk_frees++);
+			/* Entire chunk is free; return it. */
+			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
+			pmap_qremove((vm_offset_t)pc, 1);
+			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+			break;
+		}
 	}
+out:
 	sched_unpin();
+	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
+	if (pmap != NULL) {
+		pmap_invalidate_all(pmap);
+		if (pmap != locked_pmap)
+			PMAP_UNLOCK(pmap);
+	}
+	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
+		m_pc = free;
+		free = m_pc->right;
+		/* Recycle a freed page table page. */
+		m_pc->wire_count = 1;
+		atomic_add_int(&cnt.v_wire_count, 1);
+	}
+	pmap_free_zero_pages(free);
+	return (m_pc);
 }
 
-
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
-	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
@@ -2273,13 +2366,30 @@
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
-	/* move to head of list */
-	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx]) {
-			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+			/*
+			 * 98% of the time, pc is already at the head of the
+			 * list.  If it isn't already, move it to the head.
+			 */
+			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
+			    pc)) {
+				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+			}
 			return;
 		}
+	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+	free_pv_chunk(pc);
+}
+
+static void
+free_pv_chunk(struct pv_chunk *pc)
+{
+	vm_page_t m;
+
+ 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
@@ -2296,18 +2406,17 @@
  * when needed.
  */
 static pv_entry_t
-get_pv_entry(pmap_t pmap, int try)
+get_pv_entry(pmap_t pmap, boolean_t try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
-	struct vpgqueues *pq;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
@@ -2315,7 +2424,6 @@
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max tunable.\n");
-	pq = NULL;
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
@@ -2341,33 +2449,20 @@
 		}
 	}
 	/*
-	 * Access to the ptelist "pv_vafree" is synchronized by the page
-	 * queues lock.  If "pv_vafree" is currently non-empty, it will
+	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
+	 * global lock.  If "pv_vafree" is currently non-empty, it will
 	 * remain non-empty until pmap_ptelist_alloc() completes.
 	 */
-	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, (pq ==
-	    &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
+	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
-		/*
-		 * Reclaim pv entries: At first, destroy mappings to
-		 * inactive pages.  After that, if a pv chunk entry
-		 * is still needed, destroy mappings to active pages.
-		 */
-		if (pq == NULL) {
-			PV_STAT(pmap_collect_inactive++);
-			pq = &vm_page_queues[PQ_INACTIVE];
-		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
-			PV_STAT(pmap_collect_active++);
-			pq = &vm_page_queues[PQ_ACTIVE];
-		} else
-			panic("get_pv_entry: increase vm.pmap.shpgperproc");
-		pmap_collect(pmap, pq);
-		goto retry;
+		m = pmap_pv_reclaim(pmap);
+		if (m == NULL)
+			goto retry;
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
@@ -2377,6 +2472,7 @@
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
+	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
@@ -2388,7 +2484,7 @@
 {
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
@@ -2406,7 +2502,7 @@
 	vm_offset_t va_last;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
 
@@ -2439,7 +2535,7 @@
 	vm_offset_t va_last;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
 
@@ -2480,7 +2576,7 @@
 {
 	struct md_page *pvh;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	pmap_pvh_free(&m->md, pmap, va);
 	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -2498,8 +2594,8 @@
 {
 	pv_entry_t pv;
 
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
@@ -2513,8 +2609,8 @@
 {
 	pv_entry_t pv;
 
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
@@ -2533,7 +2629,7 @@
 	struct md_page *pvh;
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
@@ -2611,7 +2707,7 @@
 	 */
 	if (va >= KERNBASE)
 		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
-	else if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) {
+	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
 		if ((*PMAP1 & PG_FRAME) != mptepa) {
 			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
@@ -2770,7 +2866,7 @@
 	pt_entry_t oldpte;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
@@ -2801,7 +2897,7 @@
 {
 	pt_entry_t *pte;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
@@ -2833,7 +2929,7 @@
 
 	anyvalid = 0;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	PMAP_LOCK(pmap);
 
@@ -2922,7 +3018,7 @@
 	sched_unpin();
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
@@ -2954,7 +3050,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	free = NULL;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -2995,7 +3091,7 @@
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	pmap_free_zero_pages(free);
 }
 
@@ -3050,7 +3146,7 @@
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
-	int anychanged;
+	boolean_t anychanged, pv_lists_locked;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
@@ -3066,10 +3162,16 @@
 		return;
 #endif
 
-	anychanged = 0;
-
-	vm_page_lock_queues();
-	sched_pin();
+	if (pmap_is_current(pmap))
+		pv_lists_locked = FALSE;
+	else {
+		pv_lists_locked = TRUE;
+resume:
+		rw_wlock(&pvh_global_lock);
+		sched_pin();
+	}
+	anychanged = FALSE;
+
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pt_entry_t obits, pbits;
@@ -3104,12 +3206,27 @@
 				 */
 				if (pmap_protect_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva, prot))
-					anychanged = 1;
+					anychanged = TRUE;
 				continue;
-			} else if (!pmap_demote_pde(pmap,
-			    &pmap->pm_pdir[pdirindex], sva)) {
-				/* The large page mapping was destroyed. */
-				continue;
+			} else {
+				if (!pv_lists_locked) {
+					pv_lists_locked = TRUE;
+					if (!rw_try_wlock(&pvh_global_lock)) {
+						if (anychanged)
+							pmap_invalidate_all(
+							    pmap);
+						PMAP_UNLOCK(pmap);
+						goto resume;
+					}
+				}
+				if (!pmap_demote_pde(pmap,
+				    &pmap->pm_pdir[pdirindex], sva)) {
+					/*
+					 * The large page mapping was
+					 * destroyed.
+					 */
+					continue;
+				}
 			}
 		}
 
@@ -3155,14 +3272,16 @@
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
-					anychanged = 1;
+					anychanged = TRUE;
 			}
 		}
 	}
-	sched_unpin();
 	if (anychanged)
 		pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();
+	if (pv_lists_locked) {
+		sched_unpin();
+		rw_wunlock(&pvh_global_lock);
+	}
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3332,7 +3451,7 @@
 
 	mpte = NULL;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 
@@ -3502,7 +3621,7 @@
 		pmap_promote_pde(pmap, pde, va);
 
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3517,7 +3636,7 @@
 {
 	pd_entry_t *pde, newpde;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pde = pmap_pde(pmap, va);
 	if (*pde != 0) {
@@ -3586,7 +3705,7 @@
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
@@ -3600,7 +3719,7 @@
 			    mpte);
 		m = TAILQ_NEXT(m, listq);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3617,10 +3736,10 @@
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3635,7 +3754,7 @@
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
@@ -3841,9 +3960,9 @@
 		if (!wired != ((*pde & PG_W) == 0)) {
 			if (!are_queues_locked) {
 				are_queues_locked = TRUE;
-				if (!mtx_trylock(&vm_page_queue_mtx)) {
+				if (!rw_try_wlock(&pvh_global_lock)) {
 					PMAP_UNLOCK(pmap);
-					vm_page_lock_queues();
+					rw_wlock(&pvh_global_lock);
 					goto retry;
 				}
 			}
@@ -3867,7 +3986,7 @@
 	pmap_pte_release(pte);
 out:
 	if (are_queues_locked)
-		vm_page_unlock_queues();
+		rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3896,7 +4015,7 @@
 	if (!pmap_is_current(src_pmap))
 		return;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
@@ -3986,7 +4105,7 @@
 	}
 out:
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
@@ -4128,7 +4247,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
@@ -4150,7 +4269,7 @@
 				break;
 		}
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4168,13 +4287,13 @@
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	count = pmap_pvh_wired_mappings(&m->md, count);
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
 	        count);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
@@ -4190,7 +4309,7 @@
 	pt_entry_t *pte;
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
@@ -4215,11 +4334,11 @@
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4249,13 +4368,13 @@
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
-			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
+			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfl(inuse);
 				bitmask = 1UL << bit;
@@ -4347,20 +4466,13 @@
 			}
 		}
 		if (allfree) {
-			PV_STAT(pv_entry_spare -= _NPCPV);
-			PV_STAT(pc_chunk_count--);
-			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
-			pmap_qremove((vm_offset_t)pc, 1);
-			vm_page_unwire(m, 0);
-			vm_page_free(m);
-			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+			free_pv_chunk(pc);
 		}
 	}
 	sched_unpin();
 	pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
@@ -4388,11 +4500,11 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_modified_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4409,7 +4521,7 @@
 	pmap_t pmap;
 	boolean_t rv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
@@ -4462,11 +4574,11 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_referenced_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4482,7 +4594,7 @@
 	pmap_t pmap;
 	boolean_t rv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
@@ -4523,7 +4635,7 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -4564,7 +4676,7 @@
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -4593,7 +4705,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -4652,7 +4764,7 @@
 	}
 out:
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rtval);
 }
 
@@ -4682,7 +4794,7 @@
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -4743,7 +4855,7 @@
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -4763,7 +4875,7 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_reference: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -4810,7 +4922,7 @@
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/ptrace_machdep.c
--- a/head/sys/i386/i386/ptrace_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/ptrace_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/i386/ptrace_machdep.c 238675 2012-07-21 21:39:02Z kib $");
 
 #include "opt_cpu.h"
 
@@ -54,10 +54,12 @@
 	fpstate = &td->td_pcb->pcb_user_save.sv_xmm;
 	switch (req) {
 	case PT_GETXMMREGS:
+		npxgetregs(td);
 		error = copyout(fpstate, addr, sizeof(*fpstate));
 		break;
 
 	case PT_SETXMMREGS:
+		npxgetregs(td);
 		error = copyin(addr, fpstate, sizeof(*fpstate));
 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 		break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/swtch.s
--- a/head/sys/i386/i386/swtch.s	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/swtch.s	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/i386/swtch.s 237027 2012-06-13 21:03:01Z jkim $
  */
 
 #include "opt_npx.h"
@@ -386,6 +386,36 @@
 	pushfl
 	popl	PCB_PSL(%ecx)
 
+	movl	%cr0,%eax
+	movl	%eax,PCB_CR0(%ecx)
+	movl	%cr2,%eax
+	movl	%eax,PCB_CR2(%ecx)
+	movl	%cr4,%eax
+	movl	%eax,PCB_CR4(%ecx)
+
+	movl	%dr0,%eax
+	movl	%eax,PCB_DR0(%ecx)
+	movl	%dr1,%eax
+	movl	%eax,PCB_DR1(%ecx)
+	movl	%dr2,%eax
+	movl	%eax,PCB_DR2(%ecx)
+	movl	%dr3,%eax
+	movl	%eax,PCB_DR3(%ecx)
+	movl	%dr6,%eax
+	movl	%eax,PCB_DR6(%ecx)
+	movl	%dr7,%eax
+	movl	%eax,PCB_DR7(%ecx)
+
+	mov	%ds,PCB_DS(%ecx)
+	mov	%es,PCB_ES(%ecx)
+	mov	%fs,PCB_FS(%ecx)
+	mov	%ss,PCB_SS(%ecx)
+	
+	sgdt	PCB_GDT(%ecx)
+	sidt	PCB_IDT(%ecx)
+	sldt	PCB_LDT(%ecx)
+	str	PCB_TR(%ecx)
+
 #ifdef DEV_NPX
 	/*
 	 * If fpcurthread == NULL, then the npx h/w state is irrelevant and the
@@ -425,5 +455,84 @@
 	popfl
 #endif	/* DEV_NPX */
 
+	movl	$1,%eax
 	ret
 END(savectx)
+
+/*
+ * resumectx(pcb) __fastcall
+ * Resuming processor state from pcb.
+ */
+ENTRY(resumectx)
+	/* Restore GDT. */
+	lgdt	PCB_GDT(%ecx)
+
+	/* Restore segment registers */
+	movzwl	PCB_DS(%ecx),%eax
+	mov	%ax,%ds
+	movzwl	PCB_ES(%ecx),%eax
+	mov	%ax,%es
+	movzwl	PCB_FS(%ecx),%eax
+	mov	%ax,%fs
+	movzwl	PCB_GS(%ecx),%eax
+	movw	%ax,%gs
+	movzwl	PCB_SS(%ecx),%eax
+	mov	%ax,%ss
+
+	/* Restore CR2, CR4, CR3 and CR0 */
+	movl	PCB_CR2(%ecx),%eax
+	movl	%eax,%cr2
+	movl	PCB_CR4(%ecx),%eax
+	movl	%eax,%cr4
+	movl	PCB_CR3(%ecx),%eax
+	movl	%eax,%cr3
+	movl	PCB_CR0(%ecx),%eax
+	movl	%eax,%cr0
+	jmp	1f
+1:
+
+	/* Restore descriptor tables */
+	lidt	PCB_IDT(%ecx)
+	lldt	PCB_LDT(%ecx)
+
+#define SDT_SYS386TSS	9
+#define SDT_SYS386BSY	11
+	/* Clear "task busy" bit and reload TR */
+	movl	PCPU(TSS_GDT),%eax
+	andb	$(~SDT_SYS386BSY | SDT_SYS386TSS),5(%eax)
+	movzwl	PCB_TR(%ecx),%eax
+	ltr	%ax
+#undef SDT_SYS386TSS
+#undef SDT_SYS386BSY
+
+	/* Restore debug registers */
+	movl	PCB_DR0(%ecx),%eax
+	movl	%eax,%dr0
+	movl	PCB_DR1(%ecx),%eax
+	movl	%eax,%dr1
+	movl	PCB_DR2(%ecx),%eax
+	movl	%eax,%dr2
+	movl	PCB_DR3(%ecx),%eax
+	movl	%eax,%dr3
+	movl	PCB_DR6(%ecx),%eax
+	movl	%eax,%dr6
+	movl	PCB_DR7(%ecx),%eax
+	movl	%eax,%dr7
+
+#ifdef DEV_NPX
+	/* XXX FIX ME */
+#endif
+
+	/* Restore other registers */
+	movl	PCB_EDI(%ecx),%edi
+	movl	PCB_ESI(%ecx),%esi
+	movl	PCB_EBP(%ecx),%ebp
+	movl	PCB_ESP(%ecx),%esp
+	movl	PCB_EBX(%ecx),%ebx
+
+	/* reload code selector by turning return into intersegmental return */
+	pushl	PCB_EIP(%ecx)
+	movl	$KCSEL,4(%esp)
+	xorl	%eax,%eax
+	lret
+END(resumectx)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/trap.c
--- a/head/sys/i386/i386/trap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/trap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/trap.c 233781 2012-04-02 15:07:22Z jhb $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/trap.c 238678 2012-07-21 21:52:48Z kib $");
 
 /*
  * 386 Trap and System call handling
@@ -369,7 +369,7 @@
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 #ifdef DEV_NPX
-			ucode = npxtrap();
+			ucode = npxtrap_x87();
 			if (ucode == -1)
 				goto userout;
 #else
@@ -532,7 +532,13 @@
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
-			ucode = 0; /* XXX */
+#if defined(DEV_NPX) && !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
+			ucode = npxtrap_sse();
+			if (ucode == -1)
+				goto userout;
+#else
+			ucode = 0;
+#endif
 			i = SIGFPE;
 			break;
 		}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/vm86.c
--- a/head/sys/i386/i386/vm86.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/vm86.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/vm86.c 234350 2012-04-16 19:31:44Z jkim $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/vm86.c 237924 2012-07-01 12:59:00Z brueffer $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -650,7 +650,6 @@
 			return (1);
 		}
 	return (0);
-	panic("vm86_getptr: address not found");
 }
 	
 int
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/apicvar.h
--- a/head/sys/i386/include/apicvar.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/apicvar.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/include/apicvar.h 232230 2012-02-27 17:30:21Z jhb $
+ * $FreeBSD: head/sys/i386/include/apicvar.h 235622 2012-05-18 18:55:58Z iwasaki $
  */
 
 #ifndef _MACHINE_APICVAR_H_
@@ -126,7 +126,8 @@
 #define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST)
 
 #define	IPI_STOP	(APIC_IPI_INTS + 7)	/* Stop CPU until restarted. */
-#define	IPI_STOP_HARD	(APIC_IPI_INTS + 8)	/* Stop CPU with a NMI. */
+#define	IPI_SUSPEND	(APIC_IPI_INTS + 8)	/* Suspend CPU until restarted. */
+#define	IPI_STOP_HARD	(APIC_IPI_INTS + 9)	/* Stop CPU with a NMI. */
 
 /*
  * The spurious interrupt can share the priority class with the IPIs since
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/atomic.h
--- a/head/sys/i386/include/atomic.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/atomic.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/include/atomic.h 220404 2011-04-06 23:59:59Z jkim $
+ * $FreeBSD: head/sys/i386/include/atomic.h 236456 2012-06-02 18:10:16Z kib $
  */
 #ifndef _MACHINE_ATOMIC_H_
 #define	_MACHINE_ATOMIC_H_
@@ -32,9 +32,9 @@
 #error this file needs sys/cdefs.h as a prerequisite
 #endif
 
-#define	mb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory")
-#define	wmb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory")
-#define	rmb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory")
+#define	mb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc")
+#define	wmb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc")
+#define	rmb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc")
 
 /*
  * Various simple operations on memory, each of which is atomic in the
@@ -79,8 +79,9 @@
 int	atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src);
 u_int	atomic_fetchadd_int(volatile u_int *p, u_int v);
 
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)			\
-u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p);	\
+#define	ATOMIC_LOAD(TYPE, LOP)					\
+u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p)
+#define	ATOMIC_STORE(TYPE)					\
 void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 
 #else /* !KLD_MODULE && __GNUCLIKE_ASM */
@@ -280,16 +281,29 @@
 	return (v);
 }
 
+/*
+ * We assume that a = b will do atomic loads and stores.  Due to the
+ * IA32 memory model, a simple store guarantees release semantics.
+ *
+ * However, loads may pass stores, so for atomic_load_acq we have to
+ * ensure a Store/Load barrier to do the load in SMP kernels.  We use
+ * "lock cmpxchg" as recommended by the AMD Software Optimization
+ * Guide, and not mfence.  For UP kernels, however, the cache of the
+ * single processor is always consistent, so we only need to take care
+ * of the compiler.
+ */
+#define	ATOMIC_STORE(TYPE)				\
+static __inline void					\
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
+{							\
+	__asm __volatile("" : : : "memory");		\
+	*p = v;						\
+}							\
+struct __hack
+
 #if defined(_KERNEL) && !defined(SMP)
 
-/*
- * We assume that a = b will do atomic loads and stores.  However, on a
- * PentiumPro or higher, reads may pass writes, so for that case we have
- * to use a serializing instruction (i.e. with LOCK) to do the load in
- * SMP kernels.  For UP kernels, however, the cache of the single processor
- * is always consistent, so we only need to take care of compiler.
- */
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)		\
+#define	ATOMIC_LOAD(TYPE, LOP)				\
 static __inline u_##TYPE				\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
 {							\
@@ -299,18 +313,11 @@
 	__asm __volatile("" : : : "memory");		\
 	return (tmp);					\
 }							\
-							\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__asm __volatile("" : : : "memory");		\
-	*p = v;						\
-}							\
 struct __hack
 
 #else /* !(_KERNEL && !SMP) */
 
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)		\
+#define	ATOMIC_LOAD(TYPE, LOP)				\
 static __inline u_##TYPE				\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
 {							\
@@ -324,19 +331,6 @@
 							\
 	return (res);					\
 }							\
-							\
-/*							\
- * The XCHG instruction asserts LOCK automagically.	\
- */							\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__asm __volatile(SOP				\
-	: "=m" (*p),			/* 0 */		\
-	  "+r" (v)			/* 1 */		\
-	: "m" (*p)			/* 2 */		\
-	: "memory");					\
-}							\
 struct __hack
 
 #endif /* _KERNEL && !SMP */
@@ -363,13 +357,19 @@
 ATOMIC_ASM(add,	     long,  "addl %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subl %1,%0",  "ir",  v);
 
-ATOMIC_STORE_LOAD(char,	"cmpxchgb %b0,%1", "xchgb %b1,%0");
-ATOMIC_STORE_LOAD(short,"cmpxchgw %w0,%1", "xchgw %w1,%0");
-ATOMIC_STORE_LOAD(int,	"cmpxchgl %0,%1",  "xchgl %1,%0");
-ATOMIC_STORE_LOAD(long,	"cmpxchgl %0,%1",  "xchgl %1,%0");
+ATOMIC_LOAD(char,  "cmpxchgb %b0,%1");
+ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
+ATOMIC_LOAD(int,   "cmpxchgl %0,%1");
+ATOMIC_LOAD(long,  "cmpxchgl %0,%1");
+
+ATOMIC_STORE(char);
+ATOMIC_STORE(short);
+ATOMIC_STORE(int);
+ATOMIC_STORE(long);
 
 #undef ATOMIC_ASM
-#undef ATOMIC_STORE_LOAD
+#undef ATOMIC_LOAD
+#undef ATOMIC_STORE
 
 #ifndef WANT_FUNCTIONS
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/bootinfo.h
--- a/head/sys/i386/include/bootinfo.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/bootinfo.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/bootinfo.h 235391 2012-05-13 09:25:39Z avg $
  */
 
 #ifndef	_MACHINE_BOOTINFO_H_
@@ -65,13 +65,13 @@
 	u_int32_t	bi_kernend;		/* end of kernel space */
 	u_int32_t	bi_envp;		/* environment */
 	u_int32_t	bi_modulep;		/* preloaded modules */
+	uint32_t	bi_memdesc_version;	/* EFI memory desc version */
+	uint64_t	bi_memdesc_size;	/* sizeof EFI memory desc */
+	uint64_t	bi_memmap;		/* pa of EFI memory map */
+	uint64_t	bi_memmap_size;		/* size of EFI memory map */
 	uint64_t	bi_hcdp;		/* DIG64 HCDP table */
 	uint64_t	bi_fpswa;		/* FPSWA interface */
 	uint64_t	bi_systab;		/* pa of EFI system table */
-	uint64_t	bi_memmap;		/* pa of EFI memory map */
-	uint64_t	bi_memmap_size;		/* size of EFI memory map */
-	uint64_t	bi_memdesc_size;	/* sizeof EFI memory desc */
-	uint32_t	bi_memdesc_version;	/* EFI memory desc version */
 };
 
 #ifdef _KERNEL
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/cpufunc.h
--- a/head/sys/i386/include/cpufunc.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/cpufunc.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/include/cpufunc.h 223796 2011-07-05 18:42:10Z jkim $
+ * $FreeBSD: head/sys/i386/include/cpufunc.h 238311 2012-07-09 20:55:39Z jhb $
  */
 
 /*
@@ -97,6 +97,13 @@
 }
 
 static __inline void
+clts(void)
+{
+
+	__asm __volatile("clts");
+}
+
+static __inline void
 disable_intr(void)
 {
 #ifdef XEN
@@ -688,6 +695,9 @@
 int	breakpoint(void);
 u_int	bsfl(u_int mask);
 u_int	bsrl(u_int mask);
+void	clflush(u_long addr);
+void	clts(void);
+void	cpuid_count(u_int ax, u_int cx, u_int *p);
 void	disable_intr(void);
 void	do_cpuid(u_int ax, u_int *p);
 void	enable_intr(void);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/elf.h
--- a/head/sys/i386/include/elf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/elf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/elf.h 237430 2012-06-22 06:38:31Z kib $
  */
 
 #ifndef _MACHINE_ELF_H_
@@ -96,6 +96,7 @@
 #define	AT_NCPUS	19	/* Number of CPUs. */
 #define	AT_PAGESIZES	20	/* Pagesizes. */
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
+#define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #define	AT_STACKPROT	23	/* Initial stack protection. */
 
 #define	AT_COUNT	24	/* Count of defined aux entry types. */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/in_cksum.h
--- a/head/sys/i386/include/in_cksum.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/in_cksum.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  *	from tahoe:	in_cksum.c	1.2	86/01/05
  *	from:		@(#)in_cksum.c	1.3 (Berkeley) 1/19/91
  *	from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/in_cksum.h 235941 2012-05-24 22:00:48Z bz $
  */
 
 #ifndef _MACHINE_IN_CKSUM_H_
@@ -54,6 +54,7 @@
  * therefore always exactly five 32-bit words.
  */
 #if defined(__GNUCLIKE_ASM) && !defined(__INTEL_COMPILER)
+#if defined(IPVERSION) && (IPVERSION == 4)
 static __inline u_int
 in_cksum_hdr(const struct ip *ip)
 {
@@ -88,6 +89,7 @@
 	__tmpsum = (int)ntohs(ip->ip_sum) + 256;
 	ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16));
 }
+#endif
 
 static __inline u_short
 in_addword(u_short sum, u_short b)
@@ -121,6 +123,7 @@
 }
 
 #else
+#if defined(IPVERSION) && (IPVERSION == 4)
 #define	in_cksum_update(ip) \
 	do { \
 		int __tmpsum; \
@@ -129,10 +132,13 @@
 	} while(0)
 
 #endif
+#endif
 
 #ifdef _KERNEL
 #if !defined(__GNUCLIKE_ASM) || defined(__INTEL_COMPILER)
+#if defined(IPVERSION) && (IPVERSION == 4)
 u_int in_cksum_hdr(const struct ip *ip);
+#endif
 u_short in_addword(u_short sum, u_short b);
 u_short in_pseudo(u_int sum, u_int b, u_int c);
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/intr_machdep.h
--- a/head/sys/i386/include/intr_machdep.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/intr_machdep.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/include/intr_machdep.h 234207 2012-04-13 07:15:40Z avg $
+ * $FreeBSD: head/sys/i386/include/intr_machdep.h 234989 2012-05-03 21:44:01Z attilio $
  */
 
 #ifndef __MACHINE_INTR_MACHDEP_H__
@@ -131,9 +131,7 @@
 enum intr_trigger elcr_read_trigger(u_int irq);
 void	elcr_resume(void);
 void	elcr_write_trigger(u_int irq, enum intr_trigger trigger);
-#ifdef SMP
 void	intr_add_cpu(u_int cpu);
-#endif
 int	intr_add_handler(const char *name, int vector, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep);
 #ifdef SMP
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/md_var.h
--- a/head/sys/i386/include/md_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/md_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/md_var.h 235622 2012-05-18 18:55:58Z iwasaki $
  */
 
 #ifndef _MACHINE_MD_VAR_H_
@@ -91,6 +91,7 @@
 void	doreti_popl_fs_fault(void) __asm(__STRING(doreti_popl_fs_fault));
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
+void	initializecpu(void);
 void	enable_sse(void);
 void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
 void	i686_pagezero(void *addr);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/npx.h
--- a/head/sys/i386/include/npx.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/npx.h	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)npx.h	5.3 (Berkeley) 1/18/91
- * $FreeBSD: head/sys/i386/include/npx.h 233044 2012-03-16 20:24:30Z tijl $
+ * $FreeBSD: head/sys/i386/include/npx.h 238678 2012-07-21 21:52:48Z kib $
  */
 
 /*
@@ -55,7 +55,8 @@
 void	npxinit(void);
 void	npxsave(union savefpu *addr);
 void	npxsetregs(struct thread *td, union savefpu *addr);
-int	npxtrap(void);
+int	npxtrap_x87(void);
+int	npxtrap_sse(void);
 void	npxuserinited(struct thread *);
 struct fpu_kern_ctx *fpu_kern_alloc_ctx(u_int flags);
 void	fpu_kern_free_ctx(struct fpu_kern_ctx *ctx);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/pcb.h
--- a/head/sys/i386/include/pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)pcb.h	5.10 (Berkeley) 5/12/91
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/pcb.h 237027 2012-06-13 21:03:01Z jkim $
  */
 
 #ifndef _I386_PCB_H_
@@ -45,7 +45,10 @@
 #include <machine/npx.h>
 
 struct pcb {
+	int	pcb_cr0;
+	int	pcb_cr2;
 	int	pcb_cr3;
+	int	pcb_cr4;
 	int	pcb_edi;
 	int	pcb_esi;
 	int	pcb_ebp;
@@ -71,20 +74,30 @@
 #define	PCB_KERNNPX	0x40	/* kernel uses npx */
 
 	caddr_t	pcb_onfault;	/* copyin/out fault recovery */
+	int	pcb_ds;
+	int	pcb_es;
+	int	pcb_fs;
 	int	pcb_gs;
+	int	pcb_ss;
 	struct segment_descriptor pcb_fsd;
 	struct segment_descriptor pcb_gsd;
 	struct	pcb_ext	*pcb_ext;	/* optional pcb extension */
 	int	pcb_psl;	/* process status long */
 	u_long	pcb_vm86[2];	/* vm86bios scratch space */
 	union	savefpu *pcb_save;
+
+	struct region_descriptor pcb_gdt;
+	struct region_descriptor pcb_idt;
+	uint16_t	pcb_ldt;
+	uint16_t	pcb_tr;
 };
 
 #ifdef _KERNEL
 struct trapframe;
 
 void	makectx(struct trapframe *, struct pcb *);
-void	savectx(struct pcb *);
+int	savectx(struct pcb *) __returns_twice;
+void	resumectx(struct pcb *) __fastcall;
 #endif
 
 #endif /* _I386_PCB_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/pmap.h
--- a/head/sys/i386/include/pmap.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/pmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  *
  *	from: hp300: @(#)pmap.h	7.2 (Berkeley) 12/16/90
  *	from: @(#)pmap.h	7.4 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/i386/include/pmap.h 222813 2011-06-07 08:46:13Z attilio $
+ * $FreeBSD: head/sys/i386/include/pmap.h 237168 2012-06-16 18:56:19Z alc $
  */
 
 #ifndef _MACHINE_PMAP_H_
@@ -481,7 +481,7 @@
 	pmap_t			pc_pmap;
 	TAILQ_ENTRY(pv_chunk)	pc_list;
 	uint32_t		pc_map[_NPCM];	/* bitmap; 1 = free */
-	uint32_t		pc_spare[2];
+	TAILQ_ENTRY(pv_chunk)	pc_lru;
 	struct pv_entry		pc_pventry[_NPCPV];
 };
 
@@ -498,6 +498,7 @@
 extern vm_offset_t virtual_end;
 
 #define	pmap_page_get_memattr(m)	((vm_memattr_t)(m)->md.pat_mode)
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
 #define	pmap_unmapbios(va, sz)	pmap_unmapdev((va), (sz))
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/smp.h
--- a/head/sys/i386/include/smp.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/smp.h	Wed Jul 25 16:40:53 2012 +0300
@@ -6,7 +6,7 @@
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
- * $FreeBSD: head/sys/i386/include/smp.h 222853 2011-06-08 08:12:15Z avg $
+ * $FreeBSD: head/sys/i386/include/smp.h 236938 2012-06-12 00:14:54Z iwasaki $
  *
  */
 
@@ -53,13 +53,18 @@
 	IDTVEC(invlcache),	/* Write back and invalidate cache */
 	IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ 
 	IDTVEC(cpustop),	/* CPU stops & waits to be restarted */
+	IDTVEC(cpususpend),	/* CPU suspends & waits to be resumed */
 	IDTVEC(rendezvous),	/* handle CPU rendezvous */
 	IDTVEC(lazypmap);	/* handle lazy pmap release */
 
 /* functions in mp_machdep.c */
 void	cpu_add(u_int apic_id, char boot_cpu);
 void	cpustop_handler(void);
+#ifndef XEN
+void	cpususpend_handler(void);
+#endif
 void	init_secondary(void);
+void	ipi_startup(int apic_id, int vector);
 void	ipi_all_but_self(u_int ipi);
 #ifndef XEN
 void 	ipi_bitmap_handler(struct trapframe frame);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/vdso.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/i386/include/vdso.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/i386/include/vdso.h 237433 2012-06-22 07:06:40Z kib $ */
+
+#include <x86/vdso.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/vmparam.h
--- a/head/sys/i386/include/vmparam.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/vmparam.h	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vmparam.h	5.9 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/i386/include/vmparam.h 228398 2011-12-10 18:42:00Z alc $
+ * $FreeBSD: head/sys/i386/include/vmparam.h 237435 2012-06-22 07:16:29Z kib $
  */
 
 
@@ -165,7 +165,8 @@
 
 #define VM_MAXUSER_ADDRESS	VADDR(PTDPTDI, 0)
 
-#define USRSTACK		VM_MAXUSER_ADDRESS
+#define	SHAREDPAGE		(VM_MAXUSER_ADDRESS - PAGE_SIZE)
+#define	USRSTACK		SHAREDPAGE
 
 #define VM_MAX_ADDRESS		VADDR(PTDPTDI, PTDPTDI)
 #define VM_MIN_ADDRESS		((vm_offset_t)0)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/isa/npx.c
--- a/head/sys/i386/isa/npx.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/isa/npx.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/isa/npx.c 230426 2012-01-21 17:45:27Z kib $");
+__FBSDID("$FreeBSD: head/sys/i386/isa/npx.c 238678 2012-07-21 21:52:48Z kib $");
 
 #include "opt_cpu.h"
 #include "opt_isa.h"
@@ -99,15 +99,7 @@
 #ifdef CPU_ENABLE_SSE
 #define	fxrstor(addr)		__asm __volatile("fxrstor %0" : : "m" (*(addr)))
 #define	fxsave(addr)		__asm __volatile("fxsave %0" : "=m" (*(addr)))
-#endif
-#ifdef XEN
-#define	start_emulating()	(HYPERVISOR_fpu_taskswitch(1))
-#define	stop_emulating()	(HYPERVISOR_fpu_taskswitch(0))
-#else
-#define	start_emulating()	__asm __volatile( \
-				    "smsw %%ax; orb %0,%%al; lmsw %%ax" \
-				    : : "n" (CR0_TS) : "ax")
-#define	stop_emulating()	__asm __volatile("clts")
+#define	stmxcsr(addr)		__asm __volatile("stmxcsr %0" : : "m" (*(addr)))
 #endif
 #else	/* !(__GNUCLIKE_ASM && !lint) */
 
@@ -122,12 +114,19 @@
 #ifdef CPU_ENABLE_SSE
 void	fxsave(caddr_t addr);
 void	fxrstor(caddr_t addr);
+void	stmxcsr(u_int csr);
 #endif
-void	start_emulating(void);
-void	stop_emulating(void);
 
 #endif	/* __GNUCLIKE_ASM && !lint */
 
+#ifdef XEN
+#define	start_emulating()	(HYPERVISOR_fpu_taskswitch(1))
+#define	stop_emulating()	(HYPERVISOR_fpu_taskswitch(0))
+#else
+#define	start_emulating()	load_cr0(rcr0() | CR0_TS)
+#define	stop_emulating()	clts()
+#endif
+
 #ifdef CPU_ENABLE_SSE
 #define GET_FPU_CW(thread) \
 	(cpu_fxsr ? \
@@ -584,29 +583,30 @@
 };
 
 /*
- * Preserve the FP status word, clear FP exceptions, then generate a SIGFPE.
+ * Read the FP status and control words, then generate si_code value
+ * for SIGFPE.  The error code chosen will be one of the
+ * FPE_... macros.  It will be sent as the second argument to old
+ * BSD-style signal handlers and as "siginfo_t->si_code" (second
+ * argument) to SA_SIGINFO signal handlers.
  *
- * Clearing exceptions is necessary mainly to avoid IRQ13 bugs.  We now
- * depend on longjmp() restoring a usable state.  Restoring the state
- * or examining it might fail if we didn't clear exceptions.
+ * Some time ago, we cleared the x87 exceptions with FNCLEX there.
+ * Clearing exceptions was necessary mainly to avoid IRQ13 bugs.  The
+ * usermode code which understands the FPU hardware enough to enable
+ * the exceptions, can also handle clearing the exception state in the
+ * handler.  The only consequence of not clearing the exception is the
+ * rethrow of the SIGFPE on return from the signal handler and
+ * reexecution of the corresponding instruction.
  *
- * The error code chosen will be one of the FPE_... macros. It will be
- * sent as the second argument to old BSD-style signal handlers and as
- * "siginfo_t->si_code" (second argument) to SA_SIGINFO signal handlers.
- *
- * XXX the FP state is not preserved across signal handlers.  So signal
- * handlers cannot afford to do FP unless they preserve the state or
- * longjmp() out.  Both preserving the state and longjmp()ing may be
- * destroyed by IRQ13 bugs.  Clearing FP exceptions is not an acceptable
- * solution for signals other than SIGFPE.
+ * For XMM traps, the exceptions were never cleared.
  */
 int
-npxtrap()
+npxtrap_x87(void)
 {
 	u_short control, status;
 
 	if (!hw_float) {
-		printf("npxtrap: fpcurthread = %p, curthread = %p, hw_float = %d\n",
+		printf(
+	"npxtrap_x87: fpcurthread = %p, curthread = %p, hw_float = %d\n",
 		       PCPU_GET(fpcurthread), curthread, hw_float);
 		panic("npxtrap from nowhere");
 	}
@@ -624,13 +624,32 @@
 		fnstcw(&control);
 		fnstsw(&status);
 	}
-
-	if (PCPU_GET(fpcurthread) == curthread)
-		fnclex();
 	critical_exit();
 	return (fpetable[status & ((~control & 0x3f) | 0x40)]);
 }
 
+#ifdef CPU_ENABLE_SSE
+int
+npxtrap_sse(void)
+{
+	u_int mxcsr;
+
+	if (!hw_float) {
+		printf(
+	"npxtrap_sse: fpcurthread = %p, curthread = %p, hw_float = %d\n",
+		       PCPU_GET(fpcurthread), curthread, hw_float);
+		panic("npxtrap from nowhere");
+	}
+	critical_enter();
+	if (PCPU_GET(fpcurthread) != curthread)
+		mxcsr = curthread->td_pcb->pcb_save->sv_xmm.sv_env.en_mxcsr;
+	else
+		stmxcsr(&mxcsr);
+	critical_exit();
+	return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
+}
+#endif
+
 /*
  * Implement device not available (DNA) exception
  *
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/linux/linux.h
--- a/head/sys/i386/linux/linux.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/linux/linux.h	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/linux/linux.h 230132 2012-01-15 13:23:18Z uqs $
+ * $FreeBSD: head/sys/i386/linux/linux.h 235063 2012-05-05 19:42:38Z netchild $
  */
 
 #ifndef _I386_LINUX_H_
@@ -42,6 +42,7 @@
 #define	ldebug(name)	isclr(linux_debug_map, LINUX_SYS_linux_ ## name)
 #define	ARGS(nm, fmt)	"linux(%ld): "#nm"("fmt")\n", (long)td->td_proc->p_pid
 #define	LMSG(fmt)	"linux(%ld): "fmt"\n", (long)td->td_proc->p_pid
+#define	LINUX_DTRACE	linuxulator
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_LINUX);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/linux/linux_dummy.c
--- a/head/sys/i386/linux/linux_dummy.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/linux/linux_dummy.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,16 +27,25 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/linux/linux_dummy.c 234352 2012-04-16 21:22:02Z jkim $");
+__FBSDID("$FreeBSD: head/sys/i386/linux/linux_dummy.c 235063 2012-05-05 19:42:38Z netchild $");
+
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
 
 #include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <i386/linux/linux.h>
 #include <i386/linux/linux_proto.h>
+#include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_util.h>
 
+/* DTrace init */
+LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
+
 DUMMY(stime);
 DUMMY(fstat);
 DUMMY(olduname);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/xen/pmap.c
--- a/head/sys/i386/xen/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/xen/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -75,7 +75,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/xen/pmap.c 229007 2011-12-30 18:16:15Z alc $");
+__FBSDID("$FreeBSD: head/sys/i386/xen/pmap.c 236534 2012-06-04 03:51:08Z alc $");
 
 /*
  *	Manages physical address maps.
@@ -179,7 +179,6 @@
 #define PMAP_INLINE
 #endif
 
-#define PV_STATS
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
@@ -230,6 +229,7 @@
 /*
  * Data for the pv entry allocation mechanism
  */
+static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static int shpgperproc = PMAP_SHPGPERPROC;
 
@@ -277,8 +277,9 @@
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
+static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
+static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
@@ -1914,6 +1915,7 @@
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
+CTASSERT(_NPCPV == 336);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
@@ -1927,7 +1929,7 @@
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
-static uint32_t pc_freemask[11] = {
+static const uint32_t pc_freemask[_NPCM] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
@@ -1958,74 +1960,140 @@
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
-
-static int pmap_collect_inactive, pmap_collect_active;
-
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
-	"Current number times pmap_collect called on inactive queue");
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
-	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
- * another pv entry chunk.  This is normally called to
- * unmap inactive pages, and if necessary, active pages.
+ * another pv entry chunk.
  */
-static void
-pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
+static vm_page_t
+pmap_pv_reclaim(pmap_t locked_pmap)
 {
+	struct pch newtail;
+	struct pv_chunk *pc;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
-	pv_entry_t next_pv, pv;
+	pv_entry_t pv;
 	vm_offset_t va;
-	vm_page_t m, free;
-
+	vm_page_t free, m, m_pc;
+	uint32_t inuse;
+	int bit, field, freed;
+
+	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+	pmap = NULL;
+	free = m_pc = NULL;
+	TAILQ_INIT(&newtail);
 	sched_pin();
-	TAILQ_FOREACH(m, &vpq->pl, pageq) {
-		if ((m->flags & PG_MARKER) != 0 || m->hold_count || m->busy)
-			continue;
-		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
-			va = pv->pv_va;
-			pmap = PV_PMAP(pv);
+	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
+	    free == NULL)) {
+		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+		if (pmap != pc->pc_pmap) {
+			if (pmap != NULL) {
+				pmap_invalidate_all(pmap);
+				if (pmap != locked_pmap)
+					PMAP_UNLOCK(pmap);
+			}
+			pmap = pc->pc_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
-			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
+			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
+				pmap = NULL;
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 				continue;
-			pmap->pm_stats.resident_count--;
-			pte = pmap_pte_quick(pmap, va);
-			tpte = pte_load_clear(pte);
-			KASSERT((tpte & PG_W) == 0,
-			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
-			if (tpte & PG_A)
-				vm_page_aflag_set(m, PGA_REFERENCED);
-			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
-				vm_page_dirty(m);
-			free = NULL;
-			pmap_unuse_pt(pmap, va, &free);
-			pmap_invalidate_page(pmap, va);
-			pmap_free_zero_pages(free);
-			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-			free_pv_entry(pmap, pv);
-			if (pmap != locked_pmap)
-				PMAP_UNLOCK(pmap);
+			}
 		}
-		if (TAILQ_EMPTY(&m->md.pv_list))
-			vm_page_aflag_clear(m, PGA_WRITEABLE);
+
+		/*
+		 * Destroy every non-wired, 4 KB page mapping in the chunk.
+		 */
+		freed = 0;
+		for (field = 0; field < _NPCM; field++) {
+			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
+			    inuse != 0; inuse &= ~(1UL << bit)) {
+				bit = bsfl(inuse);
+				pv = &pc->pc_pventry[field * 32 + bit];
+				va = pv->pv_va;
+				pte = pmap_pte_quick(pmap, va);
+				if ((*pte & PG_W) != 0)
+					continue;
+				tpte = pte_load_clear(pte);
+				if ((tpte & PG_G) != 0)
+					pmap_invalidate_page(pmap, va);
+				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+					vm_page_dirty(m);
+				if ((tpte & PG_A) != 0)
+					vm_page_aflag_set(m, PGA_REFERENCED);
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list))
+					vm_page_aflag_clear(m, PGA_WRITEABLE);
+				pc->pc_map[field] |= 1UL << bit;
+				pmap_unuse_pt(pmap, va, &free);
+				freed++;
+			}
+		}
+		if (freed == 0) {
+			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+			continue;
+		}
+		/* Every freed mapping is for a 4 KB page. */
+		pmap->pm_stats.resident_count -= freed;
+		PV_STAT(pv_entry_frees += freed);
+		PV_STAT(pv_entry_spare += freed);
+		pv_entry_count -= freed;
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		for (field = 0; field < _NPCM; field++)
+			if (pc->pc_map[field] != pc_freemask[field]) {
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+
+				/*
+				 * One freed pv entry in locked_pmap is
+				 * sufficient.
+				 */
+				if (pmap == locked_pmap)
+					goto out;
+				break;
+			}
+		if (field == _NPCM) {
+			PV_STAT(pv_entry_spare -= _NPCPV);
+			PV_STAT(pc_chunk_count--);
+			PV_STAT(pc_chunk_frees++);
+			/* Entire chunk is free; return it. */
+			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
+			pmap_qremove((vm_offset_t)pc, 1);
+			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+			break;
+		}
 	}
+out:
 	sched_unpin();
+	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
+	if (pmap != NULL) {
+		pmap_invalidate_all(pmap);
+		if (pmap != locked_pmap)
+			PMAP_UNLOCK(pmap);
+	}
+	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
+		m_pc = free;
+		free = m_pc->right;
+		/* Recycle a freed page table page. */
+		m_pc->wire_count = 1;
+		atomic_add_int(&cnt.v_wire_count, 1);
+	}
+	pmap_free_zero_pages(free);
+	return (m_pc);
 }
 
-
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
-	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
@@ -2039,13 +2107,30 @@
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
-	/* move to head of list */
-	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx]) {
-			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+			/*
+			 * 98% of the time, pc is already at the head of the
+			 * list.  If it isn't already, move it to the head.
+			 */
+			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
+			    pc)) {
+				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+			}
 			return;
 		}
+	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+	free_pv_chunk(pc);
+}
+
+static void
+free_pv_chunk(struct pv_chunk *pc)
+{
+	vm_page_t m;
+
+ 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
@@ -2062,11 +2147,10 @@
  * when needed.
  */
 static pv_entry_t
-get_pv_entry(pmap_t pmap, int try)
+get_pv_entry(pmap_t pmap, boolean_t try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
-	struct vpgqueues *pq;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
@@ -2081,7 +2165,6 @@
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max tunable.\n");
-	pq = NULL;
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
@@ -2111,29 +2194,16 @@
 	 * queues lock.  If "pv_vafree" is currently non-empty, it will
 	 * remain non-empty until pmap_ptelist_alloc() completes.
 	 */
-	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, (pq ==
-	    &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
+	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
-		/*
-		 * Reclaim pv entries: At first, destroy mappings to
-		 * inactive pages.  After that, if a pv chunk entry
-		 * is still needed, destroy mappings to active pages.
-		 */
-		if (pq == NULL) {
-			PV_STAT(pmap_collect_inactive++);
-			pq = &vm_page_queues[PQ_INACTIVE];
-		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
-			PV_STAT(pmap_collect_active++);
-			pq = &vm_page_queues[PQ_ACTIVE];
-		} else
-			panic("get_pv_entry: increase vm.pmap.shpgperproc");
-		pmap_collect(pmap, pq);
-		goto retry;
+		m = pmap_pv_reclaim(pmap);
+		if (m == NULL)
+			goto retry;
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
@@ -2145,6 +2215,7 @@
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
+	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
@@ -3470,7 +3541,7 @@
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
-			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
+			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfl(inuse);
 				bitmask = 1UL << bit;
@@ -3531,15 +3602,8 @@
 		}
 		PT_UPDATES_FLUSH();
 		if (allfree) {
-			PV_STAT(pv_entry_spare -= _NPCPV);
-			PV_STAT(pc_chunk_count--);
-			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
-			pmap_qremove((vm_offset_t)pc, 1);
-			vm_page_unwire(m, 0);
-			vm_page_free(m);
-			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+			free_pv_chunk(pc);
 		}
 	}
 	PT_UPDATES_FLUSH();
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/acpica/acpi_wakeup.c
--- a/head/sys/ia64/acpica/acpi_wakeup.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/acpica/acpi_wakeup.c	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/acpica/acpi_wakeup.c 236409 2012-06-01 17:07:52Z jkim $
  */
 
 #include <sys/param.h>
@@ -39,6 +39,13 @@
 	return (0);
 }
 
+int
+acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
+    int intr_enabled)
+{
+	return (0);
+}
+
 void
 acpi_install_wakeup_handler(struct acpi_softc *sc)
 {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/busdma_machdep.c
--- a/head/sys/ia64/ia64/busdma_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/busdma_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/ia64/ia64/busdma_machdep.c 232356 2012-03-01 19:58:34Z jhb $");
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/busdma_machdep.c 238184 2012-07-07 00:25:17Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -262,7 +262,7 @@
 			atomic_add_int(&parent->ref_count, 1);
 	}
 
-	if (newtag->lowaddr < ptoa(Maxmem) && (flags & BUS_DMA_ALLOCNOW) != 0) {
+	if (newtag->lowaddr < paddr_max && (flags & BUS_DMA_ALLOCNOW) != 0) {
 		/* Must bounce */
 
 		if (ptoa(total_bpages) < maxsize) {
@@ -340,7 +340,7 @@
 	 * exclusion region, a data alignment that is stricter than 1, and/or
 	 * an active address boundary.
 	 */
-	if (dmat->lowaddr < ptoa(Maxmem)) {
+	if (dmat->lowaddr < paddr_max) {
 		/* Must bounce */
 		int maxpages;
 
@@ -356,7 +356,7 @@
 		 * Attempt to add pages to our pool on a per-instance
 		 * basis up to a sane limit.
 		 */
-		maxpages = MIN(MAX_BPAGES, Maxmem - atop(dmat->lowaddr));
+		maxpages = MIN(MAX_BPAGES, atop(paddr_max - dmat->lowaddr));
 		if ((dmat->flags & BUS_DMA_MIN_ALLOC_COMP) == 0
 		 || (dmat->map_count > 0 && total_bpages < maxpages)) {
 			int pages;
@@ -438,7 +438,7 @@
 	 */
 	if ((dmat->maxsize <= PAGE_SIZE) &&
 	   (dmat->alignment < dmat->maxsize) &&
-	    dmat->lowaddr >= ptoa(Maxmem)) {
+	    dmat->lowaddr >= paddr_max) {
 		*vaddr = malloc(dmat->maxsize, M_DEVBUF, mflags);
 	} else {
 		/*
@@ -473,7 +473,7 @@
 		panic("bus_dmamem_free: Invalid map freed\n");
 	if ((dmat->maxsize <= PAGE_SIZE) &&
 	   (dmat->alignment < dmat->maxsize) &&
-	    dmat->lowaddr >= ptoa(Maxmem))
+	    dmat->lowaddr >= paddr_max)
 		free(vaddr, M_DEVBUF);
 	else {
 		contigfree(vaddr, dmat->maxsize, M_DEVBUF);
@@ -506,7 +506,7 @@
 	else
 		pmap = NULL;
 
-	if ((dmat->lowaddr < ptoa(Maxmem) || dmat->boundary > 0 ||
+	if ((dmat->lowaddr < paddr_max || dmat->boundary > 0 ||
 	    dmat->alignment > 1) && map != &nobounce_dmamap &&
 	    map->pagesneeded == 0) {
 		vm_offset_t vendaddr;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/machdep.c
--- a/head/sys/ia64/ia64/machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/ia64/ia64/machdep.c 232250 2012-02-28 13:19:34Z gavin $");
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/machdep.c 238257 2012-07-08 18:00:22Z marcel $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -152,22 +152,11 @@
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
-
 struct msgbuf *msgbufp = NULL;
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = NULL;
 
-long Maxmem = 0;
-long realmem = 0;
-
-#define	PHYSMAP_SIZE	(2 * VM_PHYSSEG_MAX)
-
-vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
-
-/* must be 2 less so 0 0 can signal end of chunks */
-#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
-
 struct kva_md_info kmi;
 
 #define	Mhz	1000000L
@@ -270,25 +259,8 @@
 #ifdef PERFMON
 	perfmon_init();
 #endif
-	printf("real memory  = %ld (%ld MB)\n", ia64_ptob(Maxmem),
-	    ia64_ptob(Maxmem) / 1048576);
-	realmem = Maxmem;
-
-	/*
-	 * Display any holes after the first chunk of extended memory.
-	 */
-	if (bootverbose) {
-		int indx;
-
-		printf("Physical memory chunk(s):\n");
-		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
-			long size1 = phys_avail[indx + 1] - phys_avail[indx];
-
-			printf("0x%08lx - 0x%08lx, %ld bytes (%ld pages)\n",
-			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
-			    size1 >> PAGE_SHIFT);
-		}
-	}
+	printf("real memory  = %ld (%ld MB)\n", ptoa(realmem),
+	    ptoa(realmem) / 1048576);
 
 	vm_ksubmap_init(&kmi);
 
@@ -534,6 +506,14 @@
 }
 
 void
+cpu_pcpu_setup(struct pcpu *pc, u_int acpi_id, u_int sapic_id)
+{
+
+	pc->pc_acpi_id = acpi_id;
+	pc->pc_md.lid = IA64_LID_SET_SAPIC_ID(sapic_id);
+}
+ 
+void
 spinlock_enter(void)
 {
 	struct thread *td;
@@ -700,43 +680,86 @@
 ia64_init(void)
 {
 	struct ia64_init_return ret;
-	int phys_avail_cnt;
-	vm_offset_t kernstart, kernend;
-	vm_offset_t kernstartpfn, kernendpfn, pfn0, pfn1;
+	struct efi_md *md;
+	pt_entry_t *pbvm_pgtbl_ent, *pbvm_pgtbl_lim;
 	char *p;
-	struct efi_md *md;
+	vm_size_t mdlen;
 	int metadata_missing;
 
-	/* NO OUTPUT ALLOWED UNTIL FURTHER NOTICE */
+	/*
+	 * NO OUTPUT ALLOWED UNTIL FURTHER NOTICE.
+	 */
 
-	/*
-	 * TODO: Disable interrupts, floating point etc.
-	 * Maybe flush cache and tlb
-	 */
 	ia64_set_fpsr(IA64_FPSR_DEFAULT);
 
 	/*
-	 * TODO: Get critical system information (if possible, from the
-	 * information provided by the boot program).
+	 * Region 6 is direct mapped UC and region 7 is direct mapped
+	 * WC. The details of this is controlled by the Alt {I,D}TLB
+	 * handlers. Here we just make sure that they have the largest
+	 * possible page size to minimise TLB usage.
 	 */
+	ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (PAGE_SHIFT << 2));
+	ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (PAGE_SHIFT << 2));
+	ia64_srlz_d();
+
+	/* Initialize/setup physical memory datastructures */
+	ia64_physmem_init();
 
 	/*
-	 * Look for the I/O ports first - we need them for console
-	 * probing.
+	 * Process the memory map. This gives us the PAL locations,
+	 * the I/O port base address, the available memory regions
+	 * for initializing the physical memory map.
 	 */
 	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
+		mdlen = md->md_pages * EFI_PAGE_SIZE;
 		switch (md->md_type) {
 		case EFI_MD_TYPE_IOPORT:
 			ia64_port_base = (uintptr_t)pmap_mapdev(md->md_phys,
-			    md->md_pages * EFI_PAGE_SIZE);
+			    mdlen);
 			break;
 		case EFI_MD_TYPE_PALCODE:
-			ia64_pal_size = md->md_pages * EFI_PAGE_SIZE;
 			ia64_pal_base = md->md_phys;
+			ia64_pal_size = mdlen;
+			/*FALLTHROUGH*/
+		case EFI_MD_TYPE_BAD:
+		case EFI_MD_TYPE_FIRMWARE:
+		case EFI_MD_TYPE_RECLAIM:
+		case EFI_MD_TYPE_RT_CODE:
+		case EFI_MD_TYPE_RT_DATA:
+			/* Don't use these memory regions. */
+			ia64_physmem_track(md->md_phys, mdlen);
+			break;
+		case EFI_MD_TYPE_BS_CODE:
+		case EFI_MD_TYPE_BS_DATA:
+		case EFI_MD_TYPE_CODE:
+		case EFI_MD_TYPE_DATA:
+		case EFI_MD_TYPE_FREE:
+			/* These are ok to use. */
+			ia64_physmem_add(md->md_phys, mdlen);
 			break;
 		}
 	}
 
+	/*
+	 * Remove the PBVM and its page table from phys_avail. The loader
+	 * passes the physical address of the page table to us. The virtual
+	 * address of the page table is fixed.
+	 * Track and the PBVM limit for later use.
+	 */
+	ia64_physmem_delete(bootinfo->bi_pbvm_pgtbl, bootinfo->bi_pbvm_pgtblsz);
+	pbvm_pgtbl_ent = (void *)IA64_PBVM_PGTBL;
+	pbvm_pgtbl_lim = (void *)(IA64_PBVM_PGTBL + bootinfo->bi_pbvm_pgtblsz);
+	while (pbvm_pgtbl_ent < pbvm_pgtbl_lim) {
+		if ((*pbvm_pgtbl_ent & PTE_PRESENT) == 0)
+			break;
+		ia64_physmem_delete(*pbvm_pgtbl_ent & PTE_PPN_MASK,
+		    IA64_PBVM_PAGE_SIZE);
+		pbvm_pgtbl_ent++;
+	}
+
+	/* Finalize physical memory datastructures */
+	ia64_physmem_fini();
+
 	metadata_missing = 0;
 	if (bootinfo->bi_modulep)
 		preload_metadata = (caddr_t)bootinfo->bi_modulep;
@@ -757,31 +780,6 @@
 		bootverbose = 1;
 
 	/*
-	 * Find the beginning and end of the kernel.
-	 */
-	kernstart = trunc_page(kernel_text);
-#ifdef DDB
-	ksym_start = bootinfo->bi_symtab;
-	ksym_end = bootinfo->bi_esymtab;
-	kernend = (vm_offset_t)round_page(ksym_end);
-#else
-	kernend = (vm_offset_t)round_page(_end);
-#endif
-	/* But if the bootstrap tells us otherwise, believe it! */
-	if (bootinfo->bi_kernend)
-		kernend = round_page(bootinfo->bi_kernend);
-
-	/*
-	 * Region 6 is direct mapped UC and region 7 is direct mapped
-	 * WC. The details of this is controlled by the Alt {I,D}TLB
-	 * handlers. Here we just make sure that they have the largest
-	 * possible page size to minimise TLB usage.
-	 */
-	ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (PAGE_SHIFT << 2));
-	ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (PAGE_SHIFT << 2));
-	ia64_srlz_d();
-
-	/*
 	 * Wire things up so we can call the firmware.
 	 */
 	map_pal_code();
@@ -800,9 +798,8 @@
 	pcpup = &pcpu0;
 	ia64_set_k4((u_int64_t)pcpup);
 	pcpu_init(pcpup, 0, sizeof(pcpu0));
-	dpcpu_init((void *)kernend, 0);
-	PCPU_SET(md.lid, ia64_get_lid());
-	kernend += DPCPU_SIZE;
+	dpcpu_init(ia64_physmem_alloc(DPCPU_SIZE, PAGE_SIZE), 0);
+	cpu_pcpu_setup(pcpup, ~0U, ia64_get_lid());
 	PCPU_SET(curthread, &thread0);
 
 	/*
@@ -828,105 +825,20 @@
 		freeenv(p);
 	}
 
-	kernstartpfn = atop(IA64_RR_MASK(kernstart));
-	kernendpfn = atop(IA64_RR_MASK(kernend));
-
-	/*
-	 * Size the memory regions and load phys_avail[] with the results.
-	 */
-
-	/*
-	 * Find out how much memory is available, by looking at
-	 * the memory descriptors.
-	 */
-
-#ifdef DEBUG_MD
-	printf("Memory descriptor count: %d\n", mdcount);
-#endif
-
-	phys_avail_cnt = 0;
-	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
-#ifdef DEBUG_MD
-		printf("MD %p: type %d pa 0x%lx cnt 0x%lx\n", md,
-		    md->md_type, md->md_phys, md->md_pages);
-#endif
-
-		pfn0 = ia64_btop(round_page(md->md_phys));
-		pfn1 = ia64_btop(trunc_page(md->md_phys + md->md_pages * 4096));
-		if (pfn1 <= pfn0)
-			continue;
-
-		if (md->md_type != EFI_MD_TYPE_FREE)
-			continue;
-
-		/*
-		 * We have a memory descriptor that describes conventional
-		 * memory that is for general use. We must determine if the
-		 * loader has put the kernel in this region.
-		 */
-		physmem += (pfn1 - pfn0);
-		if (pfn0 <= kernendpfn && kernstartpfn <= pfn1) {
-			/*
-			 * Must compute the location of the kernel
-			 * within the segment.
-			 */
-#ifdef DEBUG_MD
-			printf("Descriptor %p contains kernel\n", mp);
-#endif
-			if (pfn0 < kernstartpfn) {
-				/*
-				 * There is a chunk before the kernel.
-				 */
-#ifdef DEBUG_MD
-				printf("Loading chunk before kernel: "
-				       "0x%lx / 0x%lx\n", pfn0, kernstartpfn);
-#endif
-				phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
-				phys_avail[phys_avail_cnt+1] = ia64_ptob(kernstartpfn);
-				phys_avail_cnt += 2;
-			}
-			if (kernendpfn < pfn1) {
-				/*
-				 * There is a chunk after the kernel.
-				 */
-#ifdef DEBUG_MD
-				printf("Loading chunk after kernel: "
-				       "0x%lx / 0x%lx\n", kernendpfn, pfn1);
-#endif
-				phys_avail[phys_avail_cnt] = ia64_ptob(kernendpfn);
-				phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
-				phys_avail_cnt += 2;
-			}
-		} else {
-			/*
-			 * Just load this cluster as one chunk.
-			 */
-#ifdef DEBUG_MD
-			printf("Loading descriptor %d: 0x%lx / 0x%lx\n", i,
-			       pfn0, pfn1);
-#endif
-			phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
-			phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
-			phys_avail_cnt += 2;
-			
-		}
-	}
-	phys_avail[phys_avail_cnt] = 0;
-
-	Maxmem = physmem;
 	init_param2(physmem);
 
 	/*
 	 * Initialize error message buffer (at end of core).
 	 */
-	msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize);
+	msgbufp = ia64_physmem_alloc(msgbufsize, PAGE_SIZE);
 	msgbufinit(msgbufp, msgbufsize);
 
 	proc_linkup0(&proc0, &thread0);
 	/*
 	 * Init mapping for kernel stack for proc 0
 	 */
-	thread0.td_kstack = pmap_steal_memory(KSTACK_PAGES * PAGE_SIZE);
+	p = ia64_physmem_alloc(KSTACK_PAGES * PAGE_SIZE, PAGE_SIZE);
+	thread0.td_kstack = (uintptr_t)p;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 
 	mutex_init();
@@ -952,6 +864,11 @@
 	/*
 	 * Initialize debuggers, and break into them if appropriate.
 	 */
+#ifdef DDB
+	ksym_start = bootinfo->bi_symtab;
+	ksym_end = bootinfo->bi_esymtab;
+#endif
+
 	kdb_init();
 
 #ifdef KDB
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/mp_machdep.c
--- a/head/sys/ia64/ia64/mp_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/mp_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/ia64/ia64/mp_machdep.c 223758 2011-07-04 12:04:52Z attilio $");
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/mp_machdep.c 238257 2012-07-08 18:00:22Z marcel $");
 
 #include "opt_kstack_pages.h"
 
@@ -309,9 +309,8 @@
 	} else
 		pc = pcpup;
 
-	pc->pc_acpi_id = acpi_id;
-	pc->pc_md.lid = IA64_LID_SET_SAPIC_ID(sapic_id);
-
+	cpu_pcpu_setup(pc, acpi_id, sapic_id);
+ 
 	CPU_SET(pc->pc_cpuid, &all_cpus);
 }
 
@@ -466,6 +465,7 @@
 	 */
 	ia64_bind_intr();
 }
+SYSINIT(start_aps, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, cpu_mp_unleash, NULL);
 
 /*
  * send an IPI to a set of cpus.
@@ -522,5 +522,3 @@
 	ia64_mf_a();
 	CTR3(KTR_SMP, "ipi_send(%p, %d): cpuid=%d", cpu, xiv, PCPU_GET(cpuid));
 }
-
-SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, cpu_mp_unleash, NULL);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/nexus.c
--- a/head/sys/ia64/ia64/nexus.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/nexus.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/ia64/ia64/nexus.c 224184 2011-07-18 14:04:37Z jhb $
+ * $FreeBSD: head/sys/ia64/ia64/nexus.c 235041 2012-05-04 23:16:29Z marcel $
  */
 
 /*
@@ -65,9 +65,6 @@
 
 #include <dev/acpica/acpivar.h>
 
-#include <isa/isareg.h>
-#include <sys/rtprio.h>
-
 #include "clock_if.h"
 
 static MALLOC_DEFINE(M_NEXUSDEV, "nexusdev", "Nexus device");
@@ -191,12 +188,6 @@
 nexus_attach(device_t dev)
 {
 
-	/*
-	 * Mask the legacy PICs - we will use the I/O SAPIC for interrupt.
-	 */
-	outb(IO_ICU1+1, 0xff);
-	outb(IO_ICU2+1, 0xff);
-
 	if (acpi_identify() == 0)
 		BUS_ADD_CHILD(dev, 10, "acpi", 0);
 	clock_register(dev, 1000);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/physmem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/ia64/ia64/physmem.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,258 @@
+/*-
+ * Copyright (c) 2012 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/physmem.c 238190 2012-07-07 05:17:43Z marcel $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/md_var.h>
+#include <machine/vmparam.h>
+
+static u_int phys_avail_segs;
+
+vm_paddr_t phys_avail[2 * VM_PHYSSEG_MAX + 2];
+
+vm_paddr_t paddr_max;
+
+long realmem;
+
+static u_int
+ia64_physmem_find(vm_paddr_t base, vm_paddr_t lim)
+{
+	u_int idx;
+
+	for (idx = 0; phys_avail[idx + 1] != 0; idx += 2) {
+		if (phys_avail[idx] >= lim ||
+		    phys_avail[idx + 1] > base)
+			break;
+	}
+	return (idx);
+}
+
+static int
+ia64_physmem_insert(u_int idx, vm_paddr_t base, vm_paddr_t lim)
+{
+	u_int ridx;
+
+	if (phys_avail_segs == VM_PHYSSEG_MAX)
+		return (ENOMEM);
+
+	ridx = phys_avail_segs * 2;
+	while (idx < ridx) {
+		phys_avail[ridx + 1] = phys_avail[ridx - 1];
+		phys_avail[ridx] = phys_avail[ridx - 2];
+		ridx -= 2;
+	}
+	phys_avail[idx] = base;
+	phys_avail[idx + 1] = lim;
+	phys_avail_segs++;
+	return (0);
+}
+
+static int
+ia64_physmem_remove(u_int idx)
+{
+
+	if (phys_avail_segs == 0)
+		return (ENOENT);
+	do {
+		phys_avail[idx] = phys_avail[idx + 2];
+		phys_avail[idx + 1] = phys_avail[idx + 3];
+		idx += 2;
+	} while (phys_avail[idx + 1] != 0);
+	phys_avail_segs--;
+	return (0);
+}
+
+int
+ia64_physmem_add(vm_paddr_t base, vm_size_t len)
+{
+	vm_paddr_t lim;
+	u_int idx;
+
+	realmem += len;
+
+	lim = base + len;
+	idx = ia64_physmem_find(base, lim);
+	if (phys_avail[idx] == lim) {
+		phys_avail[idx] = base;
+		return (0);
+	}
+	if (idx > 0 && phys_avail[idx - 1] == base) {
+		phys_avail[idx - 1] = lim;
+		return (0);
+	}
+	return (ia64_physmem_insert(idx, base, lim));
+}
+
+int
+ia64_physmem_delete(vm_paddr_t base, vm_size_t len)
+{
+	vm_paddr_t lim;
+	u_int idx;
+
+	lim = base + len;
+	idx = ia64_physmem_find(base, lim);
+	if (phys_avail[idx] >= lim || phys_avail[idx + 1] == 0)
+		return (ENOENT);
+	if (phys_avail[idx] < base && phys_avail[idx + 1] > lim) {
+		len = phys_avail[idx + 1] - lim;
+		phys_avail[idx + 1] = base;
+		base = lim;
+		lim = base + len;
+		return (ia64_physmem_insert(idx + 2, base, lim));
+	} else {
+		if (phys_avail[idx] == base)
+			phys_avail[idx] = lim;
+		if (phys_avail[idx + 1] == lim)
+			phys_avail[idx + 1] = base;
+		if (phys_avail[idx] >= phys_avail[idx + 1])
+			return (ia64_physmem_remove(idx));
+	}
+	return (0);
+}
+
+int
+ia64_physmem_fini(void)
+{
+	vm_paddr_t base, lim, size;
+	u_int idx;
+
+	idx = 0;
+	while (phys_avail[idx + 1] != 0) {
+		base = round_page(phys_avail[idx]);
+		lim = trunc_page(phys_avail[idx + 1]);
+		if (base < lim) {
+			phys_avail[idx] = base;
+			phys_avail[idx + 1] = lim;
+			size = lim - base;
+			physmem += atop(size);
+			paddr_max = lim;
+			idx += 2;
+		} else
+			ia64_physmem_remove(idx);
+	}
+
+	/*
+	 * Round realmem to a multple of 128MB. Hopefully that compensates
+	 * for any loss of DRAM that isn't accounted for in the memory map.
+	 * I'm thinking legacy BIOS or VGA here. In any case, it's ok if
+	 * we got it wrong, because we don't actually use realmem. It's
+	 * just for show...
+	 */
+	size = 1U << 27;
+	realmem = (realmem + size - 1) & ~(size - 1);
+	realmem = atop(realmem);
+	return (0);
+}
+
+int
+ia64_physmem_init(void)
+{
+
+	/* Nothing to do just yet. */
+	return (0);
+}
+
+int
+ia64_physmem_track(vm_paddr_t base, vm_size_t len)
+{
+
+	realmem += len;
+	return (0);
+}
+
+void *
+ia64_physmem_alloc(vm_size_t len, vm_size_t align)
+{
+	vm_paddr_t base, lim, pa;
+	void *ptr;
+	u_int idx;
+
+	if (phys_avail_segs == 0)
+		return (NULL);
+
+	len = round_page(len);
+
+	/*
+	 * Try and allocate with least effort.
+	 */
+	idx = phys_avail_segs * 2;
+	while (idx > 0) {
+		idx -= 2;
+		base = phys_avail[idx];
+		lim = phys_avail[idx + 1];
+
+		if (lim - base < len)
+			continue;
+
+		/* First try from the end. */
+		pa = lim - len;
+		if ((pa & (align - 1)) == 0) {
+			if (pa == base)
+				ia64_physmem_remove(idx);
+			else
+				phys_avail[idx + 1] = pa;
+			goto gotit;
+		}
+
+		/* Try from the start next. */
+		pa = base;
+		if ((pa & (align - 1)) == 0) {
+			if (pa + len == lim)
+				ia64_physmem_remove(idx);
+			else
+				phys_avail[idx] += len;
+			goto gotit;
+		}
+	}
+
+	/*
+	 * Find a good segment and split it up.
+	 */
+	idx = phys_avail_segs * 2;
+	while (idx > 0) {
+		idx -= 2;
+		base = phys_avail[idx];
+		lim = phys_avail[idx + 1];
+
+		pa = (base + align - 1) & ~(align - 1);
+		if (pa + len <= lim) {
+			ia64_physmem_delete(pa, len);
+			goto gotit;
+		}
+	}
+
+	/* Out of luck. */
+	return (NULL);
+
+ gotit:
+	ptr = (void *)IA64_PHYS_TO_RR7(pa);
+	bzero(ptr, len);
+	return (ptr);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/pmap.c
--- a/head/sys/ia64/ia64/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -46,7 +46,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/ia64/ia64/pmap.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/pmap.c 238190 2012-07-07 05:17:43Z marcel $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -243,36 +243,6 @@
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
 		    vm_page_t m);
 
-vm_offset_t
-pmap_steal_memory(vm_size_t size)
-{
-	vm_size_t bank_size;
-	vm_offset_t pa, va;
-
-	size = round_page(size);
-
-	bank_size = phys_avail[1] - phys_avail[0];
-	while (size > bank_size) {
-		int i;
-		for (i = 0; phys_avail[i+2]; i+= 2) {
-			phys_avail[i] = phys_avail[i+2];
-			phys_avail[i+1] = phys_avail[i+3];
-		}
-		phys_avail[i] = 0;
-		phys_avail[i+1] = 0;
-		if (!phys_avail[0])
-			panic("pmap_steal_memory: out of memory");
-		bank_size = phys_avail[1] - phys_avail[0];
-	}
-
-	pa = phys_avail[0];
-	phys_avail[0] += size;
-
-	va = IA64_PHYS_TO_RR7(pa);
-	bzero((caddr_t) va, size);
-	return va;
-}
-
 static void
 pmap_initialize_vhpt(vm_offset_t vhpt)
 {
@@ -289,21 +259,23 @@
 }
 
 #ifdef SMP
-MALLOC_DECLARE(M_SMP);
-
 vm_offset_t
 pmap_alloc_vhpt(void)
 {
 	vm_offset_t vhpt;
+	vm_page_t m;
 	vm_size_t size;
 
 	size = 1UL << pmap_vhpt_log2size;
-	vhpt = (uintptr_t)contigmalloc(size, M_SMP, 0, 0UL, ~0UL, size, 0UL);
-	if (vhpt != 0) {
-		vhpt = IA64_PHYS_TO_RR7(ia64_tpa(vhpt));
+	m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
+	    VM_ALLOC_WIRED, atop(size), 0UL, ~0UL, size, 0UL,
+	    VM_MEMATTR_DEFAULT);
+	if (m != NULL) {
+		vhpt = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m));
 		pmap_initialize_vhpt(vhpt);
+		return (vhpt);
 	}
-	return (vhpt);
+	return (0);
 }
 #endif
 
@@ -316,7 +288,7 @@
 	struct ia64_pal_result res;
 	vm_offset_t base;
 	size_t size;
-	int i, j, count, ridbits;
+	int i, ridbits;
 
 	/*
 	 * Query the PAL Code to find the loop parameters for the
@@ -378,7 +350,7 @@
 
 	pmap_ridmax = (1 << ridbits);
 	pmap_ridmapsz = pmap_ridmax / 64;
-	pmap_ridmap = (uint64_t *)pmap_steal_memory(pmap_ridmax / 8);
+	pmap_ridmap = ia64_physmem_alloc(pmap_ridmax / 8, PAGE_SIZE);
 	pmap_ridmap[0] |= 0xff;
 	pmap_rididx = 0;
 	pmap_ridcount = 8;
@@ -387,14 +359,10 @@
 	/*
 	 * Allocate some memory for initial kernel 'page tables'.
 	 */
-	ia64_kptdir = (void *)pmap_steal_memory(PAGE_SIZE);
+	ia64_kptdir = ia64_physmem_alloc(PAGE_SIZE, PAGE_SIZE);
 	nkpt = 0;
 	kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 
-	for (i = 0; phys_avail[i+2]; i+= 2)
-		;
-	count = i+2;
-
 	/*
 	 * Determine a valid (mappable) VHPT size.
 	 */
@@ -408,35 +376,18 @@
 	if (pmap_vhpt_log2size & 1)
 		pmap_vhpt_log2size--;
 
-	base = 0;
 	size = 1UL << pmap_vhpt_log2size;
-	for (i = 0; i < count; i += 2) {
-		base = (phys_avail[i] + size - 1) & ~(size - 1);
-		if (base + size <= phys_avail[i+1])
-			break;
-	}
-	if (!phys_avail[i])
+	base = (uintptr_t)ia64_physmem_alloc(size, size);
+	if (base == 0)
 		panic("Unable to allocate VHPT");
 
-	if (base != phys_avail[i]) {
-		/* Split this region. */
-		for (j = count; j > i; j -= 2) {
-			phys_avail[j] = phys_avail[j-2];
-			phys_avail[j+1] = phys_avail[j-2+1];
-		}
-		phys_avail[i+1] = base;
-		phys_avail[i+2] = base + size;
-	} else
-		phys_avail[i] = base + size;
-
-	base = IA64_PHYS_TO_RR7(base);
 	PCPU_SET(md.vhpt, base);
 	if (bootverbose)
 		printf("VHPT: address=%#lx, size=%#lx\n", base, size);
 
 	pmap_vhpt_nbuckets = size / sizeof(struct ia64_lpte);
-	pmap_vhpt_bucket = (void *)pmap_steal_memory(pmap_vhpt_nbuckets *
-	    sizeof(struct ia64_bucket));
+	pmap_vhpt_bucket = ia64_physmem_alloc(pmap_vhpt_nbuckets *
+	    sizeof(struct ia64_bucket), PAGE_SIZE);
 	for (i = 0; i < pmap_vhpt_nbuckets; i++) {
 		/* Stolen memory is zeroed. */
 		mtx_init(&pmap_vhpt_bucket[i].mutex, "VHPT bucket lock", NULL,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/_stdint.h
--- a/head/sys/ia64/include/_stdint.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/_stdint.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/include/_stdint.h 237517 2012-06-24 04:15:58Z andrew $
  */
 
 #ifndef	_MACHINE__STDINT_H_
@@ -149,12 +149,6 @@
 /* Limit of size_t. */
 #define	SIZE_MAX	UINT64_MAX
 
-#ifndef WCHAR_MIN /* Also possibly defined in <wchar.h> */
-/* Limits of wchar_t. */
-#define	WCHAR_MIN	INT32_MIN
-#define	WCHAR_MAX	INT32_MAX
-#endif
-
 /* Limits of wint_t. */
 #define	WINT_MIN	INT32_MIN
 #define	WINT_MAX	INT32_MAX
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/_types.h
--- a/head/sys/ia64/include/_types.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/_types.h	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  *
  *	From: @(#)ansi.h	8.2 (Berkeley) 1/4/94
  *	From: @(#)types.h	8.3 (Berkeley) 1/5/94
- * $FreeBSD: head/sys/ia64/include/_types.h 228469 2011-12-13 13:38:03Z ed $
+ * $FreeBSD: head/sys/ia64/include/_types.h 237517 2012-06-24 04:15:58Z andrew $
  */
 
 #ifndef _MACHINE__TYPES_H_
@@ -96,6 +96,10 @@
 typedef	__uint64_t	__vm_paddr_t;
 typedef	__uint64_t	__vm_pindex_t;
 typedef	__uint64_t	__vm_size_t;
+typedef	int		__wchar_t;
+
+#define	__WCHAR_MIN	__INT_MIN	/* min value for a wchar_t */
+#define	__WCHAR_MAX	__INT_MAX	/* max value for a wchar_t */
 
 /*
  * Unusual type definitions.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/elf.h
--- a/head/sys/ia64/include/elf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/elf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/include/elf.h 237430 2012-06-22 06:38:31Z kib $
  */
 
 #ifndef _MACHINE_ELF_H_
@@ -95,6 +95,7 @@
 #define	AT_NCPUS	19	/* Number of CPUs. */
 #define	AT_PAGESIZES	20	/* Pagesizes. */
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
+#define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #define	AT_STACKPROT	23	/* Initial stack protection. */
 
 #define	AT_COUNT	24	/* Count of defined aux entry types. */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/in_cksum.h
--- a/head/sys/ia64/include/in_cksum.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/in_cksum.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  *	from tahoe:	in_cksum.c	1.2	86/01/05
  *	from:		@(#)in_cksum.c	1.3 (Berkeley) 1/19/91
  *	from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/include/in_cksum.h 235941 2012-05-24 22:00:48Z bz $
  */
 
 #ifndef _MACHINE_IN_CKSUM_H_
@@ -39,6 +39,7 @@
 
 #define in_cksum(m, len)	in_cksum_skip(m, len, 0)
 
+#if defined(IPVERSION) && (IPVERSION == 4)
 /*
  * It it useful to have an Internet checksum routine which is inlineable
  * and optimized specifically for the task of computing IP header checksums
@@ -65,9 +66,12 @@
 	} while(0)
 
 #endif
+#endif
 
 #ifdef _KERNEL
+#if defined(IPVERSION) && (IPVERSION == 4)
 u_int in_cksum_hdr(const struct ip *ip);
+#endif
 u_short	in_addword(u_short sum, u_short b);
 u_short	in_pseudo(u_int sum, u_int b, u_int c);
 u_short	in_cksum_skip(struct mbuf *m, int len, int skip);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/md_var.h
--- a/head/sys/ia64/include/md_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/md_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/include/md_var.h 238257 2012-07-08 18:00:22Z marcel $
  */
 
 #ifndef _MACHINE_MD_VAR_H_
@@ -61,6 +61,7 @@
 #ifdef _KERNEL
 
 struct _special;
+struct pcpu;
 struct thread;
 struct trapframe;
 
@@ -73,14 +74,14 @@
 };
 
 extern uint64_t ia64_lapic_addr;
-
-extern long Maxmem;
+extern vm_paddr_t paddr_max;
 extern u_int busdma_swi_pending;
 
 void	*acpi_find_table(const char *sig);
 void	busdma_swi(void);
 int	copyout_regstack(struct thread *, uint64_t *, uint64_t *);
 void	cpu_mp_add(u_int, u_int, u_int);
+void	cpu_pcpu_setup(struct pcpu *, u_int, u_int);
 int	do_ast(struct trapframe *);
 void	ia32_trap(int, struct trapframe *);
 int	ia64_count_cpus(void);
@@ -93,6 +94,12 @@
 int	ia64_highfp_save_ipi(void);
 struct ia64_init_return ia64_init(void);
 u_int	ia64_itc_freq(void);
+int	ia64_physmem_add(vm_paddr_t, vm_size_t);
+void	*ia64_physmem_alloc(vm_size_t, vm_size_t);
+int	ia64_physmem_delete(vm_paddr_t, vm_size_t);
+int	ia64_physmem_fini(void);
+int	ia64_physmem_init(void);
+int	ia64_physmem_track(vm_paddr_t, vm_size_t);
 void	ia64_probe_sapics(void);
 void	ia64_sync_icache(vm_offset_t, vm_size_t);
 void	interrupt(struct trapframe *);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/param.h
--- a/head/sys/ia64/include/param.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/param.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/ia64/include/param.h 224217 2011-07-19 13:00:30Z attilio $ */
+/* $FreeBSD: head/sys/ia64/include/param.h 238184 2012-07-07 00:25:17Z marcel $ */
 /* From: NetBSD: param.h,v 1.20 1997/09/19 13:52:53 leo Exp */
 
 /*-
@@ -110,9 +110,6 @@
 #define atop(x)			((unsigned long)(x) >> PAGE_SHIFT)
 #define ptoa(x)			((unsigned long)(x) << PAGE_SHIFT)
 
-#define	ia64_btop(x)		((unsigned long)(x) >> PAGE_SHIFT)
-#define	ia64_ptob(x)		((unsigned long)(x) << PAGE_SHIFT)
-
 #define pgtok(x)                ((x) * (PAGE_SIZE / 1024)) 
 
 #endif	/* !_IA64_INCLUDE_PARAM_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/pcb.h
--- a/head/sys/ia64/include/pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$FreeBSD$
+ *	$FreeBSD: head/sys/ia64/include/pcb.h 234785 2012-04-29 11:04:31Z dim $
  */
 
 #ifndef _MACHINE_PCB_H_
@@ -65,10 +65,10 @@
 
 void makectx(struct trapframe *, struct pcb *);
 void restorectx(struct pcb *) __dead2;
-int swapctx(struct pcb *old, struct pcb *new);
+int swapctx(struct pcb *old, struct pcb *new) __returns_twice;
 
 void ia32_restorectx(struct pcb *);
-void ia32_savectx(struct pcb *);
+void ia32_savectx(struct pcb *) __returns_twice;
 
 #endif
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/pmap.h
--- a/head/sys/ia64/include/pmap.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/pmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  *	from: hp300: @(#)pmap.h	7.2 (Berkeley) 12/16/90
  *	from: @(#)pmap.h	7.4 (Berkeley) 5/12/91
  *	from: i386 pmap.h,v 1.54 1997/11/20 19:30:35 bde Exp
- * $FreeBSD: head/sys/ia64/include/pmap.h 223873 2011-07-08 16:30:54Z marcel $
+ * $FreeBSD: head/sys/ia64/include/pmap.h 237168 2012-06-16 18:56:19Z alc $
  */
 
 #ifndef _MACHINE_PMAP_H_
@@ -118,6 +118,7 @@
 
 #define	pmap_page_get_memattr(m)	((m)->md.memattr)
 #define	pmap_page_is_mapped(m)	(!TAILQ_EMPTY(&(m)->md.pv_list))
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
 #define	pmap_mapbios(pa, sz)	pmap_mapdev(pa, sz)
 #define	pmap_unmapbios(va, sz)	pmap_unmapdev(va, sz)
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/vdso.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/ia64/include/vdso.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,41 @@
+/*-
+ * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/ia64/include/vdso.h 237433 2012-06-22 07:06:40Z kib $
+ */
+
+#ifndef _IA64_VDSO_H
+#define	_IA64_VDSO_H
+
+#define	VDSO_TIMEHANDS_MD			\
+	uint32_t	th_res[8];
+
+#ifdef _KERNEL
+#ifdef COMPAT_FREEBSD32
+
+#define	VDSO_TIMEHANDS_MD32	VDSO_TIMEHANDS_MD
+
+#endif
+#endif
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/capabilities.conf
--- a/head/sys/kern/capabilities.conf	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/capabilities.conf	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
 ## - sys_exit(2), abort2(2) and close(2) are very important.
 ## - Sorted alphabetically, please keep it that way.
 ##
-## $FreeBSD: head/sys/kern/capabilities.conf 224987 2011-08-18 22:51:30Z jonathan $
+## $FreeBSD: head/sys/kern/capabilities.conf 236361 2012-05-31 19:32:37Z pjd $
 ##
 
 ##
@@ -445,13 +445,17 @@
 faccessat
 fstatat
 fchmodat
+fchownat
 futimesat
+linkat
 mkdirat
-rmdirat
 mkfifoat
 mknodat
 openat
+readlinkat
 renameat
+symlinkat
+unlinkat
 
 ##
 ## Allow entry into open(2). This system call will fail, since access to the
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/dtio_kdtrace.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/kern/dtio_kdtrace.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2012 Advanced Computing Technologies LLC
+ * Written by George Neville-Neil gnn at freebsd.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/dtio_kdtrace.c 238366 2012-07-11 16:27:02Z gnn $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <sys/dtrace.h>
+#include "../sys/dtrace_bsd.h"
+
+
+static int	dtio_unload(void);
+static void	dtio_getargdesc(void *, dtrace_id_t, void *,
+		    dtrace_argdesc_t *);
+static void	dtio_provide(void *, dtrace_probedesc_t *);
+static void	dtio_destroy(void *, dtrace_id_t, void *);
+static void	dtio_enable(void *, dtrace_id_t, void *);
+static void	dtio_disable(void *, dtrace_id_t, void *);
+static void	dtio_load(void *);
+
+static dtrace_pattr_t dtio_attr = {
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+};
+
+static char    *genunix = "genunix";
+
+/*
+ * Name strings.
+ */
+static char	*dtio_start_str = "start";
+static char	*dtio_done_str = "done";
+static char	*dtio_wait_start_str = "wait-start";
+static char	*dtio_wait_done_str = "wait-done";
+
+static dtrace_pops_t dtio_pops = {
+	dtio_provide,
+	NULL,
+	dtio_enable,
+	dtio_disable,
+	NULL,
+	NULL,
+	dtio_getargdesc,
+	NULL,
+	NULL,
+	dtio_destroy
+};
+
+static dtrace_provider_id_t	dtio_id;
+
+extern uint32_t	dtio_start_id;
+extern uint32_t	dtio_done_id;
+extern uint32_t	dtio_wait_start_id;
+extern uint32_t	dtio_wait_done_id;
+
+static void
+dtio_getargdesc(void *arg, dtrace_id_t id, void *parg,
+    dtrace_argdesc_t *desc)
+{
+	const char *p = NULL;
+
+	switch (desc->dtargd_ndx) {
+	case 0:
+		p = "struct bio *";
+		break;
+	case 1:
+		p = "struct devstat *";
+		break;
+	default:
+		desc->dtargd_ndx = DTRACE_ARGNONE;
+	}
+
+	if (p != NULL)
+		strlcpy(desc->dtargd_native, p, sizeof(desc->dtargd_native));
+}
+
+static void
+dtio_provide(void *arg, dtrace_probedesc_t *desc)
+{
+	if (desc != NULL)
+		return;
+
+	if (dtrace_probe_lookup(dtio_id, genunix, NULL, 
+				dtio_start_str) == 0) {
+		dtio_start_id = dtrace_probe_create(dtio_id, genunix, NULL, 
+						   dtio_start_str, 0, NULL);
+	}
+	if (dtrace_probe_lookup(dtio_id, genunix, NULL, dtio_done_str) == 0) {
+		dtio_done_id = dtrace_probe_create(dtio_id, genunix, NULL, 
+						   dtio_done_str, 0, NULL);
+	}
+	if (dtrace_probe_lookup(dtio_id, genunix, NULL, 
+				dtio_wait_start_str) == 0) {
+		dtio_wait_start_id = dtrace_probe_create(dtio_id, genunix, 
+							 NULL, 
+							 dtio_wait_start_str, 
+							 0, NULL);
+	}
+	if (dtrace_probe_lookup(dtio_id, genunix, NULL, 
+				dtio_wait_done_str) == 0) {
+		dtio_wait_done_id = dtrace_probe_create(dtio_id, genunix, NULL, 
+						   dtio_wait_done_str, 0, NULL);
+	}
+
+}
+
+static void
+dtio_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+}
+
+static void
+dtio_enable(void *arg, dtrace_id_t id, void *parg)
+{
+	if (id == dtio_start_id)
+		dtrace_io_start_probe =
+			(dtrace_io_start_probe_func_t)dtrace_probe;
+	else if (id == dtio_done_id)
+		dtrace_io_done_probe =
+			(dtrace_io_done_probe_func_t)dtrace_probe;
+	else if (id == dtio_wait_start_id)
+		dtrace_io_wait_start_probe =
+			(dtrace_io_wait_start_probe_func_t)dtrace_probe;
+	else if (id == dtio_wait_done_id)
+		dtrace_io_wait_done_probe =
+			(dtrace_io_wait_done_probe_func_t)dtrace_probe;
+	else
+		printf("dtrace io provider: unknown ID\n");
+
+}
+
+static void
+dtio_disable(void *arg, dtrace_id_t id, void *parg)
+{
+	if (id == dtio_start_id)
+		dtrace_io_start_probe = NULL;
+	else if (id == dtio_done_id)
+		dtrace_io_done_probe = NULL;
+	else if (id == dtio_wait_start_id)
+		dtrace_io_wait_start_probe = NULL;
+	else if (id == dtio_wait_done_id)
+		dtrace_io_wait_done_probe = NULL;
+	else 
+		printf("dtrace io provider: unknown ID\n");
+	
+}
+
+static void
+dtio_load(void *dummy)
+{
+	if (dtrace_register("io", &dtio_attr, DTRACE_PRIV_USER, NULL, 
+			    &dtio_pops, NULL, &dtio_id) != 0)
+		return;
+}
+
+
+static int
+dtio_unload()
+{
+	dtrace_io_start_probe = NULL;
+	dtrace_io_done_probe = NULL;
+	dtrace_io_wait_start_probe = NULL;
+	dtrace_io_wait_done_probe = NULL;
+
+	return (dtrace_unregister(dtio_id));
+}
+
+static int
+dtio_modevent(module_t mod __unused, int type, void *data __unused)
+{
+	int error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		break;
+
+	case MOD_UNLOAD:
+		break;
+
+	case MOD_SHUTDOWN:
+		break;
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+SYSINIT(dtio_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+    dtio_load, NULL);
+SYSUNINIT(dtio_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+    dtio_unload, NULL);
+
+DEV_MODULE(dtio, dtio_modevent, NULL);
+MODULE_VERSION(dtio, 1);
+MODULE_DEPEND(dtio, dtrace, 1, 1, 1);
+MODULE_DEPEND(dtio, opensolaris, 1, 1, 1);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/imgact_aout.c
--- a/head/sys/kern/imgact_aout.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/imgact_aout.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/imgact_aout.c 223165 2011-06-16 22:00:59Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/imgact_aout.c 238687 2012-07-22 13:41:45Z kib $");
 
 #include <sys/param.h>
 #include <sys/exec.h>
@@ -106,6 +106,7 @@
 #define	AOUT32_USRSTACK	0xbfc00000
 #define	AOUT32_PS_STRINGS \
     (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
+#define	AOUT32_MINUSER	FREEBSD32_MINUSER
 
 extern const char *freebsd32_syscallnames[];
 extern u_long ia32_maxssiz;
@@ -129,7 +130,7 @@
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= MINSIGSTKSZ,
 	.sv_pagesize	= IA32_PAGE_SIZE,
-	.sv_minuser	= 0,
+	.sv_minuser	= AOUT32_MINUSER,
 	.sv_maxuser	= AOUT32_USRSTACK,
 	.sv_usrstack	= AOUT32_USRSTACK,
 	.sv_psstrings	= AOUT32_PS_STRINGS,
@@ -174,9 +175,9 @@
 	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
 	 * NetBSD is in network byte order.. ugh.
 	 */
-	if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
-	    ((a_out->a_magic >> 16) & 0xff) != 0 &&
-	    ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
+	if (((a_out->a_midmag >> 16) & 0xff) != 0x86 &&
+	    ((a_out->a_midmag >> 16) & 0xff) != 0 &&
+	    ((((int)ntohl(a_out->a_midmag)) >> 16) & 0xff) != 0x86)
                 return -1;
 
 	/*
@@ -184,7 +185,7 @@
 	 *	We do two cases: host byte order and network byte order
 	 *	(for NetBSD compatibility)
 	 */
-	switch ((int)(a_out->a_magic & 0xffff)) {
+	switch ((int)(a_out->a_midmag & 0xffff)) {
 	case ZMAGIC:
 		virtual_offset = 0;
 		if (a_out->a_text) {
@@ -203,7 +204,7 @@
 		break;
 	default:
 		/* NetBSD compatibility */
-		switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
+		switch ((int)(ntohl(a_out->a_midmag) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			virtual_offset = PAGE_SIZE;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/imgact_elf.c
--- a/head/sys/kern/imgact_elf.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/imgact_elf.c	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/imgact_elf.c 232828 2012-03-11 19:38:49Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/imgact_elf.c 238617 2012-07-19 11:15:53Z kib $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -83,7 +83,7 @@
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
 static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
-    const char *interp, int32_t *osrel);
+    const char *interp, int interp_name_len, int32_t *osrel);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry, size_t pagesize);
 static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
@@ -254,7 +254,7 @@
 
 static Elf_Brandinfo *
 __elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
-    int32_t *osrel)
+    int interp_name_len, int32_t *osrel)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	Elf_Brandinfo *bi;
@@ -300,7 +300,10 @@
 			if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
 				continue;
 			if (hdr->e_machine == bi->machine &&
-			    strcmp(interp, bi->interp_path) == 0)
+			    /* ELF image p_filesz includes terminating zero */
+			    strlen(bi->interp_path) + 1 == interp_name_len &&
+			    strncmp(interp, bi->interp_path, interp_name_len)
+			    == 0)
 				return (bi);
 		}
 	}
@@ -722,7 +725,7 @@
 	u_long seg_size, seg_addr;
 	u_long addr, baddr, et_dyn_addr, entry = 0, proghdr = 0;
 	int32_t osrel = 0;
-	int error = 0, i, n;
+	int error = 0, i, n, interp_name_len = 0;
 	const char *interp = NULL, *newinterp = NULL;
 	Elf_Brandinfo *brand_info;
 	char *path;
@@ -763,9 +766,11 @@
 		case PT_INTERP:
 			/* Path to interpreter */
 			if (phdr[i].p_filesz > MAXPATHLEN ||
-			    phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE)
+			    phdr[i].p_offset >= PAGE_SIZE ||
+			    phdr[i].p_offset + phdr[i].p_filesz >= PAGE_SIZE)
 				return (ENOEXEC);
 			interp = imgp->image_header + phdr[i].p_offset;
+			interp_name_len = phdr[i].p_filesz;
 			break;
 		case PT_GNU_STACK:
 			if (__elfN(nxstack))
@@ -775,7 +780,8 @@
 		}
 	}
 
-	brand_info = __elfN(get_brandinfo)(imgp, interp, &osrel);
+	brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len,
+	    &osrel);
 	if (brand_info == NULL) {
 		uprintf("ELF binary type \"%u\" not known.\n",
 		    hdr->e_ident[EI_OSABI]);
@@ -1011,6 +1017,10 @@
 		AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
 		AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
 	}
+	if (imgp->sysent->sv_timekeep_base != 0) {
+		AUXARGS_ENTRY(pos, AT_TIMEKEEP,
+		    imgp->sysent->sv_timekeep_base);
+	}
 	AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
 	    != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    imgp->sysent->sv_stackprot);
@@ -1558,6 +1568,7 @@
 	int i;
 
 	if (pnote == NULL || pnote->p_offset >= PAGE_SIZE ||
+	    pnote->p_filesz > PAGE_SIZE ||
 	    pnote->p_offset + pnote->p_filesz >= PAGE_SIZE)
 		return (FALSE);
 
@@ -1565,15 +1576,17 @@
 	note_end = (const Elf_Note *)(imgp->image_header +
 	    pnote->p_offset + pnote->p_filesz);
 	for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
-		if (!aligned(note, Elf32_Addr))
+		if (!aligned(note, Elf32_Addr) || (const char *)note_end -
+		    (const char *)note < sizeof(Elf_Note))
 			return (FALSE);
 		if (note->n_namesz != checknote->hdr.n_namesz ||
 		    note->n_descsz != checknote->hdr.n_descsz ||
 		    note->n_type != checknote->hdr.n_type)
 			goto nextnote;
 		note_name = (const char *)(note + 1);
-		if (strncmp(checknote->vendor, note_name,
-		    checknote->hdr.n_namesz) != 0)
+		if (note_name + checknote->hdr.n_namesz >=
+		    (const char *)note_end || strncmp(checknote->vendor,
+		    note_name, checknote->hdr.n_namesz) != 0)
 			goto nextnote;
 
 		/*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/imgact_gzip.c
--- a/head/sys/kern/imgact_gzip.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/imgact_gzip.c	Wed Jul 25 16:40:53 2012 +0300
@@ -22,7 +22,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/imgact_gzip.c 231885 2012-02-17 23:47:16Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/imgact_gzip.c 237694 2012-06-28 07:33:43Z imp $");
 
 #include <sys/param.h>
 #include <sys/exec.h>
@@ -161,7 +161,7 @@
 	 * Set file/virtual offset based on a.out variant. We do two cases:
 	 * host byte order and network byte order (for NetBSD compatibility)
 	 */
-	switch ((int) (gz->a_out.a_magic & 0xffff)) {
+	switch ((int) (gz->a_out.a_midmag & 0xffff)) {
 	case ZMAGIC:
 		gz->virtual_offset = 0;
 		if (gz->a_out.a_text) {
@@ -177,7 +177,7 @@
 		break;
 	default:
 		/* NetBSD compatibility */
-		switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
+		switch ((int) (ntohl(gz->a_out.a_midmag) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			gz->virtual_offset = PAGE_SIZE;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/init_main.c
--- a/head/sys/kern/init_main.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/init_main.c	Wed Jul 25 16:40:53 2012 +0300
@@ -42,7 +42,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/init_main.c 230455 2012-01-22 11:01:36Z pjd $");
+__FBSDID("$FreeBSD: head/sys/kern/init_main.c 236404 2012-06-01 15:42:37Z jhb $");
 
 #include "opt_ddb.h"
 #include "opt_init_path.h"
@@ -158,6 +158,24 @@
 	newsysinit_end = newset + count;
 }
 
+#if defined (DDB) && defined(VERBOSE_SYSINIT)
+static const char *
+symbol_name(vm_offset_t va, db_strategy_t strategy)
+{
+	const char *name;
+	c_db_sym_t sym;
+	db_expr_t  offset;
+
+	if (va == 0)
+		return (NULL);
+	sym = db_search_symbol(va, strategy, &offset);
+	if (offset != 0)
+		return (NULL);
+	db_symbol_values(sym, &name, NULL);
+	return (name);
+}
+#endif
+
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
@@ -238,15 +256,16 @@
 		}
 		if (verbose) {
 #if defined(DDB)
-			const char *name;
-			c_db_sym_t sym;
-			db_expr_t  offset;
+			const char *func, *data;
 
-			sym = db_search_symbol((vm_offset_t)(*sipp)->func,
-			    DB_STGY_PROC, &offset);
-			db_symbol_values(sym, &name, NULL);
-			if (name != NULL)
-				printf("   %s(%p)... ", name, (*sipp)->udata);
+			func = symbol_name((vm_offset_t)(*sipp)->func,
+			    DB_STGY_PROC);
+			data = symbol_name((vm_offset_t)(*sipp)->udata,
+			    DB_STGY_ANY);
+			if (func != NULL && data != NULL)
+				printf("   %s(&%s)... ", func, data);
+			else if (func != NULL)
+				printf("   %s(%p)... ", func, (*sipp)->udata);
 			else
 #endif
 				printf("   %p(%p)... ", (*sipp)->func,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/init_sysent.c
--- a/head/sys/kern/init_sysent.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/init_sysent.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/kern/init_sysent.c 227776 2011-11-21 01:26:10Z lstewart $
- * created from FreeBSD: head/sys/kern/syscalls.master 227691 2011-11-19 06:35:15Z ed 
+ * $FreeBSD: head/sys/kern/init_sysent.c 236363 2012-05-31 19:34:53Z pjd $
+ * created from FreeBSD: head/sys/kern/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 #include "opt_compat.h"
@@ -525,19 +525,19 @@
 	{ AS(cpuset_setaffinity_args), (sy_call_t *)sys_cpuset_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 488 = cpuset_setaffinity */
 	{ AS(faccessat_args), (sy_call_t *)sys_faccessat, AUE_FACCESSAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 489 = faccessat */
 	{ AS(fchmodat_args), (sy_call_t *)sys_fchmodat, AUE_FCHMODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 490 = fchmodat */
-	{ AS(fchownat_args), (sy_call_t *)sys_fchownat, AUE_FCHOWNAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 491 = fchownat */
+	{ AS(fchownat_args), (sy_call_t *)sys_fchownat, AUE_FCHOWNAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 491 = fchownat */
 	{ AS(fexecve_args), (sy_call_t *)sys_fexecve, AUE_FEXECVE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 492 = fexecve */
 	{ AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 493 = fstatat */
 	{ AS(futimesat_args), (sy_call_t *)sys_futimesat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 494 = futimesat */
-	{ AS(linkat_args), (sy_call_t *)sys_linkat, AUE_LINKAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 495 = linkat */
+	{ AS(linkat_args), (sy_call_t *)sys_linkat, AUE_LINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 495 = linkat */
 	{ AS(mkdirat_args), (sy_call_t *)sys_mkdirat, AUE_MKDIRAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 496 = mkdirat */
 	{ AS(mkfifoat_args), (sy_call_t *)sys_mkfifoat, AUE_MKFIFOAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 497 = mkfifoat */
 	{ AS(mknodat_args), (sy_call_t *)sys_mknodat, AUE_MKNODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 498 = mknodat */
 	{ AS(openat_args), (sy_call_t *)sys_openat, AUE_OPENAT_RWTC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 499 = openat */
-	{ AS(readlinkat_args), (sy_call_t *)sys_readlinkat, AUE_READLINKAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 500 = readlinkat */
+	{ AS(readlinkat_args), (sy_call_t *)sys_readlinkat, AUE_READLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 500 = readlinkat */
 	{ AS(renameat_args), (sy_call_t *)sys_renameat, AUE_RENAMEAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 501 = renameat */
-	{ AS(symlinkat_args), (sy_call_t *)sys_symlinkat, AUE_SYMLINKAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 502 = symlinkat */
-	{ AS(unlinkat_args), (sy_call_t *)sys_unlinkat, AUE_UNLINKAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 503 = unlinkat */
+	{ AS(symlinkat_args), (sy_call_t *)sys_symlinkat, AUE_SYMLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 502 = symlinkat */
+	{ AS(unlinkat_args), (sy_call_t *)sys_unlinkat, AUE_UNLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 503 = unlinkat */
 	{ AS(posix_openpt_args), (sy_call_t *)sys_posix_openpt, AUE_POSIX_OPENPT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 504 = posix_openpt */
 	{ AS(gssd_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 505 = gssd_syscall */
 	{ AS(jail_get_args), (sy_call_t *)sys_jail_get, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 506 = jail_get */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_acct.c
--- a/head/sys/kern/kern_acct.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_acct.c	Wed Jul 25 16:40:53 2012 +0300
@@ -68,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_acct.c 225617 2011-09-16 13:58:51Z kmacy $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_acct.c 234927 2012-05-02 14:25:39Z jhb $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -122,7 +122,7 @@
 static uint32_t	encode_long(long);
 static void	acctwatch(void);
 static void	acct_thread(void *);
-static int	acct_disable(struct thread *);
+static int	acct_disable(struct thread *, int);
 
 /*
  * Accounting vnode pointer, saved vnode pointer, and flags for each.
@@ -196,7 +196,7 @@
 sys_acct(struct thread *td, struct acct_args *uap)
 {
 	struct nameidata nd;
-	int error, flags, vfslocked;
+	int error, flags, vfslocked, replacing;
 
 	error = priv_check(td, PRIV_ACCT);
 	if (error)
@@ -246,6 +246,13 @@
 	sx_xlock(&acct_sx);
 
 	/*
+	 * Don't log spurious disable/enable messages if we are
+	 * switching from one accounting file to another due to log
+	 * rotation.
+	 */
+	replacing = (acct_vp != NULL && uap->path != NULL);
+
+	/*
 	 * If accounting was previously enabled, kill the old space-watcher,
 	 * close the file, and (if no new file was specified, leave).  Reset
 	 * the suspended state regardless of whether accounting remains
@@ -254,7 +261,7 @@
 	acct_suspended = 0;
 	if (acct_vp != NULL) {
 		vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
-		error = acct_disable(td);
+		error = acct_disable(td, !replacing);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (uap->path == NULL) {
@@ -299,7 +306,8 @@
 	}
 	acct_configured = 1;
 	sx_xunlock(&acct_sx);
-	log(LOG_NOTICE, "Accounting enabled\n");
+	if (!replacing)
+		log(LOG_NOTICE, "Accounting enabled\n");
 	return (error);
 }
 
@@ -308,7 +316,7 @@
  * our reference to the credential, and clearing the vnode's flags.
  */
 static int
-acct_disable(struct thread *td)
+acct_disable(struct thread *td, int logging)
 {
 	int error;
 
@@ -319,7 +327,8 @@
 	acct_vp = NULL;
 	acct_cred = NULL;
 	acct_flags = 0;
-	log(LOG_NOTICE, "Accounting disabled\n");
+	if (logging)
+		log(LOG_NOTICE, "Accounting disabled\n");
 	return (error);
 }
 
@@ -574,7 +583,7 @@
 	 */
 	vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
 	if (acct_vp->v_type == VBAD) {
-		(void) acct_disable(NULL);
+		(void) acct_disable(NULL, 1);
 		VFS_UNLOCK_GIANT(vfslocked);
 		acct_state |= ACCT_EXITREQ;
 		return;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_clock.c
--- a/head/sys/kern/kern_clock.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_clock.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,11 +35,12 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_clock.c 233628 2012-03-28 20:58:30Z fabient $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_clock.c 235459 2012-05-15 01:30:25Z rstone $");
 
 #include "opt_kdb.h"
 #include "opt_device_polling.h"
 #include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
 #include "opt_ntp.h"
 #include "opt_watchdog.h"
 
@@ -56,6 +57,7 @@
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
@@ -88,6 +90,9 @@
 /* Spin-lock protecting profiling statistics. */
 static struct mtx time_lock;
 
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE2(sched, , , tick, tick, "struct thread *", "struct proc *");
+
 static int
 sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
 {
@@ -760,6 +765,7 @@
 		ru->ru_maxrss = rss;
 	KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock",
 	    "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz);
+	SDT_PROBE2(sched, , , tick, td, td->td_proc);
 	thread_lock_flags(td, MTX_QUIET);
 	for ( ; cnt > 0; cnt--)
 		sched_clock(td);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_conf.c
--- a/head/sys/kern/kern_conf.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_conf.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_conf.c 231386 2012-02-10 14:55:47Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_conf.c 235899 2012-05-24 11:24:44Z mav $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -993,9 +993,10 @@
 	max_parentpath_len = SPECNAMELEN - physpath_len - /*/*/1;
 	parentpath_len = strlen(pdev->si_name);
 	if (max_parentpath_len < parentpath_len) {
-		printf("make_dev_physpath_alias: WARNING - Unable to alias %s "
-		    "to %s/%s - path too long\n",
-		    pdev->si_name, physpath, pdev->si_name);
+		if (bootverbose)
+			printf("WARNING: Unable to alias %s "
+			    "to %s/%s - path too long\n",
+			    pdev->si_name, physpath, pdev->si_name);
 		ret = ENAMETOOLONG;
 		goto out;
 	}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_descrip.c
--- a/head/sys/kern/kern_descrip.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_descrip.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 234131 2012-04-11 14:08:09Z eadler $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 238667 2012-07-21 13:02:11Z kib $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -102,7 +102,7 @@
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
-		     "file desc to leader structures");
+    "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 
 MALLOC_DECLARE(M_FADVISE);
@@ -113,21 +113,24 @@
 /* Flags for do_dup() */
 #define DUP_FIXED	0x1	/* Force fixed allocation */
 #define DUP_FCNTL	0x2	/* fcntl()-style errors */
-
-static int do_dup(struct thread *td, int flags, int old, int new,
-    register_t *retval);
-static int	fd_first_free(struct filedesc *, int, int);
-static int	fd_last_used(struct filedesc *, int, int);
-static void	fdgrowtable(struct filedesc *, int);
+#define	DUP_CLOEXEC	0x4	/* Atomically set FD_CLOEXEC. */
+
+static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
+		    struct thread *td, int holdleaders);
+static int	do_dup(struct thread *td, int flags, int old, int new,
+		    register_t *retval);
+static int	fd_first_free(struct filedesc *fdp, int low, int size);
+static int	fd_last_used(struct filedesc *fdp, int size);
+static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
-static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
-static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
-static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
 static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
 static int	fill_procdesc_info(struct procdesc *pdp,
-    struct kinfo_file *kif);
+		    struct kinfo_file *kif);
+static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
 static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
+static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
+static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
 
 /*
  * A process is initially started out with NDFILE descriptors stored within
@@ -181,14 +184,15 @@
  */
 volatile int openfiles;			/* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
-void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
+void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /* A mutex to protect the association between a proc and filedesc. */
-static struct mtx	fdesc_mtx;
+static struct mtx fdesc_mtx;
 
 /*
- * Find the first zero bit in the given bitmap, starting at low and not
- * exceeding size - 1.
+ * If low >= size, just return low. Otherwise find the first zero bit in the
+ * given bitmap, starting at low and not exceeding size - 1. Return size if
+ * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
@@ -214,19 +218,16 @@
 }
 
 /*
- * Find the highest non-zero bit in the given bitmap, starting at low and
- * not exceeding size - 1.
+ * Find the highest non-zero bit in the given bitmap, starting at 0 and
+ * not exceeding size - 1. Return -1 if not found.
  */
 static int
-fd_last_used(struct filedesc *fdp, int low, int size)
+fd_last_used(struct filedesc *fdp, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
-	if (low >= size)
-		return (-1);
-
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
@@ -234,17 +235,21 @@
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
-	for (minoff = NDSLOT(low); off >= minoff; --off)
+	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
-	return (low - 1);
+	return (-1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
-        KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
-            ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
@@ -256,8 +261,8 @@
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
-	KASSERT(!fdisused(fdp, fd),
-	    ("fd already used"));
+
+	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
@@ -274,16 +279,15 @@
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
-	KASSERT(fdisused(fdp, fd),
-	    ("fd is already unused"));
-	KASSERT(fdp->fd_ofiles[fd] == NULL,
-	    ("fd is still in use"));
+
+	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
+	KASSERT(fdp->fd_ofiles[fd] == NULL, ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
-		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
+		fdp->fd_lastfile = fd_last_used(fdp, fd);
 }
 
 /*
@@ -363,7 +367,7 @@
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 	struct flock fl;
-	struct oflock ofl;
+	struct __oflock ofl;
 	intptr_t arg;
 	int error;
 	int cmd;
@@ -427,23 +431,13 @@
 	return (error);
 }
 
-static inline struct file *
-fdtofp(int fd, struct filedesc *fdp)
-{
-	struct file *fp;
-
-	FILEDESC_LOCK_ASSERT(fdp);
-	if ((unsigned)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
-		return (NULL);
-	return (fp);
-}
-
 static inline int
 fdunwrap(int fd, cap_rights_t rights, struct filedesc *fdp, struct file **fpp)
 {
 
-	*fpp = fdtofp(fd, fdp);
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	*fpp = fget_locked(fdp, fd);
 	if (*fpp == NULL)
 		return (EBADF);
 
@@ -472,6 +466,7 @@
 	int vfslocked;
 	u_int old, new;
 	uint64_t bsize;
+	off_t foffset;
 
 	vfslocked = 0;
 	error = 0;
@@ -485,6 +480,12 @@
 		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
 		break;
 
+	case F_DUPFD_CLOEXEC:
+		tmp = arg;
+		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
+		    td->td_retval);
+		break;
+
 	case F_DUP2FD:
 		tmp = arg;
 		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
@@ -492,7 +493,7 @@
 
 	case F_GETFD:
 		FILEDESC_SLOCK(fdp);
-		if ((fp = fdtofp(fd, fdp)) == NULL) {
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
@@ -504,7 +505,7 @@
 
 	case F_SETFD:
 		FILEDESC_XLOCK(fdp);
-		if ((fp = fdtofp(fd, fdp)) == NULL) {
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_XUNLOCK(fdp);
 			error = EBADF;
 			break;
@@ -613,14 +614,15 @@
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
-			if (fp->f_offset < 0 ||
+			foffset = foffset_get(fp);
+			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
-			     fp->f_offset > OFF_MAX - flp->l_start)) {
+			     foffset > OFF_MAX - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
-			flp->l_start += fp->f_offset;
+			flp->l_start += foffset;
 		}
 
 		/*
@@ -675,10 +677,30 @@
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
-		/* Check for race with close */
+		if (error != 0 || flp->l_type == F_UNLCK ||
+		    flp->l_type == F_UNLCKSYS) {
+			fdrop(fp, td);
+			break;
+		}
+
+		/*
+		 * Check for a race with close.
+		 *
+		 * The vnode is now advisory locked (or unlocked, but this case
+		 * is not really important) as the caller requested.
+		 * We had to drop the filedesc lock, so we need to recheck if
+		 * the descriptor is still valid, because if it was closed
+		 * in the meantime we need to remove advisory lock from the
+		 * vnode - close on any descriptor leading to an advisory
+		 * locked vnode, removes that lock.
+		 * We will return 0 on purpose in that case, as the result of
+		 * successful advisory lock might have been externally visible
+		 * already. This is fine - effectively we pretend to the caller
+		 * that the closing thread was a bit slower and that the
+		 * advisory lock succeeded before the close.
+		 */
 		FILEDESC_SLOCK(fdp);
-		if ((unsigned) fd >= fdp->fd_nfiles ||
-		    fp != fdp->fd_ofiles[fd]) {
+		if (fget_locked(fdp, fd) != fp) {
 			FILEDESC_SUNLOCK(fdp);
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
@@ -686,7 +708,7 @@
 			flp->l_type = F_UNLCK;
 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
-					   F_UNLCK, flp, F_POSIX);
+			    F_UNLCK, flp, F_POSIX);
 			VFS_UNLOCK_GIANT(vfslocked);
 			vfslocked = 0;
 		} else
@@ -714,15 +736,16 @@
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
+			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
-			    fp->f_offset > OFF_MAX - flp->l_start) ||
+			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
-			     fp->f_offset < OFF_MIN - flp->l_start)) {
+			     foffset < OFF_MIN - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
-			flp->l_start += fp->f_offset;
+			flp->l_start += foffset;
 		}
 		/*
 		 * VOP_ADVLOCK() may block.
@@ -743,7 +766,7 @@
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		FILEDESC_SLOCK(fdp);
-		if ((fp = fdtofp(fd, fdp)) == NULL) {
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
@@ -799,7 +822,7 @@
 	struct proc *p;
 	struct file *fp;
 	struct file *delfp;
-	int error, holdleaders, maxfd;
+	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
@@ -820,7 +843,7 @@
 		return (flags & DUP_FCNTL ? EINVAL : EBADF);
 
 	FILEDESC_XLOCK(fdp);
-	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
+	if (fget_locked(fdp, old) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
@@ -871,77 +894,29 @@
 		}
 	}
 
+	KASSERT(fp == fdp->fd_ofiles[old], ("old fd has been modified"));
+	KASSERT(old != new, ("new fd is same as old"));
+
+	delfp = fdp->fd_ofiles[new];
 	/*
-	 * If the old file changed out from under us then treat it as a
-	 * bad file descriptor.  Userland should do its own locking to
-	 * avoid this case.
-	 */
-	if (fdp->fd_ofiles[old] != fp) {
-		/* we've allocated a descriptor which we won't use */
-		if (fdp->fd_ofiles[new] == NULL)
-			fdunused(fdp, new);
-		FILEDESC_XUNLOCK(fdp);
-		fdrop(fp, td);
-		return (EBADF);
-	}
-	KASSERT(old != new,
-	    ("new fd is same as old"));
-
-	/*
-	 * Save info on the descriptor being overwritten.  We cannot close
-	 * it without introducing an ownership race for the slot, since we
-	 * need to drop the filedesc lock to call closef().
-	 *
-	 * XXX this duplicates parts of close().
-	 */
-	delfp = fdp->fd_ofiles[new];
-	holdleaders = 0;
-	if (delfp != NULL) {
-		if (td->td_proc->p_fdtol != NULL) {
-			/*
-			 * Ask fdfree() to sleep to ensure that all relevant
-			 * process leaders can be traversed in closef().
-			 */
-			fdp->fd_holdleaderscount++;
-			holdleaders = 1;
-		}
-	}
-
-	/*
-	 * Duplicate the source descriptor
+	 * Duplicate the source descriptor.
 	 */
 	fdp->fd_ofiles[new] = fp;
-	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
+	if ((flags & DUP_CLOEXEC) != 0)
+		fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] | UF_EXCLOSE;
+	else
+		fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
 	if (new > fdp->fd_lastfile)
 		fdp->fd_lastfile = new;
 	*retval = new;
 
-	/*
-	 * If we dup'd over a valid file, we now own the reference to it
-	 * and must dispose of it using closef() semantics (as if a
-	 * close() were performed on it).
-	 *
-	 * XXX this duplicates parts of close().
-	 */
 	if (delfp != NULL) {
-		knote_fdclose(td, new);
-		if (delfp->f_type == DTYPE_MQUEUE)
-			mq_fdclose(td, new, delfp);
-		FILEDESC_XUNLOCK(fdp);
-		(void) closef(delfp, td);
-		if (holdleaders) {
-			FILEDESC_XLOCK(fdp);
-			fdp->fd_holdleaderscount--;
-			if (fdp->fd_holdleaderscount == 0 &&
-			    fdp->fd_holdleaderswakeup != 0) {
-				fdp->fd_holdleaderswakeup = 0;
-				wakeup(&fdp->fd_holdleaderscount);
-			}
-			FILEDESC_XUNLOCK(fdp);
-		}
+		(void) closefp(fdp, new, delfp, td, 1);
+		/* closefp() drops the FILEDESC lock for us. */
 	} else {
 		FILEDESC_XUNLOCK(fdp);
 	}
+
 	return (0);
 }
 
@@ -1165,6 +1140,61 @@
 }
 
 /*
+ * Function drops the filedesc lock on return.
+ */
+static int
+closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
+    int holdleaders)
+{
+	struct file *fp_object;
+	int error;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	if (holdleaders) {
+		if (td->td_proc->p_fdtol != NULL) {
+			/*
+			 * Ask fdfree() to sleep to ensure that all relevant
+			 * process leaders can be traversed in closef().
+			 */
+			fdp->fd_holdleaderscount++;
+		} else {
+			holdleaders = 0;
+		}
+	}
+
+	/*
+	 * We now hold the fp reference that used to be owned by the
+	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
+	 * knote_fdclose to prevent a race of the fd getting opened, a knote
+	 * added, and deleteing a knote for the new fd.
+	 */
+	knote_fdclose(td, fd);
+
+	/*
+	 * When we're closing an fd with a capability, we need to notify
+	 * mqueue if the underlying object is of type mqueue.
+	 */
+	(void)cap_funwrap(fp, 0, &fp_object);
+	if (fp_object->f_type == DTYPE_MQUEUE)
+		mq_fdclose(td, fd, fp_object);
+	FILEDESC_XUNLOCK(fdp);
+
+	error = closef(fp, td);
+	if (holdleaders) {
+		FILEDESC_XLOCK(fdp);
+		fdp->fd_holdleaderscount--;
+		if (fdp->fd_holdleaderscount == 0 &&
+		    fdp->fd_holdleaderswakeup != 0) {
+			fdp->fd_holdleaderswakeup = 0;
+			wakeup(&fdp->fd_holdleaderscount);
+		}
+		FILEDESC_XUNLOCK(fdp);
+	}
+	return (error);
+}
+
+/*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
@@ -1188,63 +1218,23 @@
 	int fd;
 {
 	struct filedesc *fdp;
-	struct file *fp, *fp_object;
-	int error;
-	int holdleaders;
-
-	error = 0;
-	holdleaders = 0;
+	struct file *fp;
+
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_XLOCK(fdp);
-	if ((unsigned)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL) {
+	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdp->fd_ofiles[fd] = NULL;
 	fdp->fd_ofileflags[fd] = 0;
 	fdunused(fdp, fd);
-	if (td->td_proc->p_fdtol != NULL) {
-		/*
-		 * Ask fdfree() to sleep to ensure that all relevant
-		 * process leaders can be traversed in closef().
-		 */
-		fdp->fd_holdleaderscount++;
-		holdleaders = 1;
-	}
-
-	/*
-	 * We now hold the fp reference that used to be owned by the
-	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
-	 * knote_fdclose to prevent a race of the fd getting opened, a knote
-	 * added, and deleteing a knote for the new fd.
-	 */
-	knote_fdclose(td, fd);
-
-	/*
-	 * When we're closing an fd with a capability, we need to notify
-	 * mqueue if the underlying object is of type mqueue.
-	 */
-	(void)cap_funwrap(fp, 0, &fp_object);
-	if (fp_object->f_type == DTYPE_MQUEUE)
-		mq_fdclose(td, fd, fp_object);
-	FILEDESC_XUNLOCK(fdp);
-
-	error = closef(fp, td);
-	if (holdleaders) {
-		FILEDESC_XLOCK(fdp);
-		fdp->fd_holdleaderscount--;
-		if (fdp->fd_holdleaderscount == 0 &&
-		    fdp->fd_holdleaderswakeup != 0) {
-			fdp->fd_holdleaderswakeup = 0;
-			wakeup(&fdp->fd_holdleaderscount);
-		}
-		FILEDESC_XUNLOCK(fdp);
-	}
-	return (error);
+
+	/* closefp() drops the FILEDESC lock for us. */
+	return (closefp(fdp, fd, fp, td, 1));
 }
 
 /*
@@ -1407,6 +1397,7 @@
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		int vfslocked;
+
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
@@ -1417,7 +1408,7 @@
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
-		error = 0;
+			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
@@ -1428,9 +1419,7 @@
 }
 
 /*
- * Grow the file table to accomodate (at least) nfd descriptors.  This may
- * block and drop the filedesc lock, but it will reacquire it before
- * returning.
+ * Grow the file table to accomodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
@@ -1456,7 +1445,6 @@
 		return;
 
 	/* allocate a new table and (if required) new bitmaps */
-	FILEDESC_XUNLOCK(fdp);
 	ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	nfileflags = (char *)&ntable[nnfiles];
@@ -1465,20 +1453,7 @@
 		    M_FILEDESC, M_ZERO | M_WAITOK);
 	else
 		nmap = NULL;
-	FILEDESC_XLOCK(fdp);
-
-	/*
-	 * We now have new tables ready to go.  Since we dropped the
-	 * filedesc lock to call malloc(), watch out for a race.
-	 */
-	onfiles = fdp->fd_nfiles;
-	if (onfiles >= nnfiles) {
-		/* we lost the race, but that's OK */
-		free(ntable, M_FILEDESC);
-		if (nmap != NULL)
-			free(nmap, M_FILEDESC);
-		return;
-	}
+
 	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
 	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
 	otable = fdp->fd_ofiles;
@@ -1512,7 +1487,7 @@
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
-	int fd = -1, maxfd;
+	int fd = -1, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
@@ -1527,36 +1502,38 @@
 	PROC_UNLOCK(p);
 
 	/*
-	 * Search the bitmap for a free descriptor.  If none is found, try
-	 * to grow the file table.  Keep at it until we either get a file
-	 * descriptor or run into process or system limits; fdgrowtable()
-	 * may drop the filedesc lock, so we're in a race.
+	 * Search the bitmap for a free descriptor starting at minfd.
+	 * If none is found, grow the file table.
 	 */
-	for (;;) {
-		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
-		if (fd >= maxfd)
-			return (EMFILE);
-		if (fd < fdp->fd_nfiles)
-			break;
+	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+	if (fd >= maxfd)
+		return (EMFILE);
+	if (fd >= fdp->fd_nfiles) {
+		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
 		PROC_LOCK(p);
-		error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd));
+		error = racct_set(p, RACCT_NOFILE, allocfd);
 		PROC_UNLOCK(p);
 		if (error != 0)
 			return (EMFILE);
 #endif
-		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
+		/*
+		 * fd is already equal to first free descriptor >= minfd, so
+		 * we only need to grow the table and we are done.
+		 */
+		fdgrowtable(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
+	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
+	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
-	KASSERT(fdp->fd_ofiles[fd] == NULL,
-	    ("free descriptor isn't"));
-	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
+	KASSERT(fdp->fd_ofiles[fd] == NULL, ("file descriptor isn't free"));
+	KASSERT(fdp->fd_ofileflags[fd] == 0, ("file flags are set"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
@@ -1571,7 +1548,6 @@
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = td->td_proc->p_fd;
-	struct file **fpp;
 	int i, lim, last;
 
 	FILEDESC_LOCK_ASSERT(fdp);
@@ -1587,9 +1563,8 @@
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 	last = min(fdp->fd_nfiles, lim);
-	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
-	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
-		if (*fpp == NULL && --n <= 0)
+	for (i = fdp->fd_freefile; i < last; i++) {
+		if (fdp->fd_ofiles[i] == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
@@ -1848,7 +1823,6 @@
 fdfree(struct thread *td)
 {
 	struct filedesc *fdp;
-	struct file **fpp;
 	int i, locked;
 	struct filedesc_to_leader *fdtol;
 	struct file *fp;
@@ -1875,13 +1849,10 @@
 			 fdtol->fdl_refcount));
 		if (fdtol->fdl_refcount == 1 &&
 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
-			for (i = 0, fpp = fdp->fd_ofiles;
-			     i <= fdp->fd_lastfile;
-			     i++, fpp++) {
-				if (*fpp == NULL ||
-				    (*fpp)->f_type != DTYPE_VNODE)
+			for (i = 0; i <= fdp->fd_lastfile; i++) {
+				fp = fdp->fd_ofiles[i];
+				if (fp == NULL || fp->f_type != DTYPE_VNODE)
 					continue;
-				fp = *fpp;
 				fhold(fp);
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
@@ -1891,15 +1862,11 @@
 				vp = fp->f_vnode;
 				locked = VFS_LOCK_GIANT(vp->v_mount);
 				(void) VOP_ADVLOCK(vp,
-						   (caddr_t)td->td_proc->
-						   p_leader,
-						   F_UNLCK,
-						   &lf,
-						   F_POSIX);
+				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
+				    &lf, F_POSIX);
 				VFS_UNLOCK_GIANT(locked);
 				FILEDESC_XLOCK(fdp);
 				fdrop(fp, td);
-				fpp = fdp->fd_ofiles + i;
 			}
 		}
 	retry:
@@ -1944,12 +1911,11 @@
 	if (i > 0)
 		return;
 
-	fpp = fdp->fd_ofiles;
-	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
-		if (*fpp) {
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		fp = fdp->fd_ofiles[i];
+		if (fp != NULL) {
 			FILEDESC_XLOCK(fdp);
-			fp = *fpp;
-			*fpp = NULL;
+			fdp->fd_ofiles[i] = NULL;
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 		}
@@ -2086,6 +2052,7 @@
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
+	struct file *fp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
@@ -2093,31 +2060,20 @@
 	if (fdp == NULL)
 		return;
 
-	FILEDESC_XLOCK(fdp);
-
 	/*
 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
 	 * may block and rip them out from under us.
 	 */
+	FILEDESC_XLOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
-		if (fdp->fd_ofiles[i] != NULL &&
-		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
+		fp = fdp->fd_ofiles[i];
+		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
-			struct file *fp;
-
-			knote_fdclose(td, i);
-			/*
-			 * NULL-out descriptor prior to close to avoid
-			 * a race while close blocks.
-			 */
-			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
-			if (fp->f_type == DTYPE_MQUEUE)
-				mq_fdclose(td, i, fp);
-			FILEDESC_XUNLOCK(fdp);
-			(void) closef(fp, td);
+			(void) closefp(fdp, i, fp, td, 0);
+			/* closefp() drops the FILEDESC lock. */
 			FILEDESC_XLOCK(fdp);
 		}
 	}
@@ -2198,7 +2154,7 @@
 	 * node, not the capability itself.
 	 */
 	(void)cap_funwrap(fp, 0, &fp_object);
-	if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) {
+	if (fp_object->f_type == DTYPE_VNODE && td != NULL) {
 		int vfslocked;
 
 		vp = fp_object->f_vnode;
@@ -2209,7 +2165,7 @@
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
-					   F_UNLCK, &lf, F_POSIX);
+			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
@@ -2233,8 +2189,8 @@
 				lf.l_type = F_UNLCK;
 				vp = fp_object->f_vnode;
 				(void) VOP_ADVLOCK(vp,
-						   (caddr_t)fdtol->fdl_leader,
-						   F_UNLCK, &lf, F_POSIX);
+				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
+				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
@@ -2329,8 +2285,8 @@
 	struct file *fp;
 #ifdef CAPABILITIES
 	struct file *fp_fromcap;
+#endif
 	int error;
-#endif
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
@@ -2369,7 +2325,7 @@
 		else
 			error = cap_funwrap_mmap(fp, needrights, maxprotp,
 			    &fp_fromcap);
-		if (error) {
+		if (error != 0) {
 			fdrop(fp, td);
 			return (error);
 		}
@@ -2394,14 +2350,30 @@
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
-	 *
-	 * Only one flag, or 0, may be specified.
 	 */
-	if ((flags == FREAD && (fp->f_flag & FREAD) == 0) ||
-	    (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) {
+	error = 0;
+	switch (flags) {
+	case FREAD:
+	case FWRITE:
+		if ((fp->f_flag & flags) == 0)
+			error = EBADF;
+		break;
+	case FEXEC:
+	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
+		    ((fp->f_flag & FWRITE) != 0))
+			error = EBADF;
+		break;
+	case 0:
+		break;
+	default:
+		KASSERT(0, ("wrong flags"));
+	}
+
+	if (error != 0) {
 		fdrop(fp, td);
-		return (EBADF);
+		return (error);
 	}
+
 	*fpp = fp;
 	return (0);
 }
@@ -2498,6 +2470,13 @@
 	return (_fgetvp(td, fd, FREAD, rights, NULL, vpp));
 }
 
+int
+fgetvp_exec(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, FEXEC, rights, NULL, vpp));
+}
+
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t rights,
@@ -2647,10 +2626,13 @@
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
-dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
+dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp)
 {
-	struct file *wfp;
 	struct file *fp;
+	int error, indx;
+
+	KASSERT(openerror == ENODEV || openerror == ENXIO,
+	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
@@ -2658,12 +2640,17 @@
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
-	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
-	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
+	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
+	error = fdalloc(td, 0, &indx);
+	if (error != 0) {
+		FILEDESC_XUNLOCK(fdp);
+		return (error);
+	}
+
 	/*
 	 * There are two cases of interest here.
 	 *
@@ -2671,61 +2658,36 @@
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
-	 *
-	 * Any other error code is just returned.
 	 */
-	switch (error) {
+	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
-		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
+		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
+			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
-		fp = fdp->fd_ofiles[indx];
-		fdp->fd_ofiles[indx] = wfp;
+		fdp->fd_ofiles[indx] = fp;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
-		if (fp == NULL)
-			fdused(fdp, indx);
-		fhold(wfp);
-		FILEDESC_XUNLOCK(fdp);
-		if (fp != NULL)
-			/*
-			 * We now own the reference to fp that the ofiles[]
-			 * array used to own.  Release it.
-			 */
-			fdrop(fp, td);
-		return (0);
-
+		fhold(fp);
+		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
-		fp = fdp->fd_ofiles[indx];
-		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+		fdp->fd_ofiles[indx] = fp;
 		fdp->fd_ofiles[dfd] = NULL;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		fdp->fd_ofileflags[dfd] = 0;
 		fdunused(fdp, dfd);
-		if (fp == NULL)
-			fdused(fdp, indx);
-		FILEDESC_XUNLOCK(fdp);
-
-		/*
-		 * We now own the reference to fp that the ofiles[] array
-		 * used to own.  Release it.
-		 */
-		if (fp != NULL)
-			fdrop(fp, td);
-		return (0);
-
-	default:
-		FILEDESC_XUNLOCK(fdp);
-		return (error);
+		break;
 	}
-	/* NOTREACHED */
+	FILEDESC_XUNLOCK(fdp);
+	*indxp = indx;
+	return (0);
 }
 
 /*
@@ -2884,7 +2846,7 @@
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = 0;
-			xf.xf_offset = fp->f_offset;
+			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
@@ -3089,7 +3051,7 @@
 			kif->kf_flags |= KF_FLAG_DIRECT;
 		if (fp->f_flag & FHASLOCK)
 			kif->kf_flags |= KF_FLAG_HASLOCK;
-		kif->kf_offset = fp->f_offset;
+		kif->kf_offset = foffset_get(fp);
 		if (vp != NULL) {
 			vref(vp);
 			switch (vp->v_type) {
@@ -3433,7 +3395,7 @@
 		}
 		refcnt = fp->f_count;
 		fflags = fp->f_flag;
-		offset = fp->f_offset;
+		offset = foffset_get(fp);
 
 		/*
 		 * Create sysctl entry.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_event.c
--- a/head/sys/kern/kern_event.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_event.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_event.c 233505 2012-03-26 09:34:17Z melifaro $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_event.c 238424 2012-07-13 13:24:33Z jhb $");
 
 #include "opt_ktrace.h"
 
@@ -513,6 +513,10 @@
 	list->kl_unlock(list->kl_lockarg);
 }
 
+/*
+ * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
+ * interval timer support code.
+ */
 static int
 timertoticks(intptr_t data)
 {
@@ -526,7 +530,6 @@
 	return tticks;
 }
 
-/* XXX - move to kern_timeout.c? */
 static void
 filt_timerexpire(void *knx)
 {
@@ -536,9 +539,16 @@
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 
+	/*
+	 * timertoticks() uses tvtohz() which always adds 1 to allow
+	 * for the time until the next clock interrupt being strictly
+	 * less than 1 clock tick.  We don't want that here since we
+	 * want to appear to be in sync with the clock interrupt even
+	 * when we're delayed.
+	 */
 	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
 		calloutp = (struct callout *)kn->kn_hook;
-		callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
+		callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata) - 1,
 		    filt_timerexpire, kn);
 	}
 }
@@ -546,7 +556,6 @@
 /*
  * data contains amount of time to sleep, in milliseconds
  */
-/* XXX - move to kern_timeout.c? */
 static int
 filt_timerattach(struct knote *kn)
 {
@@ -570,7 +579,6 @@
 	return (0);
 }
 
-/* XXX - move to kern_timeout.c? */
 static void
 filt_timerdetach(struct knote *kn)
 {
@@ -583,7 +591,6 @@
 	kn->kn_status |= KN_DETACHED;	/* knlist_remove usually clears it */
 }
 
-/* XXX - move to kern_timeout.c? */
 static int
 filt_timer(struct knote *kn, long hint)
 {
@@ -692,7 +699,7 @@
 	if (error)
 		goto done2;
 
-	/* An extra reference on `nfp' has been held for us by falloc(). */
+	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_exec.c
--- a/head/sys/kern/kern_exec.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_exec.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_exec.c 232700 2012-03-08 19:41:05Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_exec.c 238220 2012-07-08 00:51:38Z mjg $");
 
 #include "opt_capsicum.h"
 #include "opt_hwpmc_hooks.h"
@@ -443,8 +443,10 @@
 		/*
 		 * Some might argue that CAP_READ and/or CAP_MMAP should also
 		 * be required here; such arguments will be entertained.
+		 *
+		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
 		 */
-		error = fgetvp_read(td, args->fd, CAP_FEXECVE, &binvp);
+		error = fgetvp_exec(td, args->fd, CAP_FEXECVE, &binvp);
 		if (error)
 			goto exec_fail;
 		vfslocked = VFS_LOCK_GIANT(binvp->v_mount);
@@ -1511,64 +1513,3 @@
 	execsw = newexecsw;
 	return (0);
 }
-
-static vm_object_t shared_page_obj;
-static int shared_page_free;
-
-int
-shared_page_fill(int size, int align, const char *data)
-{
-	vm_page_t m;
-	struct sf_buf *s;
-	vm_offset_t sk;
-	int res;
-
-	VM_OBJECT_LOCK(shared_page_obj);
-	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY);
-	res = roundup(shared_page_free, align);
-	if (res + size >= IDX_TO_OFF(shared_page_obj->size))
-		res = -1;
-	else {
-		VM_OBJECT_UNLOCK(shared_page_obj);
-		s = sf_buf_alloc(m, SFB_DEFAULT);
-		sk = sf_buf_kva(s);
-		bcopy(data, (void *)(sk + res), size);
-		shared_page_free = res + size;
-		sf_buf_free(s);
-		VM_OBJECT_LOCK(shared_page_obj);
-	}
-	vm_page_wakeup(m);
-	VM_OBJECT_UNLOCK(shared_page_obj);
-	return (res);
-}
-
-static void
-shared_page_init(void *dummy __unused)
-{
-	vm_page_t m;
-
-	shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
-	    VM_PROT_DEFAULT, 0, NULL);
-	VM_OBJECT_LOCK(shared_page_obj);
-	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY | VM_ALLOC_NOBUSY |
-	    VM_ALLOC_ZERO);
-	m->valid = VM_PAGE_BITS_ALL;
-	VM_OBJECT_UNLOCK(shared_page_obj);
-}
-
-SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
-    NULL);
-
-void
-exec_sysvec_init(void *param)
-{
-	struct sysentvec *sv;
-
-	sv = (struct sysentvec *)param;
-
-	if ((sv->sv_flags & SV_SHP) == 0)
-		return;
-	sv->sv_shared_page_obj = shared_page_obj;
-	sv->sv_sigcode_base = sv->sv_shared_page_base +
-	    shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode);
-}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_fork.c
--- a/head/sys/kern/kern_fork.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_fork.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_fork.c 232240 2012-02-27 21:10:10Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_fork.c 237276 2012-06-19 22:21:59Z pjd $");
 
 #include "opt_kdtrace.h"
 #include "opt_ktrace.h"
@@ -475,7 +475,6 @@
 
 	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
 	td2->td_sigstk = td->td_sigstk;
-	td2->td_sigmask = td->td_sigmask;
 	td2->td_flags = TDF_INMEM;
 	td2->td_lend_user_pri = PRI_MAX;
 
@@ -922,8 +921,10 @@
 		 */
 		*procp = newproc;
 #ifdef PROCDESC
-		if (flags & RFPROCDESC)
+		if (flags & RFPROCDESC) {
 			procdesc_finit(newproc->p_procdesc, fp_procdesc);
+			fdrop(fp_procdesc, td);
+		}
 #endif
 		racct_proc_fork_done(newproc);
 		return (0);
@@ -939,14 +940,16 @@
 #ifdef MAC
 	mac_proc_destroy(newproc);
 #endif
+	racct_proc_exit(newproc);
 fail1:
-	racct_proc_exit(newproc);
 	if (vm2 != NULL)
 		vmspace_free(vm2);
 	uma_zfree(proc_zone, newproc);
 #ifdef PROCDESC
-	if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL))
+	if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL)) {
+		fdclose(td->td_proc->p_fd, fp_procdesc, *procdescp, td);
 		fdrop(fp_procdesc, td);
+	}
 #endif
 	pause("fork", hz / 2);
 	return (error);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_jail.c
--- a/head/sys/kern/kern_jail.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_jail.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 232598 2012-03-06 11:05:50Z trasz $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 235803 2012-05-22 19:43:20Z trasz $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -1811,6 +1811,16 @@
 		}
 	}
 
+#ifdef RACCT
+	if (!created) {
+		sx_sunlock(&allprison_lock);
+		prison_racct_modify(pr);
+		sx_slock(&allprison_lock);
+	}
+#endif
+
+	td->td_retval[0] = pr->pr_id;
+
 	/*
 	 * Now that it is all there, drop the temporary reference from existing
 	 * prisons.  Or add a reference to newly created persistent prisons
@@ -1832,12 +1842,6 @@
 			sx_sunlock(&allprison_lock);
 	}
 
-#ifdef RACCT
-	if (!created)
-		prison_racct_modify(pr);
-#endif
-
-	td->td_retval[0] = pr->pr_id;
 	goto done_errmsg;
 
  done_deref_locked:
@@ -4491,8 +4495,11 @@
 	sx_slock(&allproc_lock);
 	sx_xlock(&allprison_lock);
 
-	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0)
+	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
+		sx_xunlock(&allprison_lock);
+		sx_sunlock(&allproc_lock);
 		return;
+	}
 
 	oldprr = pr->pr_prison_racct;
 	pr->pr_prison_racct = NULL;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_kthread.c
--- a/head/sys/kern/kern_kthread.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_kthread.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_kthread.c 232700 2012-03-08 19:41:05Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_kthread.c 236117 2012-05-26 20:03:47Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -271,7 +271,6 @@
 
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
-/* XXX check if we should zero. */
 	bcopy(&oldtd->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
@@ -295,7 +294,6 @@
 	/* this code almost the same as create_thread() in kern_thr.c */
 	PROC_LOCK(p);
 	p->p_flag |= P_HADTHREADS;
-	newtd->td_sigmask = oldtd->td_sigmask; /* XXX dubious */
 	thread_link(newtd, p);
 	thread_lock(oldtd);
 	/* let the scheduler know about these things. */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_malloc.c
--- a/head/sys/kern/kern_malloc.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_malloc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -43,7 +43,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_malloc.c 232356 2012-03-01 19:58:34Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_malloc.c 238502 2012-07-15 20:29:48Z mdf $");
 
 #include "opt_ddb.h"
 #include "opt_kdtrace.h"
@@ -744,7 +744,7 @@
 		vm_kmem_size = 2 * mem_size * PAGE_SIZE;
 
 #ifdef DEBUG_MEMGUARD
-	tmp = memguard_fudge(vm_kmem_size, vm_kmem_size_max);
+	tmp = memguard_fudge(vm_kmem_size, kernel_map);
 #else
 	tmp = vm_kmem_size;
 #endif
@@ -1000,6 +1000,8 @@
 		db_printf("%18s %12ju %12juK %12ju\n",
 		    mtp->ks_shortdesc, allocs - frees,
 		    (alloced - freed + 1023) / 1024, allocs);
+		if (db_pager_quit)
+			break;
 	}
 }
 
@@ -1029,6 +1031,8 @@
 		if (mtip->mti_zone != subzone)
 			continue;
 		db_printf("%s\n", mtp->ks_shortdesc);
+		if (db_pager_quit)
+			break;
 	}
 }
 #endif /* MALLOC_DEBUG_MAXZONES > 1 */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_proc.c
--- a/head/sys/kern/kern_proc.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_proc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_proc.c 233389 2012-03-23 20:05:41Z trociny $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_proc.c 238527 2012-07-16 09:38:19Z pgj $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -309,6 +309,30 @@
 	return (p);
 }
 
+static struct proc *
+pfind_tid(pid_t tid)
+{
+	struct proc *p;
+	struct thread *td;
+
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state == PRS_NEW) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		FOREACH_THREAD_IN_PROC(p, td) {
+			if (td->td_tid == tid)
+				goto found;
+		}
+		PROC_UNLOCK(p);
+	}
+found:
+	sx_sunlock(&allproc_lock);
+	return (p);
+}
+
 /*
  * Locate a process group by number.
  * The caller must hold proctree_lock.
@@ -339,7 +363,12 @@
 	struct proc *p;
 	int error;
 
-	p = pfind(pid);
+	if (pid <= PID_MAX)
+		p = pfind(pid);
+	else if ((flags & PGET_NOTID) == 0)
+		p = pfind_tid(pid);
+	else
+		p = NULL;
 	if (p == NULL)
 		return (ESRCH);
 	if ((flags & PGET_CANSEE) != 0) {
@@ -849,6 +878,9 @@
 	kp->ki_childtime = kp->ki_childstime;
 	timevaladd(&kp->ki_childtime, &kp->ki_childutime);
 
+	FOREACH_THREAD_IN_PROC(p, td0)
+		kp->ki_cow += td0->td_cow;
+
 	tp = NULL;
 	if (p->p_pgrp) {
 		kp->ki_pgid = p->p_pgrp->pg_id;
@@ -961,6 +993,7 @@
 		kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime);
 		kp->ki_pctcpu = sched_pctcpu(td);
 		kp->ki_estcpu = td->td_estcpu;
+		kp->ki_cow = td->td_cow;
 	}
 
 	/* We can't get this anymore but ps etc never used it anyway. */
@@ -1103,6 +1136,7 @@
 	CP(*ki, *ki32, ki_estcpu);
 	CP(*ki, *ki32, ki_slptime);
 	CP(*ki, *ki32, ki_swtime);
+	CP(*ki, *ki32, ki_cow);
 	CP(*ki, *ki32, ki_runtime);
 	TV_CP(*ki, *ki32, ki_start);
 	TV_CP(*ki, *ki32, ki_childtime);
@@ -2155,6 +2189,10 @@
 			kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
 		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
 			kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
+		if (entry->eflags & MAP_ENTRY_GROWS_UP)
+			kve->kve_flags |= KVME_FLAG_GROWS_UP;
+		if (entry->eflags & MAP_ENTRY_GROWS_DOWN)
+			kve->kve_flags |= KVME_FLAG_GROWS_DOWN;
 
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_racct.c
--- a/head/sys/kern/kern_racct.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_racct.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,11 +26,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/kern/kern_racct.c 234383 2012-04-17 14:31:02Z trasz $
+ * $FreeBSD: head/sys/kern/kern_racct.c 235787 2012-05-22 15:58:27Z trasz $
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 234383 2012-04-17 14:31:02Z trasz $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 235787 2012-05-22 15:58:27Z trasz $");
 
 #include "opt_kdtrace.h"
 
@@ -573,6 +573,9 @@
 	PROC_UNLOCK(child);
 	PROC_UNLOCK(parent);
 
+	if (error != 0)
+		racct_proc_exit(child);
+
 	return (error);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_rangelock.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/kern/kern_rangelock.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/kern_rangelock.c 236317 2012-05-30 16:06:38Z kib $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rangelock.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+struct rl_q_entry {
+	TAILQ_ENTRY(rl_q_entry) rl_q_link;
+	off_t		rl_q_start, rl_q_end;
+	int		rl_q_flags;
+};
+
+static uma_zone_t rl_entry_zone;
+
+static void
+rangelock_sys_init(void)
+{
+
+	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
+
+static struct rl_q_entry *
+rlqentry_alloc(void)
+{
+
+	return (uma_zalloc(rl_entry_zone, M_WAITOK));
+}
+
+void
+rlqentry_free(struct rl_q_entry *rleq)
+{
+
+	uma_zfree(rl_entry_zone, rleq);
+}
+
+void
+rangelock_init(struct rangelock *lock)
+{
+
+	TAILQ_INIT(&lock->rl_waiters);
+	lock->rl_currdep = NULL;
+}
+
+void
+rangelock_destroy(struct rangelock *lock)
+{
+
+	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+}
+
+/*
+ * Verifies the supplied rl_q_entries for compatibility.  Returns true
+ * if the rangelock queue entries are not compatible, false if they are.
+ *
+ * Two entries are compatible if their ranges do not overlap, or both
+ * entries are for read.
+ */
+static int
+rangelock_incompatible(const struct rl_q_entry *e1,
+    const struct rl_q_entry *e2)
+{
+
+	if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ &&
+	    (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ)
+		return (0);
+	if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
+		return (1);
+	return (0);
+}
+
+/*
+ * Recalculate the lock->rl_currdep after an unlock.
+ */
+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+	struct rl_q_entry *entry, *entry1, *whead;
+
+	if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) &&
+	    lock->rl_currdep != NULL)
+		lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link);
+	for (entry = lock->rl_currdep; entry != NULL;
+	     entry = TAILQ_NEXT(entry, rl_q_link)) {
+		TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) {
+			if (rangelock_incompatible(entry, entry1))
+				goto out;
+			if (entry1 == entry)
+				break;
+		}
+	}
+out:
+	lock->rl_currdep = entry;
+	TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) {
+		if (whead == lock->rl_currdep)
+			break;
+		if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) {
+			whead->rl_q_flags |= RL_LOCK_GRANTED;
+			wakeup(whead);
+		}
+	}
+}
+
+static void
+rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
+    struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && entry != NULL && ilk != NULL);
+	mtx_assert(ilk, MA_OWNED);
+	KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
+
+	TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	if (curthread->td_rlqe == NULL)
+		curthread->td_rlqe = entry;
+	else
+		rlqentry_free(entry);
+}
+
+void
+rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+
+	mtx_lock(ilk);
+	rangelock_unlock_locked(lock, cookie, ilk);
+}
+
+/*
+ * Unlock the sub-range of granted lock.
+ */
+void *
+rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
+    off_t end, struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+	entry = cookie;
+	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
+	    ("Unlocking non-granted lock"));
+	KASSERT(entry->rl_q_start == start, ("wrong start"));
+	KASSERT(entry->rl_q_end >= end, ("wrong end"));
+
+	mtx_lock(ilk);
+	if (entry->rl_q_end == end) {
+		rangelock_unlock_locked(lock, cookie, ilk);
+		return (NULL);
+	}
+	entry->rl_q_end = end;
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	return (cookie);
+}
+
+/*
+ * Add the lock request to the queue of the pending requests for
+ * rangelock.  Sleep until the request can be granted.
+ */
+static void *
+rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
+    struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+	struct thread *td;
+
+	MPASS(lock != NULL && ilk != NULL);
+
+	td = curthread;
+	if (td->td_rlqe != NULL) {
+		entry = td->td_rlqe;
+		td->td_rlqe = NULL;
+	} else
+		entry = rlqentry_alloc();
+	MPASS(entry != NULL);
+	entry->rl_q_flags = mode;
+	entry->rl_q_start = start;
+	entry->rl_q_end = end;
+
+	mtx_lock(ilk);
+	/*
+	 * XXXKIB TODO. Check that a thread does not try to enqueue a
+	 * lock that is incompatible with another request from the same
+	 * thread.
+	 */
+
+	TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
+	if (lock->rl_currdep == NULL)
+		lock->rl_currdep = entry;
+	rangelock_calc_block(lock);
+	while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
+		msleep(entry, ilk, 0, "range", 0);
+	mtx_unlock(ilk);
+	return (entry);
+}
+
+void *
+rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+	return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk));
+}
+
+void *
+rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+	return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk));
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_sharedpage.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/kern/kern_sharedpage.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,240 @@
+/*-
+ * Copyright (c) 2010, 2012 Konstantin Belousov <kib at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/kern_sharedpage.c 237477 2012-06-23 10:15:23Z kib $");
+
+#include "opt_compat.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/vdso.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+static struct sx shared_page_alloc_sx;
+static vm_object_t shared_page_obj;
+static int shared_page_free;
+char *shared_page_mapping;
+
+void
+shared_page_write(int base, int size, const void *data)
+{
+
+	bcopy(data, shared_page_mapping + base, size);
+}
+
+static int
+shared_page_alloc_locked(int size, int align)
+{
+	int res;
+
+	res = roundup(shared_page_free, align);
+	if (res + size >= IDX_TO_OFF(shared_page_obj->size))
+		res = -1;
+	else
+		shared_page_free = res + size;
+	return (res);
+}
+
+int
+shared_page_alloc(int size, int align)
+{
+	int res;
+
+	sx_xlock(&shared_page_alloc_sx);
+	res = shared_page_alloc_locked(size, align);
+	sx_xunlock(&shared_page_alloc_sx);
+	return (res);
+}
+
+int
+shared_page_fill(int size, int align, const void *data)
+{
+	int res;
+
+	sx_xlock(&shared_page_alloc_sx);
+	res = shared_page_alloc_locked(size, align);
+	if (res != -1)
+		shared_page_write(res, size, data);
+	sx_xunlock(&shared_page_alloc_sx);
+	return (res);
+}
+
+static void
+shared_page_init(void *dummy __unused)
+{
+	vm_page_t m;
+	vm_offset_t addr;
+
+	sx_init(&shared_page_alloc_sx, "shpsx");
+	shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
+	    VM_PROT_DEFAULT, 0, NULL);
+	VM_OBJECT_LOCK(shared_page_obj);
+	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY | VM_ALLOC_NOBUSY |
+	    VM_ALLOC_ZERO);
+	m->valid = VM_PAGE_BITS_ALL;
+	VM_OBJECT_UNLOCK(shared_page_obj);
+	addr = kmem_alloc_nofault(kernel_map, PAGE_SIZE);
+	pmap_qenter(addr, &m, 1);
+	shared_page_mapping = (char *)addr;
+}
+
+SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
+    NULL);
+
+static void
+timehands_update(struct sysentvec *sv)
+{
+	struct vdso_timehands th;
+	struct vdso_timekeep *tk;
+	uint32_t enabled, idx;
+
+	enabled = tc_fill_vdso_timehands(&th);
+	tk = (struct vdso_timekeep *)(shared_page_mapping +
+	    sv->sv_timekeep_off);
+	idx = sv->sv_timekeep_curr;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0);
+	if (++idx >= VDSO_TH_NUM)
+		idx = 0;
+	sv->sv_timekeep_curr = idx;
+	if (++sv->sv_timekeep_gen == 0)
+		sv->sv_timekeep_gen = 1;
+	th.th_gen = 0;
+	if (enabled)
+		tk->tk_th[idx] = th;
+	tk->tk_enabled = enabled;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen);
+	tk->tk_current = idx;
+}
+
+#ifdef COMPAT_FREEBSD32
+static void
+timehands_update32(struct sysentvec *sv)
+{
+	struct vdso_timekeep32 *tk;
+	struct vdso_timehands32 th;
+	uint32_t enabled, idx;
+
+	enabled = tc_fill_vdso_timehands32(&th);
+	tk = (struct vdso_timekeep32 *)(shared_page_mapping +
+	    sv->sv_timekeep_off);
+	idx = sv->sv_timekeep_curr;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0);
+	if (++idx >= VDSO_TH_NUM)
+		idx = 0;
+	sv->sv_timekeep_curr = idx;
+	if (++sv->sv_timekeep_gen == 0)
+		sv->sv_timekeep_gen = 1;
+	th.th_gen = 0;
+	if (enabled)
+		tk->tk_th[idx] = th;
+	tk->tk_enabled = enabled;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen);
+	tk->tk_current = idx;
+}
+#endif
+
+/*
+ * This is hackish, but easiest way to avoid creating list structures
+ * that needs to be iterated over from the hardclock interrupt
+ * context.
+ */
+static struct sysentvec *host_sysentvec;
+#ifdef COMPAT_FREEBSD32
+static struct sysentvec *compat32_sysentvec;
+#endif
+
+void
+timekeep_push_vdso(void)
+{
+
+	if (host_sysentvec != NULL && host_sysentvec->sv_timekeep_base != 0)
+		timehands_update(host_sysentvec);
+#ifdef COMPAT_FREEBSD32
+	if (compat32_sysentvec != NULL &&
+	    compat32_sysentvec->sv_timekeep_base != 0)
+		timehands_update32(compat32_sysentvec);
+#endif
+}
+
+void
+exec_sysvec_init(void *param)
+{
+	struct sysentvec *sv;
+	int tk_base;
+	uint32_t tk_ver;
+
+	sv = (struct sysentvec *)param;
+
+	if ((sv->sv_flags & SV_SHP) == 0)
+		return;
+	sv->sv_shared_page_obj = shared_page_obj;
+	sv->sv_sigcode_base = sv->sv_shared_page_base +
+	    shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode);
+	if ((sv->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD)
+		return;
+	tk_ver = VDSO_TK_VER_CURR;
+#ifdef COMPAT_FREEBSD32
+	if ((sv->sv_flags & SV_ILP32) != 0) {
+		tk_base = shared_page_alloc(sizeof(struct vdso_timekeep32) +
+		    sizeof(struct vdso_timehands32) * VDSO_TH_NUM, 16);
+		KASSERT(tk_base != -1, ("tk_base -1 for 32bit"));
+		shared_page_write(tk_base + offsetof(struct vdso_timekeep32,
+		    tk_ver), sizeof(uint32_t), &tk_ver);
+		KASSERT(compat32_sysentvec == 0,
+		    ("Native compat32 already registered"));
+		compat32_sysentvec = sv;
+	} else {
+#endif
+		tk_base = shared_page_alloc(sizeof(struct vdso_timekeep) +
+		    sizeof(struct vdso_timehands) * VDSO_TH_NUM, 16);
+		KASSERT(tk_base != -1, ("tk_base -1 for native"));
+		shared_page_write(tk_base + offsetof(struct vdso_timekeep,
+		    tk_ver), sizeof(uint32_t), &tk_ver);
+		KASSERT(host_sysentvec == 0, ("Native already registered"));
+		host_sysentvec = sv;
+#ifdef COMPAT_FREEBSD32
+	}
+#endif
+	sv->sv_timekeep_base = sv->sv_shared_page_base + tk_base;
+	sv->sv_timekeep_off = tk_base;
+	timekeep_push_vdso();
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_shutdown.c
--- a/head/sys/kern/kern_shutdown.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_shutdown.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_shutdown.c 230643 2012-01-28 14:00:21Z attilio $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_shutdown.c 236503 2012-06-03 08:01:12Z avg $");
 
 #include "opt_ddb.h"
 #include "opt_kdb.h"
@@ -66,9 +66,7 @@
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
-#ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
-#endif
 
 #include <ddb/ddb.h>
 
@@ -151,7 +149,7 @@
 
 /* Context information for dump-debuggers. */
 static struct pcb dumppcb;		/* Registers. */
-static lwpid_t dumptid;			/* Thread ID. */
+lwpid_t dumptid;			/* Thread ID. */
 
 static void poweroff_wait(void *, int);
 static void shutdown_halt(void *junk, int howto);
@@ -334,9 +332,7 @@
 
 		waittime = 0;
 
-#ifdef SW_WATCHDOG
 		wdog_kern_pat(WD_LASTVAL);
-#endif
 		sys_sync(curthread, NULL);
 
 		/*
@@ -362,9 +358,8 @@
 			if (nbusy < pbusy)
 				iter = 0;
 			pbusy = nbusy;
-#ifdef SW_WATCHDOG
+
 			wdog_kern_pat(WD_LASTVAL);
-#endif
 			sys_sync(curthread, NULL);
 
 #ifdef PREEMPTION
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_sig.c
--- a/head/sys/kern/kern_sig.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_sig.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_sig.c 234172 2012-04-12 10:48:43Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_sig.c 238336 2012-07-10 05:45:13Z davidxu $");
 
 #include "opt_compat.h"
 #include "opt_kdtrace.h"
@@ -2436,9 +2436,10 @@
 		}
 stopme:
 		thread_suspend_switch(td);
-		if (!(p->p_flag & P_TRACED)) {
+		if (p->p_xthread == td)
+			p->p_xthread = NULL;
+		if (!(p->p_flag & P_TRACED))
 			break;
-		}
 		if (td->td_dbgflags & TDB_SUSPEND) {
 			if (p->p_flag & P_SINGLE_EXIT)
 				break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_synch.c
--- a/head/sys/kern/kern_synch.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_synch.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,8 +35,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_synch.c 234494 2012-04-20 15:32:36Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_synch.c 235459 2012-05-15 01:30:25Z rstone $");
 
+#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_sched.h"
 
@@ -51,6 +52,7 @@
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
@@ -105,6 +107,20 @@
 
 static void	loadav(void *arg);
 
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE(sched, , , preempt, preempt);
+
+/*
+ * These probes reference Solaris features that are not implemented in FreeBSD.
+ * Create the probes anyway for compatibility with existing D scripts; they'll
+ * just never fire.
+ */
+SDT_PROBE_DEFINE(sched, , , cpucaps_sleep, cpucaps-sleep);
+SDT_PROBE_DEFINE(sched, , , cpucaps_wakeup, cpucaps-wakeup);
+SDT_PROBE_DEFINE(sched, , , schedctl_nopreempt, schedctl-nopreempt);
+SDT_PROBE_DEFINE(sched, , , schedctl_preempt, schedctl-preempt);
+SDT_PROBE_DEFINE(sched, , , schedctl_yield, schedctl-yield);
+
 void
 sleepinit(void)
 {
@@ -462,6 +478,7 @@
 		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
 		    "lockname:\"%s\"", td->td_lockname);
 #endif
+	SDT_PROBE0(sched, , , preempt);
 #ifdef XEN
 	PT_UPDATES_FLUSH();
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_tc.c
--- a/head/sys/kern/kern_tc.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_tc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -14,8 +14,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_tc.c 232449 2012-03-03 08:19:18Z jmallett $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_tc.c 238537 2012-07-16 20:17:19Z gnn $");
 
+#include "opt_compat.h"
 #include "opt_ntp.h"
 #include "opt_ffclock.h"
 
@@ -32,6 +33,7 @@
 #include <sys/timepps.h>
 #include <sys/timetc.h>
 #include <sys/timex.h>
+#include <sys/vdso.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
@@ -120,6 +122,8 @@
 static void tc_windup(void);
 static void cpu_tick_calibrate(int);
 
+void dtrace_getnanotime(struct timespec *tsp);
+
 static int
 sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
 {
@@ -958,6 +962,24 @@
 #endif /* FFCLOCK */
 
 /*
+ * This is a clone of getnanotime and used for walltimestamps.
+ * The dtrace_ prefix prevents fbt from creating probes for
+ * it so walltimestamp can be safely used in all fbt probes.
+ */
+void
+dtrace_getnanotime(struct timespec *tsp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*tsp = th->th_nanotime;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+/*
  * System clock currently providing time to the system. Modifiable via sysctl
  * when the FFCLOCK option is defined.
  */
@@ -1360,6 +1382,7 @@
 #endif
 
 	timehands = th;
+	timekeep_push_vdso();
 }
 
 /* Report or change the active timecounter hardware. */
@@ -1386,6 +1409,7 @@
 		(void)newtc->tc_get_timecount(newtc);
 
 		timecounter = newtc;
+		timekeep_push_vdso();
 		return (0);
 	}
 	return (EINVAL);
@@ -1844,3 +1868,63 @@
 }
 
 cpu_tick_f	*cpu_ticks = tc_cpu_ticks;
+
+static int vdso_th_enable = 1;
+static int
+sysctl_fast_gettime(SYSCTL_HANDLER_ARGS)
+{
+	int old_vdso_th_enable, error;
+
+	old_vdso_th_enable = vdso_th_enable;
+	error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req);
+	if (error != 0)
+		return (error);
+	vdso_th_enable = old_vdso_th_enable;
+	timekeep_push_vdso();
+	return (0);
+}
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day");
+
+uint32_t
+tc_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+{
+	struct timehands *th;
+	uint32_t enabled;
+
+	th = timehands;
+	vdso_th->th_algo = VDSO_TH_ALGO_1;
+	vdso_th->th_scale = th->th_scale;
+	vdso_th->th_offset_count = th->th_offset_count;
+	vdso_th->th_counter_mask = th->th_counter->tc_counter_mask;
+	vdso_th->th_offset = th->th_offset;
+	vdso_th->th_boottime = boottimebin;
+	enabled = cpu_fill_vdso_timehands(vdso_th);
+	if (!vdso_th_enable)
+		enabled = 0;
+	return (enabled);
+}
+
+#ifdef COMPAT_FREEBSD32
+uint32_t
+tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+{
+	struct timehands *th;
+	uint32_t enabled;
+
+	th = timehands;
+	vdso_th32->th_algo = VDSO_TH_ALGO_1;
+	*(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale;
+	vdso_th32->th_offset_count = th->th_offset_count;
+	vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask;
+	vdso_th32->th_offset.sec = th->th_offset.sec;
+	*(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac;
+	vdso_th32->th_boottime.sec = boottimebin.sec;
+	*(uint64_t *)&vdso_th32->th_boottime.frac[0] = boottimebin.frac;
+	enabled = cpu_fill_vdso_timehands32(vdso_th32);
+	if (!vdso_th_enable)
+		enabled = 0;
+	return (enabled);
+}
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_thr.c
--- a/head/sys/kern/kern_thr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_thr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_thr.c 234381 2012-04-17 13:44:40Z trasz $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_thr.c 236117 2012-05-26 20:03:47Z kib $");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
@@ -252,7 +252,6 @@
 
 	PROC_LOCK(td->td_proc);
 	td->td_proc->p_flag |= P_HADTHREADS;
-	newtd->td_sigmask = td->td_sigmask;
 	thread_link(newtd, p); 
 	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
 	thread_lock(td);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_thread.c
--- a/head/sys/kern/kern_thread.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_thread.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,10 +27,11 @@
  */
 
 #include "opt_witness.h"
+#include "opt_kdtrace.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_thread.c 229429 2012-01-03 21:03:28Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_thread.c 236317 2012-05-30 16:06:38Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -38,7 +39,9 @@
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rangelock.h>
 #include <sys/resourcevar.h>
+#include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
@@ -59,6 +62,10 @@
 #include <vm/uma.h>
 #include <sys/eventhandler.h>
 
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE(proc, , , lwp_exit, lwp-exit);
+
+
 /*
  * thread related storage.
  */
@@ -199,6 +206,7 @@
 
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
+	td->td_rlqe = NULL;
 	EVENTHANDLER_INVOKE(thread_init, td);
 	td->td_sched = (struct td_sched *)&td[1];
 	umtx_thread_init(td);
@@ -216,6 +224,7 @@
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_INVOKE(thread_fini, td);
+	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_timeout.c
--- a/head/sys/kern/kern_timeout.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_timeout.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_timeout.c 227293 2011-11-07 06:44:47Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_timeout.c 234981 2012-05-03 20:00:30Z kib $");
 
 #include "opt_kdtrace.h"
 
@@ -437,6 +437,181 @@
 	}
 }
 
+static void
+callout_cc_del(struct callout *c, struct callout_cpu *cc)
+{
+
+	if (cc->cc_next == c)
+		cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
+	if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
+		c->c_func = NULL;
+		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+	}
+}
+
+static struct callout *
+softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
+    int *lockcalls, int *gcalls)
+{
+	void (*c_func)(void *);
+	void *c_arg;
+	struct lock_class *class;
+	struct lock_object *c_lock;
+	int c_flags, sharedlock;
+#ifdef SMP
+	struct callout_cpu *new_cc;
+	void (*new_func)(void *);
+	void *new_arg;
+	int new_cpu, new_ticks;
+#endif
+#ifdef DIAGNOSTIC
+	struct bintime bt1, bt2;
+	struct timespec ts2;
+	static uint64_t maxdt = 36893488147419102LL;	/* 2 msec */
+	static timeout_t *lastfunc;
+#endif
+
+	cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
+	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
+	sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
+	c_lock = c->c_lock;
+	c_func = c->c_func;
+	c_arg = c->c_arg;
+	c_flags = c->c_flags;
+	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+		c->c_flags = CALLOUT_LOCAL_ALLOC;
+	else
+		c->c_flags &= ~CALLOUT_PENDING;
+	cc->cc_curr = c;
+	cc->cc_cancel = 0;
+	CC_UNLOCK(cc);
+	if (c_lock != NULL) {
+		class->lc_lock(c_lock, sharedlock);
+		/*
+		 * The callout may have been cancelled
+		 * while we switched locks.
+		 */
+		if (cc->cc_cancel) {
+			class->lc_unlock(c_lock);
+			goto skip;
+		}
+		/* The callout cannot be stopped now. */
+		cc->cc_cancel = 1;
+
+		if (c_lock == &Giant.lock_object) {
+			(*gcalls)++;
+			CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
+			    c, c_func, c_arg);
+		} else {
+			(*lockcalls)++;
+			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
+			    c, c_func, c_arg);
+		}
+	} else {
+		(*mpcalls)++;
+		CTR3(KTR_CALLOUT, "callout mpsafe %p func %p arg %p",
+		    c, c_func, c_arg);
+	}
+#ifdef DIAGNOSTIC
+	binuptime(&bt1);
+#endif
+	THREAD_NO_SLEEPING();
+	SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
+	c_func(c_arg);
+	SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
+	THREAD_SLEEPING_OK();
+#ifdef DIAGNOSTIC
+	binuptime(&bt2);
+	bintime_sub(&bt2, &bt1);
+	if (bt2.frac > maxdt) {
+		if (lastfunc != c_func || bt2.frac > maxdt * 2) {
+			bintime2timespec(&bt2, &ts2);
+			printf(
+		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
+			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
+		}
+		maxdt = bt2.frac;
+		lastfunc = c_func;
+	}
+#endif
+	CTR1(KTR_CALLOUT, "callout %p finished", c);
+	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
+		class->lc_unlock(c_lock);
+skip:
+	CC_LOCK(cc);
+	/*
+	 * If the current callout is locally allocated (from
+	 * timeout(9)) then put it on the freelist.
+	 *
+	 * Note: we need to check the cached copy of c_flags because
+	 * if it was not local, then it's not safe to deref the
+	 * callout pointer.
+	 */
+	if (c_flags & CALLOUT_LOCAL_ALLOC) {
+		KASSERT(c->c_flags == CALLOUT_LOCAL_ALLOC,
+		    ("corrupted callout"));
+		c->c_func = NULL;
+		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+	}
+	cc->cc_curr = NULL;
+	if (cc->cc_waiting) {
+		/*
+		 * There is someone waiting for the
+		 * callout to complete.
+		 * If the callout was scheduled for
+		 * migration just cancel it.
+		 */
+		if (cc_cme_migrating(cc))
+			cc_cme_cleanup(cc);
+		cc->cc_waiting = 0;
+		CC_UNLOCK(cc);
+		wakeup(&cc->cc_waiting);
+		CC_LOCK(cc);
+	} else if (cc_cme_migrating(cc)) {
+#ifdef SMP
+		/*
+		 * If the callout was scheduled for
+		 * migration just perform it now.
+		 */
+		new_cpu = cc->cc_migration_cpu;
+		new_ticks = cc->cc_migration_ticks;
+		new_func = cc->cc_migration_func;
+		new_arg = cc->cc_migration_arg;
+		cc_cme_cleanup(cc);
+
+		/*
+		 * Handle deferred callout stops
+		 */
+		if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
+			CTR3(KTR_CALLOUT,
+			     "deferred cancelled %p func %p arg %p",
+			     c, new_func, new_arg);
+			callout_cc_del(c, cc);
+			goto nextc;
+		}
+
+		c->c_flags &= ~CALLOUT_DFRMIGRATION;
+
+		/*
+		 * It should be assert here that the
+		 * callout is not destroyed but that
+		 * is not easy.
+		 */
+		new_cc = callout_cpu_switch(c, cc, new_cpu);
+		callout_cc_add(c, new_cc, new_ticks, new_func, new_arg,
+		    new_cpu);
+		CC_UNLOCK(new_cc);
+		CC_LOCK(cc);
+#else
+		panic("migration should not happen");
+#endif
+	}
+#ifdef SMP
+nextc:
+#endif
+	return (cc->cc_next);
+}
+
 /*
  * The callout mechanism is based on the work of Adam M. Costello and 
  * George Varghese, published in a technical report entitled "Redesigning
@@ -465,12 +640,6 @@
 	int mpcalls;
 	int lockcalls;
 	int gcalls;
-#ifdef DIAGNOSTIC
-	struct bintime bt1, bt2;
-	struct timespec ts2;
-	static uint64_t maxdt = 36893488147419102LL;	/* 2 msec */
-	static timeout_t *lastfunc;
-#endif
 
 #ifndef MAX_SOFTCLOCK_STEPS
 #define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
@@ -492,7 +661,7 @@
 		cc->cc_softticks++;
 		bucket = &cc->cc_callwheel[curticks & callwheelmask];
 		c = TAILQ_FIRST(bucket);
-		while (c) {
+		while (c != NULL) {
 			depth++;
 			if (c->c_time != curticks) {
 				c = TAILQ_NEXT(c, c_links.tqe);
@@ -507,160 +676,10 @@
 					steps = 0;
 				}
 			} else {
-				void (*c_func)(void *);
-				void *c_arg;
-				struct lock_class *class;
-				struct lock_object *c_lock;
-				int c_flags, sharedlock;
-
-				cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
 				TAILQ_REMOVE(bucket, c, c_links.tqe);
-				class = (c->c_lock != NULL) ?
-				    LOCK_CLASS(c->c_lock) : NULL;
-				sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ?
-				    0 : 1;
-				c_lock = c->c_lock;
-				c_func = c->c_func;
-				c_arg = c->c_arg;
-				c_flags = c->c_flags;
-				if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
-					c->c_flags = CALLOUT_LOCAL_ALLOC;
-				} else {
-					c->c_flags =
-					    (c->c_flags & ~CALLOUT_PENDING);
-				}
-				cc->cc_curr = c;
-				cc->cc_cancel = 0;
-				CC_UNLOCK(cc);
-				if (c_lock != NULL) {
-					class->lc_lock(c_lock, sharedlock);
-					/*
-					 * The callout may have been cancelled
-					 * while we switched locks.
-					 */
-					if (cc->cc_cancel) {
-						class->lc_unlock(c_lock);
-						goto skip;
-					}
-					/* The callout cannot be stopped now. */
-					cc->cc_cancel = 1;
-
-					if (c_lock == &Giant.lock_object) {
-						gcalls++;
-						CTR3(KTR_CALLOUT,
-						    "callout %p func %p arg %p",
-						    c, c_func, c_arg);
-					} else {
-						lockcalls++;
-						CTR3(KTR_CALLOUT, "callout lock"
-						    " %p func %p arg %p",
-						    c, c_func, c_arg);
-					}
-				} else {
-					mpcalls++;
-					CTR3(KTR_CALLOUT,
-					    "callout mpsafe %p func %p arg %p",
-					    c, c_func, c_arg);
-				}
-#ifdef DIAGNOSTIC
-				binuptime(&bt1);
-#endif
-				THREAD_NO_SLEEPING();
-				SDT_PROBE(callout_execute, kernel, ,
-				    callout_start, c, 0, 0, 0, 0);
-				c_func(c_arg);
-				SDT_PROBE(callout_execute, kernel, ,
-				    callout_end, c, 0, 0, 0, 0);
-				THREAD_SLEEPING_OK();
-#ifdef DIAGNOSTIC
-				binuptime(&bt2);
-				bintime_sub(&bt2, &bt1);
-				if (bt2.frac > maxdt) {
-					if (lastfunc != c_func ||
-					    bt2.frac > maxdt * 2) {
-						bintime2timespec(&bt2, &ts2);
-						printf(
-			"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
-						    c_func, c_arg,
-						    (intmax_t)ts2.tv_sec,
-						    ts2.tv_nsec);
-					}
-					maxdt = bt2.frac;
-					lastfunc = c_func;
-				}
-#endif
-				CTR1(KTR_CALLOUT, "callout %p finished", c);
-				if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
-					class->lc_unlock(c_lock);
-			skip:
-				CC_LOCK(cc);
-				/*
-				 * If the current callout is locally
-				 * allocated (from timeout(9))
-				 * then put it on the freelist.
-				 *
-				 * Note: we need to check the cached
-				 * copy of c_flags because if it was not
-				 * local, then it's not safe to deref the
-				 * callout pointer.
-				 */
-				if (c_flags & CALLOUT_LOCAL_ALLOC) {
-					KASSERT(c->c_flags ==
-					    CALLOUT_LOCAL_ALLOC,
-					    ("corrupted callout"));
-					c->c_func = NULL;
-					SLIST_INSERT_HEAD(&cc->cc_callfree, c,
-					    c_links.sle);
-				}
-				cc->cc_curr = NULL;
-				if (cc->cc_waiting) {
-
-					/*
-					 * There is someone waiting for the
-					 * callout to complete.
-					 * If the callout was scheduled for
-					 * migration just cancel it.
-					 */
-					if (cc_cme_migrating(cc))
-						cc_cme_cleanup(cc);
-					cc->cc_waiting = 0;
-					CC_UNLOCK(cc);
-					wakeup(&cc->cc_waiting);
-					CC_LOCK(cc);
-				} else if (cc_cme_migrating(cc)) {
-#ifdef SMP
-					struct callout_cpu *new_cc;
-					void (*new_func)(void *);
-					void *new_arg;
-					int new_cpu, new_ticks;
-
-					/*
-					 * If the callout was scheduled for
-					 * migration just perform it now.
-					 */
-					new_cpu = cc->cc_migration_cpu;
-					new_ticks = cc->cc_migration_ticks;
-					new_func = cc->cc_migration_func;
-					new_arg = cc->cc_migration_arg;
-					cc_cme_cleanup(cc);
-
-					/*
-					 * It should be assert here that the
-					 * callout is not destroyed but that
-					 * is not easy.
-					 */
-					new_cc = callout_cpu_switch(c, cc,
-					    new_cpu);
-					callout_cc_add(c, new_cc, new_ticks,
-					    new_func, new_arg, new_cpu);
-					CC_UNLOCK(new_cc);
-					CC_LOCK(cc);
-#else
-					panic("migration should not happen");
-#endif
-				}
+				c = softclock_call_cc(c, cc, &mpcalls,
+				    &lockcalls, &gcalls);
 				steps = 0;
-				c = cc->cc_next;
 			}
 		}
 	}
@@ -814,6 +833,7 @@
 			cc->cc_migration_ticks = to_ticks;
 			cc->cc_migration_func = ftn;
 			cc->cc_migration_arg = arg;
+			c->c_flags |= CALLOUT_DFRMIGRATION;
 			CTR5(KTR_CALLOUT,
 		    "migration of %p func %p arg %p in %d to %u deferred",
 			    c, c->c_func, c->c_arg, to_ticks, cpu);
@@ -984,6 +1004,12 @@
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
+		} else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
+			c->c_flags &= ~CALLOUT_DFRMIGRATION;
+			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
+			CC_UNLOCK(cc);
+			return (1);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
@@ -996,19 +1022,12 @@
 
 	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
-	if (cc->cc_next == c) {
-		cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
-	}
+	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+	    c, c->c_func, c->c_arg);
 	TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c,
 	    c_links.tqe);
+	callout_cc_del(c, cc);
 
-	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
-	    c, c->c_func, c->c_arg);
-
-	if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
-		c->c_func = NULL;
-		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
-	}
 	CC_UNLOCK(cc);
 	return (1);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sched_4bsd.c
--- a/head/sys/kern/sched_4bsd.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sched_4bsd.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sched_4bsd.c 232700 2012-03-08 19:41:05Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/sched_4bsd.c 235471 2012-05-15 10:58:17Z pluknet $");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
@@ -50,6 +50,7 @@
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
@@ -244,12 +245,31 @@
 	   "allow threads to share a quantum");
 #endif
 
+SDT_PROVIDER_DEFINE(sched);
+
+SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
+    "struct proc *", "uint8_t");
+SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
+    "struct proc *", "void *");
+SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
+    "struct proc *", "void *", "int");
+SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
+    "struct proc *", "uint8_t", "struct thread *");
+SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *",
+    "struct proc *");
+SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
+SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *",
+    "struct proc *");
+
 static __inline void
 sched_load_add(void)
 {
 
 	sched_tdcnt++;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
+	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
 }
 
 static __inline void
@@ -258,6 +278,7 @@
 
 	sched_tdcnt--;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
+	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
 }
 /*
  * Arrange to reschedule if necessary, taking the priorities and
@@ -795,10 +816,13 @@
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
 	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
+	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
 	if (td != curthread && prio > td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
+		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
+		    curthread);
 	}
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
@@ -987,6 +1011,9 @@
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
+
+		SDT_PROBE2(sched, , , off_cpu, td, td->td_proc);
+
                 /* I feel sleepy */
 		lock_profile_release_lock(&sched_lock.lock_object);
 #ifdef KDTRACE_HOOKS
@@ -1018,11 +1045,14 @@
 		 * needed to, or the thread_wait() or wait() will
 		 * need to reap it.
 		 */
+
+		SDT_PROBE0(sched, , , on_cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
-	}
+	} else
+		SDT_PROBE0(sched, , , remain_cpu);
 
 #ifdef SMP
 	if (td->td_flags & TDF_IDLETD)
@@ -1223,6 +1253,8 @@
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
+	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
+	    flags & SRQ_PREEMPTED);
 
 
 	/*
@@ -1315,6 +1347,8 @@
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
+	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
+	    flags & SRQ_PREEMPTED);
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
@@ -1362,6 +1396,7 @@
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
+	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
@@ -1425,6 +1460,8 @@
 void
 sched_preempt(struct thread *td)
 {
+
+	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 	thread_lock(td);
 	if (td->td_critnest > 1)
 		td->td_owepreempt = 1;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sched_ule.c
--- a/head/sys/kern/sched_ule.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sched_ule.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 234066 2012-04-09 18:24:58Z mav $");
+__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 236141 2012-05-27 10:25:20Z raj $");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_kdtrace.h"
@@ -53,6 +53,7 @@
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
@@ -76,7 +77,7 @@
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
-#if defined(__powerpc__) && defined(E500)
+#if defined(__powerpc__) && defined(BOOKE_E500)
 #error "This architecture is not currently compatible with ULE"
 #endif
 
@@ -327,6 +328,24 @@
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
+SDT_PROVIDER_DEFINE(sched);
+
+SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
+    "struct proc *", "uint8_t");
+SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
+    "struct proc *", "void *");
+SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
+    "struct proc *", "void *", "int");
+SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
+    "struct proc *", "uint8_t", "struct thread *");
+SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *", 
+    "struct proc *");
+SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
+SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *", 
+    "struct proc *");
+
 /*
  * Print the threads waiting on a run-queue.
  */
@@ -509,6 +528,7 @@
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload++;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
+	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
@@ -528,6 +548,7 @@
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload--;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
+	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
@@ -1625,10 +1646,13 @@
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
 	    "prio:%d", td->td_priority, "new prio:%d", prio,
 	    KTR_ATTR_LINKED, sched_tdname(curthread));
+	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
 	if (td != curthread && prio > td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
+		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
+		    curthread);
 	} 
 	ts = td->td_sched;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
@@ -1879,6 +1903,7 @@
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
+		SDT_PROBE2(sched, , , off_cpu, td, td->td_proc);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
 		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
 		sched_pctcpu_update(newtd->td_sched, 0);
@@ -1903,12 +1928,16 @@
 		tdq = TDQ_CPU(cpuid);
 		lock_profile_obtain_lock_success(
 		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
+
+		SDT_PROBE0(sched, , , on_cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
-	} else
+	} else {
 		thread_unblock_switch(td, mtx);
+		SDT_PROBE0(sched, , , remain_cpu);
+	}
 	/*
 	 * Assert that all went well and return.
 	 */
@@ -2102,6 +2131,8 @@
 {
 	struct tdq *tdq;
 
+	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
+
 	thread_lock(td);
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
@@ -2330,6 +2361,8 @@
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
+	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
+	    flags & SRQ_PREEMPTED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Recalculate the priority before we select the target cpu or
@@ -2375,6 +2408,7 @@
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority);
+	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 	tdq = TDQ_CPU(td->td_sched->ts_cpu);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_bus.c
--- a/head/sys/kern/subr_bus.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_bus.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_bus.c 234152 2012-04-11 20:57:41Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_bus.c 235978 2012-05-25 07:32:26Z avg $");
 
 #include "opt_bus.h"
 
@@ -1909,6 +1909,8 @@
 
 	PDEBUG(("%s at %s with order %u as unit %d",
 	    name, DEVICENAME(dev), order, unit));
+	KASSERT(name != NULL || unit == -1,
+	    ("child device with wildcard name and specific unit number"));
 
 	child = make_device(dev, name, unit);
 	if (child == NULL)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_devstat.c
--- a/head/sys/kern/subr_devstat.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_devstat.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_devstat.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_devstat.c 238372 2012-07-11 18:50:50Z kib $");
+
+#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -44,6 +46,58 @@
 
 #include <machine/atomic.h>
 
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+
+dtrace_io_start_probe_func_t dtrace_io_start_probe;
+dtrace_io_done_probe_func_t dtrace_io_done_probe;
+dtrace_io_wait_start_probe_func_t dtrace_io_wait_start_probe;
+dtrace_io_wait_done_probe_func_t dtrace_io_wait_done_probe;
+
+uint32_t	dtio_start_id;
+uint32_t	dtio_done_id;
+uint32_t	dtio_wait_start_id;
+uint32_t	dtio_wait_done_id;
+
+#define DTRACE_DEVSTAT_START() \
+	if (dtrace_io_start_probe != NULL) \
+		(*dtrace_io_start_probe)(dtio_start_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_BIO_START() \
+	if (dtrace_io_start_probe != NULL) \
+		(*dtrace_io_start_probe)(dtio_start_id, bp, ds);
+
+#define DTRACE_DEVSTAT_DONE() \
+	if (dtrace_io_done_probe != NULL) \
+		(*dtrace_io_done_probe)(dtio_done_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_BIO_DONE() \
+	if (dtrace_io_done_probe != NULL) \
+		(*dtrace_io_done_probe)(dtio_done_id, bp, ds);
+
+#define DTRACE_DEVSTAT_WAIT_START() \
+	if (dtrace_io_wait_start_probe != NULL) \
+		(*dtrace_io_wait_start_probe)(dtio_wait_start_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_WAIT_DONE() \
+	if (dtrace_io_wait_done_probe != NULL) \
+		(*dtrace_io_wait_done_probe)(dtio_wait_done_id, NULL, ds);
+
+#else /* ! KDTRACE_HOOKS */
+
+#define DTRACE_DEVSTAT_START()
+
+#define DTRACE_DEVSTAT_BIO_START()
+
+#define DTRACE_DEVSTAT_DONE()
+
+#define DTRACE_DEVSTAT_BIO_DONE()
+
+#define DTRACE_DEVSTAT_WAIT_START()
+
+#define DTRACE_DEVSTAT_WAIT_DONE()
+#endif /* KDTRACE_HOOKS */
+
 static int devstat_num_devs;
 static long devstat_generation = 1;
 static int devstat_version = DEVSTAT_VERSION;
@@ -227,6 +281,7 @@
 	}
 	ds->start_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
+	DTRACE_DEVSTAT_START();
 }
 
 void
@@ -241,6 +296,7 @@
 
 	binuptime(&bp->bio_t0);
 	devstat_start_transaction(ds, &bp->bio_t0);
+	DTRACE_DEVSTAT_BIO_START();
 }
 
 /*
@@ -312,6 +368,7 @@
 
 	ds->end_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
+	DTRACE_DEVSTAT_DONE();
 }
 
 void
@@ -334,6 +391,7 @@
 
 	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
 				DEVSTAT_TAG_SIMPLE, flg, NULL, &bp->bio_t0);
+	DTRACE_DEVSTAT_BIO_DONE();
 }
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_dummy_vdso_tc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/kern/subr_dummy_vdso_tc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,49 @@
+/*-
+ * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/subr_dummy_vdso_tc.c 237433 2012-06-22 07:06:40Z kib $");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/vdso.h>
+
+uint32_t
+cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+{
+
+	return (0);
+}
+
+#ifdef COMPAT_FREEBSD32
+uint32_t
+cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+{
+
+	return (0);
+}
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_firmware.c
--- a/head/sys/kern/subr_firmware.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_firmware.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_firmware.c 234201 2012-04-13 04:22:42Z adrian $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_firmware.c 237546 2012-06-25 05:41:16Z kevlo $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -198,7 +198,7 @@
 		free(str, M_TEMP);
 		return NULL;
 	}
-	bzero(frp, sizeof(frp));	/* start from a clean record */
+	bzero(frp, sizeof(*frp));	/* start from a clean record */
 	frp->fw.name = str;
 	frp->fw.data = data;
 	frp->fw.datasize = datasize;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_rman.c
--- a/head/sys/kern/subr_rman.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_rman.c	Wed Jul 25 16:40:53 2012 +0300
@@ -58,7 +58,7 @@
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_rman.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_rman.c 236359 2012-05-31 17:27:05Z imp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -161,6 +161,7 @@
 rman_manage_region(struct rman *rm, u_long start, u_long end)
 {
 	struct resource_i *r, *s, *t;
+	int rv = 0;
 
 	DPRINTF(("rman_manage_region: <%s> request: start %#lx, end %#lx\n",
 	    rm->rm_descr, start, end));
@@ -188,13 +189,17 @@
 		TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link);
 	} else {
 		/* Check for any overlap with the current region. */
-		if (r->r_start <= s->r_end && r->r_end >= s->r_start)
-			return EBUSY;
+		if (r->r_start <= s->r_end && r->r_end >= s->r_start) {
+			rv = EBUSY;
+			goto out;
+		}
 
 		/* Check for any overlap with the next region. */
 		t = TAILQ_NEXT(s, r_link);
-		if (t && r->r_start <= t->r_end && r->r_end >= t->r_start)
-			return EBUSY;
+		if (t && r->r_start <= t->r_end && r->r_end >= t->r_start) {
+			rv = EBUSY;
+			goto out;
+		}
 
 		/*
 		 * See if this region can be merged with the next region.  If
@@ -225,9 +230,9 @@
 			TAILQ_INSERT_BEFORE(s, r, r_link);
 		}
 	}
-
+out:
 	mtx_unlock(rm->rm_mtx);
-	return 0;
+	return rv;
 }
 
 int
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_sleepqueue.c
--- a/head/sys/kern/subr_sleepqueue.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_sleepqueue.c	Wed Jul 25 16:40:53 2012 +0300
@@ -60,10 +60,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_sleepqueue.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_sleepqueue.c 235459 2012-05-15 01:30:25Z rstone $");
 
 #include "opt_sleepqueue_profiling.h"
 #include "opt_ddb.h"
+#include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
@@ -75,6 +76,7 @@
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
@@ -166,6 +168,9 @@
 static void	sleepq_switch(void *wchan, int pri);
 static void	sleepq_timeout(void *arg);
 
+SDT_PROBE_DECLARE(sched, , , sleep);
+SDT_PROBE_DECLARE(sched, , , wakeup);
+
 /*
  * Early initialization of sleep queues that is called from the sleepinit()
  * SYSINIT.
@@ -534,6 +539,7 @@
 	MPASS(td->td_sleepqueue == NULL);
 	sched_sleep(td, pri);
 	thread_lock_set(td, &sc->sc_lock);
+	SDT_PROBE0(sched, , , sleep);
 	TD_SET_SLEEPING(td);
 	mi_switch(SW_VOL | SWT_SLEEPQ, NULL);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
@@ -715,6 +721,8 @@
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
+	SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
+
 	/* Remove the thread from the queue. */
 	sq->sq_blockedcnt[td->td_sqqueue]--;
 	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_smp.c
--- a/head/sys/kern/subr_smp.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_smp.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_smp.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_smp.c 236906 2012-06-11 18:47:26Z iwasaki $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -55,6 +55,7 @@
 #ifdef SMP
 volatile cpuset_t stopped_cpus;
 volatile cpuset_t started_cpus;
+volatile cpuset_t suspended_cpus;
 cpuset_t hlt_cpus_mask;
 cpuset_t logical_cpus_mask;
 
@@ -207,9 +208,10 @@
 #endif
 	static volatile u_int stopping_cpu = NOCPU;
 	int i;
+	volatile cpuset_t *cpus;
 
 	KASSERT(
-#if defined(__amd64__)
+#if defined(__amd64__) || defined(__i386__)
 	    type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
 #else
 	    type == IPI_STOP || type == IPI_STOP_HARD,
@@ -231,8 +233,15 @@
 	/* send the stop IPI to all CPUs in map */
 	ipi_selected(map, type);
 
+#if defined(__amd64__) || defined(__i386__)
+	if (type == IPI_SUSPEND)
+		cpus = &suspended_cpus;
+	else
+#endif
+		cpus = &stopped_cpus;
+
 	i = 0;
-	while (!CPU_SUBSET(&stopped_cpus, &map)) {
+	while (!CPU_SUBSET(cpus, &map)) {
 		/* spin */
 		cpu_spinwait();
 		i++;
@@ -260,7 +269,7 @@
 	return (generic_stop_cpus(map, IPI_STOP_HARD));
 }
 
-#if defined(__amd64__)
+#if defined(__amd64__) || defined(__i386__)
 int
 suspend_cpus(cpuset_t map)
 {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_syscall.c
--- a/head/sys/kern/subr_syscall.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_syscall.c	Wed Jul 25 16:40:53 2012 +0300
@@ -42,7 +42,7 @@
 #include "opt_ktrace.h"
 #include "opt_kdtrace.h"
 
-__FBSDID("$FreeBSD: head/sys/kern/subr_syscall.c 234172 2012-04-12 10:48:43Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_syscall.c 236309 2012-05-30 13:44:42Z kib $");
 
 #include <sys/capability.h>
 #include <sys/ktr.h>
@@ -182,6 +182,12 @@
 	KASSERT(td->td_locks == 0,
 	    ("System call %s returning with %d locks held",
 	     syscallname(p, sa->code), td->td_locks));
+	KASSERT((td->td_pflags & TDP_NOFAULTING) == 0,
+	    ("System call %s returning with pagefaults disabled",
+	     syscallname(p, sa->code)));
+	KASSERT((td->td_pflags & TDP_NOSLEEPING) == 0,
+	    ("System call %s returning with sleep disabled",
+	     syscallname(p, sa->code)));
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_trap.c
--- a/head/sys/kern/subr_trap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_trap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -42,9 +42,8 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_trap.c 234494 2012-04-20 15:32:36Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_trap.c 236859 2012-06-10 20:24:01Z pjd $");
 
-#include "opt_capsicum.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_kdtrace.h"
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_turnstile.c
--- a/head/sys/kern/subr_turnstile.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_turnstile.c	Wed Jul 25 16:40:53 2012 +0300
@@ -57,9 +57,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_turnstile.c 234303 2012-04-14 23:59:58Z davide $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_turnstile.c 235459 2012-05-15 01:30:25Z rstone $");
 
 #include "opt_ddb.h"
+#include "opt_kdtrace.h"
 #include "opt_turnstile_profiling.h"
 #include "opt_sched.h"
 
@@ -73,6 +74,7 @@
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 
@@ -167,6 +169,11 @@
 static int	turnstile_init(void *mem, int size, int flags);
 static void	turnstile_fini(void *mem, int size);
 
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE(sched, , , sleep, sleep);
+SDT_PROBE_DEFINE2(sched, , , wakeup, wakeup, "struct thread *", 
+    "struct proc *");
+
 /*
  * Walks the chain of turnstiles and their owners to propagate the priority
  * of the thread being blocked to all the threads holding locks that have to
@@ -740,6 +747,8 @@
 		CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
 		    td->td_tid, lock, lock->lo_name);
 
+	SDT_PROBE0(sched, , , sleep);
+
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	mi_switch(SW_VOL | SWT_TURNSTILE, NULL);
 
@@ -916,6 +925,7 @@
 	while (!TAILQ_EMPTY(&pending_threads)) {
 		td = TAILQ_FIRST(&pending_threads);
 		TAILQ_REMOVE(&pending_threads, td, td_lockq);
+		SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
 		thread_lock(td);
 		THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_witness.c
--- a/head/sys/kern/subr_witness.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_witness.c	Wed Jul 25 16:40:53 2012 +0300
@@ -85,7 +85,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_witness.c 233937 2012-04-06 06:53:58Z melifaro $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_witness.c 237623 2012-06-27 03:45:25Z alc $");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
@@ -564,7 +564,7 @@
 	 */
 	{ "bpf global lock", &lock_class_mtx_sleep },
 	{ "bpf interface lock", &lock_class_rw },
-	{ "bpf cdev lock", &lock_class_rw },
+	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
@@ -593,19 +593,22 @@
 	/*
 	 * CDEV
 	 */
-	{ "system map", &lock_class_mtx_sleep },
-	{ "vm page queue mutex", &lock_class_mtx_sleep },
+	{ "vm map (system)", &lock_class_mtx_sleep },
+	{ "vm page queue", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
-	 * 
 	 */
+	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_mtx_sleep },
-	{ "page lock", &lock_class_mtx_sleep },
-	{ "vm page queue mutex", &lock_class_mtx_sleep },
+	{ "vm page", &lock_class_mtx_sleep },
+	{ "vm page queue", &lock_class_mtx_sleep },
+	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
+	{ "pmap pv list", &lock_class_rw },
+	{ "vm page free queue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sys_capability.c
--- a/head/sys/kern/sys_capability.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sys_capability.c	Wed Jul 25 16:40:53 2012 +0300
@@ -51,12 +51,12 @@
  * anonymous, rather than named, POSIX shared memory objects.
  */
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/sys_capability.c 236858 2012-06-10 20:22:10Z pjd $");
+
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sys_capability.c 232860 2012-03-12 11:56:57Z pho $");
-
 #include <sys/param.h>
 #include <sys/capability.h>
 #include <sys/file.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sys_generic.c
--- a/head/sys/kern/sys_generic.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sys_generic.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sys_generic.c 232494 2012-03-04 14:55:37Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/sys_generic.c 237195 2012-06-17 13:03:50Z davide $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -1255,7 +1255,7 @@
 	struct pollfd *bits;
 	struct pollfd smallbits[32];
 	struct timeval atv, rtv, ttv;
-	int error = 0, timo;
+	int error, timo;
 	u_int nfds;
 	size_t ni;
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sys_procdesc.c
--- a/head/sys/kern/sys_procdesc.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sys_procdesc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -59,7 +59,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sys_procdesc.c 225617 2011-09-16 13:58:51Z kmacy $");
+__FBSDID("$FreeBSD: head/sys/kern/sys_procdesc.c 237277 2012-06-19 22:23:59Z pjd $");
 
 #include "opt_procdesc.h"
 
@@ -338,7 +338,7 @@
 
 /*
  * procdesc_close() - last close on a process descriptor.  If the process is
- * still running, terminate with SIGKILL (unless PD_DAEMON is set) and let
+ * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let
  * init(8) clean up the mess; if not, we have to clean up the zombie ourselves.
  */
 static int
@@ -386,7 +386,7 @@
 		 */
 		p->p_sigparent = SIGCHLD;
 		proc_reparent(p, initproc);
-		if ((pd->pd_flags & PD_DAEMON) == 0)
+		if ((pd->pd_flags & PDF_DAEMON) == 0)
 			kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 		sx_xunlock(&proctree_lock);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sys_process.c
--- a/head/sys/kern/sys_process.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sys_process.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sys_process.c 232048 2012-02-23 11:50:23Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/sys_process.c 238287 2012-07-09 09:24:46Z davidxu $");
 
 #include "opt_compat.h"
 
@@ -635,7 +635,7 @@
 	struct iovec iov;
 	struct uio uio;
 	struct proc *curp, *p, *pp;
-	struct thread *td2 = NULL;
+	struct thread *td2 = NULL, *td3;
 	struct ptrace_io_desc *piod = NULL;
 	struct ptrace_lwpinfo *pl;
 	int error, write, tmp, num;
@@ -953,10 +953,8 @@
 			td2->td_xsig = data;
 
 			if (req == PT_DETACH) {
-				struct thread *td3;
-				FOREACH_THREAD_IN_PROC(p, td3) {
+				FOREACH_THREAD_IN_PROC(p, td3)
 					td3->td_dbgflags &= ~TDB_SUSPEND; 
-				}
 			}
 			/*
 			 * unsuspend all threads, to not let a thread run,
@@ -967,6 +965,8 @@
 			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
+			if (req == PT_ATTACH)
+				kern_psignal(p, data);
 		} else {
 			if (data)
 				kern_psignal(p, data);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/syscalls.c
--- a/head/sys/kern/syscalls.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/syscalls.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/kern/syscalls.c 227776 2011-11-21 01:26:10Z lstewart $
- * created from FreeBSD: head/sys/kern/syscalls.master 227691 2011-11-19 06:35:15Z ed 
+ * $FreeBSD: head/sys/kern/syscalls.c 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/kern/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 const char *syscallnames[] = {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/syscalls.master
--- a/head/sys/kern/syscalls.master	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/syscalls.master	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
- $FreeBSD: head/sys/kern/syscalls.master 227776 2011-11-21 01:26:10Z lstewart $
+ $FreeBSD: head/sys/kern/syscalls.master 236026 2012-05-25 21:50:48Z ed $
 ;	from: @(#)syscalls.master	8.2 (Berkeley) 1/13/94
 ;
 ; System call name/number master file.
@@ -916,9 +916,9 @@
 512	AUE_SHMCTL	NOSTD	{ int shmctl(int shmid, int cmd, \
 				    struct shmid_ds *buf); }
 513	AUE_LPATHCONF	STD	{ int lpathconf(char *path, int name); }
-514	AUE_CAP_NEW	STD	{ int cap_new(int fd, u_int64_t rights); }
+514	AUE_CAP_NEW	STD	{ int cap_new(int fd, uint64_t rights); }
 515	AUE_CAP_GETRIGHTS	STD	{ int cap_getrights(int fd, \
-				    u_int64_t *rightsp); }
+				    uint64_t *rightsp); }
 516	AUE_CAP_ENTER	STD	{ int cap_enter(void); }
 517	AUE_CAP_GETMODE	STD	{ int cap_getmode(u_int *modep); }
 518	AUE_PDFORK	STD	{ int pdfork(int *fdp, int flags); }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/systrace_args.c
--- a/head/sys/kern/systrace_args.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/systrace_args.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,7 +2,7 @@
  * System call argument to DTrace register array converstion.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/kern/systrace_args.c 227776 2011-11-21 01:26:10Z lstewart $
+ * $FreeBSD: head/sys/kern/systrace_args.c 236027 2012-05-25 21:52:57Z ed $
  * This file is part of the DTrace syscall provider.
  */
 
@@ -3121,7 +3121,7 @@
 	case 514: {
 		struct cap_new_args *p = params;
 		iarg[0] = p->fd; /* int */
-		uarg[1] = p->rights; /* u_int64_t */
+		uarg[1] = p->rights; /* uint64_t */
 		*n_args = 2;
 		break;
 	}
@@ -3129,7 +3129,7 @@
 	case 515: {
 		struct cap_getrights_args *p = params;
 		iarg[0] = p->fd; /* int */
-		uarg[1] = (intptr_t) p->rightsp; /* u_int64_t * */
+		uarg[1] = (intptr_t) p->rightsp; /* uint64_t * */
 		*n_args = 2;
 		break;
 	}
@@ -8434,7 +8434,7 @@
 			p = "int";
 			break;
 		case 1:
-			p = "u_int64_t";
+			p = "uint64_t";
 			break;
 		default:
 			break;
@@ -8447,7 +8447,7 @@
 			p = "int";
 			break;
 		case 1:
-			p = "u_int64_t *";
+			p = "uint64_t *";
 			break;
 		default:
 			break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/tty.c
--- a/head/sys/kern/tty.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/tty.c	Wed Jul 25 16:40:53 2012 +0300
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/tty.c 232197 2012-02-26 20:56:49Z phk $");
+__FBSDID("$FreeBSD: head/sys/kern/tty.c 237219 2012-06-18 07:34:38Z pho $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -219,9 +219,15 @@
 static int
 ttydev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
-	struct tty *tp = dev->si_drv1;
+	struct tty *tp;
 	int error = 0;
 
+	while ((tp = dev->si_drv1) == NULL) {
+		error = tsleep(&dev->si_drv1, PCATCH, "ttdrv1", 1);
+		if (error != EWOULDBLOCK)
+			return (error);
+	}
+
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		/* Device is already gone. */
@@ -738,9 +744,14 @@
 static int
 ttyil_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
-	struct tty *tp = dev->si_drv1;
+	struct tty *tp;
 	int error = 0;
 
+	while ((tp = dev->si_drv1) == NULL) {
+		error = tsleep(&dev->si_drv1, PCATCH, "ttdrv1", 1);
+		if (error != EWOULDBLOCK)
+			return (error);
+	}
 	tty_lock(tp);
 	if (tty_gone(tp))
 		error = ENODEV;
@@ -1203,6 +1214,7 @@
 	dev = make_dev_cred(&ttydev_cdevsw, 0, cred,
 	    uid, gid, mode, "%s%s", prefix, name);
 	dev->si_drv1 = tp;
+	wakeup(&dev->si_drv1);
 	tp->t_dev = dev;
 
 	/* Slave call-in devices. */
@@ -1211,12 +1223,14 @@
 		    uid, gid, mode, "%s%s.init", prefix, name);
 		dev_depends(tp->t_dev, dev);
 		dev->si_drv1 = tp;
+		wakeup(&dev->si_drv1);
 		dev->si_drv2 = &tp->t_termios_init_in;
 
 		dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_LOCK, cred,
 		    uid, gid, mode, "%s%s.lock", prefix, name);
 		dev_depends(tp->t_dev, dev);
 		dev->si_drv1 = tp;
+		wakeup(&dev->si_drv1);
 		dev->si_drv2 = &tp->t_termios_lock_in;
 	}
 
@@ -1226,6 +1240,7 @@
 		    UID_UUCP, GID_DIALER, 0660, "cua%s", name);
 		dev_depends(tp->t_dev, dev);
 		dev->si_drv1 = tp;
+		wakeup(&dev->si_drv1);
 
 		/* Slave call-out devices. */
 		if (tp->t_flags & TF_INITLOCK) {
@@ -1234,6 +1249,7 @@
 			    UID_UUCP, GID_DIALER, 0660, "cua%s.init", name);
 			dev_depends(tp->t_dev, dev);
 			dev->si_drv1 = tp;
+			wakeup(&dev->si_drv1);
 			dev->si_drv2 = &tp->t_termios_init_out;
 
 			dev = make_dev_cred(&ttyil_cdevsw,
@@ -1241,6 +1257,7 @@
 			    UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name);
 			dev_depends(tp->t_dev, dev);
 			dev->si_drv1 = tp;
+			wakeup(&dev->si_drv1);
 			dev->si_drv2 = &tp->t_termios_lock_out;
 		}
 	}
@@ -1817,9 +1834,6 @@
 {
 	struct tty *tp;
 	struct file *fp;
-#ifdef CAPABILITIES
-	struct file *fp_cap;
-#endif
 	struct cdev *dev;
 	struct cdevsw *cdp;
 	struct filedesc *fdp;
@@ -1838,10 +1852,9 @@
 	}
 
 #ifdef CAPABILITIES
-	fp_cap = fp;
-	error = cap_funwrap(fp_cap, CAP_TTYHOOK, &fp);
+	error = cap_funwrap(fp, CAP_TTYHOOK, &fp);
 	if (error)
-		return (error);
+		goto done1;
 #endif
 
 	/*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/uipc_mqueue.c
--- a/head/sys/kern/uipc_mqueue.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/uipc_mqueue.c	Wed Jul 25 16:40:53 2012 +0300
@@ -43,7 +43,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/uipc_mqueue.c 229272 2012-01-02 12:12:10Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/uipc_mqueue.c 234607 2012-04-23 14:10:34Z trasz $");
 
 #include "opt_compat.h"
 
@@ -703,7 +703,7 @@
 {
 	struct vnode *vp = (struct vnode *)context;
 
-	vrecycle(vp, curthread);
+	vrecycle(vp);
 	vdrop(vp);
 }
 
@@ -1065,7 +1065,7 @@
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
-		vrecycle(ap->a_vp, ap->a_td);
+		vrecycle(ap->a_vp);
 	return (0);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/uipc_socket.c
--- a/head/sys/kern/uipc_socket.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/uipc_socket.c	Wed Jul 25 16:40:53 2012 +0300
@@ -101,7 +101,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 233850 2012-04-03 18:38:00Z np $");
+__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 238085 2012-07-03 19:08:02Z trociny $");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
@@ -635,7 +635,7 @@
 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 	if (so->so_options & SO_ACCEPTCONN) {
 		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
-		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
+		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated"));
 	}
 	SOCK_UNLOCK(so);
 	ACCEPT_UNLOCK();
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/uipc_syscalls.c
--- a/head/sys/kern/uipc_syscalls.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/uipc_syscalls.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 233004 2012-03-15 14:13:38Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 236891 2012-06-11 16:08:03Z pjd $");
 
 #include "opt_capsicum.h"
 #include "opt_inet.h"
@@ -134,8 +134,7 @@
 	int error;
 #endif
 
-	fp = NULL;
-	if ((fdp == NULL) || ((fp = fget_unlocked(fdp, fd)) == NULL))
+	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL)
 		return (EBADF);
 #ifdef CAPABILITIES
 	/*
@@ -179,7 +178,6 @@
 		int	protocol;
 	} */ *uap;
 {
-	struct filedesc *fdp;
 	struct socket *so;
 	struct file *fp;
 	int fd, error;
@@ -191,7 +189,6 @@
 	if (error)
 		return (error);
 #endif
-	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd, 0);
 	if (error)
 		return (error);
@@ -199,7 +196,7 @@
 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error) {
-		fdclose(fdp, fp, fd, td);
+		fdclose(td->td_proc->p_fd, fp, fd, td);
 	} else {
 		finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
 		td->td_retval[0] = fd;
@@ -1962,6 +1959,7 @@
 	 * and takes care of the overall progress.
 	 */
 	for (off = uap->offset, rem = uap->nbytes; ; ) {
+		struct mbuf *mtail = NULL;
 		int loopbytes = 0;
 		int space = 0;
 		int done = 0;
@@ -2181,10 +2179,13 @@
 			m0->m_len = xfsize;
 
 			/* Append to mbuf chain. */
-			if (m != NULL)
-				m_cat(m, m0);
+			if (mtail != NULL)
+				mtail->m_next = m0;
+			else if (m != NULL)
+				m_last(m)->m_next = m0;
 			else
 				m = m0;
+			mtail = m0;
 
 			/* Keep track of bits processed. */
 			loopbytes += xfsize;
@@ -2309,25 +2310,23 @@
 	} */ *uap;
 {
 #if (defined(INET) || defined(INET6)) && defined(SCTP)
-	struct filedesc *fdp;
 	struct file *nfp = NULL;
 	int error;
 	struct socket *head, *so;
 	int fd;
 	u_int fflag;
 
-	fdp = td->td_proc->p_fd;
 	AUDIT_ARG_FD(uap->sd);
 	error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag);
 	if (error)
 		goto done2;
 	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
-		goto done2;
+		goto done;
 	}
 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
 	if (error)
-		goto done2;
+		goto done;
 	/*
 	 * At this point we know we do have a assoc to pull
 	 * we proceed to get the fd setup. This may block
@@ -2374,7 +2373,7 @@
 	 * out from under us.
 	 */
 	if (error)
-		fdclose(fdp, nfp, fd, td);
+		fdclose(td->td_proc->p_fd, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/uipc_usrreq.c
--- a/head/sys/kern/uipc_usrreq.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/uipc_usrreq.c	Wed Jul 25 16:40:53 2012 +0300
@@ -57,7 +57,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 232317 2012-02-29 21:38:31Z trociny $");
+__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 237036 2012-06-13 22:12:10Z pjd $");
 
 #include "opt_ddb.h"
 
@@ -1872,7 +1872,7 @@
 			FILEDESC_SLOCK(fdescp);
 			for (i = 0; i < oldfds; i++) {
 				fd = *fdp++;
-				if ((unsigned)fd >= fdescp->fd_nfiles ||
+				if (fd < 0 || fd >= fdescp->fd_nfiles ||
 				    fdescp->fd_ofiles[fd] == NULL) {
 					FILEDESC_SUNLOCK(fdescp);
 					error = EBADF;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_bio.c
--- a/head/sys/kern/vfs_bio.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_bio.c	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 232351 2012-03-01 18:45:25Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 236487 2012-06-02 19:39:12Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -2640,8 +2640,8 @@
 	if (bp != NULL) {
 		int lockflags;
 		/*
-		 * Buffer is in-core.  If the buffer is not busy, it must
-		 * be on a queue.
+		 * Buffer is in-core.  If the buffer is not busy nor managed,
+		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
@@ -2671,9 +2671,13 @@
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
-		BO_LOCK(bo);
-		bremfree(bp);
-		BO_UNLOCK(bo);
+		if (bp->b_flags & B_MANAGED)
+			MPASS(bp->b_qindex == QUEUE_NONE);
+		else {
+			BO_LOCK(bo);
+			bremfree(bp);
+			BO_UNLOCK(bo);
+		}
 
 		/*
 		 * check for size inconsistancies for non-VMIO case.
@@ -3991,7 +3995,9 @@
 	}
 
 	db_printf("buf at %p\n", bp);
-	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
+	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
+	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
+	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_default.c
--- a/head/sys/kern/vfs_default.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_default.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_default.c 234386 2012-04-17 16:28:22Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_default.c 236825 2012-06-09 22:26:53Z mckusick $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -343,8 +343,8 @@
 		if (error)
 			goto out;
 
-		if ((dp->d_type != DT_WHT) &&
-		    !strcmp(dp->d_name, dirname)) {
+		if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
+		    strcmp(dp->d_name, dirname) == 0) {
 			found = 1;
 			goto out;
 		}
@@ -646,8 +646,17 @@
 		if ((bp->b_vflags & BV_SCANNED) != 0)
 			continue;
 		bp->b_vflags |= BV_SCANNED;
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
-			continue;
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
+			if (ap->a_waitfor != MNT_WAIT)
+				continue;
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
+			    BO_MTX(bo)) != 0) {
+				BO_LOCK(bo);
+				goto loop1;
+			}
+			BO_LOCK(bo);
+		}
 		BO_UNLOCK(bo);
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_subr.c
--- a/head/sys/kern/vfs_subr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_subr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 234483 2012-04-20 07:00:28Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 236503 2012-06-03 08:01:12Z avg $");
 
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
@@ -73,9 +73,7 @@
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
-#ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
-#endif
 
 #include <machine/stdarg.h>
 
@@ -1027,6 +1025,7 @@
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
+	rangelock_init(&vp->v_rl);
 
 	*vpp = vp;
 	return (0);
@@ -1327,8 +1326,7 @@
  * sync activity.
  */
 int
-vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
-    off_t length, int blksize)
+vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
@@ -1869,10 +1867,10 @@
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
-#ifdef SW_WATCHDOG
+
 			if (first_printf == 0)
 				wdog_kern_pat(WD_LASTVAL);
-#endif
+
 		}
 		if (!LIST_EMPTY(gslp)) {
 			mtx_unlock(&sync_mtx);
@@ -2469,6 +2467,7 @@
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
+	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	mtx_destroy(BO_MTX(bo));
@@ -2660,7 +2659,7 @@
  * Recycle an unused vnode to the front of the free list.
  */
 int
-vrecycle(struct vnode *vp, struct thread *td)
+vrecycle(struct vnode *vp)
 {
 	int recycled;
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_syscalls.c
--- a/head/sys/kern/vfs_syscalls.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_syscalls.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_syscalls.c 234489 2012-04-20 10:08:30Z jh $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_syscalls.c 238029 2012-07-02 21:01:03Z kib $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -1093,8 +1093,7 @@
 	struct file *fp;
 	struct vnode *vp;
 	int cmode;
-	struct file *nfp;
-	int type, indx = -1, error, error_open;
+	int type, indx = -1, error;
 	struct flock lf;
 	struct nameidata nd;
 	int vfslocked;
@@ -1111,19 +1110,22 @@
 	if (flags & O_EXEC) {
 		if (flags & O_ACCMODE)
 			return (EINVAL);
-	} else if ((flags & O_ACCMODE) == O_ACCMODE)
+	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
 		return (EINVAL);
-	else
+	} else {
 		flags = FFLAGS(flags);
+	}
 
 	/*
-	 * allocate the file descriptor, but don't install a descriptor yet
+	 * Allocate the file descriptor, but don't install a descriptor yet.
 	 */
-	error = falloc_noinstall(td, &nfp);
+	error = falloc_noinstall(td, &fp);
 	if (error)
 		return (error);
-	/* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
-	fp = nfp;
+	/*
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
+	 */
 	/* Set the flags early so the finit in devfs can pick them up. */
 	fp->f_flag = flags & FMASK;
 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
@@ -1141,36 +1143,24 @@
 			goto success;
 
 		/*
-		 * handle special fdopen() case.  bleh.  dupfdopen() is
-		 * responsible for dropping the old contents of ofiles[indx]
-		 * if it succeeds.
+		 * Handle special fdopen() case. bleh.
 		 *
 		 * Don't do this for relative (capability) lookups; we don't
 		 * understand exactly what would happen, and we don't think
 		 * that it ever should.
 		 */
-		if ((nd.ni_strictrelative == 0) &&
+		if (nd.ni_strictrelative == 0 &&
 		    (error == ENODEV || error == ENXIO) &&
-		    (td->td_dupfd >= 0)) {
-			/* XXX from fdopen */
-			error_open = error;
-			if ((error = finstall(td, fp, &indx, flags)) != 0)
-				goto bad_unlocked;
-			if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
-			    flags, error_open)) == 0)
+		    td->td_dupfd >= 0) {
+			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
+			    &indx);
+			if (error == 0)
 				goto success;
 		}
-		/*
-		 * Clean up the descriptor, but only if another thread hadn't
-		 * replaced or closed it.
-		 */
-		if (indx != -1)
-			fdclose(fdp, fp, indx, td);
-		fdrop(fp, td);
 
 		if (error == ERESTART)
 			error = EINTR;
-		return (error);
+		goto bad_unlocked;
 	}
 	td->td_dupfd = 0;
 	vfslocked = NDHASGIANT(&nd);
@@ -1206,7 +1196,7 @@
 		if ((flags & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
-			    type)) != 0)
+		    type)) != 0)
 			goto bad;
 		atomic_set_int(&fp->f_flag, FHASLOCK);
 	}
@@ -1247,10 +1237,8 @@
 bad:
 	VFS_UNLOCK_GIANT(vfslocked);
 bad_unlocked:
-	if (indx != -1)
-		fdclose(fdp, fp, indx, td);
+	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
 	fdrop(fp, td);
-	td->td_retval[0] = -1;
 	return (error);
 }
 
@@ -1993,7 +1981,7 @@
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vattr;
-	off_t offset, size;
+	off_t foffset, offset, size;
 	int error, noneg;
 	int vfslocked;
 
@@ -2005,18 +1993,19 @@
 		return (ESPIPE);
 	}
 	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	noneg = (vp->v_type != VCHR);
 	offset = uap->offset;
 	switch (uap->whence) {
 	case L_INCR:
 		if (noneg &&
-		    (fp->f_offset < 0 ||
-		    (offset > 0 && fp->f_offset > OFF_MAX - offset))) {
+		    (foffset < 0 ||
+		    (offset > 0 && foffset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
-		offset += fp->f_offset;
+		offset += foffset;
 		break;
 	case L_XTND:
 		vn_lock(vp, LK_SHARED | LK_RETRY);
@@ -2056,12 +2045,12 @@
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
-	fp->f_offset = offset;
 	VFS_KNOTE_UNLOCKED(vp, 0);
-	*(off_t *)(td->td_retval) = fp->f_offset;
+	*(off_t *)(td->td_retval) = offset;
 drop:
 	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
+	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
@@ -3994,6 +3983,7 @@
 	caddr_t dirbuf;
 	int error, eofflag, readcnt, vfslocked;
 	long loff;
+	off_t foffset;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	if (uap->count > 64 * 1024)
@@ -4006,10 +3996,12 @@
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
 unionread:
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
+		foffset_unlock(fp, foffset, 0);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
@@ -4022,12 +4014,13 @@
 	auio.uio_td = td;
 	auio.uio_resid = uap->count;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
-	loff = auio.uio_offset = fp->f_offset;
+	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error) {
 		VOP_UNLOCK(vp, 0);
 		VFS_UNLOCK_GIANT(vfslocked);
+		foffset_unlock(fp, foffset, FOF_NOUPDATE);
 		fdrop(fp, td);
 		return (error);
 	}
@@ -4036,7 +4029,7 @@
 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 			    NULL, NULL);
-			fp->f_offset = auio.uio_offset;
+			foffset = auio.uio_offset;
 		} else
 #	endif
 	{
@@ -4048,7 +4041,7 @@
 		kiov.iov_base = dirbuf;
 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
 			    NULL, NULL);
-		fp->f_offset = kuio.uio_offset;
+		foffset = kuio.uio_offset;
 		if (error == 0) {
 			readcnt = uap->count - kuio.uio_resid;
 			edp = (struct dirent *)&dirbuf[readcnt];
@@ -4086,6 +4079,7 @@
 	if (error) {
 		VOP_UNLOCK(vp, 0);
 		VFS_UNLOCK_GIANT(vfslocked);
+		foffset_unlock(fp, foffset, 0);
 		fdrop(fp, td);
 		return (error);
 	}
@@ -4097,13 +4091,14 @@
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
-		fp->f_offset = 0;
+		foffset = 0;
 		vput(tvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
+	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	td->td_retval[0] = uap->count - auio.uio_resid;
 	if (error == 0)
@@ -4136,7 +4131,8 @@
 	long base;
 	int error;
 
-	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
+	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+	    NULL, UIO_USERSPACE);
 	if (error)
 		return (error);
 	if (uap->basep != NULL)
@@ -4146,7 +4142,7 @@
 
 int
 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
-    long *basep)
+    long *basep, ssize_t *residp, enum uio_seg bufseg)
 {
 	struct vnode *vp;
 	struct file *fp;
@@ -4155,6 +4151,7 @@
 	int vfslocked;
 	long loff;
 	int error, eofflag;
+	off_t foffset;
 
 	AUDIT_ARG_FD(fd);
 	if (count > IOSIZE_MAX)
@@ -4168,6 +4165,7 @@
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
 unionread:
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
@@ -4180,18 +4178,18 @@
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
-	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
-	loff = auio.uio_offset = fp->f_offset;
+	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 		    NULL);
-	fp->f_offset = auio.uio_offset;
+	foffset = auio.uio_offset;
 	if (error) {
 		VOP_UNLOCK(vp, 0);
 		VFS_UNLOCK_GIANT(vfslocked);
@@ -4205,7 +4203,7 @@
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
-		fp->f_offset = 0;
+		foffset = 0;
 		vput(tvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
@@ -4213,8 +4211,11 @@
 	VOP_UNLOCK(vp, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 	*basep = loff;
+	if (residp != NULL)
+		*residp = auio.uio_resid;
 	td->td_retval[0] = count - auio.uio_resid;
 fail:
+	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	return (error);
 }
@@ -4334,12 +4335,10 @@
 	struct file *fp;
 #ifdef CAPABILITIES
 	struct file *fp_fromcap;
+	int error;
 #endif
-	int error;
-
-	error = 0;
-	fp = NULL;
-	if ((fdp == NULL) || (fp = fget_unlocked(fdp, fd)) == NULL)
+
+	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL)
 		return (EBADF);
 #ifdef CAPABILITIES
 	/*
@@ -4481,24 +4480,19 @@
 		int flags;
 	} */ *uap;
 {
-	struct proc *p = td->td_proc;
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
-	struct vattr vat;
-	struct vattr *vap = &vat;
 	struct flock lf;
 	struct file *fp;
-	register struct filedesc *fdp = p->p_fd;
 	int fmode, error, type;
-	accmode_t accmode;
-	struct file *nfp;
 	int vfslocked;
 	int indx;
 
 	error = priv_check(td, PRIV_VFS_FHOPEN);
 	if (error)
 		return (error);
+	indx = -1;
 	fmode = FFLAGS(uap->flags);
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
@@ -4514,109 +4508,42 @@
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
-	if (error)
-		goto out;
+	if (error) {
+		VFS_UNLOCK_GIANT(vfslocked);
+		return (error);
+	}
+
+	error = falloc_noinstall(td, &fp);
+	if (error) {
+		vput(vp);
+		VFS_UNLOCK_GIANT(vfslocked);
+		return (error);
+	}
 	/*
-	 * from now on we have to make sure not
-	 * to forget about the vnode
-	 * any error that causes an abort must vput(vp)
-	 * just set error = err and 'goto bad;'.
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
 	 */
 
-	/*
-	 * from vn_open
-	 */
-	if (vp->v_type == VLNK) {
-		error = EMLINK;
+#ifdef INVARIANTS
+	td->td_dupfd = -1;
+#endif
+	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
+	if (error) {
+		KASSERT(fp->f_ops == &badfileops,
+		    ("VOP_OPEN in fhopen() set f_ops"));
+		KASSERT(td->td_dupfd < 0,
+		    ("fhopen() encountered fdopen()"));
+
+		vput(vp);
 		goto bad;
 	}
-	if (vp->v_type == VSOCK) {
-		error = EOPNOTSUPP;
-		goto bad;
-	}
-	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
-		error = ENOTDIR;
-		goto bad;
-	}
-	accmode = 0;
-	if (fmode & (FWRITE | O_TRUNC)) {
-		if (vp->v_type == VDIR) {
-			error = EISDIR;
-			goto bad;
-		}
-		error = vn_writechk(vp);
-		if (error)
-			goto bad;
-		accmode |= VWRITE;
-	}
-	if (fmode & FREAD)
-		accmode |= VREAD;
-	if ((fmode & O_APPEND) && (fmode & FWRITE))
-		accmode |= VAPPEND;
-#ifdef MAC
-	error = mac_vnode_check_open(td->td_ucred, vp, accmode);
-	if (error)
-		goto bad;
+#ifdef INVARIANTS
+	td->td_dupfd = 0;
 #endif
-	if (accmode) {
-		error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
-		if (error)
-			goto bad;
-	}
-	if (fmode & O_TRUNC) {
-		vfs_ref(mp);
-		VOP_UNLOCK(vp, 0);				/* XXX */
-		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
-			vrele(vp);
-			vfs_rel(mp);
-			goto out;
-		}
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
-		vfs_rel(mp);
-#ifdef MAC
-		/*
-		 * We don't yet have fp->f_cred, so use td->td_ucred, which
-		 * should be right.
-		 */
-		error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
-		if (error == 0) {
-#endif
-			VATTR_NULL(vap);
-			vap->va_size = 0;
-			error = VOP_SETATTR(vp, vap, td->td_ucred);
-#ifdef MAC
-		}
-#endif
-		vn_finished_write(mp);
-		if (error)
-			goto bad;
-	}
-	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
-	if (error)
-		goto bad;
-
-	if (fmode & FWRITE) {
-		vp->v_writecount++;
-		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
-		    __func__, vp, vp->v_writecount);
-	}
-
-	/*
-	 * end of vn_open code
-	 */
-
-	if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
-		if (fmode & FWRITE) {
-			vp->v_writecount--;
-			CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
-			    __func__, vp, vp->v_writecount);
-		}
-		goto bad;
-	}
-	/* An extra reference on `nfp' has been held for us by falloc(). */
-	fp = nfp;
-	nfp->f_vnode = vp;
-	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
+	fp->f_vnode = vp;
+	fp->f_seqcount = 1;
+	finit(fp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
+	VOP_UNLOCK(vp, 0);
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
@@ -4628,36 +4555,22 @@
 		type = F_FLOCK;
 		if ((fmode & FNONBLOCK) == 0)
 			type |= F_WAIT;
-		VOP_UNLOCK(vp, 0);
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
-			    type)) != 0) {
-			/*
-			 * The lock request failed.  Normally close the
-			 * descriptor but handle the case where someone might
-			 * have dup()d or close()d it when we weren't looking.
-			 */
-			fdclose(fdp, fp, indx, td);
-
-			/*
-			 * release our private reference
-			 */
-			fdrop(fp, td);
-			goto out;
-		}
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		    type)) != 0)
+			goto bad;
 		atomic_set_int(&fp->f_flag, FHASLOCK);
 	}
-
-	VOP_UNLOCK(vp, 0);
+	if (fmode & O_TRUNC) {
+		error = fo_truncate(fp, 0, td->td_ucred, td);
+		if (error)
+			goto bad;
+	}
+
+	error = finstall(td, fp, &indx, fmode);
+bad:
+	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	td->td_retval[0] = indx;
-	return (0);
-
-bad:
-	vput(vp);
-out:
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -4679,7 +4592,22 @@
 	} */ *uap;
 {
 	struct stat sb;
-	fhandle_t fh;
+	struct fhandle fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fh));
+	if (error != 0)
+		return (error);
+	error = kern_fhstat(td, fh, &sb);
+	if (error != 0)
+		return (error);
+	error = copyout(&sb, uap->sb, sizeof(sb));
+	return (error);
+}
+
+int
+kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
+{
 	struct mount *mp;
 	struct vnode *vp;
 	int vfslocked;
@@ -4688,9 +4616,6 @@
 	error = priv_check(td, PRIV_VFS_FHSTAT);
 	if (error)
 		return (error);
-	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
-	if (error)
-		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	vfslocked = VFS_LOCK_GIANT(mp);
@@ -4700,12 +4625,9 @@
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
-	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
+	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
-	if (error)
-		return (error);
-	error = copyout(&sb, uap->sb, sizeof(sb));
 	return (error);
 }
 
@@ -4960,6 +4882,8 @@
 			new->fa_advice = advice;
 			new->fa_start = offset;
 			new->fa_end = end;
+			new->fa_prevstart = 0;
+			new->fa_prevend = 0;
 			fp->f_advice = new;
 			new = fa;
 		}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_vnops.c
--- a/head/sys/kern/vfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_vnops.c 232701 2012-03-08 20:27:20Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_vnops.c 238029 2012-07-02 21:01:03Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -56,6 +56,7 @@
 #include <sys/filio.h>
 #include <sys/resourcevar.h>
 #include <sys/sx.h>
+#include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
@@ -65,10 +66,15 @@
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
 #include <vm/vm_object.h>
+#include <vm/vm_page.h>
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
+static fo_rdwr_t	vn_io_fault;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
@@ -77,8 +83,8 @@
 static fo_close_t	vn_closefile;
 
 struct 	fileops vnops = {
-	.fo_read = vn_read,
-	.fo_write = vn_write,
+	.fo_read = vn_io_fault,
+	.fo_write = vn_io_fault,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
@@ -102,7 +108,8 @@
 }
 
 /*
- * Common code for vnode open operations.
+ * Common code for vnode open operations via a name lookup.
+ * Lookup the vnode and invoke VOP_CREATE if needed.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  * 
  * Note that this does NOT free nameidata for the successful case,
@@ -118,7 +125,6 @@
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int fmode, error;
-	accmode_t accmode;
 	int vfslocked, mpsafe;
 
 	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
@@ -199,24 +205,44 @@
 		vfslocked = NDHASGIANT(ndp);
 		vp = ndp->ni_vp;
 	}
-	if (vp->v_type == VLNK) {
-		error = EMLINK;
+	error = vn_open_vnode(vp, fmode, cred, td, fp);
+	if (error)
 		goto bad;
-	}
-	if (vp->v_type == VSOCK) {
-		error = EOPNOTSUPP;
-		goto bad;
-	}
-	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
-		error = ENOTDIR;
-		goto bad;
-	}
+	*flagp = fmode;
+	if (!mpsafe)
+		VFS_UNLOCK_GIANT(vfslocked);
+	return (0);
+bad:
+	NDFREE(ndp, NDF_ONLY_PNBUF);
+	vput(vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	*flagp = fmode;
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+/*
+ * Common code for vnode open operations once a vnode is located.
+ * Check permissions, and call the VOP_OPEN routine.
+ */
+int
+vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
+    struct thread *td, struct file *fp)
+{
+	accmode_t accmode;
+	int error;
+
+	VFS_ASSERT_GIANT(vp->v_mount);
+	if (vp->v_type == VLNK)
+		return (EMLINK);
+	if (vp->v_type == VSOCK)
+		return (EOPNOTSUPP);
+	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
+		return (ENOTDIR);
 	accmode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
-		if (vp->v_type == VDIR) {
-			error = EISDIR;
-			goto bad;
-		}
+		if (vp->v_type == VDIR)
+			return (EISDIR);
 		accmode |= VWRITE;
 	}
 	if (fmode & FREAD)
@@ -228,40 +254,30 @@
 #ifdef MAC
 	error = mac_vnode_check_open(cred, vp, accmode);
 	if (error)
-		goto bad;
+		return (error);
 #endif
 	if ((fmode & O_CREAT) == 0) {
 		if (accmode & VWRITE) {
 			error = vn_writechk(vp);
 			if (error)
-				goto bad;
+				return (error);
 		}
 		if (accmode) {
 		        error = VOP_ACCESS(vp, accmode, cred, td);
 			if (error)
-				goto bad;
+				return (error);
 		}
 	}
 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
-		goto bad;
+		return (error);
 
 	if (fmode & FWRITE) {
 		vp->v_writecount++;
 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
-	*flagp = fmode;
-	ASSERT_VOP_LOCKED(vp, "vn_open_cred");
-	if (!mpsafe)
-		VFS_UNLOCK_GIANT(vfslocked);
+	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 	return (0);
-bad:
-	NDFREE(ndp, NDF_ONLY_PNBUF);
-	vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	*flagp = fmode;
-	ndp->ni_vp = NULL;
-	return (error);
 }
 
 /*
@@ -367,47 +383,19 @@
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
-vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
-    aresid, td)
-	enum uio_rw rw;
-	struct vnode *vp;
-	void *base;
-	int len;
-	off_t offset;
-	enum uio_seg segflg;
-	int ioflg;
-	struct ucred *active_cred;
-	struct ucred *file_cred;
-	ssize_t *aresid;
-	struct thread *td;
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+    enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+    struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
+	void *rl_cookie;
 	int error, lock_flags;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
-	if ((ioflg & IO_NODELOCKED) == 0) {
-		mp = NULL;
-		if (rw == UIO_WRITE) { 
-			if (vp->v_type != VCHR &&
-			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
-			    != 0)
-				return (error);
-			if (MNT_SHARED_WRITES(mp) ||
-			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
-				lock_flags = LK_SHARED;
-			} else {
-				lock_flags = LK_EXCLUSIVE;
-			}
-			vn_lock(vp, lock_flags | LK_RETRY);
-		} else
-			vn_lock(vp, LK_SHARED | LK_RETRY);
-
-	}
-	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
@@ -418,6 +406,33 @@
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if (rw == UIO_READ) {
+			rl_cookie = vn_rangelock_rlock(vp, offset,
+			    offset + len);
+		} else {
+			rl_cookie = vn_rangelock_wlock(vp, offset,
+			    offset + len);
+		}
+		mp = NULL;
+		if (rw == UIO_WRITE) { 
+			if (vp->v_type != VCHR &&
+			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
+			    != 0)
+				goto out;
+			if (MNT_SHARED_WRITES(mp) ||
+			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
+				lock_flags = LK_SHARED;
+			else
+				lock_flags = LK_EXCLUSIVE;
+		} else
+			lock_flags = LK_SHARED;
+		vn_lock(vp, lock_flags | LK_RETRY);
+	} else
+		rl_cookie = NULL;
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
@@ -429,7 +444,7 @@
 	}
 #endif
 	if (error == 0) {
-		if (file_cred)
+		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
@@ -444,10 +459,13 @@
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
-		if (rw == UIO_WRITE && vp->v_type != VCHR)
+		VOP_UNLOCK(vp, 0);
+		if (mp != NULL)
 			vn_finished_write(mp);
-		VOP_UNLOCK(vp, 0);
 	}
+ out:
+	if (rl_cookie != NULL)
+		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
@@ -509,6 +527,110 @@
 	return (error);
 }
 
+off_t
+foffset_lock(struct file *fp, int flags)
+{
+	struct mtx *mtxp;
+	off_t res;
+
+	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+	/*
+	 * Caller only wants the current f_offset value.  Assume that
+	 * the long and shorter integer types reads are atomic.
+	 */
+	if ((flags & FOF_NOLOCK) != 0)
+		return (fp->f_offset);
+#endif
+
+	/*
+	 * According to McKusick the vn lock was protecting f_offset here.
+	 * It is now protected by the FOFFSET_LOCKED flag.
+	 */
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if ((flags & FOF_NOLOCK) == 0) {
+		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+			    "vofflock", 0);
+		}
+		fp->f_vnread_flags |= FOFFSET_LOCKED;
+	}
+	res = fp->f_offset;
+	mtx_unlock(mtxp);
+	return (res);
+}
+
+void
+foffset_unlock(struct file *fp, off_t val, int flags)
+{
+	struct mtx *mtxp;
+
+	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+	if ((flags & FOF_NOLOCK) != 0) {
+		if ((flags & FOF_NOUPDATE) == 0)
+			fp->f_offset = val;
+		if ((flags & FOF_NEXTOFF) != 0)
+			fp->f_nextoff = val;
+		return;
+	}
+#endif
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if ((flags & FOF_NOUPDATE) == 0)
+		fp->f_offset = val;
+	if ((flags & FOF_NEXTOFF) != 0)
+		fp->f_nextoff = val;
+	if ((flags & FOF_NOLOCK) == 0) {
+		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
+		    ("Lost FOFFSET_LOCKED"));
+		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+			wakeup(&fp->f_vnread_flags);
+		fp->f_vnread_flags = 0;
+	}
+	mtx_unlock(mtxp);
+}
+
+void
+foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = foffset_lock(fp, flags);
+}
+
+void
+foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+	if ((flags & FOF_OFFSET) == 0)
+		foffset_unlock(fp, uio->uio_offset, flags);
+}
+
+static int
+get_advice(struct file *fp, struct uio *uio)
+{
+	struct mtx *mtxp;
+	int ret;
+
+	ret = POSIX_FADV_NORMAL;
+	if (fp->f_advice == NULL)
+		return (ret);
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if (uio->uio_offset >= fp->f_advice->fa_start &&
+	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+		ret = fp->f_advice->fa_advice;
+	mtx_unlock(mtxp);
+	return (ret);
+}
+
 /*
  * File table vnode read routine.
  */
@@ -521,44 +643,22 @@
 	struct thread *td;
 {
 	struct vnode *vp;
+	struct mtx *mtxp;
 	int error, ioflag;
-	struct mtx *mtxp;
 	int advice, vfslocked;
-	off_t offset;
+	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
-	mtxp = NULL;
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
-	advice = POSIX_FADV_NORMAL;
+	advice = get_advice(fp, uio);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	/*
-	 * According to McKusick the vn lock was protecting f_offset here.
-	 * It is now protected by the FOFFSET_LOCKED flag.
-	 */
-	if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
-		mtxp = mtx_pool_find(mtxpool_sleep, fp);
-		mtx_lock(mtxp);
-		if ((flags & FOF_OFFSET) == 0) {
-			while (fp->f_vnread_flags & FOFFSET_LOCKED) {
-				fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-				msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
-				    "vnread offlock", 0);
-			}
-			fp->f_vnread_flags |= FOFFSET_LOCKED;
-			uio->uio_offset = fp->f_offset;
-		}
-		if (fp->f_advice != NULL &&
-		    uio->uio_offset >= fp->f_advice->fa_start &&
-		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
-			advice = fp->f_advice->fa_advice;
-		mtx_unlock(mtxp);
-	}
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	switch (advice) {
@@ -578,20 +678,42 @@
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0) {
-		fp->f_offset = uio->uio_offset;
-		mtx_lock(mtxp);
-		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
-			wakeup(&fp->f_vnread_flags);
-		fp->f_vnread_flags = 0;
-		mtx_unlock(mtxp);
-	}
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
-	    offset != uio->uio_offset)
-		error = VOP_ADVISE(vp, offset, uio->uio_offset - 1,
-		    POSIX_FADV_DONTNEED);
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE read(2).  To optimize the common
+		 * case of using POSIX_FADV_NOREUSE with sequential
+		 * access, track the previous implicit DONTNEED
+		 * request and grow this request to include the
+		 * current read(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously read regions of the
+		 * file.  This allows filesystem blocks that are
+		 * accessed by multiple calls to read(2) to be flushed
+		 * once the last read(2) finishes.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
@@ -609,12 +731,14 @@
 {
 	struct vnode *vp;
 	struct mount *mp;
+	struct mtx *mtxp;
 	int error, ioflag, lock_flags;
-	struct mtx *mtxp;
 	int advice, vfslocked;
+	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type == VREG)
@@ -633,6 +757,8 @@
 	if (vp->v_type != VCHR &&
 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto unlock;
+
+	advice = get_advice(fp, uio);
  
 	if ((MNT_SHARED_WRITES(mp) ||
 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
@@ -643,74 +769,360 @@
 	}
 
 	vn_lock(vp, lock_flags | LK_RETRY);
-	if ((flags & FOF_OFFSET) == 0)
-		uio->uio_offset = fp->f_offset;
-	advice = POSIX_FADV_NORMAL;
-	if (fp->f_advice != NULL) {
-		mtxp = mtx_pool_find(mtxpool_sleep, fp);
-		mtx_lock(mtxp);
-		if (fp->f_advice != NULL &&
-		    uio->uio_offset >= fp->f_advice->fa_start &&
-		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
-			advice = fp->f_advice->fa_advice;
-		mtx_unlock(mtxp);
-	}
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
-	case POSIX_FADV_NOREUSE:
-		/*
-		 * Request the underlying FS to discard the buffers
-		 * and pages after the I/O is complete.
-		 */
-		ioflag |= IO_DIRECT;
-		break;
 	}
+	offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0)
-		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (vp->v_type != VCHR)
 		vn_finished_write(mp);
+	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE write(2).  To optimize the
+		 * common case of using POSIX_FADV_NOREUSE with
+		 * sequential access, track the previous implicit
+		 * DONTNEED request and grow this request to include
+		 * the current write(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously written regions of the
+		 * file.
+		 *
+		 * Note that the blocks just written are almost
+		 * certainly still dirty, so this only works when
+		 * VOP_ADVISE() calls from subsequent writes push out
+		 * the data written by this write(2) once the backing
+		 * buffers are clean.  However, as compared to forcing
+		 * IO_DIRECT, this gives much saner behavior.  Write
+		 * clustering is still allowed, and clean pages are
+		 * merely moved to the cache page queue rather than
+		 * outright thrown away.  This means a subsequent
+		 * read(2) can still avoid hitting the disk if the
+		 * pages have not been reclaimed.
+		 *
+		 * This does make POSIX_FADV_NOREUSE largely useless
+		 * with non-sequential access.  However, sequential
+		 * access is the more common use case and the flag is
+		 * merely advisory.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
+	
 unlock:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
+static const int io_hold_cnt = 16;
+static int vn_io_fault_enable = 1;
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
+    &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
+static unsigned long vn_io_faults_cnt;
+SYSCTL_LONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
+    &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
+
+/*
+ * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
+ * prevent the following deadlock:
+ *
+ * Assume that the thread A reads from the vnode vp1 into userspace
+ * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
+ * currently not resident, then system ends up with the call chain
+ *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
+ *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
+ * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
+ * If, at the same time, thread B reads from vnode vp2 into buffer buf2
+ * backed by the pages of vnode vp1, and some page in buf2 is not
+ * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
+ *
+ * To prevent the lock order reversal and deadlock, vn_io_fault() does
+ * not allow page faults to happen during VOP_READ() or VOP_WRITE().
+ * Instead, it first tries to do the whole range i/o with pagefaults
+ * disabled. If all pages in the i/o buffer are resident and mapped,
+ * VOP will succeed (ignoring the genuine filesystem errors).
+ * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
+ * i/o in chunks, with all pages in the chunk prefaulted and held
+ * using vm_fault_quick_hold_pages().
+ *
+ * Filesystems using this deadlock avoidance scheme should use the
+ * array of the held pages from uio, saved in the curthread->td_ma,
+ * instead of doing uiomove().  A helper function
+ * vn_io_fault_uiomove() converts uiomove request into
+ * uiomove_fromphys() over td_ma array.
+ *
+ * Since vnode locks do not cover the whole i/o anymore, rangelocks
+ * make the current i/o request atomic with respect to other i/os and
+ * truncations.
+ */
+static int
+vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	vm_page_t ma[io_hold_cnt + 2];
+	struct uio *uio_clone, short_uio;
+	struct iovec short_iovec[1];
+	fo_rdwr_t *doio;
+	struct vnode *vp;
+	void *rl_cookie;
+	struct mount *mp;
+	vm_page_t *prev_td_ma;
+	int cnt, error, save, saveheld, prev_td_ma_cnt;
+	vm_offset_t addr, end;
+	vm_prot_t prot;
+	size_t len, resid;
+	ssize_t adv;
+
+	if (uio->uio_rw == UIO_READ)
+		doio = vn_read;
+	else
+		doio = vn_write;
+	vp = fp->f_vnode;
+	foffset_lock_uio(fp, uio, flags);
+
+	if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
+	    ((mp = vp->v_mount) != NULL &&
+	    (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) ||
+	    !vn_io_fault_enable) {
+		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+		goto out_last;
+	}
+
+	/*
+	 * The UFS follows IO_UNIT directive and replays back both
+	 * uio_offset and uio_resid if an error is encountered during the
+	 * operation.  But, since the iovec may be already advanced,
+	 * uio is still in an inconsistent state.
+	 *
+	 * Cache a copy of the original uio, which is advanced to the redo
+	 * point using UIO_NOCOPY below.
+	 */
+	uio_clone = cloneuio(uio);
+	resid = uio->uio_resid;
+
+	short_uio.uio_segflg = UIO_USERSPACE;
+	short_uio.uio_rw = uio->uio_rw;
+	short_uio.uio_td = uio->uio_td;
+
+	if (uio->uio_rw == UIO_READ) {
+		prot = VM_PROT_WRITE;
+		rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
+		    uio->uio_offset + uio->uio_resid);
+	} else {
+		prot = VM_PROT_READ;
+		if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0)
+			/* For appenders, punt and lock the whole range. */
+			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+		else
+			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
+			    uio->uio_offset + uio->uio_resid);
+	}
+
+	save = vm_fault_disable_pagefaults();
+	error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+	if (error != EFAULT)
+		goto out;
+
+	atomic_add_long(&vn_io_faults_cnt, 1);
+	uio_clone->uio_segflg = UIO_NOCOPY;
+	uiomove(NULL, resid - uio->uio_resid, uio_clone);
+	uio_clone->uio_segflg = uio->uio_segflg;
+
+	saveheld = curthread_pflags_set(TDP_UIOHELD);
+	prev_td_ma = td->td_ma;
+	prev_td_ma_cnt = td->td_ma_cnt;
+
+	while (uio_clone->uio_resid != 0) {
+		len = uio_clone->uio_iov->iov_len;
+		if (len == 0) {
+			KASSERT(uio_clone->uio_iovcnt >= 1,
+			    ("iovcnt underflow"));
+			uio_clone->uio_iov++;
+			uio_clone->uio_iovcnt--;
+			continue;
+		}
+
+		addr = (vm_offset_t)uio_clone->uio_iov->iov_base;
+		end = round_page(addr + len);
+		cnt = howmany(end - trunc_page(addr), PAGE_SIZE);
+		/*
+		 * A perfectly misaligned address and length could cause
+		 * both the start and the end of the chunk to use partial
+		 * page.  +2 accounts for such a situation.
+		 */
+		if (cnt > io_hold_cnt + 2) {
+			len = io_hold_cnt * PAGE_SIZE;
+			KASSERT(howmany(round_page(addr + len) -
+			    trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2,
+			    ("cnt overflow"));
+		}
+		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
+		    addr, len, prot, ma, io_hold_cnt + 2);
+		if (cnt == -1) {
+			error = EFAULT;
+			break;
+		}
+		short_uio.uio_iov = &short_iovec[0];
+		short_iovec[0].iov_base = (void *)addr;
+		short_uio.uio_iovcnt = 1;
+		short_uio.uio_resid = short_iovec[0].iov_len = len;
+		short_uio.uio_offset = uio_clone->uio_offset;
+		td->td_ma = ma;
+		td->td_ma_cnt = cnt;
+
+		error = doio(fp, &short_uio, active_cred, flags | FOF_OFFSET,
+		    td);
+		vm_page_unhold_pages(ma, cnt);
+		adv = len - short_uio.uio_resid;
+
+		uio_clone->uio_iov->iov_base =
+		    (char *)uio_clone->uio_iov->iov_base + adv;
+		uio_clone->uio_iov->iov_len -= adv;
+		uio_clone->uio_resid -= adv;
+		uio_clone->uio_offset += adv;
+
+		uio->uio_resid -= adv;
+		uio->uio_offset += adv;
+
+		if (error != 0 || adv == 0)
+			break;
+	}
+	td->td_ma = prev_td_ma;
+	td->td_ma_cnt = prev_td_ma_cnt;
+	curthread_pflags_restore(saveheld);
+out:
+	vm_fault_enable_pagefaults(save);
+	vn_rangelock_unlock(vp, rl_cookie);
+	free(uio_clone, M_IOV);
+out_last:
+	foffset_unlock_uio(fp, uio, flags);
+	return (error);
+}
+
+/*
+ * Helper function to perform the requested uiomove operation using
+ * the held pages for io->uio_iov[0].iov_base buffer instead of
+ * copyin/copyout.  Access to the pages with uiomove_fromphys()
+ * instead of iov_base prevents page faults that could occur due to
+ * pmap_collect() invalidating the mapping created by
+ * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
+ * object cleanup revoking the write access from page mappings.
+ *
+ * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
+ * instead of plain uiomove().
+ */
+int
+vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
+{
+	struct uio transp_uio;
+	struct iovec transp_iov[1];
+	struct thread *td;
+	size_t adv;
+	int error, pgadv;
+
+	td = curthread;
+	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+	    uio->uio_segflg != UIO_USERSPACE)
+		return (uiomove(data, xfersize, uio));
+
+	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+	transp_iov[0].iov_base = data;
+	transp_uio.uio_iov = &transp_iov[0];
+	transp_uio.uio_iovcnt = 1;
+	if (xfersize > uio->uio_resid)
+		xfersize = uio->uio_resid;
+	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
+	transp_uio.uio_offset = 0;
+	transp_uio.uio_segflg = UIO_SYSSPACE;
+	/*
+	 * Since transp_iov points to data, and td_ma page array
+	 * corresponds to original uio->uio_iov, we need to invert the
+	 * direction of the i/o operation as passed to
+	 * uiomove_fromphys().
+	 */
+	switch (uio->uio_rw) {
+	case UIO_WRITE:
+		transp_uio.uio_rw = UIO_READ;
+		break;
+	case UIO_READ:
+		transp_uio.uio_rw = UIO_WRITE;
+		break;
+	}
+	transp_uio.uio_td = uio->uio_td;
+	error = uiomove_fromphys(td->td_ma,
+	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
+	    xfersize, &transp_uio);
+	adv = xfersize - transp_uio.uio_resid;
+	pgadv =
+	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
+	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
+	td->td_ma += pgadv;
+	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+	    pgadv));
+	td->td_ma_cnt -= pgadv;
+	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
+	uio->uio_iov->iov_len -= adv;
+	uio->uio_resid -= adv;
+	uio->uio_offset += adv;
+	return (error);
+}
+
 /*
  * File table truncate routine.
  */
 static int
-vn_truncate(fp, length, active_cred, td)
-	struct file *fp;
-	off_t length;
-	struct ucred *active_cred;
-	struct thread *td;
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
 {
 	struct vattr vattr;
 	struct mount *mp;
 	struct vnode *vp;
+	void *rl_cookie;
 	int vfslocked;
 	int error;
 
 	vp = fp->f_vnode;
+
+	/*
+	 * Lock the whole range for truncation.  Otherwise split i/o
+	 * might happen partly before and partly after the truncation.
+	 */
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-	if (error) {
-		VFS_UNLOCK_GIANT(vfslocked);
-		return (error);
-	}
+	if (error)
+		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
@@ -730,7 +1142,9 @@
 out:
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
+out1:
 	VFS_UNLOCK_GIANT(vfslocked);
+	vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
@@ -1466,3 +1880,56 @@
 	vm_object_page_remove(object, start, end, 0);
 	VM_OBJECT_UNLOCK(object);
 }
+
+int
+vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
+{
+	struct vattr va;
+	daddr_t bn, bnp;
+	uint64_t bsize;
+	off_t noff;
+	int error;
+
+	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
+	    ("Wrong command %lu", cmd));
+
+	if (vn_lock(vp, LK_SHARED) != 0)
+		return (EBADF);
+	if (vp->v_type != VREG) {
+		error = ENOTTY;
+		goto unlock;
+	}
+	error = VOP_GETATTR(vp, &va, cred);
+	if (error != 0)
+		goto unlock;
+	noff = *off;
+	if (noff >= va.va_size) {
+		error = ENXIO;
+		goto unlock;
+	}
+	bsize = vp->v_mount->mnt_stat.f_iosize;
+	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
+		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
+		if (error == EOPNOTSUPP) {
+			error = ENOTTY;
+			goto unlock;
+		}
+		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
+		    (bnp != -1 && cmd == FIOSEEKDATA)) {
+			noff = bn * bsize;
+			if (noff < *off)
+				noff = *off;
+			goto unlock;
+		}
+	}
+	if (noff > va.va_size)
+		noff = va.va_size;
+	/* noff == va.va_size. There is an implicit hole at the end of file. */
+	if (cmd == FIOSEEKDATA)
+		error = ENXIO;
+unlock:
+	VOP_UNLOCK(vp, 0);
+	if (error == 0)
+		*off = noff;
+	return (error);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/icmp_var.h
--- a/head/sys/netinet/icmp_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/icmp_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)icmp_var.h	8.1 (Berkeley) 6/10/93
- * $FreeBSD$
+ * $FreeBSD: head/sys/netinet/icmp_var.h 237230 2012-06-18 17:11:24Z tuexen $
  */
 
 #ifndef _NETINET_ICMP_VAR_H_
@@ -102,7 +102,8 @@
 #define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
 #define BANDLIM_RST_OPENPORT 4   /* No connection, listener */
 #define BANDLIM_ICMP6_UNREACH 5
-#define BANDLIM_MAX 5
+#define BANDLIM_SCTP_OOTB 6
+#define BANDLIM_MAX 6
 #endif
 
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/if_ether.c
--- a/head/sys/netinet/if_ether.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/if_ether.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/if_ether.c 230442 2012-01-22 02:13:19Z bz $");
+__FBSDID("$FreeBSD: head/sys/netinet/if_ether.c 237263 2012-06-19 07:34:13Z np $");
 
 #include "opt_inet.h"
 
@@ -180,6 +180,17 @@
 		    callout_active(&lle->la_timer)) {
 			callout_stop(&lle->la_timer);
 			LLE_REMREF(lle);
+
+			if (lle->la_flags != LLE_DELETED) {
+				int evt;
+
+				if (lle->la_flags & LLE_VALID)
+					evt = LLENTRY_EXPIRED;
+				else
+					evt = LLENTRY_TIMEDOUT;
+				EVENTHANDLER_INVOKE(lle_event, lle, evt);
+			}
+
 			pkts_dropped = llentry_free(lle);
 			ARPSTAT_ADD(dropped, pkts_dropped);
 			ARPSTAT_INC(timeouts);
@@ -726,7 +737,7 @@
 		(void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
 		la->la_flags |= LLE_VALID;
 
-		EVENTHANDLER_INVOKE(arp_update_event, la);
+		EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 		if (!(la->la_flags & LLE_STATIC)) {
 			int canceled;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/if_ether.h
--- a/head/sys/netinet/if_ether.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/if_ether.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)if_ether.h	8.3 (Berkeley) 5/2/95
- * $FreeBSD: head/sys/netinet/if_ether.h 229810 2012-01-08 13:34:00Z glebius $
+ * $FreeBSD: head/sys/netinet/if_ether.h 237263 2012-06-19 07:34:13Z np $
  */
 
 #ifndef _NETINET_IF_ETHER_H_
@@ -122,8 +122,14 @@
 void	arp_ifscrub(struct ifnet *, uint32_t);
 
 #include <sys/eventhandler.h>
-typedef void (*llevent_arp_update_fn)(void *, struct llentry *);
-EVENTHANDLER_DECLARE(arp_update_event, llevent_arp_update_fn);
+enum {
+	LLENTRY_RESOLVED,
+	LLENTRY_TIMEDOUT,
+	LLENTRY_DELETED,
+	LLENTRY_EXPIRED,
+};
+typedef void (*lle_event_fn)(void *, struct llentry *, int);
+EVENTHANDLER_DECLARE(lle_event, lle_event_fn);
 
 #endif
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/igmp.c
--- a/head/sys/netinet/igmp.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/igmp.c	Wed Jul 25 16:40:53 2012 +0300
@@ -48,7 +48,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/igmp.c 229621 2012-01-05 19:00:36Z jhb $");
+__FBSDID("$FreeBSD: head/sys/netinet/igmp.c 238084 2012-07-03 19:04:18Z trociny $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -2285,13 +2285,11 @@
 	 */
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
-	if (ifp != NULL) {
-		/*
-		 * Sanity check that netinet's notion of ifp is the
-		 * same as net's.
-		 */
-		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
-	}
+	/*
+	 * Sanity check that netinet's notion of ifp is the
+	 * same as net's.
+	 */
+	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 
 	IGMP_LOCK();
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in.c
--- a/head/sys/netinet/in.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/in.c 234087 2012-04-10 06:52:39Z glebius $");
+__FBSDID("$FreeBSD: head/sys/netinet/in.c 237263 2012-06-19 07:34:13Z np $");
 
 #include "opt_mpath.h"
 
@@ -1469,7 +1469,7 @@
 		if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) {
 			LLE_WLOCK(lle);
 			lle->la_flags = LLE_DELETED;
-			EVENTHANDLER_INVOKE(arp_update_event, lle);
+			EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 			LLE_WUNLOCK(lle);
 #ifdef DIAGNOSTIC
 			log(LOG_INFO, "ifaddr cache = %p  is deleted\n", lle);	
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in.h
--- a/head/sys/netinet/in.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)in.h	8.3 (Berkeley) 1/3/94
- * $FreeBSD: head/sys/netinet/in.h 226402 2011-10-15 18:41:25Z glebius $
+ * $FreeBSD: head/sys/netinet/in.h 236959 2012-06-12 14:02:38Z tuexen $
  */
 
 #ifndef _NETINET_IN_H_
@@ -241,6 +241,7 @@
 #define	IPPROTO_PIM		103		/* Protocol Independent Mcast */
 #define	IPPROTO_CARP		112		/* CARP */
 #define	IPPROTO_PGM		113		/* PGM */
+#define	IPPROTO_MPLS		137		/* MPLS-in-IP */
 #define	IPPROTO_PFSYNC		240		/* PFSYNC */
 /* 255: Reserved */
 /* BSD Private, local use, namespace incursion, no longer used */
@@ -461,6 +462,7 @@
 #define	IP_RECVTTL		65   /* bool; receive IP TTL w/dgram */
 #define	IP_MINTTL		66   /* minimum TTL for packet or drop */
 #define	IP_DONTFRAG		67   /* don't fragment packet */
+#define	IP_RECVTOS		68   /* bool; receive IP TOS w/dgram */
 
 /* IPv4 Source Filter Multicast API [RFC3678] */
 #define	IP_ADD_SOURCE_MEMBERSHIP	70   /* join a source-specific group */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in_pcb.c
--- a/head/sys/netinet/in_pcb.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in_pcb.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/in_pcb.c 230442 2012-01-22 02:13:19Z bz $");
+__FBSDID("$FreeBSD: head/sys/netinet/in_pcb.c 236959 2012-06-12 14:02:38Z tuexen $");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
@@ -2295,6 +2295,10 @@
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
+	if (inp_flags & INP_RECVTOS) {
+		db_printf("%sINP_RECVTOS", comma ? ", " : "");
+		comma = 1;
+	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in_pcb.h
--- a/head/sys/netinet/in_pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in_pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.h	8.1 (Berkeley) 6/10/93
- * $FreeBSD: head/sys/netinet/in_pcb.h 233096 2012-03-17 21:51:39Z rmh $
+ * $FreeBSD: head/sys/netinet/in_pcb.h 236959 2012-06-12 14:02:38Z tuexen $
  */
 
 #ifndef _NETINET_IN_PCB_H_
@@ -509,6 +509,7 @@
 #define	INP_DONTFRAG		0x00000800 /* don't fragment packet */
 #define	INP_BINDANY		0x00001000 /* allow bind to any address */
 #define	INP_INHASHLIST		0x00002000 /* in_pcbinshash() has been called */
+#define	INP_RECVTOS		0x00004000 /* receive incoming IP TOS */
 #define	IN6P_IPV6_V6ONLY	0x00008000 /* restrict AF_INET6 socket for v6 */
 #define	IN6P_PKTINFO		0x00010000 /* receive IP6 dst and I/F */
 #define	IN6P_HOPLIMIT		0x00020000 /* receive hoplimit */
@@ -528,7 +529,7 @@
 #define	IN6P_MTU		0x80000000 /* receive path MTU */
 
 #define	INP_CONTROLOPTS		(INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
-				 INP_RECVIF|INP_RECVTTL|\
+				 INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\
 				 IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
 				 IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
 				 IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in_var.h
--- a/head/sys/netinet/in_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)in_var.h	8.2 (Berkeley) 1/9/95
- * $FreeBSD: head/sys/netinet/in_var.h 229815 2012-01-08 17:20:29Z glebius $
+ * $FreeBSD: head/sys/netinet/in_var.h 238572 2012-07-18 08:41:00Z glebius $
  */
 
 #ifndef _NETINET_IN_VAR_H_
@@ -161,14 +161,16 @@
 #define IFP_TO_IA(ifp, ia)						\
 	/* struct ifnet *ifp; */					\
 	/* struct in_ifaddr *ia; */					\
-{									\
+do {									\
+	IN_IFADDR_RLOCK();						\
 	for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead);			\
 	    (ia) != NULL && (ia)->ia_ifp != (ifp);			\
 	    (ia) = TAILQ_NEXT((ia), ia_link))				\
 		continue;						\
 	if ((ia) != NULL)						\
 		ifa_ref(&(ia)->ia_ifa);					\
-}
+	IN_IFADDR_RUNLOCK();						\
+} while (0)
 #endif
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip.h
--- a/head/sys/netinet/ip.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip.h	Wed Jul 25 16:40:53 2012 +0300
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ip.h	8.2 (Berkeley) 6/1/94
- * $FreeBSD$
+ * $FreeBSD: head/sys/netinet/ip.h 235036 2012-05-04 21:00:32Z delphij $
  */
 
 #ifndef _NETINET_IP_H_
@@ -92,6 +92,31 @@
 #define	IPTOS_PREC_ROUTINE		0x00
 
 /*
+ * Definitions for DiffServ Codepoints as per RFC2474
+ */
+#define	IPTOS_DSCP_CS0		0x00
+#define	IPTOS_DSCP_CS1		0x20
+#define	IPTOS_DSCP_AF11		0x28
+#define	IPTOS_DSCP_AF12		0x30
+#define	IPTOS_DSCP_AF13		0x38
+#define	IPTOS_DSCP_CS2		0x40
+#define	IPTOS_DSCP_AF21		0x48
+#define	IPTOS_DSCP_AF22		0x50
+#define	IPTOS_DSCP_AF23		0x58
+#define	IPTOS_DSCP_CS3		0x60
+#define	IPTOS_DSCP_AF31		0x68
+#define	IPTOS_DSCP_AF32		0x70
+#define	IPTOS_DSCP_AF33		0x78
+#define	IPTOS_DSCP_CS4		0x80
+#define	IPTOS_DSCP_AF41		0x88
+#define	IPTOS_DSCP_AF42		0x90
+#define	IPTOS_DSCP_AF43		0x98
+#define	IPTOS_DSCP_CS5		0xa0
+#define	IPTOS_DSCP_EF		0xb8
+#define	IPTOS_DSCP_CS6		0xc0
+#define	IPTOS_DSCP_CS7		0xe0
+
+/*
  * ECN (Explicit Congestion Notification) codepoints in RFC3168 mapped to the
  * lower 2 bits of the TOS field.
  */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_carp.c
--- a/head/sys/netinet/ip_carp.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_carp.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_carp.c 234130 2012-04-11 12:26:30Z glebius $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_carp.c 236310 2012-05-30 13:51:00Z glebius $");
 
 #include "opt_bpf.h"
 #include "opt_inet.h"
@@ -696,7 +696,7 @@
 		CARPSTATS_INC(carps_onomem);
 		return (ENOMEM);
 	}
-	bcopy(&sc, (caddr_t)(mtag + 1), sizeof(struct carp_softc *));
+	bcopy(&sc, mtag + 1, sizeof(sc));
 	m_tag_prepend(m, mtag);
 
 	return (0);
@@ -1061,13 +1061,12 @@
 			IF_ADDR_RUNLOCK(ifp);
 
 			mtag = m_tag_get(PACKET_TAG_CARP,
-			    sizeof(struct ifnet *), M_NOWAIT);
+			    sizeof(struct carp_softc *), M_NOWAIT);
 			if (mtag == NULL)
 				/* Better a bit than nothing. */
 				return (LLADDR(&sc->sc_addr));
 
-			bcopy(&ifp, (caddr_t)(mtag + 1),
-			    sizeof(struct ifnet *));
+			bcopy(&sc, mtag + 1, sizeof(sc));
 			m_tag_prepend(m, mtag);
 
 			return (LLADDR(&sc->sc_addr));
@@ -1391,7 +1390,7 @@
 	if (mtag == NULL)
 		return (0);
 
-	bcopy(mtag + 1, &sc, sizeof(struct carp_softc *));
+	bcopy(mtag + 1, &sc, sizeof(sc));
 
 	/* Set the source MAC address to the Virtual Router MAC Address. */
 	switch (ifp->if_type) {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_fw.h
--- a/head/sys/netinet/ip_fw.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_fw.h	Wed Jul 25 16:40:53 2012 +0300
@@ -22,7 +22,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/netinet/ip_fw.h 233478 2012-03-25 20:37:59Z melifaro $
+ * $FreeBSD: head/sys/netinet/ip_fw.h 234946 2012-05-03 08:56:43Z melifaro $
  */
 
 #ifndef _IPFW2_H
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_icmp.c
--- a/head/sys/netinet/ip_icmp.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_icmp.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_icmp.c 229749 2012-01-07 00:11:36Z eadler $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_icmp.c 237230 2012-06-18 17:11:24Z tuexen $");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
@@ -965,7 +965,8 @@
 		{ "icmp tstamp response" },
 		{ "closed port RST response" },
 		{ "open port RST response" },
-		{ "icmp6 unreach response" }
+		{ "icmp6 unreach response" },
+		{ "sctp ootb response" }
 	};
 
 	/*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_input.c
--- a/head/sys/netinet/ip_input.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_input.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_input.c 229621 2012-01-05 19:00:36Z jhb $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_input.c 238092 2012-07-04 07:37:53Z glebius $");
 
 #include "opt_bootp.h"
 #include "opt_ipfw.h"
@@ -1495,8 +1495,7 @@
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_rmx.rmx_mtu;
-	if (ro.ro_rt)
-		RTFREE(ro.ro_rt);
+	RO_RTFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
@@ -1684,6 +1683,12 @@
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
+	if (inp->inp_flags & INP_RECVTOS) {
+		*mp = sbcreatecontrol((caddr_t) &ip->ip_tos,
+		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
+		if (*mp)
+			mp = &(*mp)->m_next;
+	}
 }
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_mroute.c
--- a/head/sys/netinet/ip_mroute.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_mroute.c	Wed Jul 25 16:40:53 2012 +0300
@@ -67,7 +67,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_mroute.c 232517 2012-03-04 18:59:38Z zec $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_mroute.c 238016 2012-07-02 19:44:18Z glebius $");
 
 #include "opt_inet.h"
 #include "opt_mrouting.h"
@@ -924,7 +924,6 @@
     vifp->v_pkt_out   = 0;
     vifp->v_bytes_in  = 0;
     vifp->v_bytes_out = 0;
-    bzero(&vifp->v_route, sizeof(vifp->v_route));
 
     /* Adjust numvifs up if the vifi is higher than numvifs */
     if (V_numvifs <= vifcp->vifc_vifi)
@@ -1702,7 +1701,7 @@
 	 * should get rejected because they appear to come from
 	 * the loopback interface, thus preventing looping.
 	 */
-	error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL);
+	error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL);
 	CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
 	    (ptrdiff_t)(vifp - V_viftable), error);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_mroute.h
--- a/head/sys/netinet/ip_mroute.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_mroute.h	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ip_mroute.h	8.1 (Berkeley) 6/10/93
- * $FreeBSD$
+ * $FreeBSD: head/sys/netinet/ip_mroute.h 238016 2012-07-02 19:44:18Z glebius $
  */
 
 #ifndef _NETINET_IP_MROUTE_H_
@@ -262,7 +262,6 @@
     u_long		v_pkt_out;	/* # pkts out on interface           */
     u_long		v_bytes_in;	/* # bytes in on interface	     */
     u_long		v_bytes_out;	/* # bytes out on interface	     */
-    struct route	v_route;	/* cached route */
 };
 
 #ifdef _KERNEL
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_output.c
--- a/head/sys/netinet/ip_output.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_output.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 227207 2011-11-06 10:47:20Z trociny $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 238573 2012-07-18 08:58:30Z glebius $");
 
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
@@ -105,6 +105,10 @@
  * ip_len and ip_off are in host format.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
+ * If route ro is present and has ro_rt initialized, route lookup would be
+ * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
+ * then result of route lookup is stored in ro->ro_rt.
+ *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
@@ -119,9 +123,8 @@
 	int mtu;
 	int n;	/* scratchpad */
 	int error = 0;
-	int nortfree = 0;
 	struct sockaddr_in *dst;
-	struct in_ifaddr *ia = NULL;
+	struct in_ifaddr *ia;
 	int isbroadcast, sw_csum;
 	struct route iproute;
 	struct rtentry *rte;	/* cache for ro->ro_rt */
@@ -146,24 +149,23 @@
 	if (ro == NULL) {
 		ro = &iproute;
 		bzero(ro, sizeof (*ro));
+	}
 
 #ifdef FLOWTABLE
-		{
-			struct flentry *fle;
+	if (ro->ro_rt == NULL) {
+		struct flentry *fle;
 			
-			/*
-			 * The flow table returns route entries valid for up to 30
-			 * seconds; we rely on the remainder of ip_output() taking no
-			 * longer than that long for the stability of ro_rt.  The
-			 * flow ID assignment must have happened before this point.
-			 */
-			if ((fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET)) != NULL) {
-				flow_to_route(fle, ro);
-				nortfree = 1;
-			}
-		}
+		/*
+		 * The flow table returns route entries valid for up to 30
+		 * seconds; we rely on the remainder of ip_output() taking no
+		 * longer than that long for the stability of ro_rt. The
+		 * flow ID assignment must have happened before this point.
+		 */
+		fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET);
+		if (fle != NULL)
+			flow_to_route(fle, ro);
+	}
 #endif
-	}
 
 	if (opt) {
 		int len = 0;
@@ -196,6 +198,7 @@
 
 	dst = (struct sockaddr_in *)&ro->ro_dst;
 again:
+	ia = NULL;
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
@@ -209,10 +212,9 @@
 		    !RT_LINK_IS_UP(rte->rt_ifp) ||
 			  dst->sin_family != AF_INET ||
 			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
-		if (!nortfree)
-			RTFREE(rte);
-		rte = ro->ro_rt = (struct rtentry *)NULL;
-		ro->ro_lle = (struct llentry *)NULL;
+		RO_RTFREE(ro);
+		ro->ro_lle = NULL;
+		rte = NULL;
 	}
 #ifdef IPFIREWALL_FORWARD
 	if (rte == NULL && fwd_tag == NULL) {
@@ -532,8 +534,11 @@
 #endif
 			error = netisr_queue(NETISR_IP, m);
 			goto done;
-		} else
+		} else {
+			if (ia != NULL)
+				ifa_free(&ia->ia_ifa);
 			goto again;	/* Redo the routing table lookup. */
+		}
 	}
 
 #ifdef IPFIREWALL_FORWARD
@@ -563,6 +568,8 @@
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m_tag_delete(m, fwd_tag);
+		if (ia != NULL)
+			ifa_free(&ia->ia_ifa);
 		goto again;
 	}
 #endif /* IPFIREWALL_FORWARD */
@@ -672,9 +679,8 @@
 		IPSTAT_INC(ips_fragmented);
 
 done:
-	if (ro == &iproute && ro->ro_rt && !nortfree) {
-		RTFREE(ro->ro_rt);
-	}
+	if (ro == &iproute)
+		RO_RTFREE(ro);
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (error);
@@ -984,6 +990,7 @@
 		case IP_FAITH:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
+		case IP_RECVTOS:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
@@ -1047,6 +1054,9 @@
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
+			case IP_RECVTOS:
+				OPTSET(INP_RECVTOS);
+				break;
 			}
 			break;
 #undef OPTSET
@@ -1156,6 +1166,7 @@
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
+		case IP_RECVTOS:
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
@@ -1214,6 +1225,9 @@
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
+			case IP_RECVTOS:
+				optval = OPTBIT(INP_RECVTOS);
+				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ipfw/ip_dummynet.c
--- a/head/sys/netinet/ipfw/ip_dummynet.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ipfw/ip_dummynet.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_dummynet.c 222560 2011-06-01 12:33:05Z ae $");
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_dummynet.c 238063 2012-07-03 08:42:48Z issyl0 $");
 
 /*
  * Configuration and internal object management for dummynet.
@@ -97,7 +97,7 @@
 	struct dn_alg *d;
 
 	SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
-		if (d->type == type || (name && !strcmp(d->name, name)))
+		if (d->type == type || (name && !strcasecmp(d->name, name)))
 			return d;
 	}
 	return NULL; /* not found */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ipfw/ip_fw_log.c
--- a/head/sys/netinet/ipfw/ip_fw_log.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ipfw/ip_fw_log.c	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 227085 2011-11-04 16:24:19Z bz $");
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 238277 2012-07-09 07:16:19Z hrs $");
 
 /*
  * Logging support for ipfw
@@ -44,8 +44,11 @@
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
 #include <net/ethernet.h> /* for ETHERTYPE_IP */
 #include <net/if.h>
+#include <net/if_clone.h>
 #include <net/vnet.h>
 #include <net/if_types.h>	/* for IFT_ETHER */
 #include <net/bpf.h>		/* for BPF */
@@ -90,6 +93,15 @@
 }
 #else /* !WITHOUT_BPF */
 static struct ifnet *log_if;	/* hook to attach to bpf */
+static struct rwlock log_if_lock;
+#define	LOGIF_LOCK_INIT(x)	rw_init(&log_if_lock, "ipfw log_if lock")
+#define	LOGIF_LOCK_DESTROY(x)	rw_destroy(&log_if_lock)
+#define	LOGIF_RLOCK(x)		rw_rlock(&log_if_lock)
+#define	LOGIF_RUNLOCK(x)	rw_runlock(&log_if_lock)
+#define	LOGIF_WLOCK(x)		rw_wlock(&log_if_lock)
+#define	LOGIF_WUNLOCK(x)	rw_wunlock(&log_if_lock)
+
+#define	IPFWNAME	"ipfw"
 
 /* we use this dummy function for all ifnet callbacks */
 static int
@@ -116,37 +128,105 @@
 static const u_char ipfwbroadcastaddr[6] =
 	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
+static int
+ipfw_log_clone_match(struct if_clone *ifc, const char *name)
+{
+
+	return (strncmp(name, IPFWNAME, sizeof(IPFWNAME) - 1) == 0);
+}
+
+static int
+ipfw_log_clone_create(struct if_clone *ifc, char *name, size_t len,
+    caddr_t params)
+{
+	int error;
+	int unit;
+	struct ifnet *ifp;
+
+	error = ifc_name2unit(name, &unit);
+	if (error)
+		return (error);
+
+	error = ifc_alloc_unit(ifc, &unit);
+	if (error)
+		return (error);
+
+	ifp = if_alloc(IFT_ETHER);
+	if (ifp == NULL) {
+		ifc_free_unit(ifc, unit);
+		return (ENOSPC);
+	}
+	ifp->if_dname = IPFWNAME;
+	ifp->if_dunit = unit;
+	snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", IPFWNAME, unit);
+	strlcpy(name, ifp->if_xname, len);
+	ifp->if_mtu = 65536;
+	ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+	ifp->if_init = (void *)log_dummy;
+	ifp->if_ioctl = log_dummy;
+	ifp->if_start = ipfw_log_start;
+	ifp->if_output = ipfw_log_output;
+	ifp->if_addrlen = 6;
+	ifp->if_hdrlen = 14;
+	ifp->if_broadcastaddr = ipfwbroadcastaddr;
+	ifp->if_baudrate = IF_Mbps(10);
+
+	LOGIF_WLOCK();
+	if (log_if == NULL)
+		log_if = ifp;
+	else {
+		LOGIF_WUNLOCK();
+		if_free(ifp);
+		ifc_free_unit(ifc, unit);
+		return (EEXIST);
+	}
+	LOGIF_WUNLOCK();
+	if_attach(ifp);
+	bpfattach(ifp, DLT_EN10MB, 14);
+
+	return (0);
+}
+
+static int
+ipfw_log_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
+{
+	int unit;
+
+	if (ifp == NULL)
+		return (0);
+
+	LOGIF_WLOCK();
+	if (log_if != NULL && ifp == log_if)
+		log_if = NULL;
+	else {
+		LOGIF_WUNLOCK();
+		return (EINVAL);
+	}
+	LOGIF_WUNLOCK();
+
+	unit = ifp->if_dunit;
+	bpfdetach(ifp);
+	if_detach(ifp);
+	if_free(ifp);
+	ifc_free_unit(ifc, unit);
+
+	return (0);
+}
+
+static struct if_clone ipfw_log_cloner = IFC_CLONE_INITIALIZER(
+    IPFWNAME, NULL, IF_MAXUNIT,
+    NULL, ipfw_log_clone_match, ipfw_log_clone_create, ipfw_log_clone_destroy);
+
 void
 ipfw_log_bpf(int onoff)
 {
-	struct ifnet *ifp;
 
 	if (onoff) {
-		if (log_if)
-			return;
-		ifp = if_alloc(IFT_ETHER);
-		if (ifp == NULL)
-			return;
-		if_initname(ifp, "ipfw", 0);
-		ifp->if_mtu = 65536;
-		ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
-		ifp->if_init = (void *)log_dummy;
-		ifp->if_ioctl = log_dummy;
-		ifp->if_start = ipfw_log_start;
-		ifp->if_output = ipfw_log_output;
-		ifp->if_addrlen = 6;
-		ifp->if_hdrlen = 14;
-		if_attach(ifp);
-		ifp->if_broadcastaddr = ipfwbroadcastaddr;
-		ifp->if_baudrate = IF_Mbps(10);
-		bpfattach(ifp, DLT_EN10MB, 14);
-		log_if = ifp;
+		LOGIF_LOCK_INIT();
+		if_clone_attach(&ipfw_log_cloner);
 	} else {
-		if (log_if) {
-			ether_ifdetach(log_if);
-			if_free(log_if);
-		}
-		log_if = NULL;
+		if_clone_detach(&ipfw_log_cloner);
+		LOGIF_LOCK_DESTROY();
 	}
 }
 #endif /* !WITHOUT_BPF */
@@ -166,9 +246,11 @@
 
 	if (V_fw_verbose == 0) {
 #ifndef WITHOUT_BPF
-
-		if (log_if == NULL || log_if->if_bpf == NULL)
+		LOGIF_RLOCK();
+		if (log_if == NULL || log_if->if_bpf == NULL) {
+			LOGIF_RUNLOCK();
 			return;
+		}
 
 		if (args->eh) /* layer2, use orig hdr */
 			BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m);
@@ -177,6 +259,7 @@
 			 * more info in the header.
 			 */
 			BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m);
+		LOGIF_RUNLOCK();
 #endif /* !WITHOUT_BPF */
 		return;
 	}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ipfw/ip_fw_private.h
--- a/head/sys/netinet/ipfw/ip_fw_private.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ipfw/ip_fw_private.h	Wed Jul 25 16:40:53 2012 +0300
@@ -22,7 +22,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 233478 2012-03-25 20:37:59Z melifaro $
+ * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 234946 2012-05-03 08:56:43Z melifaro $
  */
 
 #ifndef _IPFW2_PRIVATE_H
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ipfw/ip_fw_table.c
--- a/head/sys/netinet/ipfw/ip_fw_table.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ipfw/ip_fw_table.c	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 233478 2012-03-25 20:37:59Z melifaro $");
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 238265 2012-07-08 21:13:04Z melifaro $");
 
 /*
  * Lookup table support for ipfw
@@ -153,6 +153,9 @@
 	case IPFW_TABLE_CIDR:
 		if (plen == sizeof(in_addr_t)) {
 #ifdef INET
+			/* IPv4 case */
+			if (mlen > 32)
+				return (EINVAL);
 			ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
 			ent->value = value;
 			/* Set 'total' structure length */
@@ -341,9 +344,12 @@
 		struct xaddr_iface ifname, ifmask;
 		memset(&ifname, 0, sizeof(ifname));
 
+		/* Include last \0 into comparison */
+		mlen++;
+
 		/* Set 'total' structure length */
-		KEY_LEN(ifname) = mlen;
-		KEY_LEN(ifmask) = mlen;
+		KEY_LEN(ifname) = KEY_LEN_IFACE + mlen;
+		KEY_LEN(ifmask) = KEY_LEN_IFACE + mlen;
 		/* Assume direct match */
 		/* FIXME: Add interface pattern matching */
 #if 0
@@ -565,7 +571,8 @@
 		break;
 
 	case IPFW_TABLE_INTERFACE:
-		KEY_LEN(iface) = strlcpy(iface.ifname, (char *)paddr, IF_NAMESIZE);
+		KEY_LEN(iface) = KEY_LEN_IFACE +
+		    strlcpy(iface.ifname, (char *)paddr, IF_NAMESIZE) + 1;
 		/* Assume direct match */
 		/* FIXME: Add interface pattern matching */
 		xent = (struct table_xentry *)(rnh->rnh_lookup(&iface, NULL, rnh));
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/libalias/alias_sctp.h
--- a/head/sys/netinet/libalias/alias_sctp.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/libalias/alias_sctp.h	Wed Jul 25 16:40:53 2012 +0300
@@ -45,7 +45,7 @@
  *
  */
 
-/* $FreeBSD: head/sys/netinet/libalias/alias_sctp.h 222809 2011-06-07 06:57:22Z ae $ */
+/* $FreeBSD: head/sys/netinet/libalias/alias_sctp.h 235644 2012-05-19 05:14:24Z marcel $ */
 
 #ifndef _ALIAS_SCTP_H_
 #define _ALIAS_SCTP_H_
@@ -92,7 +92,6 @@
 #ifndef _KERNEL
 #include <stdlib.h>
 #include <stdio.h>
-#include <curses.h>
 #endif //#ifdef _KERNEL
 
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/libalias/libalias.3
--- a/head/sys/netinet/libalias/libalias.3	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/libalias/libalias.3	Wed Jul 25 16:40:53 2012 +0300
@@ -23,9 +23,9 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.\" $FreeBSD: head/sys/netinet/libalias/libalias.3 223773 2011-07-04 23:00:26Z gjb $
+.\" $FreeBSD: head/sys/netinet/libalias/libalias.3 237015 2012-06-13 18:57:27Z joel $
 .\"
-.Dd July 04, 2011
+.Dd July 4, 2011
 .Dt LIBALIAS 3
 .Os
 .Sh NAME
@@ -201,11 +201,10 @@
 If this mode bit is set, traffic on the local network which does not
 originate from unregistered address spaces will be ignored.
 Standard Class A, B and C unregistered addresses are:
-.Bd -literal -offset indent
+.Pp
 10.0.0.0     ->  10.255.255.255   (Class A subnet)
 172.16.0.0   ->  172.31.255.255   (Class B subnets)
 192.168.0.0  ->  192.168.255.255  (Class C subnets)
-.Ed
 .Pp
 This option is useful in the case that the packet aliasing host has both
 registered and unregistered subnets on different interfaces.
@@ -499,14 +498,13 @@
 New traffic generated by any of the local machines, designated in the
 several function calls, will be aliased to the same address.
 Consider the following example:
-.Bd -literal -offset indent
+.Pp
 LibAliasRedirectAddr(la, inet_aton("192.168.0.2"),
                         inet_aton("141.221.254.101"));
 LibAliasRedirectAddr(la, inet_aton("192.168.0.3"),
                         inet_aton("141.221.254.101"));
 LibAliasRedirectAddr(la, inet_aton("192.168.0.4"),
                         inet_aton("141.221.254.101"));
-.Ed
 .Pp
 Any outgoing connections such as
 .Xr telnet 1
@@ -919,7 +917,7 @@
 .An Paolo Pisati Aq piso at FreeBSD.org
 made the library modular, moving support for all
 protocols (except for IP, TCP and UDP) to external modules.
-.Sh ACKNOWLEDGMENTS
+.Sh ACKNOWLEDGEMENTS
 Listed below, in approximate chronological order, are individuals who
 have provided valuable comments and/or debugging assistance.
 .Bd -ragged -offset indent
@@ -1277,10 +1275,10 @@
 .Ed
 .Bl -inset
 .It Va name
-is the name of the module
+is the name of the module.
 .It Va handle
 is a pointer to the module obtained through
-.Xr dlopen 3
+.Xr dlopen 3 .
 .El
 Whenever a module is loaded in userland, an entry is added to
 .Va dll_chain ,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp.h
--- a/head/sys/netinet/sctp.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,14 +29,14 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
-/* $KAME: sctp.h,v 1.18 2005/03/06 16:04:16 itojun Exp $	 */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp.h 233660 2012-03-29 13:36:53Z rrs $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp.h 235990 2012-05-25 11:14:08Z tuexen $");
 
 #ifndef _NETINET_SCTP_H_
 #define _NETINET_SCTP_H_
 
+
 #include <sys/types.h>
 
 
@@ -265,8 +265,6 @@
 #define SCTP_PEELOFF                    0x0000800a
 /* the real worker for sctp_getaddrlen() */
 #define SCTP_GET_ADDR_LEN               0x0000800b
-/* temporary workaround for Apple listen() issue, no args used */
-#define SCTP_LISTEN_FIX			0x0000800c
 /* Debug things that need to be purged */
 #define SCTP_SET_INITIAL_DBG_SEQ	0x00009f00
 
@@ -511,35 +509,38 @@
 /*
  * PCB Features (in sctp_features bitmask)
  */
-#define SCTP_PCB_FLAGS_DO_NOT_PMTUD     0x00000001
-#define SCTP_PCB_FLAGS_EXT_RCVINFO      0x00000002	/* deprecated */
-#define SCTP_PCB_FLAGS_DONOT_HEARTBEAT  0x00000004
-#define SCTP_PCB_FLAGS_FRAG_INTERLEAVE  0x00000008
-#define SCTP_PCB_FLAGS_INTERLEAVE_STRMS	0x00000010
-#define SCTP_PCB_FLAGS_DO_ASCONF	0x00000020
-#define SCTP_PCB_FLAGS_AUTO_ASCONF	0x00000040
-#define SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE 0x00000080
+#define SCTP_PCB_FLAGS_DO_NOT_PMTUD      0x00000001
+#define SCTP_PCB_FLAGS_EXT_RCVINFO       0x00000002	/* deprecated */
+#define SCTP_PCB_FLAGS_DONOT_HEARTBEAT   0x00000004
+#define SCTP_PCB_FLAGS_FRAG_INTERLEAVE   0x00000008
+#define SCTP_PCB_FLAGS_INTERLEAVE_STRMS  0x00000010
+#define SCTP_PCB_FLAGS_DO_ASCONF         0x00000020
+#define SCTP_PCB_FLAGS_AUTO_ASCONF       0x00000040
+#define SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE  0x00000080
 /* socket options */
-#define SCTP_PCB_FLAGS_NODELAY		0x00000100
-#define SCTP_PCB_FLAGS_AUTOCLOSE	0x00000200
-#define SCTP_PCB_FLAGS_RECVDATAIOEVNT	0x00000400	/* deprecated */
-#define SCTP_PCB_FLAGS_RECVASSOCEVNT	0x00000800
-#define SCTP_PCB_FLAGS_RECVPADDREVNT	0x00001000
-#define SCTP_PCB_FLAGS_RECVPEERERR	0x00002000
-#define SCTP_PCB_FLAGS_RECVSENDFAILEVNT	0x00004000
-#define SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT	0x00008000
-#define SCTP_PCB_FLAGS_ADAPTATIONEVNT	0x00010000
-#define SCTP_PCB_FLAGS_PDAPIEVNT	0x00020000
-#define SCTP_PCB_FLAGS_AUTHEVNT		0x00040000
-#define SCTP_PCB_FLAGS_STREAM_RESETEVNT 0x00080000
-#define SCTP_PCB_FLAGS_NO_FRAGMENT	0x00100000
-#define SCTP_PCB_FLAGS_EXPLICIT_EOR     0x00400000
-#define SCTP_PCB_FLAGS_NEEDS_MAPPED_V4	0x00800000
-#define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS	0x01000000
-#define SCTP_PCB_FLAGS_PORTREUSE        0x02000000
-#define SCTP_PCB_FLAGS_DRYEVNT          0x04000000
-#define SCTP_PCB_FLAGS_RECVRCVINFO      0x08000000
-#define SCTP_PCB_FLAGS_RECVNXTINFO      0x10000000
+#define SCTP_PCB_FLAGS_NODELAY           0x00000100
+#define SCTP_PCB_FLAGS_AUTOCLOSE         0x00000200
+#define SCTP_PCB_FLAGS_RECVDATAIOEVNT    0x00000400	/* deprecated */
+#define SCTP_PCB_FLAGS_RECVASSOCEVNT     0x00000800
+#define SCTP_PCB_FLAGS_RECVPADDREVNT     0x00001000
+#define SCTP_PCB_FLAGS_RECVPEERERR       0x00002000
+#define SCTP_PCB_FLAGS_RECVSENDFAILEVNT  0x00004000	/* deprecated */
+#define SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT  0x00008000
+#define SCTP_PCB_FLAGS_ADAPTATIONEVNT    0x00010000
+#define SCTP_PCB_FLAGS_PDAPIEVNT         0x00020000
+#define SCTP_PCB_FLAGS_AUTHEVNT          0x00040000
+#define SCTP_PCB_FLAGS_STREAM_RESETEVNT  0x00080000
+#define SCTP_PCB_FLAGS_NO_FRAGMENT       0x00100000
+#define SCTP_PCB_FLAGS_EXPLICIT_EOR      0x00400000
+#define SCTP_PCB_FLAGS_NEEDS_MAPPED_V4   0x00800000
+#define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS  0x01000000
+#define SCTP_PCB_FLAGS_PORTREUSE         0x02000000
+#define SCTP_PCB_FLAGS_DRYEVNT           0x04000000
+#define SCTP_PCB_FLAGS_RECVRCVINFO       0x08000000
+#define SCTP_PCB_FLAGS_RECVNXTINFO       0x10000000
+#define SCTP_PCB_FLAGS_ASSOC_RESETEVNT   0x20000000
+#define SCTP_PCB_FLAGS_STREAM_CHANGEEVNT 0x40000000
+#define SCTP_PCB_FLAGS_RECVNSENDFAILEVNT 0x80000000
 
 /*-
  * mobility_features parameters (by micchie).Note
@@ -547,14 +548,16 @@
  * sctp_mobility_features flags.. not the sctp_features
  * flags.
  */
-#define SCTP_MOBILITY_BASE		0x00000001
-#define SCTP_MOBILITY_FASTHANDOFF	0x00000002
-#define SCTP_MOBILITY_PRIM_DELETED	0x00000004
+#define SCTP_MOBILITY_BASE               0x00000001
+#define SCTP_MOBILITY_FASTHANDOFF        0x00000002
+#define SCTP_MOBILITY_PRIM_DELETED       0x00000004
 
 
 #define SCTP_SMALLEST_PMTU 512	/* smallest pmtu allowed when disabling PMTU
 				 * discovery */
 
+#undef SCTP_PACKED
+
 #include <netinet/sctp_uio.h>
 
 /* This dictates the size of the packet
@@ -606,7 +609,4 @@
 #define SCTP_LOG_AT_SEND_2_OUTQ				0x08000000
 #define SCTP_LOG_TRY_ADVANCE				0x10000000
 
-
-#undef SCTP_PACKED
-
 #endif				/* !_NETINET_SCTP_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_asconf.c
--- a/head/sys/netinet/sctp_asconf.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_asconf.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,9 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_asconf.c,v 1.24 2005/03/06 16:04:16 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_asconf.c 238501 2012-07-15 20:16:17Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_asconf.c 228907 2011-12-27 10:16:24Z tuexen $");
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
@@ -49,63 +48,10 @@
  * SCTP_DEBUG_ASCONF1: protocol info, general info and errors
  * SCTP_DEBUG_ASCONF2: detailed info
  */
-#ifdef SCTP_DEBUG
-#endif				/* SCTP_DEBUG */
 
 
-static void
-sctp_asconf_get_source_ip(struct mbuf *m, struct sockaddr *sa)
-{
-	struct ip *iph;
-
-#ifdef INET
-	struct sockaddr_in *sin;
-
-#endif
-#ifdef INET6
-	struct sockaddr_in6 *sin6;
-
-#endif
-
-	iph = mtod(m, struct ip *);
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		{
-			/* IPv4 source */
-			sin = (struct sockaddr_in *)sa;
-			bzero(sin, sizeof(*sin));
-			sin->sin_family = AF_INET;
-			sin->sin_len = sizeof(struct sockaddr_in);
-			sin->sin_port = 0;
-			sin->sin_addr.s_addr = iph->ip_src.s_addr;
-			break;
-		}
-#endif
-#ifdef INET6
-	case (IPV6_VERSION >> 4):
-		{
-			/* IPv6 source */
-			struct ip6_hdr *ip6;
-
-			sin6 = (struct sockaddr_in6 *)sa;
-			bzero(sin6, sizeof(*sin6));
-			sin6->sin6_family = AF_INET6;
-			sin6->sin6_len = sizeof(struct sockaddr_in6);
-			sin6->sin6_port = 0;
-			ip6 = mtod(m, struct ip6_hdr *);
-			sin6->sin6_addr = ip6->ip6_src;
-			break;
-		}
-#endif				/* INET6 */
-	default:
-		break;
-	}
-	return;
-}
-
 /*
- * draft-ietf-tsvwg-addip-sctp
+ * RFC 5061
  *
  * An ASCONF parameter queue exists per asoc which holds the pending address
  * operations.  Lists are updated upon receipt of ASCONF-ACK.
@@ -197,12 +143,12 @@
 }
 
 static struct mbuf *
-sctp_process_asconf_add_ip(struct mbuf *m, struct sctp_asconf_paramhdr *aph,
+sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *aph,
     struct sctp_tcb *stcb, int send_hb, int response_required)
 {
 	struct sctp_nets *net;
 	struct mbuf *m_reply = NULL;
-	struct sockaddr_storage sa_source, sa_store;
+	struct sockaddr_storage sa_store;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type, param_length, aparam_length;
 	struct sockaddr *sa;
@@ -282,11 +228,10 @@
 
 	/* if 0.0.0.0/::0, add the source address instead */
 	if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
-		sa = (struct sockaddr *)&sa_source;
-		sctp_asconf_get_source_ip(m, sa);
+		sa = src;
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_add_ip: using source addr ");
-		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src);
 	}
 	/* add the address */
 	if (bad_address) {
@@ -346,11 +291,12 @@
 }
 
 static struct mbuf *
-sctp_process_asconf_delete_ip(struct mbuf *m, struct sctp_asconf_paramhdr *aph,
+sctp_process_asconf_delete_ip(struct sockaddr *src,
+    struct sctp_asconf_paramhdr *aph,
     struct sctp_tcb *stcb, int response_required)
 {
 	struct mbuf *m_reply = NULL;
-	struct sockaddr_storage sa_source, sa_store;
+	struct sockaddr_storage sa_store;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type, param_length, aparam_length;
 	struct sockaddr *sa;
@@ -368,9 +314,6 @@
 
 #endif
 
-	/* get the source IP address for src and 0.0.0.0/::0 delete checks */
-	sctp_asconf_get_source_ip(m, (struct sockaddr *)&sa_source);
-
 	aparam_length = ntohs(aph->ph.param_length);
 	ph = (struct sctp_paramhdr *)(aph + 1);
 	param_type = ntohs(ph->param_type);
@@ -427,7 +370,7 @@
 	}
 
 	/* make sure the source address is not being deleted */
-	if (sctp_cmpaddr(sa, (struct sockaddr *)&sa_source)) {
+	if (sctp_cmpaddr(sa, src)) {
 		/* trying to delete the source address! */
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete source addr\n");
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
@@ -437,8 +380,7 @@
 	}
 	/* if deleting 0.0.0.0/::0, delete all addresses except src addr */
 	if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
-		result = sctp_asconf_del_remote_addrs_except(stcb,
-		    (struct sockaddr *)&sa_source);
+		result = sctp_asconf_del_remote_addrs_except(stcb, src);
 
 		if (result) {
 			/* src address did not exist? */
@@ -478,12 +420,12 @@
 }
 
 static struct mbuf *
-sctp_process_asconf_set_primary(struct mbuf *m,
+sctp_process_asconf_set_primary(struct sockaddr *src,
     struct sctp_asconf_paramhdr *aph,
     struct sctp_tcb *stcb, int response_required)
 {
 	struct mbuf *m_reply = NULL;
-	struct sockaddr_storage sa_source, sa_store;
+	struct sockaddr_storage sa_store;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type, param_length, aparam_length;
 	struct sockaddr *sa;
@@ -553,11 +495,10 @@
 
 	/* if 0.0.0.0/::0, use the source address instead */
 	if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
-		sa = (struct sockaddr *)&sa_source;
-		sctp_asconf_get_source_ip(m, sa);
+		sa = src;
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_set_primary: using source addr ");
-		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src);
 	}
 	/* set the primary address */
 	if (sctp_set_primary_addr(stcb, sa, NULL) == 0) {
@@ -629,6 +570,7 @@
  */
 void
 sctp_handle_asconf(struct mbuf *m, unsigned int offset,
+    struct sockaddr *src,
     struct sctp_asconf_chunk *cp, struct sctp_tcb *stcb,
     int first)
 {
@@ -765,13 +707,13 @@
 		switch (param_type) {
 		case SCTP_ADD_IP_ADDRESS:
 			asoc->peer_supports_asconf = 1;
-			m_result = sctp_process_asconf_add_ip(m, aph, stcb,
+			m_result = sctp_process_asconf_add_ip(src, aph, stcb,
 			    (cnt < SCTP_BASE_SYSCTL(sctp_hb_maxburst)), error);
 			cnt++;
 			break;
 		case SCTP_DEL_IP_ADDRESS:
 			asoc->peer_supports_asconf = 1;
-			m_result = sctp_process_asconf_delete_ip(m, aph, stcb,
+			m_result = sctp_process_asconf_delete_ip(src, aph, stcb,
 			    error);
 			break;
 		case SCTP_ERROR_CAUSE_IND:
@@ -779,7 +721,7 @@
 			break;
 		case SCTP_SET_PRIM_ADDR:
 			asoc->peer_supports_asconf = 1;
-			m_result = sctp_process_asconf_set_primary(m, aph,
+			m_result = sctp_process_asconf_set_primary(src, aph,
 			    stcb, error);
 			break;
 		case SCTP_NAT_VTAGS:
@@ -859,70 +801,16 @@
 		 * this could happen if the source address was just newly
 		 * added
 		 */
-		struct ip *iph;
-		struct sctphdr *sh;
-		struct sockaddr_storage from_store;
-		struct sockaddr *from = (struct sockaddr *)&from_store;
-
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: looking up net for IP source address\n");
-		/* pullup already done, IP options already stripped */
-		iph = mtod(m, struct ip *);
-		switch (iph->ip_v) {
-#ifdef INET
-		case IPVERSION:
-			{
-				struct sockaddr_in *from4;
-
-				sh = (struct sctphdr *)((caddr_t)iph + sizeof(*iph));
-				from4 = (struct sockaddr_in *)&from_store;
-				bzero(from4, sizeof(*from4));
-				from4->sin_family = AF_INET;
-				from4->sin_len = sizeof(struct sockaddr_in);
-				from4->sin_addr.s_addr = iph->ip_src.s_addr;
-				from4->sin_port = sh->src_port;
-				break;
-			}
+		SCTPDBG(SCTP_DEBUG_ASCONF1, "Looking for IP source: ");
+		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src);
+		/* look up the from address */
+		stcb->asoc.last_control_chunk_from = sctp_findnet(stcb, src);
+#ifdef SCTP_DEBUG
+		if (stcb->asoc.last_control_chunk_from == NULL) {
+			SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: IP source address not found?!\n");
+		}
 #endif
-#ifdef INET6
-		case IPV6_VERSION >> 4:
-			{
-				struct ip6_hdr *ip6;
-				struct sockaddr_in6 *from6;
-
-				ip6 = mtod(m, struct ip6_hdr *);
-				sh = (struct sctphdr *)((caddr_t)ip6 + sizeof(*ip6));
-				from6 = (struct sockaddr_in6 *)&from_store;
-				bzero(from6, sizeof(*from6));
-				from6->sin6_family = AF_INET6;
-				from6->sin6_len = sizeof(struct sockaddr_in6);
-				from6->sin6_addr = ip6->ip6_src;
-				from6->sin6_port = sh->src_port;
-				/*
-				 * Get the scopes in properly to the sin6
-				 * addr's
-				 */
-				/* we probably don't need these operations */
-				(void)sa6_recoverscope(from6);
-				sa6_embedscope(from6,
-				    MODULE_GLOBAL(ip6_use_defzone));
-
-				break;
-			}
-#endif
-		default:
-			/* unknown address type */
-			from = NULL;
-		}
-		if (from != NULL) {
-			SCTPDBG(SCTP_DEBUG_ASCONF1, "Looking for IP source: ");
-			SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, from);
-			/* look up the from address */
-			stcb->asoc.last_control_chunk_from = sctp_findnet(stcb, from);
-#ifdef SCTP_DEBUG
-			if (stcb->asoc.last_control_chunk_from == NULL)
-				SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: IP source address not found?!\n");
-#endif
-		}
 	}
 }
 
@@ -1789,8 +1677,7 @@
 	 */
 	if (serial_num == (asoc->asconf_seq_out + 1)) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got unexpected next serial number! Aborting asoc!\n");
-		sctp_abort_an_association(stcb->sctp_ep, stcb,
-		    SCTP_CAUSE_ILLEGAL_ASCONF_ACK, NULL, SCTP_SO_NOT_LOCKED);
+		sctp_abort_an_association(stcb->sctp_ep, stcb, NULL, SCTP_SO_NOT_LOCKED);
 		*abort_no_unlock = 1;
 		return;
 	}
@@ -2860,13 +2747,14 @@
 	struct sctp_paramhdr tmp_param, *ph;
 	uint16_t plen, ptype;
 	struct sctp_ifa *sctp_ifa;
-	struct sctp_ipv6addr_param addr_store;
 
 #ifdef INET6
+	struct sctp_ipv6addr_param addr6_store;
 	struct sockaddr_in6 sin6;
 
 #endif
 #ifdef INET
+	struct sctp_ipv4addr_param addr4_store;
 	struct sockaddr_in sin;
 
 #endif
@@ -2915,7 +2803,7 @@
 				a6p = (struct sctp_ipv6addr_param *)
 				    sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv6addr_param),
-				    (uint8_t *) & addr_store);
+				    (uint8_t *) & addr6_store);
 				if (plen != sizeof(struct sctp_ipv6addr_param) ||
 				    a6p == NULL) {
 					return;
@@ -2934,7 +2822,7 @@
 				/* get the entire IPv4 address param */
 				a4p = (struct sctp_ipv4addr_param *)sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv4addr_param),
-				    (uint8_t *) & addr_store);
+				    (uint8_t *) & addr4_store);
 				if (plen != sizeof(struct sctp_ipv4addr_param) ||
 				    a4p == NULL) {
 					return;
@@ -3012,16 +2900,17 @@
 {
 	struct sctp_paramhdr tmp_param, *ph;
 	uint16_t plen, ptype;
-	struct sctp_ipv6addr_param addr_store;
 
 #ifdef INET
 	struct sockaddr_in *sin;
 	struct sctp_ipv4addr_param *a4p;
+	struct sctp_ipv6addr_param addr4_store;
 
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	struct sctp_ipv6addr_param *a6p;
+	struct sctp_ipv6addr_param addr6_store;
 	struct sockaddr_in6 sin6_tmp;
 
 #endif
@@ -3067,7 +2956,7 @@
 				a6p = (struct sctp_ipv6addr_param *)
 				    sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv6addr_param),
-				    (uint8_t *) & addr_store);
+				    (uint8_t *) & addr6_store);
 				if (a6p == NULL) {
 					return (0);
 				}
@@ -3097,7 +2986,7 @@
 				a4p = (struct sctp_ipv4addr_param *)
 				    sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv4addr_param),
-				    (uint8_t *) & addr_store);
+				    (uint8_t *) & addr4_store);
 				if (a4p == NULL) {
 					return (0);
 				}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_asconf.h
--- a/head/sys/netinet/sctp_asconf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_asconf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_asconf.h,v 1.8 2005/03/06 16:04:16 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_asconf.h 228653 2011-12-17 19:21:40Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_asconf.h 237715 2012-06-28 16:01:08Z tuexen $");
 
 #ifndef _NETINET_SCTP_ASCONF_H_
 #define _NETINET_SCTP_ASCONF_H_
@@ -48,8 +46,8 @@
 extern struct mbuf *sctp_compose_asconf(struct sctp_tcb *, int *, int);
 
 extern void
-sctp_handle_asconf(struct mbuf *, unsigned int, struct sctp_asconf_chunk *,
-    struct sctp_tcb *, int i);
+sctp_handle_asconf(struct mbuf *, unsigned int, struct sockaddr *,
+    struct sctp_asconf_chunk *, struct sctp_tcb *, int);
 
 extern void
 sctp_handle_asconf_ack(struct mbuf *, int, struct sctp_asconf_ack_chunk *,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_auth.c
--- a/head/sys/netinet/sctp_auth.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_auth.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_auth.c 228907 2011-12-27 10:16:24Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_auth.c 235828 2012-05-23 11:26:28Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp.h>
@@ -284,16 +284,16 @@
 	uint32_t i;
 
 	if (key == NULL) {
-		printf("%s: [Null key]\n", str);
+		SCTP_PRINTF("%s: [Null key]\n", str);
 		return;
 	}
-	printf("%s: len %u, ", str, key->keylen);
+	SCTP_PRINTF("%s: len %u, ", str, key->keylen);
 	if (key->keylen) {
 		for (i = 0; i < key->keylen; i++)
-			printf("%02x", key->key[i]);
-		printf("\n");
+			SCTP_PRINTF("%02x", key->key[i]);
+		SCTP_PRINTF("\n");
 	} else {
-		printf("[Null key]\n");
+		SCTP_PRINTF("[Null key]\n");
 	}
 }
 
@@ -303,16 +303,16 @@
 	uint32_t i;
 
 	if (key == NULL) {
-		printf("%s: [Null key]\n", str);
+		SCTP_PRINTF("%s: [Null key]\n", str);
 		return;
 	}
-	printf("%s: len %u, ", str, key->keylen);
+	SCTP_PRINTF("%s: len %u, ", str, key->keylen);
 	if (key->keylen) {
 		for (i = 0; i < key->keylen; i++)
-			printf("%02x", key->key[i]);
-		printf("\n");
+			SCTP_PRINTF("%02x", key->key[i]);
+		SCTP_PRINTF("\n");
 	} else {
-		printf("[Null key]\n");
+		SCTP_PRINTF("[Null key]\n");
 	}
 }
 
@@ -1801,7 +1801,7 @@
 			 * shared_key_id, (void
 			 * *)stcb->asoc.authinfo.recv_keyid);
 			 */
-			sctp_notify_authentication(stcb, SCTP_AUTH_NEWKEY,
+			sctp_notify_authentication(stcb, SCTP_AUTH_NEW_KEY,
 			    shared_key_id, stcb->asoc.authinfo.recv_keyid,
 			    SCTP_SO_NOT_LOCKED);
 		/* compute a new recv assoc key and cache it */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_auth.h
--- a/head/sys/netinet/sctp_auth.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_auth.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,10 +31,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_auth.h 228653 2011-12-17 19:21:40Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_auth.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#ifndef __SCTP_AUTH_H__
-#define __SCTP_AUTH_H__
+#ifndef _NETINET_SCTP_AUTH_H_
+#define _NETINET_SCTP_AUTH_H_
 
 
 /* digest lengths */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_bsd_addr.c
--- a/head/sys/netinet/sctp_bsd_addr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_bsd_addr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_output.c,v 1.46 2005/03/06 16:04:17 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_bsd_addr.c 232866 2012-03-12 15:05:17Z rrs $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_bsd_addr.c 237540 2012-06-24 21:25:54Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
@@ -424,11 +422,12 @@
 
 #ifdef SCTP_PACKET_LOGGING
 void
-sctp_packet_log(struct mbuf *m, int length)
+sctp_packet_log(struct mbuf *m)
 {
 	int *lenat, thisone;
 	void *copyto;
 	uint32_t *tick_tock;
+	int length;
 	int total_len;
 	int grabbed_lock = 0;
 	int value, newval, thisend, thisbegin;
@@ -438,6 +437,7 @@
 	 * (value) -ticks of log      (ticks) o -ip packet o -as logged -
 	 * where this started (thisbegin) x <--end points here
 	 */
+	length = SCTP_HEADER_LEN(m);
 	total_len = SCTP_SIZE32((length + (4 * sizeof(int))));
 	/* Log a packet to the buffer. */
 	if (total_len > SCTP_PACKET_LOG_SIZE) {
@@ -483,7 +483,7 @@
 	}
 	/* Sanity check */
 	if (thisend >= SCTP_PACKET_LOG_SIZE) {
-		printf("Insanity stops a log thisbegin:%d thisend:%d writers:%d lock:%d end:%d\n",
+		SCTP_PRINTF("Insanity stops a log thisbegin:%d thisend:%d writers:%d lock:%d end:%d\n",
 		    thisbegin,
 		    thisend,
 		    SCTP_BASE_VAR(packet_log_writers),
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_bsd_addr.h
--- a/head/sys/netinet/sctp_bsd_addr.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_bsd_addr.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,10 +31,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_bsd_addr.h 228653 2011-12-17 19:21:40Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_bsd_addr.h 237540 2012-06-24 21:25:54Z tuexen $");
 
-#ifndef __sctp_bsd_addr_h__
-#define __sctp_bsd_addr_h__
+#ifndef _NETINET_SCTP_BSD_ADDR_H_
+#define _NETINET_SCTP_BSD_ADDR_H_
+
 #include <netinet/sctp_pcb.h>
 
 #if defined(_KERNEL) || defined(__Userspace__)
@@ -52,7 +53,7 @@
 
 #ifdef  SCTP_PACKET_LOGGING
 
-void sctp_packet_log(struct mbuf *m, int length);
+void sctp_packet_log(struct mbuf *m);
 int sctp_copy_out_packet_log(uint8_t * target, int length);
 
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_cc_functions.c
--- a/head/sys/netinet/sctp_cc_functions.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_cc_functions.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,9 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_cc_functions.c 235828 2012-05-23 11:26:28Z tuexen $");
+
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
@@ -44,8 +47,6 @@
 #include <netinet/sctp_auth.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctp_dtrace_declare.h>
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_cc_functions.c 228907 2011-12-27 10:16:24Z tuexen $");
 
 #define SHIFT_MPTCP_MULTI_N 40
 #define SHIFT_MPTCP_MULTI_Z 16
@@ -1594,9 +1595,7 @@
 
 	cur_val = net->cwnd >> 10;
 	indx = SCTP_HS_TABLE_SIZE - 1;
-#ifdef SCTP_DEBUG
-	printf("HS CC CAlled.\n");
-#endif
+
 	if (cur_val < sctp_cwnd_adjust[0].cwnd) {
 		/* normal mode */
 		if (net->net_ack > net->mtu) {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_constants.h
--- a/head/sys/netinet/sctp_constants.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_constants.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_constants.h,v 1.17 2005/03/06 16:04:17 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_constants.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_constants.h 233660 2012-03-29 13:36:53Z rrs $");
-
-#ifndef __sctp_constants_h__
-#define __sctp_constants_h__
+#ifndef _NETINET_SCTP_CONSTANTS_H_
+#define _NETINET_SCTP_CONSTANTS_H_
 
 /* IANA assigned port number for SCTP over UDP encapsulation */
 /* For freebsd we cannot bind the port at
@@ -348,7 +346,7 @@
 #define SCTP_NO_FR_UNLESS_SEGMENT_SMALLER 1
 
 /* default max I can burst out after a fast retransmit, 0 disables it */
-#define SCTP_DEF_MAX_BURST 0
+#define SCTP_DEF_MAX_BURST 4
 #define SCTP_DEF_HBMAX_BURST 4
 #define SCTP_DEF_FRMAX_BURST 4
 
@@ -460,18 +458,6 @@
 #define SCTP_HAS_NAT_SUPPORT            0xc007
 #define SCTP_NAT_VTAGS                  0xc008
 
-/* Notification error codes */
-#define SCTP_NOTIFY_DATAGRAM_UNSENT	0x0001
-#define SCTP_NOTIFY_DATAGRAM_SENT	0x0002
-#define SCTP_FAILED_THRESHOLD		0x0004
-#define SCTP_HEARTBEAT_SUCCESS		0x0008
-#define SCTP_RESPONSE_TO_USER_REQ	0x0010
-#define SCTP_INTERNAL_ERROR		0x0020
-#define SCTP_SHUTDOWN_GUARD_EXPIRES	0x0040
-#define SCTP_RECEIVED_SACK		0x0080
-#define SCTP_PEER_FAULTY		0x0100
-#define SCTP_ICMP_REFUSED		0x0200
-
 /* bits for TOS field */
 #define SCTP_ECT0_BIT		0x02
 #define SCTP_ECT1_BIT		0x01
@@ -755,35 +741,29 @@
 #define SCTP_NOTIFY_ASSOC_DOWN                   2
 #define SCTP_NOTIFY_INTERFACE_DOWN               3
 #define SCTP_NOTIFY_INTERFACE_UP                 4
-#define SCTP_NOTIFY_DG_FAIL                      5
-#define SCTP_NOTIFY_STRDATA_ERR                  6
-#define SCTP_NOTIFY_ASSOC_ABORTED                7
-#define SCTP_NOTIFY_PEER_OPENED_STREAM           8
-#define SCTP_NOTIFY_STREAM_OPENED_OK             9
+#define SCTP_NOTIFY_SENT_DG_FAIL                 5
+#define SCTP_NOTIFY_UNSENT_DG_FAIL               6
+#define SCTP_NOTIFY_SPECIAL_SP_FAIL              7
+#define SCTP_NOTIFY_ASSOC_LOC_ABORTED            8
+#define SCTP_NOTIFY_ASSOC_REM_ABORTED            9
 #define SCTP_NOTIFY_ASSOC_RESTART               10
-#define SCTP_NOTIFY_HB_RESP                     11
-#define SCTP_NOTIFY_ASCONF_SUCCESS              12
-#define SCTP_NOTIFY_ASCONF_FAILED               13
-#define SCTP_NOTIFY_PEER_SHUTDOWN               14
-#define SCTP_NOTIFY_ASCONF_ADD_IP               15
-#define SCTP_NOTIFY_ASCONF_DELETE_IP            16
-#define SCTP_NOTIFY_ASCONF_SET_PRIMARY          17
-#define SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION 18
-#define SCTP_NOTIFY_INTERFACE_CONFIRMED         20
-#define SCTP_NOTIFY_STR_RESET_RECV              21
-#define SCTP_NOTIFY_STR_RESET_SEND              22
-#define SCTP_NOTIFY_STR_RESET_FAILED_OUT        23
-#define SCTP_NOTIFY_STR_RESET_FAILED_IN         24
-#define SCTP_NOTIFY_AUTH_NEW_KEY                25
-#define SCTP_NOTIFY_AUTH_FREE_KEY               26
-#define SCTP_NOTIFY_SPECIAL_SP_FAIL             27
-#define SCTP_NOTIFY_NO_PEER_AUTH                28
-#define SCTP_NOTIFY_SENDER_DRY                  29
-#define SCTP_NOTIFY_STR_RESET_ADD_OK            30
-#define SCTP_NOTIFY_STR_RESET_ADD_FAIL          31
-#define SCTP_NOTIFY_STR_RESET_INSTREAM_ADD_OK   32
-#define SCTP_NOTIFY_MAX                         32
-
+#define SCTP_NOTIFY_PEER_SHUTDOWN               11
+#define SCTP_NOTIFY_ASCONF_ADD_IP               12
+#define SCTP_NOTIFY_ASCONF_DELETE_IP            13
+#define SCTP_NOTIFY_ASCONF_SET_PRIMARY          14
+#define SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION 15
+#define SCTP_NOTIFY_INTERFACE_CONFIRMED         16
+#define SCTP_NOTIFY_STR_RESET_RECV              17
+#define SCTP_NOTIFY_STR_RESET_SEND              18
+#define SCTP_NOTIFY_STR_RESET_FAILED_OUT        19
+#define SCTP_NOTIFY_STR_RESET_FAILED_IN         20
+#define SCTP_NOTIFY_STR_RESET_DENIED_OUT        21
+#define SCTP_NOTIFY_STR_RESET_DENIED_IN         22
+#define SCTP_NOTIFY_AUTH_NEW_KEY                23
+#define SCTP_NOTIFY_AUTH_FREE_KEY               24
+#define SCTP_NOTIFY_NO_PEER_AUTH                25
+#define SCTP_NOTIFY_SENDER_DRY                  26
+#define SCTP_NOTIFY_REMOTE_ERROR                27
 
 /* This is the value for messages that are NOT completely
  * copied down where we will start to split the message.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_crc32.c
--- a/head/sys/netinet/sctp_crc32.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_crc32.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,11 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_crc32.c,v 1.12 2005/03/06 16:04:17 itojun Exp $	 */
-
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_crc32.c 235828 2012-05-23 11:26:28Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp.h>
@@ -124,7 +121,9 @@
 sctp_delayed_cksum(struct mbuf *m, uint32_t offset)
 {
 #if defined(SCTP_WITH_NO_CSUM)
+#ifdef INVARIANTS
 	panic("sctp_delayed_cksum() called when using no SCTP CRC.");
+#endif
 #else
 	uint32_t checksum;
 
@@ -134,7 +133,7 @@
 	offset += offsetof(struct sctphdr, checksum);
 
 	if (offset + sizeof(uint32_t) > (uint32_t) (m->m_len)) {
-		printf("sctp_delayed_cksum(): m->len: %d,  off: %d.\n",
+		SCTP_PRINTF("sctp_delayed_cksum(): m->len: %d,  off: %d.\n",
 		    (uint32_t) m->m_len, offset);
 		/*
 		 * XXX this shouldn't happen, but if it does, the correct
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_crc32.h
--- a/head/sys/netinet/sctp_crc32.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_crc32.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_crc32.h,v 1.5 2004/08/17 04:06:16 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_crc32.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_crc32.h 228653 2011-12-17 19:21:40Z tuexen $");
-
-#ifndef __crc32c_h__
-#define __crc32c_h__
+#ifndef _NETINET_SCTP_CRC32_H_
+#define _NETINET_SCTP_CRC32_H_
 
 #if defined(_KERNEL)
 #if !defined(SCTP_WITH_NO_CSUM)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_dtrace_declare.h
--- a/head/sys/netinet/sctp_dtrace_declare.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_dtrace_declare.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,6 +1,6 @@
 /*-
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,9 +28,13 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_dtrace_declare.h 228653 2011-12-17 19:21:40Z tuexen $");
-#ifndef __sctp_dtrace_declare_h__
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_dtrace_declare.h 235828 2012-05-23 11:26:28Z tuexen $");
+
+#ifndef _NETINET_SCTP_DTRACE_DECLARE_H_
+#define _NETINET_SCTP_DTRACE_DECLARE_H_
+
 #include "opt_kdtrace.h"
 #include <sys/kernel.h>
 #include <sys/sdt.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_dtrace_define.h
--- a/head/sys/netinet/sctp_dtrace_define.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_dtrace_define.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,6 +1,6 @@
 /*-
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,9 +28,13 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_dtrace_define.h 228653 2011-12-17 19:21:40Z tuexen $");
-#ifndef __sctp_dtrace_define_h__
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_dtrace_define.h 235828 2012-05-23 11:26:28Z tuexen $");
+
+#ifndef _NETINET_SCTP_DTRACE_DEFINE_H_
+#define _NETINET_SCTP_DTRACE_DEFINE_H_
+
 #include "opt_kdtrace.h"
 #include <sys/kernel.h>
 #include <sys/sdt.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_header.h
--- a/head/sys/netinet/sctp_header.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_header.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_header.h,v 1.14 2005/03/06 16:04:17 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_header.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_header.h 233660 2012-03-29 13:36:53Z rrs $");
-
-#ifndef __sctp_header_h__
-#define __sctp_header_h__
+#ifndef _NETINET_SCTP_HEADER_H_
+#define _NETINET_SCTP_HEADER_H_
 
 #include <sys/time.h>
 #include <netinet/sctp.h>
@@ -499,12 +497,13 @@
 	uint16_t reserved;
 }                          SCTP_PACKED;
 
-#define SCTP_STREAM_RESET_NOTHING   0x00000000	/* Nothing for me to do */
-#define SCTP_STREAM_RESET_PERFORMED 0x00000001	/* Did it */
-#define SCTP_STREAM_RESET_REJECT    0x00000002	/* refused to do it */
-#define SCTP_STREAM_RESET_ERROR_STR 0x00000003	/* bad Stream no */
-#define SCTP_STREAM_RESET_TRY_LATER 0x00000004	/* collision, try again */
-#define SCTP_STREAM_RESET_BAD_SEQNO 0x00000005	/* bad str-reset seq no */
+#define SCTP_STREAM_RESET_RESULT_NOTHING_TO_DO   0x00000000	/* XXX: unused */
+#define SCTP_STREAM_RESET_RESULT_PERFORMED       0x00000001
+#define SCTP_STREAM_RESET_RESULT_DENIED          0x00000002
+#define SCTP_STREAM_RESET_RESULT_ERR__WRONG_SSN  0x00000003	/* XXX: unused */
+#define SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS 0x00000004
+#define SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO   0x00000005
+#define SCTP_STREAM_RESET_RESULT_IN_PROGRESS     0x00000006	/* XXX: unused */
 
 /*
  * convience structures, note that if you are making a request for specific
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_indata.c
--- a/head/sys/netinet/sctp_indata.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_indata.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_indata.c,v 1.36 2005/03/06 16:04:17 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_indata.c 234459 2012-04-19 12:43:19Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_indata.c 237715 2012-06-28 16:01:08Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
@@ -328,7 +326,7 @@
 	}
 	SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn);
 	if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) {
-		printf("gap:%x tsn:%x\n", gap, tsn);
+		SCTP_PRINTF("gap:%x tsn:%x\n", gap, tsn);
 		sctp_print_mapping_array(asoc);
 #ifdef INVARIANTS
 		panic("Things are really messed up now!!");
@@ -607,9 +605,7 @@
 			*ippp = ((control->sinfo_stream << 16) | control->sinfo_ssn);
 		}
 		stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_1;
-		sctp_abort_an_association(stcb->sctp_ep, stcb,
-		    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+		sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 		*abort_flag = 1;
 		return;
 
@@ -892,8 +888,7 @@
 
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_2;
-				sctp_abort_an_association(stcb->sctp_ep, stcb,
-				    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+				sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 				*abort_flag = 1;
 			} else if (asoc->fragmented_delivery_inprogress &&
 			    (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) {
@@ -924,8 +919,7 @@
 					*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_3;
-				sctp_abort_an_association(stcb->sctp_ep, stcb,
-				    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+				sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 				*abort_flag = 1;
 			} else if (asoc->fragmented_delivery_inprogress) {
 				/*
@@ -961,8 +955,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_4;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 				} else if ((asoc->fragment_flags & SCTP_DATA_UNORDERED) !=
 					    SCTP_DATA_UNORDERED &&
@@ -995,8 +988,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_5;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 				}
 			}
@@ -1090,8 +1082,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_6;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1127,9 +1118,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_7;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1166,9 +1155,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_8;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1202,9 +1189,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_9;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1247,9 +1232,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_10;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1289,9 +1272,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_11;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1328,9 +1309,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_12;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1367,9 +1346,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_13;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1531,7 +1508,7 @@
 		struct mbuf *op_err;
 
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
-		sctp_abort_an_association(stcb->sctp_ep, stcb, 0, op_err, SCTP_SO_NOT_LOCKED);
+		sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
 		*abort_flag = 1;
 		return (0);
 	}
@@ -1552,7 +1529,7 @@
 		 */
 		if (stcb->sctp_socket->so_rcv.sb_cc) {
 			/* some to read, wake-up */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			struct socket *so;
 
 			so = SCTP_INP_SO(stcb->sctp_ep);
@@ -1568,7 +1545,7 @@
 			}
 #endif
 			sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 		}
@@ -1678,8 +1655,7 @@
 
 		}
 		stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_14;
-		sctp_abort_an_association(stcb->sctp_ep, stcb,
-		    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+		sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 		*abort_flag = 1;
 		return (0);
 	}
@@ -1942,9 +1918,7 @@
 					*ippp = ((strmno << 16) | strmseq);
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_15;
-				sctp_abort_an_association(stcb->sctp_ep, stcb,
-				    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+				sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 				*abort_flag = 1;
 				return (0);
 			} else {
@@ -1980,9 +1954,7 @@
 						*ippp = ((strmno << 16) | strmseq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_16;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return (0);
 				}
@@ -2027,9 +1999,7 @@
 						*ippp = ((strmno << 16) | strmseq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_17;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return (0);
 				}
@@ -2308,7 +2278,7 @@
 #ifdef INVARIANTS
 		for (i = 0; i < asoc->mapping_array_size; i++) {
 			if ((asoc->mapping_array[i]) || (asoc->nr_mapping_array[i])) {
-				printf("Error Mapping array's not clean at clear\n");
+				SCTP_PRINTF("Error Mapping array's not clean at clear\n");
 				sctp_print_mapping_array(asoc);
 			}
 		}
@@ -2330,7 +2300,7 @@
 #ifdef INVARIANTS
 			panic("impossible slide");
 #else
-			printf("impossible slide lgap:%x slide_end:%x slide_from:%x? at:%d\n",
+			SCTP_PRINTF("impossible slide lgap:%x slide_end:%x slide_from:%x? at:%d\n",
 			    lgap, slide_end, slide_from, at);
 			return;
 #endif
@@ -2339,7 +2309,7 @@
 #ifdef INVARIANTS
 			panic("would overrun buffer");
 #else
-			printf("Gak, would have overrun map end:%d slide_end:%d\n",
+			SCTP_PRINTF("Gak, would have overrun map end:%d slide_end:%d\n",
 			    asoc->mapping_array_size, slide_end);
 			slide_end = asoc->mapping_array_size;
 #endif
@@ -2546,8 +2516,11 @@
 
 int
 sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
-    struct sctphdr *sh, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
-    struct sctp_nets *net, uint32_t * high_tsn)
+    struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh, struct sctp_inpcb *inp,
+    struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t * high_tsn,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_data_chunk *ch, chunk_buf;
 	struct sctp_association *asoc;
@@ -2654,8 +2627,10 @@
 
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19;
-				sctp_abort_association(inp, stcb, m, iphlen, sh,
-				    op_err, 0, net->port);
+				sctp_abort_association(inp, stcb, m, iphlen,
+				    src, dst, sh, op_err,
+				    use_mflowid, mflowid,
+				    vrf_id, port);
 				return (2);
 			}
 #ifdef SCTP_AUDITING_ENABLED
@@ -2719,7 +2694,12 @@
 					struct mbuf *op_err;
 
 					op_err = sctp_generate_invmanparam(SCTP_CAUSE_PROTOCOL_VIOLATION);
-					sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, 0, net->port);
+					sctp_abort_association(inp, stcb,
+					    m, iphlen,
+					    src, dst,
+					    sh, op_err,
+					    use_mflowid, mflowid,
+					    vrf_id, port);
 					return (2);
 				}
 				break;
@@ -2784,7 +2764,7 @@
 		/*
 		 * we need to report rwnd overrun drops.
 		 */
-		sctp_send_packet_dropped(stcb, net, *mm, iphlen, 0);
+		sctp_send_packet_dropped(stcb, net, *mm, length, iphlen, 0);
 	}
 	if (num_chunks) {
 		/*
@@ -3222,8 +3202,7 @@
 				if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) {
 					/* Yes so drop it */
 					if (tp1->data != NULL) {
-						(void)sctp_release_pr_sctp_chunk(stcb, tp1,
-						    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+						(void)sctp_release_pr_sctp_chunk(stcb, tp1, 1,
 						    SCTP_SO_NOT_LOCKED);
 					}
 					continue;
@@ -3480,8 +3459,7 @@
 				if (tp1->snd_count > tp1->rec.data.timetodrop.tv_sec) {
 					/* Yes, so drop it */
 					if (tp1->data != NULL) {
-						(void)sctp_release_pr_sctp_chunk(stcb, tp1,
-						    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+						(void)sctp_release_pr_sctp_chunk(stcb, tp1, 1,
 						    SCTP_SO_NOT_LOCKED);
 					}
 					/* Make sure to flag we had a FR */
@@ -3489,7 +3467,10 @@
 					continue;
 				}
 			}
-			/* printf("OK, we are now ready to FR this guy\n"); */
+			/*
+			 * SCTP_PRINTF("OK, we are now ready to FR this
+			 * guy\n");
+			 */
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
 				sctp_log_fr(tp1->rec.data.TSN_seq, tp1->snd_count,
 				    0, SCTP_FR_MARKED);
@@ -3557,7 +3538,7 @@
 			tot_retrans++;
 			/* mark the sending seq for possible subsequent FR's */
 			/*
-			 * printf("Marking TSN for FR new value %x\n",
+			 * SCTP_PRINTF("Marking TSN for FR new value %x\n",
 			 * (uint32_t)tpi->rec.data.TSN_seq);
 			 */
 			if (TAILQ_EMPTY(&asoc->send_queue)) {
@@ -3657,8 +3638,7 @@
 				/* Yes so drop it */
 				if (tp1->data) {
 					(void)sctp_release_pr_sctp_chunk(stcb, tp1,
-					    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
-					    SCTP_SO_NOT_LOCKED);
+					    1, SCTP_SO_NOT_LOCKED);
 				}
 			} else {
 				/*
@@ -3709,11 +3689,10 @@
 
 	TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
 		if (chk->sent < SCTP_DATAGRAM_RESEND) {
-			printf("Chk TSN:%u size:%d inflight cnt:%d\n",
+			SCTP_PRINTF("Chk TSN:%u size:%d inflight cnt:%d\n",
 			    chk->rec.data.TSN_seq,
 			    chk->send_size,
-			    chk->snd_count
-			    );
+			    chk->snd_count);
 			inflight++;
 		} else if (chk->sent == SCTP_DATAGRAM_RESEND) {
 			resend++;
@@ -3730,7 +3709,7 @@
 #ifdef INVARIANTS
 		panic("Flight size-express incorrect? \n");
 #else
-		printf("asoc->total_flight:%d cnt:%d\n",
+		SCTP_PRINTF("asoc->total_flight:%d cnt:%d\n",
 		    entry_flight, entry_cnt);
 
 		SCTP_PRINTF("Flight size-express incorrect F:%d I:%d R:%d Ab:%d ACK:%d\n",
@@ -3876,7 +3855,7 @@
 				*ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_25);
 			}
 			stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25;
-			sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+			sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 			return;
 #endif
 		}
@@ -3895,7 +3874,7 @@
 		TAILQ_FOREACH_SAFE(tp1, &asoc->sent_queue, sctp_next, tp2) {
 			if (SCTP_TSN_GE(cumack, tp1->rec.data.TSN_seq)) {
 				if (tp1->sent == SCTP_DATAGRAM_UNSENT) {
-					printf("Warning, an unsent is now acked?\n");
+					SCTP_PRINTF("Warning, an unsent is now acked?\n");
 				}
 				if (tp1->sent < SCTP_DATAGRAM_ACKED) {
 					/*
@@ -4005,7 +3984,7 @@
 	}
 	/* sa_ignore NO_NULL_CHK */
 	if (stcb->sctp_socket) {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		struct socket *so;
 
 #endif
@@ -4014,7 +3993,7 @@
 			/* sa_ignore NO_NULL_CHK */
 			sctp_wakeup_log(stcb, 1, SCTP_WAKESND_FROM_SACK);
 		}
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		so = SCTP_INP_SO(stcb->sctp_ep);
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -4028,7 +4007,7 @@
 		}
 #endif
 		sctp_sowwakeup_locked(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	} else {
@@ -4050,7 +4029,7 @@
 					/* addr came good */
 					net->dest_state |= SCTP_ADDR_REACHABLE;
 					sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb,
-					    SCTP_RECEIVED_SACK, (void *)net, SCTP_SO_NOT_LOCKED);
+					    0, (void *)net, SCTP_SO_NOT_LOCKED);
 				}
 				if (net == stcb->asoc.primary_destination) {
 					if (stcb->asoc.alternate) {
@@ -4238,7 +4217,7 @@
 					*ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_24);
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_24;
-				sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_RESPONSE_TO_USER_REQ, oper, SCTP_SO_NOT_LOCKED);