[Zrouter-src-freebsd] ZRouter.org: push to FreeBSD HEAD tree

Wed Jul 25 14:36:09 UTC 2012

details:   http://zrouter.org/hg/FreeBSD/head//rev/fc630f3c8529
changeset: 493:fc630f3c8529
user:      Aleksandr Rybalko <ray at ddteam.net>
date:      Wed Jul 25 16:40:53 2012 +0300
description:
Lazy update

diffstat:

 head/sys/amd64/acpica/acpi_machdep.c           |     4 +-
 head/sys/amd64/acpica/acpi_switch.S            |   177 -
 head/sys/amd64/acpica/acpi_wakecode.S          |    39 +-
 head/sys/amd64/acpica/acpi_wakeup.c            |   420 ----
 head/sys/amd64/amd64/cpu_switch.S              |   166 +-
 head/sys/amd64/amd64/db_disasm.c               |   179 +-
 head/sys/amd64/amd64/fpu.c                     |   187 +-
 head/sys/amd64/amd64/genassym.c                |     9 +-
 head/sys/amd64/amd64/machdep.c                 |     9 +-
 head/sys/amd64/amd64/mem.c                     |     4 +-
 head/sys/amd64/amd64/minidump_machdep.c        |     8 +-
 head/sys/amd64/amd64/mp_machdep.c              |   110 +-
 head/sys/amd64/amd64/pmap.c                    |  1197 +++++++----
 head/sys/amd64/amd64/ptrace_machdep.c          |     9 +-
 head/sys/amd64/amd64/trap.c                    |    30 +-
 head/sys/amd64/amd64/vm_machdep.c              |     6 +-
 head/sys/amd64/conf/GENERIC                    |    10 +-
 head/sys/amd64/include/atomic.h                |    76 +-
 head/sys/amd64/include/cpufunc.h               |    40 +-
 head/sys/amd64/include/elf.h                   |     3 +-
 head/sys/amd64/include/fpu.h                   |     5 +-
 head/sys/amd64/include/in_cksum.h              |     6 +-
 head/sys/amd64/include/intr_machdep.h          |     4 +-
 head/sys/amd64/include/md_var.h                |     3 +-
 head/sys/amd64/include/pcb.h                   |    18 +-
 head/sys/amd64/include/pcpu.h                  |    24 +-
 head/sys/amd64/include/pmap.h                  |     5 +-
 head/sys/amd64/include/smp.h                   |     3 +-
 head/sys/amd64/include/vdso.h                  |     6 +
 head/sys/amd64/include/vmparam.h               |     4 +-
 head/sys/amd64/linux32/linux.h                 |     3 +-
 head/sys/amd64/linux32/linux32_dummy.c         |    11 +-
 head/sys/amd64/linux32/linux32_proto.h         |     8 +-
 head/sys/amd64/linux32/linux32_syscall.h       |     4 +-
 head/sys/amd64/linux32/linux32_syscalls.c      |     4 +-
 head/sys/amd64/linux32/linux32_sysent.c        |     4 +-
 head/sys/amd64/linux32/linux32_systrace_args.c |    10 +-
 head/sys/amd64/linux32/syscalls.master         |     6 +-
 head/sys/fs/cd9660/cd9660_node.c               |     5 +-
 head/sys/fs/cd9660/cd9660_vfsops.c             |     6 +-
 head/sys/fs/devfs/devfs_vnops.c                |    17 +-
 head/sys/fs/ext2fs/ext2_inode.c                |     6 +-
 head/sys/fs/ext2fs/ext2_lookup.c               |    60 +-
 head/sys/fs/ext2fs/ext2_vfsops.c               |     8 +-
 head/sys/fs/ext2fs/ext2_vnops.c                |     8 +-
 head/sys/fs/hpfs/hpfs_vnops.c                  |    25 +-
 head/sys/fs/msdosfs/denode.h                   |     4 +-
 head/sys/fs/msdosfs/msdosfs_denode.c           |    14 +-
 head/sys/fs/msdosfs/msdosfs_lookup.c           |     6 +-
 head/sys/fs/msdosfs/msdosfs_vnops.c            |    11 +-
 head/sys/fs/nandfs/bmap.c                      |   621 ++++++
 head/sys/fs/nandfs/bmap.h                      |    40 +
 head/sys/fs/nandfs/nandfs.h                    |   310 +++
 head/sys/fs/nandfs/nandfs_alloc.c              |   364 +++
 head/sys/fs/nandfs/nandfs_bmap.c               |   230 ++
 head/sys/fs/nandfs/nandfs_buffer.c             |    83 +
 head/sys/fs/nandfs/nandfs_cleaner.c            |   620 ++++++
 head/sys/fs/nandfs/nandfs_cpfile.c             |   776 +++++++
 head/sys/fs/nandfs/nandfs_dat.c                |   344 +++
 head/sys/fs/nandfs/nandfs_dir.c                |   314 +++
 head/sys/fs/nandfs/nandfs_fs.h                 |   565 +++++
 head/sys/fs/nandfs/nandfs_ifile.c              |   213 ++
 head/sys/fs/nandfs/nandfs_mount.h              |    50 +
 head/sys/fs/nandfs/nandfs_segment.c            |  1329 ++++++++++++
 head/sys/fs/nandfs/nandfs_subr.c               |  1120 ++++++++++
 head/sys/fs/nandfs/nandfs_subr.h               |   238 ++
 head/sys/fs/nandfs/nandfs_sufile.c             |   569 +++++
 head/sys/fs/nandfs/nandfs_vfsops.c             |  1590 +++++++++++++++
 head/sys/fs/nandfs/nandfs_vnops.c              |  2455 ++++++++++++++++++++++++
 head/sys/fs/nfs/nfs_commonacl.c                |     6 +-
 head/sys/fs/nfsclient/nfs_clbio.c              |    80 +-
 head/sys/fs/nfsclient/nfs_clnode.c             |    42 +-
 head/sys/fs/nfsclient/nfs_clvfsops.c           |     5 +-
 head/sys/fs/nfsclient/nfs_clvnops.c            |    23 +-
 head/sys/fs/nfsclient/nfsnode.h                |     3 +-
 head/sys/fs/nfsserver/nfs_nfsdport.c           |     9 +-
 head/sys/fs/nfsserver/nfs_nfsdstate.c          |    17 +-
 head/sys/fs/ntfs/ntfs.h                        |   318 +-
 head/sys/fs/ntfs/ntfs_subr.c                   |   170 +-
 head/sys/fs/ntfs/ntfs_subr.h                   |     4 +-
 head/sys/fs/ntfs/ntfs_vfsops.c                 |    84 +-
 head/sys/fs/ntfs/ntfs_vnops.c                  |   152 +-
 head/sys/fs/nullfs/null_vnops.c                |     5 +-
 head/sys/fs/portalfs/portal_vnops.c            |    10 +-
 head/sys/fs/smbfs/smbfs_node.c                 |    19 +-
 head/sys/fs/tmpfs/tmpfs_vnops.c                |     5 +-
 head/sys/fs/udf/udf_vfsops.c                   |     4 +-
 head/sys/fs/unionfs/union_subr.c               |    25 +-
 head/sys/fs/unionfs/union_vfsops.c             |    12 +-
 head/sys/fs/unionfs/union_vnops.c              |   305 +-
 head/sys/i386/acpica/acpi_machdep.c            |     4 +-
 head/sys/i386/acpica/acpi_wakecode.S           |   349 +-
 head/sys/i386/acpica/acpi_wakeup.c             |   371 ---
 head/sys/i386/conf/GENERIC                     |     8 +-
 head/sys/i386/conf/XEN                         |     4 +-
 head/sys/i386/i386/apic_vector.s               |    22 +-
 head/sys/i386/i386/bios.c                      |     6 +-
 head/sys/i386/i386/elf_machdep.c               |     7 +-
 head/sys/i386/i386/genassym.c                  |    15 +-
 head/sys/i386/i386/initcpu.c                   |     3 +-
 head/sys/i386/i386/machdep.c                   |    26 +-
 head/sys/i386/i386/mem.c                       |     4 +-
 head/sys/i386/i386/minidump_machdep.c          |     8 +-
 head/sys/i386/i386/mp_machdep.c                |   137 +-
 head/sys/i386/i386/pmap.c                      |   416 ++-
 head/sys/i386/i386/ptrace_machdep.c            |     4 +-
 head/sys/i386/i386/swtch.s                     |   111 +-
 head/sys/i386/i386/trap.c                      |    12 +-
 head/sys/i386/i386/vm86.c                      |     3 +-
 head/sys/i386/include/apicvar.h                |     5 +-
 head/sys/i386/include/atomic.h                 |    80 +-
 head/sys/i386/include/bootinfo.h               |    10 +-
 head/sys/i386/include/cpufunc.h                |    12 +-
 head/sys/i386/include/elf.h                    |     3 +-
 head/sys/i386/include/in_cksum.h               |     8 +-
 head/sys/i386/include/intr_machdep.h           |     4 +-
 head/sys/i386/include/md_var.h                 |     3 +-
 head/sys/i386/include/npx.h                    |     5 +-
 head/sys/i386/include/pcb.h                    |    17 +-
 head/sys/i386/include/pmap.h                   |     5 +-
 head/sys/i386/include/smp.h                    |     7 +-
 head/sys/i386/include/vdso.h                   |     6 +
 head/sys/i386/include/vmparam.h                |     5 +-
 head/sys/i386/isa/npx.c                        |    79 +-
 head/sys/i386/linux/linux.h                    |     3 +-
 head/sys/i386/linux/linux_dummy.c              |    11 +-
 head/sys/i386/xen/pmap.c                       |   220 +-
 head/sys/ia64/acpica/acpi_wakeup.c             |     9 +-
 head/sys/ia64/ia64/busdma_machdep.c            |    14 +-
 head/sys/ia64/ia64/machdep.c                   |   241 +-
 head/sys/ia64/ia64/mp_machdep.c                |    10 +-
 head/sys/ia64/ia64/nexus.c                     |    11 +-
 head/sys/ia64/ia64/physmem.c                   |   258 ++
 head/sys/ia64/ia64/pmap.c                      |    81 +-
 head/sys/ia64/include/_stdint.h                |     8 +-
 head/sys/ia64/include/_types.h                 |     6 +-
 head/sys/ia64/include/elf.h                    |     3 +-
 head/sys/ia64/include/in_cksum.h               |     6 +-
 head/sys/ia64/include/md_var.h                 |    13 +-
 head/sys/ia64/include/param.h                  |     5 +-
 head/sys/ia64/include/pcb.h                    |     6 +-
 head/sys/ia64/include/pmap.h                   |     3 +-
 head/sys/ia64/include/vdso.h                   |    41 +
 head/sys/kern/capabilities.conf                |     8 +-
 head/sys/kern/dtio_kdtrace.c                   |   232 ++
 head/sys/kern/imgact_aout.c                    |    15 +-
 head/sys/kern/imgact_elf.c                     |    33 +-
 head/sys/kern/imgact_gzip.c                    |     6 +-
 head/sys/kern/init_main.c                      |    37 +-
 head/sys/kern/init_sysent.c                    |    14 +-
 head/sys/kern/kern_acct.c                      |    25 +-
 head/sys/kern/kern_clock.c                     |     8 +-
 head/sys/kern/kern_conf.c                      |     9 +-
 head/sys/kern/kern_descrip.c                   |   552 ++--
 head/sys/kern/kern_event.c                     |    21 +-
 head/sys/kern/kern_exec.c                      |    67 +-
 head/sys/kern/kern_fork.c                      |    13 +-
 head/sys/kern/kern_jail.c                      |    23 +-
 head/sys/kern/kern_kthread.c                   |     4 +-
 head/sys/kern/kern_malloc.c                    |     8 +-
 head/sys/kern/kern_proc.c                      |    42 +-
 head/sys/kern/kern_racct.c                     |     7 +-
 head/sys/kern/kern_rangelock.c                 |   246 ++
 head/sys/kern/kern_sharedpage.c                |   240 ++
 head/sys/kern/kern_shutdown.c                  |    11 +-
 head/sys/kern/kern_sig.c                       |     7 +-
 head/sys/kern/kern_synch.c                     |    19 +-
 head/sys/kern/kern_tc.c                        |    86 +-
 head/sys/kern/kern_thr.c                       |     3 +-
 head/sys/kern/kern_thread.c                    |    11 +-
 head/sys/kern/kern_timeout.c                   |   359 +-
 head/sys/kern/sched_4bsd.c                     |    41 +-
 head/sys/kern/sched_ule.c                      |    40 +-
 head/sys/kern/subr_bus.c                       |     4 +-
 head/sys/kern/subr_devstat.c                   |    60 +-
 head/sys/kern/subr_dummy_vdso_tc.c             |    49 +
 head/sys/kern/subr_firmware.c                  |     4 +-
 head/sys/kern/subr_rman.c                      |    19 +-
 head/sys/kern/subr_sleepqueue.c                |    10 +-
 head/sys/kern/subr_smp.c                       |    17 +-
 head/sys/kern/subr_syscall.c                   |     8 +-
 head/sys/kern/subr_trap.c                      |     3 +-
 head/sys/kern/subr_turnstile.c                 |    12 +-
 head/sys/kern/subr_witness.c                   |    17 +-
 head/sys/kern/sys_capability.c                 |     6 +-
 head/sys/kern/sys_generic.c                    |     4 +-
 head/sys/kern/sys_procdesc.c                   |     6 +-
 head/sys/kern/sys_process.c                    |    10 +-
 head/sys/kern/syscalls.c                       |     4 +-
 head/sys/kern/syscalls.master                  |     6 +-
 head/sys/kern/systrace_args.c                  |    10 +-
 head/sys/kern/tty.c                            |    31 +-
 head/sys/kern/uipc_mqueue.c                    |     6 +-
 head/sys/kern/uipc_socket.c                    |     4 +-
 head/sys/kern/uipc_syscalls.c                  |    25 +-
 head/sys/kern/uipc_usrreq.c                    |     4 +-
 head/sys/kern/vfs_bio.c                        |    20 +-
 head/sys/kern/vfs_default.c                    |    19 +-
 head/sys/kern/vfs_subr.c                       |    15 +-
 head/sys/kern/vfs_syscalls.c                   |   302 +-
 head/sys/kern/vfs_vnops.c                      |   743 +++++-
 head/sys/netinet/icmp_var.h                    |     5 +-
 head/sys/netinet/if_ether.c                    |    15 +-
 head/sys/netinet/if_ether.h                    |    12 +-
 head/sys/netinet/igmp.c                        |    14 +-
 head/sys/netinet/in.c                          |     4 +-
 head/sys/netinet/in.h                          |     4 +-
 head/sys/netinet/in_pcb.c                      |     6 +-
 head/sys/netinet/in_pcb.h                      |     5 +-
 head/sys/netinet/in_var.h                      |     8 +-
 head/sys/netinet/ip.h                          |    27 +-
 head/sys/netinet/ip_carp.c                     |    11 +-
 head/sys/netinet/ip_fw.h                       |     2 +-
 head/sys/netinet/ip_icmp.c                     |     5 +-
 head/sys/netinet/ip_input.c                    |    11 +-
 head/sys/netinet/ip_mroute.c                   |     5 +-
 head/sys/netinet/ip_mroute.h                   |     3 +-
 head/sys/netinet/ip_output.c                   |    64 +-
 head/sys/netinet/ipfw/ip_dummynet.c            |     4 +-
 head/sys/netinet/ipfw/ip_fw_log.c              |   139 +-
 head/sys/netinet/ipfw/ip_fw_private.h          |     2 +-
 head/sys/netinet/ipfw/ip_fw_table.c            |    15 +-
 head/sys/netinet/libalias/alias_sctp.h         |     3 +-
 head/sys/netinet/libalias/libalias.3           |    16 +-
 head/sys/netinet/sctp.h                        |    80 +-
 head/sys/netinet/sctp_asconf.c                 |   189 +-
 head/sys/netinet/sctp_asconf.h                 |    12 +-
 head/sys/netinet/sctp_auth.c                   |    28 +-
 head/sys/netinet/sctp_auth.h                   |    10 +-
 head/sys/netinet/sctp_bsd_addr.c               |    14 +-
 head/sys/netinet/sctp_bsd_addr.h               |    13 +-
 head/sys/netinet/sctp_cc_functions.c           |    13 +-
 head/sys/netinet/sctp_constants.h              |    78 +-
 head/sys/netinet/sctp_crc32.c                  |    13 +-
 head/sys/netinet/sctp_crc32.h                  |    14 +-
 head/sys/netinet/sctp_dtrace_declare.h         |    12 +-
 head/sys/netinet/sctp_dtrace_define.h          |    12 +-
 head/sys/netinet/sctp_header.h                 |    27 +-
 head/sys/netinet/sctp_indata.c                 |   170 +-
 head/sys/netinet/sctp_indata.h                 |    22 +-
 head/sys/netinet/sctp_input.c                  |  1134 +++++-----
 head/sys/netinet/sctp_input.h                  |    24 +-
 head/sys/netinet/sctp_lock_bsd.h               |    15 +-
 head/sys/netinet/sctp_os.h                     |    12 +-
 head/sys/netinet/sctp_os_bsd.h                 |    45 +-
 head/sys/netinet/sctp_output.c                 |  1250 +++--------
 head/sys/netinet/sctp_output.h                 |    41 +-
 head/sys/netinet/sctp_pcb.c                    |   299 +--
 head/sys/netinet/sctp_pcb.h                    |    22 +-
 head/sys/netinet/sctp_peeloff.c                |    10 +-
 head/sys/netinet/sctp_peeloff.h                |    14 +-
 head/sys/netinet/sctp_ss_functions.c           |     8 +-
 head/sys/netinet/sctp_structs.h                |    14 +-
 head/sys/netinet/sctp_sysctl.c                 |    21 +-
 head/sys/netinet/sctp_sysctl.h                 |    17 +-
 head/sys/netinet/sctp_timer.c                  |    20 +-
 head/sys/netinet/sctp_timer.h                  |    11 +-
 head/sys/netinet/sctp_uio.h                    |    99 +-
 head/sys/netinet/sctp_usrreq.c                 |   180 +-
 head/sys/netinet/sctp_var.h                    |     8 +-
 head/sys/netinet/sctputil.c                    |   774 ++++---
 head/sys/netinet/sctputil.h                    |    41 +-
 head/sys/netinet/tcp_hostcache.c               |     4 +-
 head/sys/netinet/tcp_input.c                   |    61 +-
 head/sys/netinet/tcp_lro.c                     |   888 +++++---
 head/sys/netinet/tcp_lro.h                     |   123 +-
 head/sys/netinet/tcp_offload.c                 |   209 +-
 head/sys/netinet/tcp_offload.h                 |   364 +---
 head/sys/netinet/tcp_output.c                  |    69 +-
 head/sys/netinet/tcp_subr.c                    |    36 +-
 head/sys/netinet/tcp_syncache.c                |   147 +-
 head/sys/netinet/tcp_syncache.h                |    21 +-
 head/sys/netinet/tcp_timer.c                   |     7 +-
 head/sys/netinet/tcp_timewait.c                |    11 +-
 head/sys/netinet/tcp_usrreq.c                  |    77 +-
 head/sys/netinet/tcp_var.h                     |     4 +-
 head/sys/netinet/toecore.c                     |   575 +++++
 head/sys/netinet/toecore.h                     |   130 +
 head/sys/netinet/toedev.h                      |   162 -
 head/sys/netinet/udp_usrreq.c                  |    18 +-
 head/sys/pc98/conf/GENERIC                     |     4 +-
 head/sys/pc98/include/vdso.h                   |     6 +
 head/sys/pc98/pc98/machdep.c                   |    11 +-
 head/sys/powerpc/aim/locore32.S                |     9 +-
 head/sys/powerpc/aim/locore64.S                |     9 +-
 head/sys/powerpc/aim/mmu_oea.c                 |   165 +-
 head/sys/powerpc/aim/mmu_oea64.c               |   186 +-
 head/sys/powerpc/aim/moea64_native.c           |    47 +-
 head/sys/powerpc/aim/slb.c                     |     6 +-
 head/sys/powerpc/aim/swtch32.S                 |     5 +-
 head/sys/powerpc/aim/swtch64.S                 |     5 +-
 head/sys/powerpc/booke/locore.S                |    22 +-
 head/sys/powerpc/booke/machdep.c               |    82 +-
 head/sys/powerpc/booke/machdep_e500.c          |   158 +
 head/sys/powerpc/booke/machdep_ppc4xx.c        |   219 ++
 head/sys/powerpc/booke/platform_bare.c         |    63 +-
 head/sys/powerpc/booke/pmap.c                  |    52 +-
 head/sys/powerpc/booke/trap.c                  |     9 +-
 head/sys/powerpc/booke/trap_subr.S             |     4 +-
 head/sys/powerpc/conf/DEFAULTS                 |     4 +-
 head/sys/powerpc/conf/GENERIC                  |    11 +-
 head/sys/powerpc/conf/GENERIC64                |    22 +-
 head/sys/powerpc/conf/MPC85XX                  |     5 +-
 head/sys/powerpc/conf/NOTES                    |     5 +-
 head/sys/powerpc/include/_stdint.h             |     8 +-
 head/sys/powerpc/include/_types.h              |     6 +-
 head/sys/powerpc/include/atomic.h              |    61 +-
 head/sys/powerpc/include/cpu.h                 |     4 +-
 head/sys/powerpc/include/cpufunc.h             |    18 +-
 head/sys/powerpc/include/elf.h                 |     5 +-
 head/sys/powerpc/include/hid.h                 |    55 +-
 head/sys/powerpc/include/in_cksum.h            |     6 +-
 head/sys/powerpc/include/machdep.h             |    39 +
 head/sys/powerpc/include/pcpu.h                |     4 +-
 head/sys/powerpc/include/pio.h                 |    57 +-
 head/sys/powerpc/include/pmap.h                |    19 +-
 head/sys/powerpc/include/profile.h             |     9 +-
 head/sys/powerpc/include/psl.h                 |    30 +-
 head/sys/powerpc/include/pte.h                 |    29 +-
 head/sys/powerpc/include/spr.h                 |   228 +-
 head/sys/powerpc/include/tlb.h                 |    86 +-
 head/sys/powerpc/include/trap.h                |     4 +-
 head/sys/powerpc/include/ucontext.h            |     8 +-
 head/sys/powerpc/include/vdso.h                |    41 +
 head/sys/powerpc/include/vmparam.h             |     4 +-
 head/sys/powerpc/mpc85xx/fsl_sdhc.c            |  1306 ++++++++++++
 head/sys/powerpc/mpc85xx/fsl_sdhc.h            |   297 ++
 head/sys/powerpc/mpc85xx/i2c.c                 |     5 +-
 head/sys/powerpc/mpc85xx/lbc.c                 |   303 ++-
 head/sys/powerpc/mpc85xx/lbc.h                 |    62 +-
 head/sys/powerpc/mpc85xx/mpc85xx.c             |    13 +-
 head/sys/powerpc/mpc85xx/nexus.c               |    62 +-
 head/sys/powerpc/mpc85xx/openpic_fdt.c         |    93 -
 head/sys/powerpc/mpc85xx/pci_fdt.c             |    11 +-
 head/sys/powerpc/powermac/hrowpic.c            |     4 +-
 head/sys/powerpc/powerpc/atomic.S              |   137 -
 head/sys/powerpc/powerpc/bus_machdep.c         |    82 +-
 head/sys/powerpc/powerpc/cpu.c                 |    26 +-
 head/sys/powerpc/powerpc/db_trace.c            |     6 +-
 head/sys/powerpc/powerpc/gdb_machdep.c         |     4 +-
 head/sys/powerpc/powerpc/genassym.c            |    28 +-
 head/sys/powerpc/powerpc/mmu_if.m              |    12 +-
 head/sys/powerpc/powerpc/openpic_fdt.c         |    93 +
 head/sys/powerpc/powerpc/platform.c            |     6 +-
 head/sys/powerpc/powerpc/pmap_dispatch.c       |    24 +-
 head/sys/sparc64/conf/GENERIC                  |    35 +-
 head/sys/sparc64/include/_stdint.h             |     8 +-
 head/sys/sparc64/include/_types.h              |     6 +-
 head/sys/sparc64/include/elf.h                 |     3 +-
 head/sys/sparc64/include/in_cksum.h            |     6 +-
 head/sys/sparc64/include/intr_machdep.h        |     6 +-
 head/sys/sparc64/include/pcb.h                 |     4 +-
 head/sys/sparc64/include/pmap.h                |     5 +-
 head/sys/sparc64/include/vdso.h                |    34 +
 head/sys/sparc64/sparc64/intr_machdep.c        |     9 +-
 head/sys/sparc64/sparc64/machdep.c             |     4 +-
 head/sys/sparc64/sparc64/pmap.c                |   100 +-
 head/sys/sparc64/sparc64/tsb.c                 |     5 +-
 358 files changed, 25573 insertions(+), 9531 deletions(-)

diffs (53825 lines):

diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/acpica/acpi_machdep.c

--- a/head/sys/amd64/acpica/acpi_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/acpica/acpi_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/amd64/acpica/acpi_machdep.c 235556 2012-05-17 17:58:53Z jhb $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -44,8 +44,6 @@
 
 #include <machine/nexusvar.h>
 
-SYSCTL_DECL(_debug_acpi);
-
 int acpi_resume_beep;
 TUNABLE_INT("debug.acpi.resume_beep", &acpi_resume_beep);
 SYSCTL_INT(_debug_acpi, OID_AUTO, resume_beep, CTLFLAG_RW, &acpi_resume_beep,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/acpica/acpi_switch.S
--- a/head/sys/amd64/acpica/acpi_switch.S	Wed Jul 25 16:32:50 2012 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,177 +0,0 @@
-/*-
- * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
- * Copyright (c) 2008-2012 Jung-uk Kim <jkim at FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: head/sys/amd64/acpica/acpi_switch.S 230958 2012-02-03 21:24:28Z jkim $
- */
-
-#include <machine/asmacros.h>
-#include <machine/specialreg.h>
-
-#include "acpi_wakedata.h"
-#include "assym.s"
-
-#define	WAKEUP_CTX(member)	wakeup_ ## member - wakeup_ctx(%rsi)
-
-ENTRY(acpi_restorecpu)
-	/* Switch to KPML4phys. */
-	movq	%rdi, %cr3
-
-	/* Restore GDT. */
-	lgdt	WAKEUP_CTX(gdt)
-	jmp	1f
-1:
-
-	/* Fetch PCB. */
-	movq	WAKEUP_CTX(pcb), %rdi
-
-	/* Force kernel segment registers. */
-	movl	$KDSEL, %eax
-	movw	%ax, %ds
-	movw	%ax, %es
-	movw	%ax, %ss
-	movl	$KUF32SEL, %eax
-	movw	%ax, %fs
-	movl	$KUG32SEL, %eax
-	movw	%ax, %gs
-
-	movl	$MSR_FSBASE, %ecx
-	movl	PCB_FSBASE(%rdi), %eax
-	movl	4 + PCB_FSBASE(%rdi), %edx
-	wrmsr
-	movl	$MSR_GSBASE, %ecx
-	movl	PCB_GSBASE(%rdi), %eax
-	movl	4 + PCB_GSBASE(%rdi), %edx
-	wrmsr
-	movl	$MSR_KGSBASE, %ecx
-	movl	PCB_KGSBASE(%rdi), %eax
-	movl	4 + PCB_KGSBASE(%rdi), %edx
-	wrmsr
-
-	/* Restore EFER. */
-	movl	$MSR_EFER, %ecx
-	movl	WAKEUP_CTX(efer), %eax
-	wrmsr
-
-	/* Restore fast syscall stuff. */
-	movl	$MSR_STAR, %ecx
-	movl	WAKEUP_CTX(star), %eax
-	movl	4 + WAKEUP_CTX(star), %edx
-	wrmsr
-	movl	$MSR_LSTAR, %ecx
-	movl	WAKEUP_CTX(lstar), %eax
-	movl	4 + WAKEUP_CTX(lstar), %edx
-	wrmsr
-	movl	$MSR_CSTAR, %ecx
-	movl	WAKEUP_CTX(cstar), %eax
-	movl	4 + WAKEUP_CTX(cstar), %edx
-	wrmsr
-	movl	$MSR_SF_MASK, %ecx
-	movl	WAKEUP_CTX(sfmask), %eax
-	wrmsr
-
-	/* Restore CR0 except for FPU mode. */
-	movq	PCB_CR0(%rdi), %rax
-	andq	$~(CR0_EM | CR0_TS), %rax
-	movq	%rax, %cr0
-
-	/* Restore CR2 and CR4. */
-	movq	PCB_CR2(%rdi), %rax
-	movq	%rax, %cr2
-	movq	PCB_CR4(%rdi), %rax
-	movq	%rax, %cr4
-
-	/* Restore descriptor tables. */
-	lidt	PCB_IDT(%rdi)
-	lldt	PCB_LDT(%rdi)
-
-#define	SDT_SYSTSS	9
-#define	SDT_SYSBSY	11
-
-	/* Clear "task busy" bit and reload TR. */
-	movq	PCPU(TSS), %rax
-	andb	$(~SDT_SYSBSY | SDT_SYSTSS), 5(%rax)
-	movw	PCB_TR(%rdi), %ax
-	ltr	%ax
-
-#undef	SDT_SYSTSS
-#undef	SDT_SYSBSY
-
-	/* Restore debug registers. */
-	movq	PCB_DR0(%rdi), %rax
-	movq	%rax, %dr0
-	movq	PCB_DR1(%rdi), %rax
-	movq	%rax, %dr1
-	movq	PCB_DR2(%rdi), %rax
-	movq	%rax, %dr2
-	movq	PCB_DR3(%rdi), %rax
-	movq	%rax, %dr3
-	movq	PCB_DR6(%rdi), %rax
-	movq	%rax, %dr6
-	movq	PCB_DR7(%rdi), %rax
-	movq	%rax, %dr7
-
-	/* Restore FPU state. */
-	fninit
-	movq	WAKEUP_CTX(fpusave), %rbx
-	movq	WAKEUP_CTX(xsmask), %rax
-	testq	%rax, %rax
-	jz	1f
-	movq	%rax, %rdx
-	shrq	$32, %rdx
-	movl	$XCR0, %ecx
-/*	xsetbv	*/
-	.byte	0x0f, 0x01, 0xd1
-/*	xrstor	(%rbx) */
-	.byte	0x0f, 0xae, 0x2b
-	jmp	2f
-1:
-	fxrstor	(%rbx)
-2:
-
-	/* Reload CR0. */
-	movq	PCB_CR0(%rdi), %rax
-	movq	%rax, %cr0
-
-	/* Restore other callee saved registers. */
-	movq	PCB_R15(%rdi), %r15
-	movq	PCB_R14(%rdi), %r14
-	movq	PCB_R13(%rdi), %r13
-	movq	PCB_R12(%rdi), %r12
-	movq	PCB_RBP(%rdi), %rbp
-	movq	PCB_RSP(%rdi), %rsp
-	movq	PCB_RBX(%rdi), %rbx
-
-	/* Restore return address. */
-	movq	PCB_RIP(%rdi), %rax
-	movq	%rax, (%rsp)
-
-	/* Indicate the CPU is resumed. */
-	xorl	%eax, %eax
-	movl	%eax, WAKEUP_CTX(cpu)
-
-	ret
-END(acpi_restorecpu)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/acpica/acpi_wakecode.S
--- a/head/sys/amd64/acpica/acpi_wakecode.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/acpica/acpi_wakecode.S	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/acpica/acpi_wakecode.S 231787 2012-02-15 22:10:33Z jkim $
+ * $FreeBSD: head/sys/amd64/acpica/acpi_wakecode.S 237037 2012-06-13 22:53:56Z jkim $
  */
 
 #include <machine/asmacros.h>
@@ -219,10 +219,14 @@
 	mov	$bootdata64 - bootgdt, %eax
 	mov	%ax, %ds
 
-	/* Restore arguments and return. */
-	movq	wakeup_kpml4 - wakeup_start(%rbx), %rdi
-	movq	wakeup_ctx - wakeup_start(%rbx), %rsi
-	movq	wakeup_retaddr - wakeup_start(%rbx), %rax
+	/* Restore arguments. */
+	movq	wakeup_pcb - wakeup_start(%rbx), %rdi
+	movq	wakeup_ret - wakeup_start(%rbx), %rax
+
+	/* Restore GDT. */
+	lgdt	wakeup_gdt - wakeup_start(%rbx)
+
+	/* Jump to return address. */
 	jmp	*%rax
 
 	.data
@@ -268,34 +272,11 @@
 	.long	bootgdt - wakeup_start	/* Offset plus %ds << 4 */
 
 	ALIGN_DATA
-wakeup_retaddr:
-	.quad	0
-wakeup_kpml4:
-	.quad	0
-
-wakeup_ctx:
-	.quad	0
 wakeup_pcb:
 	.quad	0
-wakeup_fpusave:
+wakeup_ret:
 	.quad	0
 wakeup_gdt:
 	.word	0
 	.quad	0
-
-	ALIGN_DATA
-wakeup_efer:
-	.quad	0
-wakeup_star:
-	.quad	0
-wakeup_lstar:
-	.quad	0
-wakeup_cstar:
-	.quad	0
-wakeup_sfmask:
-	.quad	0
-wakeup_xsmask:
-	.quad	0
-wakeup_cpu:
-	.long	0
 dummy:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/acpica/acpi_wakeup.c
--- a/head/sys/amd64/acpica/acpi_wakeup.c	Wed Jul 25 16:32:50 2012 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,420 +0,0 @@
-/*-
- * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
- * Copyright (c) 2003 Peter Wemm
- * Copyright (c) 2008-2012 Jung-uk Kim <jkim at FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/acpica/acpi_wakeup.c 233704 2012-03-30 17:03:06Z jkim $");
-
-#include <sys/param.h>
-#include <sys/bus.h>
-#include <sys/eventhandler.h>
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-#include <sys/memrange.h>
-#include <sys/smp.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-
-#include <machine/clock.h>
-#include <machine/intr_machdep.h>
-#include <x86/mca.h>
-#include <machine/pcb.h>
-#include <machine/pmap.h>
-#include <machine/specialreg.h>
-#include <machine/md_var.h>
-
-#ifdef SMP
-#include <x86/apicreg.h>
-#include <machine/smp.h>
-#include <machine/vmparam.h>
-#endif
-
-#include <contrib/dev/acpica/include/acpi.h>
-
-#include <dev/acpica/acpivar.h>
-
-#include "acpi_wakecode.h"
-#include "acpi_wakedata.h"
-
-/* Make sure the code is less than a page and leave room for the stack. */
-CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024);
-
-extern int		acpi_resume_beep;
-extern int		acpi_reset_video;
-
-#ifdef SMP
-extern struct pcb	**susppcbs;
-extern void		**suspfpusave;
-#else
-static struct pcb	**susppcbs;
-static void		**suspfpusave;
-#endif
-
-int			acpi_restorecpu(uint64_t, vm_offset_t);
-
-static void		*acpi_alloc_wakeup_handler(void);
-static void		acpi_stop_beep(void *);
-
-#ifdef SMP
-static int		acpi_wakeup_ap(struct acpi_softc *, int);
-static void		acpi_wakeup_cpus(struct acpi_softc *, const cpuset_t *);
-#endif
-
-#define	WAKECODE_VADDR(sc)	((sc)->acpi_wakeaddr + (3 * PAGE_SIZE))
-#define	WAKECODE_PADDR(sc)	((sc)->acpi_wakephys + (3 * PAGE_SIZE))
-#define	WAKECODE_FIXUP(offset, type, val) do	{	\
-	type	*addr;					\
-	addr = (type *)(WAKECODE_VADDR(sc) + offset);	\
-	*addr = val;					\
-} while (0)
-
-static void
-acpi_stop_beep(void *arg)
-{
-
-	if (acpi_resume_beep != 0)
-		timer_spkr_release();
-}
-
-#ifdef SMP
-static int
-acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
-{
-	int		vector = (WAKECODE_PADDR(sc) >> 12) & 0xff;
-	int		apic_id = cpu_apic_ids[cpu];
-	int		ms;
-
-	WAKECODE_FIXUP(wakeup_pcb, struct pcb *, susppcbs[cpu]);
-	WAKECODE_FIXUP(wakeup_fpusave, void *, suspfpusave[cpu]);
-	WAKECODE_FIXUP(wakeup_gdt, uint16_t, susppcbs[cpu]->pcb_gdt.rd_limit);
-	WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t,
-	    susppcbs[cpu]->pcb_gdt.rd_base);
-	WAKECODE_FIXUP(wakeup_cpu, int, cpu);
-
-	/* do an INIT IPI: assert RESET */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
-
-	/* wait for pending status end */
-	lapic_ipi_wait(-1);
-
-	/* do an INIT IPI: deassert RESET */
-	lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0);
-
-	/* wait for pending status end */
-	DELAY(10000);		/* wait ~10mS */
-	lapic_ipi_wait(-1);
-
-	/*
-	 * next we do a STARTUP IPI: the previous INIT IPI might still be
-	 * latched, (P5 bug) this 1st STARTUP would then terminate
-	 * immediately, and the previously started INIT IPI would continue. OR
-	 * the previous INIT IPI has already run. and this STARTUP IPI will
-	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
-	 * will run.
-	 */
-
-	/* do a STARTUP IPI */
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
-	    vector, apic_id);
-	lapic_ipi_wait(-1);
-	DELAY(200);		/* wait ~200uS */
-
-	/*
-	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
-	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
-	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
-	 * recognized after hardware RESET or INIT IPI.
-	 */
-
-	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
-	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
-	    vector, apic_id);
-	lapic_ipi_wait(-1);
-	DELAY(200);		/* wait ~200uS */
-
-	/* Wait up to 5 seconds for it to start. */
-	for (ms = 0; ms < 5000; ms++) {
-		if (*(int *)(WAKECODE_VADDR(sc) + wakeup_cpu) == 0)
-			return (1);	/* return SUCCESS */
-		DELAY(1000);
-	}
-	return (0);		/* return FAILURE */
-}
-
-#define	WARMBOOT_TARGET		0
-#define	WARMBOOT_OFF		(KERNBASE + 0x0467)
-#define	WARMBOOT_SEG		(KERNBASE + 0x0469)
-
-#define	CMOS_REG		(0x70)
-#define	CMOS_DATA		(0x71)
-#define	BIOS_RESET		(0x0f)
-#define	BIOS_WARM		(0x0a)
-
-static void
-acpi_wakeup_cpus(struct acpi_softc *sc, const cpuset_t *wakeup_cpus)
-{
-	uint32_t	mpbioswarmvec;
-	int		cpu;
-	u_char		mpbiosreason;
-
-	/* save the current value of the warm-start vector */
-	mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF);
-	outb(CMOS_REG, BIOS_RESET);
-	mpbiosreason = inb(CMOS_DATA);
-
-	/* setup a vector to our boot code */
-	*((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET;
-	*((volatile u_short *)WARMBOOT_SEG) = WAKECODE_PADDR(sc) >> 4;
-	outb(CMOS_REG, BIOS_RESET);
-	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
-
-	/* Wake up each AP. */
-	for (cpu = 1; cpu < mp_ncpus; cpu++) {
-		if (!CPU_ISSET(cpu, wakeup_cpus))
-			continue;
-		if (acpi_wakeup_ap(sc, cpu) == 0) {
-			/* restore the warmstart vector */
-			*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
-			panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)",
-			    cpu, cpu_apic_ids[cpu]);
-		}
-	}
-
-	/* restore the warmstart vector */
-	*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
-
-	outb(CMOS_REG, BIOS_RESET);
-	outb(CMOS_DATA, mpbiosreason);
-}
-#endif
-
-int
-acpi_sleep_machdep(struct acpi_softc *sc, int state)
-{
-#ifdef SMP
-	cpuset_t	wakeup_cpus;
-#endif
-	register_t	rf;
-	ACPI_STATUS	status;
-	int		ret;
-
-	ret = -1;
-
-	if (sc->acpi_wakeaddr == 0ul)
-		return (ret);
-
-#ifdef SMP
-	wakeup_cpus = all_cpus;
-	CPU_CLR(PCPU_GET(cpuid), &wakeup_cpus);
-#endif
-
-	if (acpi_resume_beep != 0)
-		timer_spkr_acquire();
-
-	AcpiSetFirmwareWakingVector(WAKECODE_PADDR(sc));
-
-	rf = intr_disable();
-	intr_suspend();
-
-	if (savectx(susppcbs[0])) {
-		ctx_fpusave(suspfpusave[0]);
-#ifdef SMP
-		if (!CPU_EMPTY(&wakeup_cpus) &&
-		    suspend_cpus(wakeup_cpus) == 0) {
-			device_printf(sc->acpi_dev, "Failed to suspend APs\n");
-			goto out;
-		}
-#endif
-
-		WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0));
-		WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0));
-
-		WAKECODE_FIXUP(wakeup_pcb, struct pcb *, susppcbs[0]);
-		WAKECODE_FIXUP(wakeup_fpusave, void *, suspfpusave[0]);
-		WAKECODE_FIXUP(wakeup_gdt, uint16_t,
-		    susppcbs[0]->pcb_gdt.rd_limit);
-		WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t,
-		    susppcbs[0]->pcb_gdt.rd_base);
-		WAKECODE_FIXUP(wakeup_cpu, int, 0);
-
-		/* Call ACPICA to enter the desired sleep state */
-		if (state == ACPI_STATE_S4 && sc->acpi_s4bios)
-			status = AcpiEnterSleepStateS4bios();
-		else
-			status = AcpiEnterSleepState(state, acpi_sleep_flags);
-
-		if (status != AE_OK) {
-			device_printf(sc->acpi_dev,
-			    "AcpiEnterSleepState failed - %s\n",
-			    AcpiFormatException(status));
-			goto out;
-		}
-
-		for (;;)
-			ia32_pause();
-	} else {
-		pmap_init_pat();
-		load_cr3(susppcbs[0]->pcb_cr3);
-		initializecpu();
-		PCPU_SET(switchtime, 0);
-		PCPU_SET(switchticks, ticks);
-#ifdef SMP
-		if (!CPU_EMPTY(&wakeup_cpus))
-			acpi_wakeup_cpus(sc, &wakeup_cpus);
-#endif
-		ret = 0;
-	}
-
-out:
-#ifdef SMP
-	if (!CPU_EMPTY(&wakeup_cpus))
-		restart_cpus(wakeup_cpus);
-#endif
-
-	mca_resume();
-	intr_resume();
-	intr_restore(rf);
-
-	AcpiSetFirmwareWakingVector(0);
-
-	if (ret == 0 && mem_range_softc.mr_op != NULL &&
-	    mem_range_softc.mr_op->reinit != NULL)
-		mem_range_softc.mr_op->reinit(&mem_range_softc);
-
-	return (ret);
-}
-
-static void *
-acpi_alloc_wakeup_handler(void)
-{
-	void		*wakeaddr;
-	int		i;
-
-	/*
-	 * Specify the region for our wakeup code.  We want it in the low 1 MB
-	 * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA
-	 * (less than 128KB, below 0xa0000, must be excluded by SMAP and DSDT),
-	 * and ROM area (0xa0000 and above).  The temporary page tables must be
-	 * page-aligned.
-	 */
-	wakeaddr = contigmalloc(4 * PAGE_SIZE, M_DEVBUF, M_WAITOK, 0x500,
-	    0xa0000, PAGE_SIZE, 0ul);
-	if (wakeaddr == NULL) {
-		printf("%s: can't alloc wake memory\n", __func__);
-		return (NULL);
-	}
-	if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL,
-	    EVENTHANDLER_PRI_LAST) == NULL) {
-		printf("%s: can't register event handler\n", __func__);
-		contigfree(wakeaddr, 4 * PAGE_SIZE, M_DEVBUF);
-		return (NULL);
-	}
-	susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK);
-	suspfpusave = malloc(mp_ncpus * sizeof(void *), M_DEVBUF, M_WAITOK);
-	for (i = 0; i < mp_ncpus; i++) {
-		susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK);
-		suspfpusave[i] = alloc_fpusave(M_WAITOK);
-	}
-
-	return (wakeaddr);
-}
-
-void
-acpi_install_wakeup_handler(struct acpi_softc *sc)
-{
-	static void	*wakeaddr = NULL;
-	uint64_t	*pt4, *pt3, *pt2;
-	int		i;
-
-	if (wakeaddr != NULL)
-		return;
-
-	wakeaddr = acpi_alloc_wakeup_handler();
-	if (wakeaddr == NULL)
-		return;
-
-	sc->acpi_wakeaddr = (vm_offset_t)wakeaddr;
-	sc->acpi_wakephys = vtophys(wakeaddr);
-
-	bcopy(wakecode, (void *)WAKECODE_VADDR(sc), sizeof(wakecode));
-
-	/* Patch GDT base address, ljmp targets and page table base address. */
-	WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t,
-	    WAKECODE_PADDR(sc) + bootgdt);
-	WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t,
-	    WAKECODE_PADDR(sc) + wakeup_32);
-	WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t,
-	    WAKECODE_PADDR(sc) + wakeup_64);
-	WAKECODE_FIXUP(wakeup_pagetables, uint32_t, sc->acpi_wakephys);
-
-	/* Save pointers to some global data. */
-	WAKECODE_FIXUP(wakeup_retaddr, void *, acpi_restorecpu);
-	WAKECODE_FIXUP(wakeup_kpml4, uint64_t, KPML4phys);
-	WAKECODE_FIXUP(wakeup_ctx, vm_offset_t,
-	    WAKECODE_VADDR(sc) + wakeup_ctx);
-	WAKECODE_FIXUP(wakeup_efer, uint64_t, rdmsr(MSR_EFER));
-	WAKECODE_FIXUP(wakeup_star, uint64_t, rdmsr(MSR_STAR));
-	WAKECODE_FIXUP(wakeup_lstar, uint64_t, rdmsr(MSR_LSTAR));
-	WAKECODE_FIXUP(wakeup_cstar, uint64_t, rdmsr(MSR_CSTAR));
-	WAKECODE_FIXUP(wakeup_sfmask, uint64_t, rdmsr(MSR_SF_MASK));
-	WAKECODE_FIXUP(wakeup_xsmask, uint64_t, xsave_mask);
-
-	/* Build temporary page tables below realmode code. */
-	pt4 = wakeaddr;
-	pt3 = pt4 + (PAGE_SIZE) / sizeof(uint64_t);
-	pt2 = pt3 + (PAGE_SIZE) / sizeof(uint64_t);
-
-	/* Create the initial 1GB replicated page tables */
-	for (i = 0; i < 512; i++) {
-		/*
-		 * Each slot of the level 4 pages points
-		 * to the same level 3 page
-		 */
-		pt4[i] = (uint64_t)(sc->acpi_wakephys + PAGE_SIZE);
-		pt4[i] |= PG_V | PG_RW | PG_U;
-
-		/*
-		 * Each slot of the level 3 pages points
-		 * to the same level 2 page
-		 */
-		pt3[i] = (uint64_t)(sc->acpi_wakephys + (2 * PAGE_SIZE));
-		pt3[i] |= PG_V | PG_RW | PG_U;
-
-		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
-		pt2[i] = i * (2 * 1024 * 1024);
-		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
-	}
-
-	if (bootverbose)
-		device_printf(sc->acpi_dev, "wakeup code va %p pa %p\n",
-		    (void *)sc->acpi_wakeaddr, (void *)sc->acpi_wakephys);
-}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/cpu_switch.S
--- a/head/sys/amd64/amd64/cpu_switch.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/cpu_switch.S	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/amd64/cpu_switch.S 232226 2012-02-27 17:28:22Z jhb $
+ * $FreeBSD: head/sys/amd64/amd64/cpu_switch.S 238450 2012-07-14 15:48:30Z kib $
  */
 
 #include <machine/asmacros.h>
@@ -122,8 +122,10 @@
 1:	movq	%rdx,%rcx
 	movl	xsave_mask,%eax
 	movl	xsave_mask+4,%edx
-/*	xsave	(%r8) */
-	.byte	0x41,0x0f,0xae,0x20
+	.globl	ctx_switch_xsave
+ctx_switch_xsave:
+	/* This is patched to xsaveopt if supported, see fpuinit_bsp1() */
+	xsave	(%r8)
 	movq	%rcx,%rdx
 2:	smsw	%ax
 	orb	$CR0_TS,%al
@@ -357,6 +359,30 @@
 	rdmsr
 	movl	%eax,PCB_KGSBASE(%rdi)
 	movl	%edx,PCB_KGSBASE+4(%rdi)
+	movl	$MSR_EFER,%ecx
+	rdmsr
+	movl	%eax,PCB_EFER(%rdi)
+	movl	%edx,PCB_EFER+4(%rdi)
+	movl	$MSR_STAR,%ecx
+	rdmsr
+	movl	%eax,PCB_STAR(%rdi)
+	movl	%edx,PCB_STAR+4(%rdi)
+	movl	$MSR_LSTAR,%ecx
+	rdmsr
+	movl	%eax,PCB_LSTAR(%rdi)
+	movl	%edx,PCB_LSTAR+4(%rdi)
+	movl	$MSR_CSTAR,%ecx
+	rdmsr
+	movl	%eax,PCB_CSTAR(%rdi)
+	movl	%edx,PCB_CSTAR+4(%rdi)
+	movl	$MSR_SF_MASK,%ecx
+	rdmsr
+	movl	%eax,PCB_SFMASK(%rdi)
+	movl	%edx,PCB_SFMASK+4(%rdi)
+	movl	xsave_mask,%eax
+	movl	%eax,PCB_XSMASK(%rdi)
+	movl	xsave_mask+4,%eax
+	movl	%eax,PCB_XSMASK+4(%rdi)
 
 	sgdt	PCB_GDT(%rdi)
 	sidt	PCB_IDT(%rdi)
@@ -370,6 +396,140 @@
 END(savectx)
 
 /*
+ * resumectx(pcb)
+ * Resuming processor state from pcb.
+ */     
+ENTRY(resumectx)
+	/* Switch to KPML4phys. */
+	movq	KPML4phys,%rax
+	movq	%rax,%cr3
+
+	/* Force kernel segment registers. */
+	movl	$KDSEL,%eax
+	movw	%ax,%ds
+	movw	%ax,%es
+	movw	%ax,%ss
+	movl	$KUF32SEL,%eax
+	movw	%ax,%fs
+	movl	$KUG32SEL,%eax
+	movw	%ax,%gs
+
+	movl	$MSR_FSBASE,%ecx
+	movl	PCB_FSBASE(%rdi),%eax
+	movl	4 + PCB_FSBASE(%rdi),%edx
+	wrmsr
+	movl	$MSR_GSBASE,%ecx
+	movl	PCB_GSBASE(%rdi),%eax
+	movl	4 + PCB_GSBASE(%rdi),%edx
+	wrmsr
+	movl	$MSR_KGSBASE,%ecx
+	movl	PCB_KGSBASE(%rdi),%eax
+	movl	4 + PCB_KGSBASE(%rdi),%edx
+	wrmsr
+
+	/* Restore EFER. */
+	movl	$MSR_EFER,%ecx
+	movl	PCB_EFER(%rdi),%eax
+	wrmsr
+
+	/* Restore fast syscall stuff. */
+	movl	$MSR_STAR,%ecx
+	movl	PCB_STAR(%rdi),%eax
+	movl	4 + PCB_STAR(%rdi),%edx
+	wrmsr
+	movl	$MSR_LSTAR,%ecx
+	movl	PCB_LSTAR(%rdi),%eax
+	movl	4 + PCB_LSTAR(%rdi),%edx
+	wrmsr
+	movl	$MSR_CSTAR,%ecx
+	movl	PCB_CSTAR(%rdi),%eax
+	movl	4 + PCB_CSTAR(%rdi),%edx
+	wrmsr
+	movl	$MSR_SF_MASK,%ecx
+	movl	PCB_SFMASK(%rdi),%eax
+	wrmsr
+
+	/* Restore CR0 except for FPU mode. */
+	movq	PCB_CR0(%rdi),%rax
+	andq	$~(CR0_EM | CR0_TS),%rax
+	movq	%rax,%cr0
+
+	/* Restore CR2, CR4 and CR3. */
+	movq	PCB_CR2(%rdi),%rax
+	movq	%rax,%cr2
+	movq	PCB_CR4(%rdi),%rax
+	movq	%rax,%cr4
+	movq	PCB_CR3(%rdi),%rax
+	movq	%rax,%cr3
+
+	/* Restore descriptor tables. */
+	lidt	PCB_IDT(%rdi)
+	lldt	PCB_LDT(%rdi)
+
+#define	SDT_SYSTSS	9
+#define	SDT_SYSBSY	11
+
+	/* Clear "task busy" bit and reload TR. */
+	movq	PCPU(TSS),%rax
+	andb	$(~SDT_SYSBSY | SDT_SYSTSS),5(%rax)
+	movw	PCB_TR(%rdi),%ax
+	ltr	%ax
+
+#undef	SDT_SYSTSS
+#undef	SDT_SYSBSY
+
+	/* Restore debug registers. */
+	movq	PCB_DR0(%rdi),%rax
+	movq	%rax,%dr0
+	movq	PCB_DR1(%rdi),%rax
+	movq	%rax,%dr1
+	movq	PCB_DR2(%rdi),%rax
+	movq	%rax,%dr2
+	movq	PCB_DR3(%rdi),%rax
+	movq	%rax,%dr3
+	movq	PCB_DR6(%rdi),%rax
+	movq	%rax,%dr6
+	movq	PCB_DR7(%rdi),%rax
+	movq	%rax,%dr7
+
+	/* Restore FPU state. */
+	fninit
+	movq	PCB_FPUSUSPEND(%rdi),%rbx
+	movq	PCB_XSMASK(%rdi),%rax
+	testq	%rax,%rax
+	jz	1f
+	movq	%rax,%rdx
+	shrq	$32,%rdx
+	movl	$XCR0,%ecx
+	xsetbv
+	xrstor	(%rbx)
+	jmp	2f
+1:
+	fxrstor	(%rbx)
+2:
+
+	/* Reload CR0. */
+	movq	PCB_CR0(%rdi),%rax
+	movq	%rax,%cr0
+
+	/* Restore other callee saved registers. */
+	movq	PCB_R15(%rdi),%r15
+	movq	PCB_R14(%rdi),%r14
+	movq	PCB_R13(%rdi),%r13
+	movq	PCB_R12(%rdi),%r12
+	movq	PCB_RBP(%rdi),%rbp
+	movq	PCB_RSP(%rdi),%rsp
+	movq	PCB_RBX(%rdi),%rbx
+
+	/* Restore return address. */
+	movq	PCB_RIP(%rdi),%rax
+	movq	%rax,(%rsp)
+
+	xorl	%eax,%eax
+	ret
+END(resumectx)
+
+/*
  * Wrapper around fpusave to care about TS0_CR.
  */
 ENTRY(ctx_fpusave)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/db_disasm.c
--- a/head/sys/amd64/amd64/db_disasm.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/db_disasm.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,12 +25,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/db_disasm.c 238166 2012-07-06 14:25:59Z jhb $");
 
 /*
  * Instruction disassembler.
  */
 #include <sys/param.h>
+#include <sys/libkern.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_access.h>
@@ -47,7 +48,9 @@
 #define	DBLR	5
 #define	EXTR	6
 #define	SDEP	7
-#define	NONE	8
+#define	ADEP	8
+#define	ESC	9
+#define	NONE	10
 
 /*
  * REX prefix and bits
@@ -67,6 +70,7 @@
 #define	Eb	4			/* address, byte size */
 #define	R	5			/* register, in 'reg' field */
 #define	Rw	6			/* word register, in 'reg' field */
+#define	Rq	39			/* quad register, in 'reg' field */
 #define	Ri	7			/* register in instruction */
 #define	S	8			/* segment reg, in 'reg' field */
 #define	Si	9			/* segment reg, in instruction */
@@ -120,6 +124,45 @@
 					   (or pointer to table) */
 };
 
+static const struct inst db_inst_0f388x[] = {
+/*80*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invept" },
+/*81*/	{ "",	   TRUE,  SDEP,  op2(E, Rq),  "invvpid" },
+/*82*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*83*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*84*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*85*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*86*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*87*/	{ "",	   FALSE, NONE,  0,	      0 },
+
+/*88*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*89*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8a*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8b*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8c*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8d*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8e*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*8f*/	{ "",	   FALSE, NONE,  0,	      0 },
+};
+
+static const struct inst * const db_inst_0f38[] = {
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	db_inst_0f388x,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0
+};
+
 static const char * const db_Grp6[] = {
 	"sldt",
 	"str",
@@ -160,8 +203,8 @@
 	"",
 	"",
 	"",
-	"",
-	""
+	"vmptrld",
+	"vmptrst"
 };
 
 static const char * const db_Grp15[] = {
@@ -169,9 +212,9 @@
 	"fxrstor",
 	"ldmxcsr",
 	"stmxcsr",
-	"",
-	"",
-	"",
+	"xsave",
+	"xrstor",
+	"xsaveopt",
 	"clflush"
 };
 
@@ -236,7 +279,7 @@
 /*36*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*37*/	{ "getsec",FALSE, NONE,  0,	      0 },
 
-/*38*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*38*/	{ "",	   FALSE, ESC,  0,	      db_inst_0f38 },
 /*39*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3a*/	{ "",	   FALSE, NONE,  0,	      0 },
 /*3b*/	{ "",	   FALSE, NONE,  0,	      0 },
@@ -266,6 +309,26 @@
 /*4f*/	{ "cmovnle",TRUE, NONE,  op2(E, R),   0 },
 };
 
+static const struct inst db_inst_0f7x[] = {
+/*70*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*71*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*72*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*73*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*74*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*75*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*76*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*77*/	{ "",	   FALSE, NONE,  0,	      0 },
+
+/*78*/	{ "vmread", TRUE, NONE,  op2(Rq, E),  0 },
+/*79*/	{ "vmwrite",TRUE, NONE,  op2(E, Rq),  0 },
+/*7a*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7b*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7c*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7d*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7e*/	{ "",	   FALSE, NONE,  0,	      0 },
+/*7f*/	{ "",	   FALSE, NONE,  0,	      0 },
+};
+
 static const struct inst db_inst_0f8x[] = {
 /*80*/	{ "jo",    FALSE, NONE,  op1(Dl),     0 },
 /*81*/	{ "jno",   FALSE, NONE,  op1(Dl),     0 },
@@ -373,7 +436,7 @@
 	db_inst_0f4x,
 	0,
 	0,
-	0,
+	db_inst_0f7x,
 	db_inst_0f8x,
 	db_inst_0f9x,
 	db_inst_0fax,
@@ -582,7 +645,7 @@
 /*0c*/	{ "or",    FALSE, BYTE,  op2(I, A),  0 },
 /*0d*/	{ "or",    FALSE, LONG,  op2(I, A),  0 },
 /*0e*/	{ "push",  FALSE, NONE,  op1(Si),    0 },
-/*0f*/	{ "",      FALSE, NONE,  0,	     0 },
+/*0f*/	{ "",      FALSE, ESC,   0,	     db_inst_0f },
 
 /*10*/	{ "adc",   TRUE,  BYTE,  op2(R, E),  0 },
 /*11*/	{ "adc",   TRUE,  LONG,  op2(R, E),  0 },
@@ -738,8 +801,8 @@
 /*96*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 /*97*/	{ "xchg",  FALSE, LONG,  op2(A, Ri),  0 },
 
-/*98*/	{ "cbw",   FALSE, SDEP,  0,	      "cwde" },	/* cbw/cwde */
-/*99*/	{ "cwd",   FALSE, SDEP,  0,	      "cdq"  },	/* cwd/cdq */
+/*98*/	{ "cwde",  FALSE, SDEP,  0,	      "cbw" },
+/*99*/	{ "cdq",   FALSE, SDEP,  0,	      "cwd" },
 /*9a*/	{ "lcall", FALSE, NONE,  op1(OS),     0 },
 /*9b*/	{ "wait",  FALSE, NONE,  0,	      0 },
 /*9c*/	{ "pushf", FALSE, LONG,  0,	      0 },
@@ -822,7 +885,7 @@
 /*e0*/	{ "loopne",FALSE, NONE,  op1(Db),     0 },
 /*e1*/	{ "loope", FALSE, NONE,  op1(Db),     0 },
 /*e2*/	{ "loop",  FALSE, NONE,  op1(Db),     0 },
-/*e3*/	{ "jcxz",  FALSE, SDEP,  op1(Db),     "jecxz" },
+/*e3*/	{ "jrcxz", FALSE, ADEP,  op1(Db),     "jecxz" },
 /*e4*/	{ "in",    FALSE, BYTE,  op2(Ib, A),  0 },
 /*e5*/	{ "in",    FALSE, LONG,  op2(Ib, A) , 0 },
 /*e6*/	{ "out",   FALSE, BYTE,  op2(A, Ib),  0 },
@@ -1208,14 +1271,6 @@
 	    if (prefix) {
 		get_value_inc(inst, loc, 1, FALSE);
 	    }
-	    if (rep == TRUE) {
-		if (inst == 0x90) {
-		    db_printf("pause\n");
-		    return (loc);
-		}
-		db_printf("repe ");	/* XXX repe VS rep */
-		rep = FALSE;
-	    }
 	} while (prefix);
 
 	if (inst >= 0xd8 && inst <= 0xdf) {
@@ -1224,9 +1279,10 @@
 	    return (loc);
 	}
 
-	if (inst == 0x0f) {
+	ip = &db_inst_table[inst];
+	while (ip->i_size == ESC) {
 	    get_value_inc(inst, loc, 1, FALSE);
-	    ip = db_inst_0f[inst>>4];
+	    ip = ((const struct inst * const *)ip->i_extra)[inst>>4];
 	    if (ip == 0) {
 		ip = &db_bad_inst;
 	    }
@@ -1234,8 +1290,6 @@
 		ip = &ip[inst&0xf];
 	    }
 	}
-	else
-	    ip = &db_inst_table[inst];
 
 	if (ip->i_has_modrm) {
 	    get_value_inc(regmodrm, loc, 1, FALSE);
@@ -1269,6 +1323,26 @@
 	/* Special cases that don't fit well in the tables. */
 	if (ip->i_extra == db_Grp7 && f_mod(rex, regmodrm) == 3) {
 		switch (regmodrm) {
+		case 0xc1:
+			i_name = "vmcall";
+			i_size = NONE;
+			i_mode = 0;
+			break;
+		case 0xc2:
+			i_name = "vmlaunch";
+			i_size = NONE;
+			i_mode = 0;
+			break;
+		case 0xc3:
+			i_name = "vmresume";
+			i_size = NONE;
+			i_mode = 0;
+			break;
+		case 0xc4:
+			i_name = "vmxoff";
+			i_size = NONE;
+			i_mode = 0;
+			break;
 		case 0xc8:
 			i_name = "monitor";
 			i_size = NONE;
@@ -1279,11 +1353,26 @@
 			i_size = NONE;
 			i_mode = 0;
 			break;
+		case 0xd0:
+			i_name = "xgetbv";
+			i_size = NONE;
+			i_mode = 0;
+			break;
+		case 0xd1:
+			i_name = "xsetbv";
+			i_size = NONE;
+			i_mode = 0;
+			break;
 		case 0xf8:
 			i_name = "swapgs";
 			i_size = NONE;
 			i_mode = 0;
 			break;
+		case 0xf9:
+			i_name = "rdtscp";
+			i_size = NONE;
+			i_mode = 0;
+			break;
 		}
 	}
 	if (ip->i_extra == db_Grp15 && f_mod(rex, regmodrm) == 3) {
@@ -1292,8 +1381,42 @@
 		i_mode = 0;
 	}
 
+	/* Handle instructions identified by mandatory prefixes. */
+	if (rep == TRUE) {
+	    if (inst == 0x90) {
+		i_name = "pause";
+		i_size = NONE;
+		i_mode = 0;
+		rep = FALSE;
+	    } else if (ip->i_extra == db_Grp9 && f_mod(rex, regmodrm) != 3 &&
+		f_reg(rex, regmodrm) == 0x6) {
+		i_name = "vmxon";
+		rep = FALSE;
+	    }
+	}
+	if (size == WORD) {
+	    if (ip->i_extra == db_Grp9 && f_mod(rex, regmodrm) != 3 &&
+		f_reg(rex, regmodrm) == 0x6) {
+		i_name = "vmclear";
+	    }
+	}
+	if (rex & REX_W) {
+	    if (strcmp(i_name, "cwde") == 0)
+		i_name = "cdqe";
+	    else if (strcmp(i_name, "cmpxchg8b") == 0)
+		i_name = "cmpxchg16b";
+	}
+
+	if (rep == TRUE)
+	    db_printf("repe ");	/* XXX repe VS rep */
+
 	if (i_size == SDEP) {
-	    if (size == WORD)
+	    if (size == LONG)
+		db_printf("%s", i_name);
+	    else
+		db_printf("%s", (const char *)ip->i_extra);
+	} else if (i_size == ADEP) {
+	    if (short_addr == FALSE)
 		db_printf("%s", i_name);
 	    else
 		db_printf("%s", (const char *)ip->i_extra);
@@ -1366,6 +1489,10 @@
 		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][WORD][f_reg(rex, regmodrm)]);
 		    break;
 
+		case Rq:
+		    db_printf("%s", db_reg[rex != 0 ? 1 : 0][QUAD][f_reg(rex, regmodrm)]);
+		    break;
+
 		case Ri:
 		    db_printf("%s", db_reg[0][QUAD][f_rm(rex, inst)]);
 		    break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/fpu.c
--- a/head/sys/amd64/amd64/fpu.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/fpu.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/fpu.c 230766 2012-01-30 07:53:33Z kib $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/fpu.c 238671 2012-07-21 13:53:00Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -73,10 +73,7 @@
 #define	fxrstor(addr)		__asm __volatile("fxrstor %0" : : "m" (*(addr)))
 #define	fxsave(addr)		__asm __volatile("fxsave %0" : "=m" (*(addr)))
 #define	ldmxcsr(csr)		__asm __volatile("ldmxcsr %0" : : "m" (csr))
-#define	start_emulating()	__asm __volatile( \
-				    "smsw %%ax; orb %0,%%al; lmsw %%ax" \
-				    : : "n" (CR0_TS) : "ax")
-#define	stop_emulating()	__asm __volatile("clts")
+#define	stmxcsr(addr)		__asm __volatile("stmxcsr %0" : : "m" (*(addr)))
 
 static __inline void
 xrstor(char *addr, uint64_t mask)
@@ -85,9 +82,7 @@
 
 	low = mask;
 	hi = mask >> 32;
-	/* xrstor (%rdi) */
-	__asm __volatile(".byte	0x0f,0xae,0x2f" : :
-	    "a" (low), "d" (hi), "D" (addr));
+	__asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi));
 }
 
 static __inline void
@@ -97,20 +92,8 @@
 
 	low = mask;
 	hi = mask >> 32;
-	/* xsave (%rdi) */
-	__asm __volatile(".byte	0x0f,0xae,0x27" : :
-	    "a" (low), "d" (hi), "D" (addr) : "memory");
-}
-
-static __inline void
-xsetbv(uint32_t reg, uint64_t val)
-{
-	uint32_t low, hi;
-
-	low = val;
-	hi = val >> 32;
-	__asm __volatile(".byte 0x0f,0x01,0xd1" : :
-	    "c" (reg), "a" (low), "d" (hi));
+	__asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) :
+	    "memory");
 }
 
 #else	/* !(__GNUCLIKE_ASM && !lint) */
@@ -123,16 +106,14 @@
 void	fxsave(caddr_t addr);
 void	fxrstor(caddr_t addr);
 void	ldmxcsr(u_int csr);
-void	start_emulating(void);
-void	stop_emulating(void);
+void	stmxcsr(u_int csr);
 void	xrstor(char *addr, uint64_t mask);
 void	xsave(char *addr, uint64_t mask);
-void	xsetbv(uint32_t reg, uint64_t val);
 
 #endif	/* __GNUCLIKE_ASM && !lint */
 
-#define GET_FPU_CW(thread) ((thread)->td_pcb->pcb_save->sv_env.en_cw)
-#define GET_FPU_SW(thread) ((thread)->td_pcb->pcb_save->sv_env.en_sw)
+#define	start_emulating()	load_cr0(rcr0() | CR0_TS)
+#define	stop_emulating()	clts()
 
 CTASSERT(sizeof(struct savefpu) == 512);
 CTASSERT(sizeof(struct xstate_hdr) == 64);
@@ -141,7 +122,7 @@
 /*
  * This requirement is to make it easier for asm code to calculate
  * offset of the fpu save area from the pcb address. FPU save area
- * must by 64-bytes aligned.
+ * must be 64-byte aligned.
  */
 CTASSERT(sizeof(struct pcb) % XSAVE_AREA_ALIGN == 0);
 
@@ -150,10 +131,16 @@
 SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
     NULL, 1, "Floating point instructions executed in hardware");
 
+static int use_xsaveopt;
 int use_xsave;			/* non-static for cpu_switch.S */
 uint64_t xsave_mask;		/* the same */
 static	struct savefpu *fpu_initialstate;
 
+struct xsave_area_elm_descr {
+	u_int	offset;
+	u_int	size;
+} *xsave_area_desc;
+
 void
 fpusave(void *addr)
 {
@@ -200,6 +187,17 @@
 	TUNABLE_ULONG_FETCH("hw.xsave_mask", &xsave_mask_user);
 	xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE;
 	xsave_mask &= xsave_mask_user;
+
+	cpuid_count(0xd, 0x1, cp);
+	if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) {
+		/*
+		 * Patch the XSAVE instruction in the cpu_switch code
+		 * to XSAVEOPT.  We assume that XSAVE encoding used
+		 * REX byte, and set the bit 4 of the r/m byte.
+		 */
+		ctx_switch_xsave[3] |= 0x10;
+		use_xsaveopt = 1;
+	}
 }
 
 /*
@@ -238,7 +236,7 @@
 
 	if (use_xsave) {
 		load_cr4(rcr4() | CR4_XSAVE);
-		xsetbv(XCR0, xsave_mask);
+		load_xcr(XCR0, xsave_mask);
 	}
 
 	/*
@@ -270,6 +268,7 @@
 fpuinitstate(void *arg __unused)
 {
 	register_t saveintr;
+	int cp[4], i, max_ext_n;
 
 	fpu_initialstate = malloc(cpu_max_ext_state_size, M_DEVBUF,
 	    M_WAITOK | M_ZERO);
@@ -291,6 +290,28 @@
 	 */
 	bzero(&fpu_initialstate->sv_xmm[0], sizeof(struct xmmacc));
 
+	/*
+	 * Create a table describing the layout of the CPU Extended
+	 * Save Area.
+	 */
+	if (use_xsaveopt) {
+		max_ext_n = flsl(xsave_mask);
+		xsave_area_desc = malloc(max_ext_n * sizeof(struct
+		    xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO);
+		/* x87 state */
+		xsave_area_desc[0].offset = 0;
+		xsave_area_desc[0].size = 160;
+		/* XMM */
+		xsave_area_desc[1].offset = 160;
+		xsave_area_desc[1].size = 288 - 160;
+
+		for (i = 2; i < max_ext_n; i++) {
+			cpuid_count(0xd, i, cp);
+			xsave_area_desc[i].offset = cp[1];
+			xsave_area_desc[i].size = cp[0];
+		}
+	}
+
 	start_emulating();
 	intr_restore(saveintr);
 }
@@ -306,7 +327,7 @@
 	critical_enter();
 	if (curthread == PCPU_GET(fpcurthread)) {
 		stop_emulating();
-		fpusave(PCPU_GET(curpcb)->pcb_save);
+		fpusave(curpcb->pcb_save);
 		start_emulating();
 		PCPU_SET(fpcurthread, 0);
 	}
@@ -492,25 +513,26 @@
 };
 
 /*
- * Preserve the FP status word, clear FP exceptions, then generate a SIGFPE.
+ * Read the FP status and control words, then generate si_code value
+ * for SIGFPE.  The error code chosen will be one of the
+ * FPE_... macros.  It will be sent as the second argument to old
+ * BSD-style signal handlers and as "siginfo_t->si_code" (second
+ * argument) to SA_SIGINFO signal handlers.
  *
- * Clearing exceptions is necessary mainly to avoid IRQ13 bugs.  We now
- * depend on longjmp() restoring a usable state.  Restoring the state
- * or examining it might fail if we didn't clear exceptions.
+ * Some time ago, we cleared the x87 exceptions with FNCLEX there.
+ * Clearing exceptions was necessary mainly to avoid IRQ13 bugs.  The
+ * usermode code which understands the FPU hardware enough to enable
+ * the exceptions, can also handle clearing the exception state in the
+ * handler.  The only consequence of not clearing the exception is the
+ * rethrow of the SIGFPE on return from the signal handler and
+ * reexecution of the corresponding instruction.
  *
- * The error code chosen will be one of the FPE_... macros. It will be
- * sent as the second argument to old BSD-style signal handlers and as
- * "siginfo_t->si_code" (second argument) to SA_SIGINFO signal handlers.
- *
- * XXX the FP state is not preserved across signal handlers.  So signal
- * handlers cannot afford to do FP unless they preserve the state or
- * longjmp() out.  Both preserving the state and longjmp()ing may be
- * destroyed by IRQ13 bugs.  Clearing FP exceptions is not an acceptable
- * solution for signals other than SIGFPE.
+ * For XMM traps, the exceptions were never cleared.
  */
 int
-fputrap()
+fputrap_x87(void)
 {
+	struct savefpu *pcb_save;
 	u_short control, status;
 
 	critical_enter();
@@ -521,19 +543,32 @@
 	 * wherever they are.
 	 */
 	if (PCPU_GET(fpcurthread) != curthread) {
-		control = GET_FPU_CW(curthread);
-		status = GET_FPU_SW(curthread);
+		pcb_save = curpcb->pcb_save;
+		control = pcb_save->sv_env.en_cw;
+		status = pcb_save->sv_env.en_sw;
 	} else {
 		fnstcw(&control);
 		fnstsw(&status);
 	}
 
-	if (PCPU_GET(fpcurthread) == curthread)
-		fnclex();
 	critical_exit();
 	return (fpetable[status & ((~control & 0x3f) | 0x40)]);
 }
 
+int
+fputrap_sse(void)
+{
+	u_int mxcsr;
+
+	critical_enter();
+	if (PCPU_GET(fpcurthread) != curthread)
+		mxcsr = curpcb->pcb_save->sv_env.en_mxcsr;
+	else
+		stmxcsr(&mxcsr);
+	critical_exit();
+	return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
+}
+
 /*
  * Implement device not available (DNA) exception
  *
@@ -547,7 +582,6 @@
 void
 fpudna(void)
 {
-	struct pcb *pcb;
 
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == curthread) {
@@ -569,26 +603,31 @@
 	 * Record new context early in case frstor causes a trap.
 	 */
 	PCPU_SET(fpcurthread, curthread);
-	pcb = PCPU_GET(curpcb);
 
 	fpu_clean_state();
 
-	if ((pcb->pcb_flags & PCB_FPUINITDONE) == 0) {
+	if ((curpcb->pcb_flags & PCB_FPUINITDONE) == 0) {
 		/*
 		 * This is the first time this thread has used the FPU or
 		 * the PCB doesn't contain a clean FPU state.  Explicitly
 		 * load an initial state.
+		 *
+		 * We prefer to restore the state from the actual save
+		 * area in PCB instead of directly loading from
+		 * fpu_initialstate, to ignite the XSAVEOPT
+		 * tracking engine.
 		 */
-		fpurestore(fpu_initialstate);
-		if (pcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
-			fldcw(pcb->pcb_initial_fpucw);
-		if (PCB_USER_FPU(pcb))
-			set_pcb_flags(pcb,
+		bcopy(fpu_initialstate, curpcb->pcb_save, cpu_max_ext_state_size);
+		fpurestore(curpcb->pcb_save);
+		if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
+			fldcw(curpcb->pcb_initial_fpucw);
+		if (PCB_USER_FPU(curpcb))
+			set_pcb_flags(curpcb,
 			    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 		else
-			set_pcb_flags(pcb, PCB_FPUINITDONE);
+			set_pcb_flags(curpcb, PCB_FPUINITDONE);
 	} else
-		fpurestore(pcb->pcb_save);
+		fpurestore(curpcb->pcb_save);
 	critical_exit();
 }
 
@@ -614,6 +653,9 @@
 fpugetregs(struct thread *td)
 {
 	struct pcb *pcb;
+	uint64_t *xstate_bv, bit;
+	char *sa;
+	int max_ext_n, i;
 
 	pcb = td->td_pcb;
 	if ((pcb->pcb_flags & PCB_USERFPUINITDONE) == 0) {
@@ -631,6 +673,25 @@
 		return (_MC_FPOWNED_FPU);
 	} else {
 		critical_exit();
+		if (use_xsaveopt) {
+			/*
+			 * Handle partially saved state.
+			 */
+			sa = (char *)get_pcb_user_save_pcb(pcb);
+			xstate_bv = (uint64_t *)(sa + sizeof(struct savefpu) +
+			    offsetof(struct xstate_hdr, xstate_bv));
+			max_ext_n = flsl(xsave_mask);
+			for (i = 0; i < max_ext_n; i++) {
+				bit = 1 << i;
+				if ((*xstate_bv & bit) != 0)
+					continue;
+				bcopy((char *)fpu_initialstate +
+				    xsave_area_desc[i].offset,
+				    sa + xsave_area_desc[i].offset,
+				    xsave_area_desc[i].size);
+				*xstate_bv |= bit;
+			}
+		}
 		return (_MC_FPOWNED_PCB);
 	}
 }
@@ -900,16 +961,14 @@
 int
 fpu_kern_thread(u_int flags)
 {
-	struct pcb *pcb;
 
-	pcb = PCPU_GET(curpcb);
 	KASSERT((curthread->td_pflags & TDP_KTHREAD) != 0,
 	    ("Only kthread may use fpu_kern_thread"));
-	KASSERT(pcb->pcb_save == get_pcb_user_save_pcb(pcb),
+	KASSERT(curpcb->pcb_save == get_pcb_user_save_pcb(curpcb),
 	    ("mangled pcb_save"));
-	KASSERT(PCB_USER_FPU(pcb), ("recursive call"));
+	KASSERT(PCB_USER_FPU(curpcb), ("recursive call"));
 
-	set_pcb_flags(pcb, PCB_KERNFPU);
+	set_pcb_flags(curpcb, PCB_KERNFPU);
 	return (0);
 }
 
@@ -919,5 +978,5 @@
 
 	if ((curthread->td_pflags & TDP_KTHREAD) == 0)
 		return (0);
-	return ((PCPU_GET(curpcb)->pcb_flags & PCB_KERNFPU) != 0);
+	return ((curpcb->pcb_flags & PCB_KERNFPU) != 0);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/genassym.c
--- a/head/sys/amd64/amd64/genassym.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/genassym.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/genassym.c 230426 2012-01-21 17:45:27Z kib $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/genassym.c 236772 2012-06-09 00:37:26Z iwasaki $");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
@@ -157,6 +157,13 @@
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
 ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct savefpu));
 ASSYM(PCB_USERFPU, sizeof(struct pcb));
+ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer));
+ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star));
+ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar));
+ASSYM(PCB_CSTAR, offsetof(struct pcb, pcb_cstar));
+ASSYM(PCB_SFMASK, offsetof(struct pcb, pcb_sfmask));
+ASSYM(PCB_XSMASK, offsetof(struct pcb, pcb_xsmask));
+ASSYM(PCB_FPUSUSPEND, offsetof(struct pcb, pcb_fpususpend));
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_FULL_IRET, PCB_FULL_IRET);
 ASSYM(PCB_DBREGS, PCB_DBREGS);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/machdep.c
--- a/head/sys/amd64/amd64/machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/machdep.c 234105 2012-04-10 16:08:46Z marius $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/machdep.c 238623 2012-07-19 19:09:12Z kib $");
 
 #include "opt_atalk.h"
 #include "opt_atpic.h"
@@ -74,6 +74,7 @@
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
@@ -206,6 +207,8 @@
 
 struct mtx icu_lock;
 
+struct mem_range_softc mem_range_softc;
+
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
 static void
@@ -296,12 +299,10 @@
 
 	cpu_setregs();
 
-#ifdef SMP
 	/*
 	 * Add BSP as an interrupt target.
 	 */
 	intr_add_cpu(0);
-#endif
 }
 
 /*
@@ -995,7 +996,7 @@
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
-		if (pcb == PCPU_GET(curpcb)) {
+		if (pcb == curpcb) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/mem.c
--- a/head/sys/amd64/amd64/mem.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/mem.c	Wed Jul 25 16:40:53 2012 +0300
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/mem.c 238310 2012-07-09 20:42:08Z jhb $");
 
 /*
  * Memory special file
@@ -72,8 +72,6 @@
  */
 MALLOC_DEFINE(M_MEMDESC, "memdesc", "memory range descriptors");
 
-struct mem_range_softc mem_range_softc;
-
 /* ARGSUSED */
 int
 memrw(struct cdev *dev, struct uio *uio, int flags)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/minidump_machdep.c
--- a/head/sys/amd64/amd64/minidump_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/minidump_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/minidump_machdep.c 230623 2012-01-27 20:18:31Z kmacy $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/minidump_machdep.c 236503 2012-06-03 08:01:12Z avg $");
 
 #include "opt_pmap.h"
 #include "opt_watchdog.h"
@@ -37,9 +37,7 @@
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/msgbuf.h>
-#ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
-#endif
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
@@ -177,9 +175,9 @@
 			report_progress(progress, dumpsize);
 			counter &= (1<<24) - 1;
 		}
-#ifdef SW_WATCHDOG
+
 		wdog_kern_pat(WD_LASTVAL);
-#endif
+
 		if (ptr) {
 			error = dump_write(di, ptr, 0, dumplo, len);
 			if (error)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/mp_machdep.c
--- a/head/sys/amd64/amd64/mp_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/mp_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 234208 2012-04-13 07:18:19Z avg $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/mp_machdep.c 237037 2012-06-13 22:53:56Z jkim $");
 
 #include "opt_cpu.h"
 #include "opt_kstack_pages.h"
@@ -100,7 +100,6 @@
 
 struct pcb stoppcbs[MAXCPU];
 struct pcb **susppcbs;
-void **suspfpusave;
 
 /* Variables needed for SMP tlb shootdown. */
 vm_offset_t smp_tlb_addr1;
@@ -982,6 +981,60 @@
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
+	ipi_startup(apic_id, vector);
+
+	/* Wait up to 5 seconds for it to start. */
+	for (ms = 0; ms < 5000; ms++) {
+		if (mp_naps > cpus)
+			return 1;	/* return SUCCESS */
+		DELAY(1000);
+	}
+	return 0;		/* return FAILURE */
+}
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+    sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+    sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+    sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW,
+    &ipi_range_size, 0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+    &ipi_masked_global, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+    &ipi_masked_page, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+    &ipi_masked_range, 0, "");
+SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+    &ipi_masked_range_size, 0, "");
+#endif /* COUNT_XINVLTLB_HITS */
+
+/*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
 	/*
 	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
@@ -1032,52 +1085,8 @@
 	    vector, apic_id);
 	lapic_ipi_wait(-1);
 	DELAY(200);		/* wait ~200uS */
-
-	/* Wait up to 5 seconds for it to start. */
-	for (ms = 0; ms < 5000; ms++) {
-		if (mp_naps > cpus)
-			return 1;	/* return SUCCESS */
-		DELAY(1000);
-	}
-	return 0;		/* return FAILURE */
 }
 
-#ifdef COUNT_XINVLTLB_HITS
-u_int xhits_gbl[MAXCPU];
-u_int xhits_pg[MAXCPU];
-u_int xhits_rng[MAXCPU];
-static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
-    sizeof(xhits_gbl), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
-    sizeof(xhits_pg), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
-    sizeof(xhits_rng), "IU", "");
-
-u_int ipi_global;
-u_int ipi_page;
-u_int ipi_range;
-u_int ipi_range_size;
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW,
-    &ipi_range_size, 0, "");
-
-u_int ipi_masked_global;
-u_int ipi_masked_page;
-u_int ipi_masked_range;
-u_int ipi_masked_range_size;
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
-    &ipi_masked_global, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
-    &ipi_masked_page, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
-    &ipi_masked_range, 0, "");
-SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
-    &ipi_masked_range_size, 0, "");
-#endif /* COUNT_XINVLTLB_HITS */
-
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
@@ -1415,15 +1424,17 @@
 	cpu = PCPU_GET(cpuid);
 
 	if (savectx(susppcbs[cpu])) {
-		ctx_fpusave(suspfpusave[cpu]);
+		ctx_fpusave(susppcbs[cpu]->pcb_fpususpend);
 		wbinvd();
-		CPU_SET_ATOMIC(cpu, &stopped_cpus);
+		CPU_SET_ATOMIC(cpu, &suspended_cpus);
 	} else {
 		pmap_init_pat();
-		load_cr3(susppcbs[cpu]->pcb_cr3);
 		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
+
+		/* Indicate that we are resumed */
+		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
 
 	/* Wait for resume */
@@ -1431,7 +1442,6 @@
 		ia32_pause();
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
-	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 
 	/* Resume MCA and local APIC */
 	mca_resume();
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/pmap.c
--- a/head/sys/amd64/amd64/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -77,7 +77,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 233954 2012-04-06 16:41:19Z alc $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 238610 2012-07-19 05:34:19Z alc $");
 
 /*
  *	Manages physical address maps.
@@ -117,6 +117,7 @@
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
@@ -167,6 +168,39 @@
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
+#define	NPV_LIST_LOCKS	MAXCPU
+
+#define	PHYS_TO_PV_LIST_LOCK(pa)	\
+			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
+
+#define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
+	struct rwlock **_lockp = (lockp);		\
+	struct rwlock *_new_lock;			\
+							\
+	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
+	if (_new_lock != *_lockp) {			\
+		if (*_lockp != NULL)			\
+			rw_wunlock(*_lockp);		\
+		*_lockp = _new_lock;			\
+		rw_wlock(*_lockp);			\
+	}						\
+} while (0)
+
+#define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
+			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
+
+#define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
+	struct rwlock **_lockp = (lockp);		\
+							\
+	if (*_lockp != NULL) {				\
+		rw_wunlock(*_lockp);			\
+		*_lockp = NULL;				\
+	}						\
+} while (0)
+
+#define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
+			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
+
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
@@ -199,9 +233,22 @@
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 
 /*
+ * Isolate the global pv list lock from data and other locks to prevent false
+ * sharing within the cache.
+ */
+static struct {
+	struct rwlock	lock;
+	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
+} pvh_global __aligned(CACHE_LINE_SIZE);
+
+#define	pvh_global_lock	pvh_global.lock
+
+/*
  * Data for the pv entry allocation mechanism
  */
-static long pv_entry_count;
+static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
+static struct mtx pv_chunks_mutex;
+static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 
 /*
@@ -215,11 +262,19 @@
  */
 static caddr_t crashdumpmap;
 
+static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t locked_pmap, boolean_t try);
-static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
-static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
-static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
+static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
+static int	popcnt_pc_map_elem(uint64_t elem);
+static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
+static void	reserve_pv_entries(pmap_t pmap, int needed,
+		    struct rwlock **lockp);
+static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+		    struct rwlock **lockp);
+static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+		    struct rwlock **lockp);
+static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+		    struct rwlock **lockp);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
@@ -227,12 +282,14 @@
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
+static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
+    vm_offset_t va, struct rwlock **lockp);
 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
     vm_offset_t va);
 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
-    vm_prot_t prot);
+    vm_prot_t prot, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
-    vm_page_t m, vm_prot_t prot, vm_page_t mpte);
+    vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
@@ -240,30 +297,32 @@
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va);
 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
-static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
+static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+    struct rwlock **lockp);
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
-		vm_page_t *free);
+		vm_page_t *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
-		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
+		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free,
+		struct rwlock **lockp);
 static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte);
 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     vm_page_t *free);
-static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
-		vm_offset_t va);
-static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
-    vm_page_t m);
+    vm_page_t m, struct rwlock **lockp);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
 
-static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
-static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
-
-static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
+static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
+		struct rwlock **lockp);
+static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
+		struct rwlock **lockp);
+static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
+		struct rwlock **lockp);
+
 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
                 vm_page_t* free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
@@ -580,6 +639,11 @@
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 
+ 	/*
+	 * Initialize the global pv list lock.
+	 */
+	rw_init(&pvh_global_lock, "pmap pv global");
+
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
@@ -744,6 +808,17 @@
 	}
 
 	/*
+	 * Initialize the pv chunk list mutex.
+	 */
+	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
+
+	/*
+	 * Initialize the pool of pv list locks.
+	 */
+	for (i = 0; i < NPV_LIST_LOCKS; i++)
+		rw_init(&pv_list_locks[i], "pmap pv list");
+
+	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	for (i = 0; phys_avail[i + 1]; i += 2);
@@ -1625,8 +1700,10 @@
 }
 
 /*
- * this routine is called if the page table page is not
- * mapped correctly.
+ * This routine is called if the desired page table page does not exist.
+ *
+ * If page table page allocation fails, this routine may sleep before
+ * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
@@ -1634,25 +1711,23 @@
  * race conditions.
  */
 static vm_page_t
-_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
+_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, pdppg, pdpg;
 
-	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
-	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
-	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
-
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
-		if (flags & M_WAITOK) {
+		if (lockp != NULL) {
+			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
-			vm_page_unlock_queues();
+			rw_runlock(&pvh_global_lock);
 			VM_WAIT;
-			vm_page_lock_queues();
+			rw_rlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 
@@ -1693,7 +1768,7 @@
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
-			    flags) == NULL) {
+			    lockp) == NULL) {
 				--m->wire_count;
 				atomic_subtract_int(&cnt.v_wire_count, 1);
 				vm_page_free_zero(m);
@@ -1726,7 +1801,7 @@
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
-			    flags) == NULL) {
+			    lockp) == NULL) {
 				--m->wire_count;
 				atomic_subtract_int(&cnt.v_wire_count, 1);
 				vm_page_free_zero(m);
@@ -1740,7 +1815,7 @@
 			if ((*pdp & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
 				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
-				    flags) == NULL) {
+				    lockp) == NULL) {
 					--m->wire_count;
 					atomic_subtract_int(&cnt.v_wire_count,
 					    1);
@@ -1766,15 +1841,12 @@
 }
 
 static vm_page_t
-pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
+pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t pdpindex, ptepindex;
 	pdp_entry_t *pdpe;
 	vm_page_t pdpg;
 
-	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
-	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
-	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
@@ -1785,24 +1857,20 @@
 		/* Allocate a pd page. */
 		ptepindex = pmap_pde_pindex(va);
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
-		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
-		if (pdpg == NULL && (flags & M_WAITOK))
+		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
+		if (pdpg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (pdpg);
 }
 
 static vm_page_t
-pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
+pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd;
 	vm_page_t m;
 
-	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
-	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
-	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
-
 	/*
 	 * Calculate pagetable page index
 	 */
@@ -1818,7 +1886,7 @@
 	 * normal 4K page.
 	 */
 	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
-		if (!pmap_demote_pde(pmap, pd, va)) {
+		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 			/*
 			 * Invalidation of the 2MB page mapping may have caused
 			 * the deallocation of the underlying PD page.
@@ -1839,8 +1907,8 @@
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
-		m = _pmap_allocpte(pmap, ptepindex, flags);
-		if (m == NULL && (flags & M_WAITOK))
+		m = _pmap_allocpte(pmap, ptepindex, lockp);
+		if (m == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (m);
@@ -1993,7 +2061,7 @@
 pv_to_chunk(pv_entry_t pv)
 {
 
-	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
+	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
@@ -2002,10 +2070,7 @@
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
-static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
-
-SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
-	"Current number of pv entries");
+static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
@@ -2019,80 +2084,159 @@
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
-static long pv_entry_frees, pv_entry_allocs;
+static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
+SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
+	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
-
-static int pmap_collect_inactive, pmap_collect_active;
-
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
-	"Current number times pmap_collect called on inactive queue");
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
-	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
- * another pv entry chunk.  This is normally called to
- * unmap inactive pages, and if necessary, active pages.
+ * another pv entry chunk.
+ *
+ * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
-static void
-pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
+static vm_page_t
+reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
+	struct pch new_tail;
+	struct pv_chunk *pc;
+	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
-	pv_entry_t next_pv, pv;
+	pv_entry_t pv;
 	vm_offset_t va;
-	vm_page_t m, free;
-
-	TAILQ_FOREACH(m, &vpq->pl, pageq) {
-		if ((m->flags & PG_MARKER) != 0 || m->hold_count || m->busy)
+	vm_page_t free, m, m_pc;
+	uint64_t inuse;
+	int bit, field, freed;
+	
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
+	pmap = NULL;
+	free = m_pc = NULL;
+	TAILQ_INIT(&new_tail);
+	mtx_lock(&pv_chunks_mutex);
+	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && free == NULL) {
+		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+		mtx_unlock(&pv_chunks_mutex);
+		if (pmap != pc->pc_pmap) {
+			if (pmap != NULL) {
+				pmap_invalidate_all(pmap);
+				if (pmap != locked_pmap)
+					PMAP_UNLOCK(pmap);
+			}
+			pmap = pc->pc_pmap;
+			/* Avoid deadlock and lock recursion. */
+			if (pmap > locked_pmap) {
+				RELEASE_PV_LIST_LOCK(lockp);
+				PMAP_LOCK(pmap);
+			} else if (pmap != locked_pmap &&
+			    !PMAP_TRYLOCK(pmap)) {
+				pmap = NULL;
+				TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+				mtx_lock(&pv_chunks_mutex);
+				continue;
+			}
+		}
+
+		/*
+		 * Destroy every non-wired, 4 KB page mapping in the chunk.
+		 */
+		freed = 0;
+		for (field = 0; field < _NPCM; field++) {
+			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
+			    inuse != 0; inuse &= ~(1UL << bit)) {
+				bit = bsfq(inuse);
+				pv = &pc->pc_pventry[field * 64 + bit];
+				va = pv->pv_va;
+				pde = pmap_pde(pmap, va);
+				if ((*pde & PG_PS) != 0)
+					continue;
+				pte = pmap_pde_to_pte(pde, va);
+				if ((*pte & PG_W) != 0)
+					continue;
+				tpte = pte_load_clear(pte);
+				if ((tpte & PG_G) != 0)
+					pmap_invalidate_page(pmap, va);
+				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+					vm_page_dirty(m);
+				if ((tpte & PG_A) != 0)
+					vm_page_aflag_set(m, PGA_REFERENCED);
+				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list) &&
+				    (m->flags & PG_FICTITIOUS) == 0) {
+					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+					if (TAILQ_EMPTY(&pvh->pv_list)) {
+						vm_page_aflag_clear(m,
+						    PGA_WRITEABLE);
+					}
+				}
+				pc->pc_map[field] |= 1UL << bit;
+				pmap_unuse_pt(pmap, va, *pde, &free);	
+				freed++;
+			}
+		}
+		if (freed == 0) {
+			TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+			mtx_lock(&pv_chunks_mutex);
 			continue;
-		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
-			va = pv->pv_va;
-			pmap = PV_PMAP(pv);
-			/* Avoid deadlock and lock recursion. */
-			if (pmap > locked_pmap)
-				PMAP_LOCK(pmap);
-			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
-				continue;
-			pmap_resident_count_dec(pmap, 1);
-			pde = pmap_pde(pmap, va);
-			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
-			    " a 2mpage in page %p's pv list", m));
-			pte = pmap_pde_to_pte(pde, va);
-			tpte = pte_load_clear(pte);
-			KASSERT((tpte & PG_W) == 0,
-			    ("pmap_collect: wired pte %#lx", tpte));
-			if (tpte & PG_A)
-				vm_page_aflag_set(m, PGA_REFERENCED);
-			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
-				vm_page_dirty(m);
-			free = NULL;
-			pmap_unuse_pt(pmap, va, *pde, &free);
-			pmap_invalidate_page(pmap, va);
-			pmap_free_zero_pages(free);
-			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-			free_pv_entry(pmap, pv);
-			if (pmap != locked_pmap)
-				PMAP_UNLOCK(pmap);
 		}
-		if (TAILQ_EMPTY(&m->md.pv_list) &&
-		    TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list))
-			vm_page_aflag_clear(m, PGA_WRITEABLE);
+		/* Every freed mapping is for a 4 KB page. */
+		pmap_resident_count_dec(pmap, freed);
+		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
+		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
+		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
+		    pc->pc_map[2] == PC_FREE2) {
+			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
+			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
+			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
+			/* Entire chunk is free; return it. */
+			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
+			dump_drop_page(m_pc->phys_addr);
+			mtx_lock(&pv_chunks_mutex);
+			break;
+		}
+		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+		mtx_lock(&pv_chunks_mutex);
+		/* One freed pv entry in locked_pmap is sufficient. */
+		if (pmap == locked_pmap)
+			break;
 	}
+	TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
+	mtx_unlock(&pv_chunks_mutex);
+	if (pmap != NULL) {
+		pmap_invalidate_all(pmap);
+		if (pmap != locked_pmap)
+			PMAP_UNLOCK(pmap);
+	}
+	if (m_pc == NULL && free != NULL) {
+		m_pc = free;
+		free = m_pc->right;
+		/* Recycle a freed page table page. */
+		m_pc->wire_count = 1;
+		atomic_add_int(&cnt.v_wire_count, 1);
+	}
+	pmap_free_zero_pages(free);
+	return (m_pc);
 }
 
 /*
@@ -2101,15 +2245,14 @@
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
-	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	PV_STAT(pv_entry_frees++);
-	PV_STAT(pv_entry_spare++);
-	pv_entry_count--;
+	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
+	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
+	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
@@ -2125,9 +2268,20 @@
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-	PV_STAT(pv_entry_spare -= _NPCPV);
-	PV_STAT(pc_chunk_count--);
-	PV_STAT(pc_chunk_frees++);
+	free_pv_chunk(pc);
+}
+
+static void
+free_pv_chunk(struct pv_chunk *pc)
+{
+	vm_page_t m;
+
+	mtx_lock(&pv_chunks_mutex);
+ 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+	mtx_unlock(&pv_chunks_mutex);
+	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
+	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
+	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
@@ -2136,22 +2290,24 @@
 }
 
 /*
- * get a new pv_entry, allocating a block from the system
- * when needed.
+ * Returns a new PV entry, allocating a new PV chunk from the system when
+ * needed.  If this PV chunk allocation fails and a PV list lock pointer was
+ * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
+ * returned.
+ *
+ * The given PV list lock may be released.
  */
 static pv_entry_t
-get_pv_entry(pmap_t pmap, boolean_t try)
+get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
-	struct vpgqueues *pq;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	PV_STAT(pv_entry_allocs++);
-	pq = NULL;
+	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
@@ -2171,52 +2327,130 @@
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
-			pv_entry_count++;
-			PV_STAT(pv_entry_spare--);
+			PV_STAT(atomic_add_long(&pv_entry_count, 1));
+			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
-	m = vm_page_alloc(NULL, 0, (pq == &vm_page_queues[PQ_ACTIVE] ?
-	    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ |
+	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
-		if (try) {
+		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
-		/*
-		 * Reclaim pv entries: At first, destroy mappings to inactive
-		 * pages.  After that, if a pv chunk entry is still needed,
-		 * destroy mappings to active pages.
-		 */
-		if (pq == NULL) {
-			PV_STAT(pmap_collect_inactive++);
-			pq = &vm_page_queues[PQ_INACTIVE];
-		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
-			PV_STAT(pmap_collect_active++);
-			pq = &vm_page_queues[PQ_ACTIVE];
-		} else
-			panic("get_pv_entry: allocation failed");
-		pmap_collect(pmap, pq);
-		goto retry;
+		m = reclaim_pv_chunk(pmap, lockp);
+		if (m == NULL)
+			goto retry;
 	}
-	PV_STAT(pc_chunk_count++);
-	PV_STAT(pc_chunk_allocs++);
+	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
+	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
+	mtx_lock(&pv_chunks_mutex);
+	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
+	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
-	pv_entry_count++;
-	PV_STAT(pv_entry_spare += _NPCPV - 1);
+	PV_STAT(atomic_add_long(&pv_entry_count, 1));
+	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
+ * Returns the number of one bits within the given PV chunk map element.
+ */
+static int
+popcnt_pc_map_elem(uint64_t elem)
+{
+	int count;
+
+	/*
+	 * This simple method of counting the one bits performs well because
+	 * the given element typically contains more zero bits than one bits.
+	 */
+	count = 0;
+	for (; elem != 0; elem &= elem - 1)
+		count++;
+	return (count);
+}
+
+/*
+ * Ensure that the number of spare PV entries in the specified pmap meets or
+ * exceeds the given count, "needed".
+ *
+ * The given PV list lock may be released.
+ */
+static void
+reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
+{
+	struct pch new_tail;
+	struct pv_chunk *pc;
+	int avail, free;
+	vm_page_t m;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
+
+	/*
+	 * Newly allocated PV chunks must be stored in a private list until
+	 * the required number of PV chunks have been allocated.  Otherwise,
+	 * reclaim_pv_chunk() could recycle one of these chunks.  In
+	 * contrast, these chunks must be added to the pmap upon allocation.
+	 */
+	TAILQ_INIT(&new_tail);
+retry:
+	avail = 0;
+	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
+		if ((cpu_feature2 & CPUID2_POPCNT) == 0) {
+			free = popcnt_pc_map_elem(pc->pc_map[0]);
+			free += popcnt_pc_map_elem(pc->pc_map[1]);
+			free += popcnt_pc_map_elem(pc->pc_map[2]);
+		} else {
+			free = popcntq(pc->pc_map[0]);
+			free += popcntq(pc->pc_map[1]);
+			free += popcntq(pc->pc_map[2]);
+		}
+		if (free == 0)
+			break;
+		avail += free;
+		if (avail >= needed)
+			break;
+	}
+	for (; avail < needed; avail += _NPCPV) {
+		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+		    VM_ALLOC_WIRED);
+		if (m == NULL) {
+			m = reclaim_pv_chunk(pmap, lockp);
+			if (m == NULL)
+				goto retry;
+		}
+		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
+		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
+		dump_add_page(m->phys_addr);
+		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
+		pc->pc_pmap = pmap;
+		pc->pc_map[0] = PC_FREE0;
+		pc->pc_map[1] = PC_FREE1;
+		pc->pc_map[2] = PC_FREE2;
+		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
+		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
+	}
+	if (!TAILQ_EMPTY(&new_tail)) {
+		mtx_lock(&pv_chunks_mutex);
+		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
+		mtx_unlock(&pv_chunks_mutex);
+	}
+}
+
+/*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
@@ -2227,7 +2461,7 @@
 {
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
@@ -2243,20 +2477,26 @@
  * entries for each of the 4KB page mappings.
  */
 static void
-pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
+pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
 {
 	struct md_page *pvh;
+	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
-
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	int bit, field;
+
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
-	 * page's pv list.
+	 * page's pv list.  Once this transfer begins, the pv list lock
+	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_2mpage(va);
@@ -2265,14 +2505,37 @@
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
+	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 	va_last = va + NBPDR - PAGE_SIZE;
-	do {
-		m++;
-		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
-		    ("pmap_pv_demote_pde: page %p is not managed", m));
-		va += PAGE_SIZE;
-		pmap_insert_entry(pmap, va, m);
-	} while (va < va_last);
+	for (;;) {
+		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
+		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
+		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
+		for (field = 0; field < _NPCM; field++) {
+			while (pc->pc_map[field]) {
+				bit = bsfq(pc->pc_map[field]);
+				pc->pc_map[field] &= ~(1ul << bit);
+				pv = &pc->pc_pventry[field * 64 + bit];
+				va += PAGE_SIZE;
+				pv->pv_va = va;
+				m++;
+				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+			    ("pmap_pv_demote_pde: page %p is not managed", m));
+				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+				if (va == va_last)
+					goto out;
+			}
+		}
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+	}
+out:
+	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
+	}
+	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
+	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 }
 
 /*
@@ -2281,23 +2544,25 @@
  * for the 2MB page mapping.
  */
 static void
-pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
+pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
+	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
-	 * Transfer the first page's pv entry for this mapping to the
-	 * 2mpage's pv list.  Aside from avoiding the cost of a call
-	 * to get_pv_entry(), a transfer avoids the possibility that
-	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
-	 * removes one of the mappings that is being promoted.
+	 * Transfer the first page's pv entry for this mapping to the 2mpage's
+	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
+	 * a transfer avoids the possibility that get_pv_entry() calls
+	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
+	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_2mpage(va);
@@ -2329,48 +2594,22 @@
 	free_pv_entry(pmap, pv);
 }
 
-static void
-pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
-{
-	struct md_page *pvh;
-
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	pmap_pvh_free(&m->md, pmap, va);
-	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
-		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
-		if (TAILQ_EMPTY(&pvh->pv_list))
-			vm_page_aflag_clear(m, PGA_WRITEABLE);
-	}
-}
-
 /*
- * Create a pv entry for page at pa for
- * (pmap, va).
+ * Conditionally create the PV entry for a 4KB page mapping if the required
+ * memory can be allocated without resorting to reclamation.
  */
-static void
-pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
+static boolean_t
+pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
+    struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	pv = get_pv_entry(pmap, FALSE);
-	pv->pv_va = va;
-	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-}
-
-/*
- * Conditionally create a pv entry.
- */
-static boolean_t
-pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
-{
-	pv_entry_t pv;
-
-	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	if ((pv = get_pv_entry(pmap, TRUE)) != NULL) {
+	/* Pass NULL instead of the lock pointer to disable reclamation. */
+	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
+		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 		return (TRUE);
 	} else
@@ -2378,17 +2617,22 @@
 }
 
 /*
- * Create the pv entry for a 2MB page mapping.
+ * Conditionally create the PV entry for a 2MB page mapping if the required
+ * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
-pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
+pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
+    struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	if ((pv = get_pv_entry(pmap, TRUE)) != NULL) {
+	rw_assert(&pvh_global_lock, RA_LOCKED);
+	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+	/* Pass NULL instead of the lock pointer to disable reclamation. */
+	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
+		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 		pvh = pa_to_pvh(pa);
 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list);
 		return (TRUE);
@@ -2417,6 +2661,20 @@
 static boolean_t
 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
+	struct rwlock *lock;
+	boolean_t rv;
+
+	lock = NULL;
+	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
+	if (lock != NULL)
+		rw_wunlock(lock);
+	return (rv);
+}
+
+static boolean_t
+pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+    struct rwlock **lockp)
+{
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
 	vm_paddr_t mptepa;
@@ -2451,7 +2709,8 @@
 		    DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 			free = NULL;
-			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free);
+			pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free,
+			    lockp);
 			pmap_invalidate_page(pmap, trunc_2mpage(va));
 			pmap_free_zero_pages(free);
 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
@@ -2491,6 +2750,17 @@
 		pmap_fill_ptp(firstpte, newpte);
 
 	/*
+	 * The spare PV entries must be reserved prior to demoting the
+	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
+	 * of the PDE and the PV lists will be inconsistent, which can result
+	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
+	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
+	 * PV entry for the 2MB page mapping that is being demoted.
+	 */
+	if ((oldpde & PG_MANAGED) != 0)
+		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
+
+	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
@@ -2509,18 +2779,12 @@
 		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 
 	/*
-	 * Demote the pv entry.  This depends on the earlier demotion
-	 * of the mapping.  Specifically, the (re)creation of a per-
-	 * page pv entry might trigger the execution of pmap_collect(),
-	 * which might reclaim a newly (re)created per-page pv entry
-	 * and destroy the associated mapping.  In order to destroy
-	 * the mapping, the PDE must have already changed from mapping
-	 * the 2mpage to referencing the page table page.
+	 * Demote the PV entry.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
-		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
-
-	pmap_pde_demotions++;
+		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
+
+	atomic_add_long(&pmap_pde_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
@@ -2531,7 +2795,7 @@
  */
 static int
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
-    vm_page_t *free)
+    vm_page_t *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
@@ -2553,6 +2817,7 @@
 		pmap_invalidate_page(kernel_pmap, sva);
 	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 	if (oldpde & PG_MANAGED) {
+		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + NBPDR;
@@ -2568,7 +2833,7 @@
 		}
 	}
 	if (pmap == kernel_pmap) {
-		if (!pmap_demote_pde(pmap, pdq, sva))
+		if (!pmap_demote_pde_locked(pmap, pdq, sva, lockp))
 			panic("pmap_remove_pde: failed demotion");
 	} else {
 		mpte = pmap_lookup_pt_page(pmap, sva);
@@ -2590,8 +2855,9 @@
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
-    pd_entry_t ptepde, vm_page_t *free)
+    pd_entry_t ptepde, vm_page_t *free, struct rwlock **lockp)
 {
+	struct md_page *pvh;
 	pt_entry_t oldpte;
 	vm_page_t m;
 
@@ -2606,7 +2872,14 @@
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
-		pmap_remove_entry(pmap, m, va);
+		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
+		pmap_pvh_free(&m->md, pmap, va);
+		if (TAILQ_EMPTY(&m->md.pv_list) &&
+		    (m->flags & PG_FICTITIOUS) == 0) {
+			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+			if (TAILQ_EMPTY(&pvh->pv_list))
+				vm_page_aflag_clear(m, PGA_WRITEABLE);
+		}
 	}
 	return (pmap_unuse_pt(pmap, va, ptepde, free));
 }
@@ -2617,6 +2890,7 @@
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
 {
+	struct rwlock *lock;
 	pt_entry_t *pte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -2625,7 +2899,10 @@
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		return;
-	pmap_remove_pte(pmap, pte, va, *pde, free);
+	lock = NULL;
+	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
+	if (lock != NULL)
+		rw_wunlock(lock);
 	pmap_invalidate_page(pmap, va);
 }
 
@@ -2638,6 +2915,7 @@
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
+	struct rwlock *lock;
 	vm_offset_t va, va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
@@ -2654,7 +2932,7 @@
 
 	anyvalid = 0;
 
-	vm_page_lock_queues();
+	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 
 	/*
@@ -2670,6 +2948,7 @@
 		}
 	}
 
+	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
@@ -2722,9 +3001,10 @@
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
-				pmap_remove_pde(pmap, pde, sva, &free);
+				pmap_remove_pde(pmap, pde, sva, &free, &lock);
 				continue;
-			} else if (!pmap_demote_pde(pmap, pde, sva)) {
+			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
+			    &lock)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			} else
@@ -2753,7 +3033,8 @@
 				anyvalid = 1;
 			else if (va == va_next)
 				va = sva;
-			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free)) {
+			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free,
+			    &lock)) {
 				sva += PAGE_SIZE;
 				break;
 			}
@@ -2761,10 +3042,12 @@
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
+	if (lock != NULL)
+		rw_wunlock(lock);
 out:
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();	
+	rw_runlock(&pvh_global_lock);	
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
@@ -2796,7 +3079,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	free = NULL;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -2835,7 +3118,7 @@
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	pmap_free_zero_pages(free);
 }
 
@@ -2956,12 +3239,12 @@
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
-					if (!mtx_trylock(&vm_page_queue_mtx)) {
+					if (!rw_try_rlock(&pvh_global_lock)) {
 						if (anychanged)
 							pmap_invalidate_all(
 							    pmap);
 						PMAP_UNLOCK(pmap);
-						vm_page_lock_queues();
+						rw_rlock(&pvh_global_lock);
 						goto resume;
 					}
 				}
@@ -3012,7 +3295,7 @@
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	if (pv_lists_locked)
-		vm_page_unlock_queues();
+		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3024,7 +3307,8 @@
  * identical characteristics. 
  */
 static void
-pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
+pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
+    struct rwlock **lockp)
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
@@ -3042,7 +3326,7 @@
 setpde:
 	newpde = *firstpte;
 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
-		pmap_pde_p_failures++;
+		atomic_add_long(&pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return;
@@ -3067,7 +3351,7 @@
 setpte:
 		oldpte = *pte;
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
-			pmap_pde_p_failures++;
+			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
@@ -3086,7 +3370,7 @@
 			    " in pmap %p", oldpteva, pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
-			pmap_pde_p_failures++;
+			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
@@ -3111,7 +3395,7 @@
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
-		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
+		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 
 	/*
 	 * Propagate the PAT index to its proper position.
@@ -3127,7 +3411,7 @@
 	else
 		pde_store(pde, PG_PS | newpde);
 
-	pmap_pde_promotions++;
+	atomic_add_long(&pmap_pde_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 }
@@ -3148,6 +3432,7 @@
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
     vm_prot_t prot, boolean_t wired)
 {
+	struct rwlock *lock;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	pt_entry_t newpte, origpte;
@@ -3161,115 +3446,16 @@
 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 	    va));
+	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
+	    va >= kmi.clean_eva,
+	    ("pmap_enter: managed mapping within the clean submap"));
 	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
 	    VM_OBJECT_LOCKED(m->object),
 	    ("pmap_enter: page %p is not busy", m));
-
-	mpte = NULL;
-
-	vm_page_lock_queues();
-	PMAP_LOCK(pmap);
-
-	/*
-	 * In the case that a page table page is not
-	 * resident, we are creating it here.
-	 */
-	if (va < VM_MAXUSER_ADDRESS)
-		mpte = pmap_allocpte(pmap, va, M_WAITOK);
-
-	pde = pmap_pde(pmap, va);
-	if (pde != NULL && (*pde & PG_V) != 0) {
-		if ((*pde & PG_PS) != 0)
-			panic("pmap_enter: attempted pmap_enter on 2MB page");
-		pte = pmap_pde_to_pte(pde, va);
-	} else
-		panic("pmap_enter: invalid page directory va=%#lx", va);
-
 	pa = VM_PAGE_TO_PHYS(m);
-	om = NULL;
-	origpte = *pte;
-	opa = origpte & PG_FRAME;
-
-	/*
-	 * Mapping has not changed, must be protection or wiring change.
-	 */
-	if (origpte && (opa == pa)) {
-		/*
-		 * Wiring change, just update stats. We don't worry about
-		 * wiring PT pages as they remain resident as long as there
-		 * are valid mappings in them. Hence, if a user page is wired,
-		 * the PT page will be also.
-		 */
-		if (wired && ((origpte & PG_W) == 0))
-			pmap->pm_stats.wired_count++;
-		else if (!wired && (origpte & PG_W))
-			pmap->pm_stats.wired_count--;
-
-		/*
-		 * Remove extra pte reference
-		 */
-		if (mpte)
-			mpte->wire_count--;
-
-		if (origpte & PG_MANAGED) {
-			om = m;
-			pa |= PG_MANAGED;
-		}
-		goto validate;
-	} 
-
-	pv = NULL;
-
-	/*
-	 * Mapping has changed, invalidate old range and fall through to
-	 * handle validating new mapping.
-	 */
-	if (opa) {
-		if (origpte & PG_W)
-			pmap->pm_stats.wired_count--;
-		if (origpte & PG_MANAGED) {
-			om = PHYS_TO_VM_PAGE(opa);
-			pv = pmap_pvh_remove(&om->md, pmap, va);
-		}
-		if (mpte != NULL) {
-			mpte->wire_count--;
-			KASSERT(mpte->wire_count > 0,
-			    ("pmap_enter: missing reference to page table page,"
-			     " va: 0x%lx", va));
-		}
-	} else
-		pmap_resident_count_inc(pmap, 1);
-
-	/*
-	 * Enter on the PV list if part of our managed memory.
-	 */
-	if ((m->oflags & VPO_UNMANAGED) == 0) {
-		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
-		    ("pmap_enter: managed mapping within the clean submap"));
-		if (pv == NULL)
-			pv = get_pv_entry(pmap, FALSE);
-		pv->pv_va = va;
-		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-		pa |= PG_MANAGED;
-	} else if (pv != NULL)
-		free_pv_entry(pmap, pv);
-
-	/*
-	 * Increment counters
-	 */
-	if (wired)
-		pmap->pm_stats.wired_count++;
-
-validate:
-	/*
-	 * Now validate mapping with desired protection/wiring.
-	 */
 	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
-	if ((prot & VM_PROT_WRITE) != 0) {
+	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= PG_RW;
-		if ((newpte & PG_MANAGED) != 0)
-			vm_page_aflag_set(m, PGA_WRITEABLE);
-	}
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if (wired)
@@ -3279,40 +3465,143 @@
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
 
+	mpte = om = NULL;
+
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
+	PMAP_LOCK(pmap);
+
 	/*
-	 * if the mapping or permission bits are different, we need
-	 * to update the pte.
+	 * In the case that a page table page is not
+	 * resident, we are creating it here.
 	 */
-	if ((origpte & ~(PG_M|PG_A)) != newpte) {
-		newpte |= PG_A;
-		if ((access & VM_PROT_WRITE) != 0)
-			newpte |= PG_M;
-		if (origpte & PG_V) {
-			invlva = FALSE;
-			origpte = pte_load_store(pte, newpte);
-			if (origpte & PG_A) {
-				if (origpte & PG_MANAGED)
-					vm_page_aflag_set(om, PGA_REFERENCED);
-				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
-				    PG_NX) == 0 && (newpte & PG_NX)))
-					invlva = TRUE;
+retry:
+	pde = pmap_pde(pmap, va);
+	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
+	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
+		pte = pmap_pde_to_pte(pde, va);
+		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
+			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
+			mpte->wire_count++;
+		}
+	} else if (va < VM_MAXUSER_ADDRESS) {
+		/*
+		 * Here if the pte page isn't mapped, or if it has been
+		 * deallocated.
+		 */
+		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock);
+		goto retry;
+	} else
+		panic("pmap_enter: invalid page directory va=%#lx", va);
+
+	origpte = *pte;
+	opa = origpte & PG_FRAME;
+
+	/*
+	 * Is the specified virtual address already mapped?
+	 */
+	if ((origpte & PG_V) != 0) {
+		/*
+		 * Wiring change, just update stats. We don't worry about
+		 * wiring PT pages as they remain resident as long as there
+		 * are valid mappings in them. Hence, if a user page is wired,
+		 * the PT page will be also.
+		 */
+		if (wired && (origpte & PG_W) == 0)
+			pmap->pm_stats.wired_count++;
+		else if (!wired && (origpte & PG_W))
+			pmap->pm_stats.wired_count--;
+
+		/*
+		 * Remove the extra PT page reference.
+		 */
+		if (mpte != NULL) {
+			mpte->wire_count--;
+			KASSERT(mpte->wire_count > 0,
+			    ("pmap_enter: missing reference to page table page,"
+			     " va: 0x%lx", va));
+		}
+
+		/*
+		 * Has the mapping changed?
+		 */
+		if (opa == pa) {
+			/*
+			 * No, might be a protection or wiring change.
+			 */
+			if ((origpte & PG_MANAGED) != 0) {
+				newpte |= PG_MANAGED;
+				om = m;
 			}
-			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
-				if ((origpte & PG_MANAGED) != 0)
-					vm_page_dirty(om);
-				if ((newpte & PG_RW) == 0)
-					invlva = TRUE;
-			}
-			if ((origpte & PG_MANAGED) != 0 &&
+			if ((origpte & ~(PG_M | PG_A)) == newpte)
+				goto unchanged;
+			goto validate;
+		} else {
+			/*
+			 * Yes, fall through to validate the new mapping.
+			 */
+			if ((origpte & PG_MANAGED) != 0)
+				om = PHYS_TO_VM_PAGE(opa);
+		}
+	} else {
+		/*
+		 * Increment the counters.
+		 */
+		if (wired)
+			pmap->pm_stats.wired_count++;
+		pmap_resident_count_inc(pmap, 1);
+	}
+
+	/*
+	 * Enter on the PV list if part of our managed memory.
+	 */
+	if ((m->oflags & VPO_UNMANAGED) == 0) {
+		newpte |= PG_MANAGED;
+		pv = get_pv_entry(pmap, &lock);
+		pv->pv_va = va;
+		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
+		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+	}
+
+validate:
+
+	/*
+	 * Update the PTE.
+	 */
+	newpte |= PG_A;
+	if ((access & VM_PROT_WRITE) != 0)
+		newpte |= PG_M;
+	if ((newpte & (PG_MANAGED | PG_RW)) == (PG_MANAGED | PG_RW))
+		vm_page_aflag_set(m, PGA_WRITEABLE);
+	if ((origpte & PG_V) != 0) {
+		invlva = FALSE;
+		origpte = pte_load_store(pte, newpte);
+		if ((origpte & PG_A) != 0 && (opa != pa ||
+		    ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0)))
+			invlva = TRUE;
+		if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
+			if ((origpte & PG_MANAGED) != 0)
+				vm_page_dirty(om);
+			if ((newpte & PG_RW) == 0)
+				invlva = TRUE;
+		}
+		if (opa != pa && (origpte & PG_MANAGED) != 0) {
+			if ((origpte & PG_A) != 0)
+				vm_page_aflag_set(om, PGA_REFERENCED);
+			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
+			pmap_pvh_free(&om->md, pmap, va);
+			if ((om->aflags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
-			if (invlva)
-				pmap_invalidate_page(pmap, va);
-		} else
-			pte_store(pte, newpte);
-	}
+		}
+		if (invlva)
+			pmap_invalidate_page(pmap, va);
+	} else
+		pte_store(pte, newpte);
+
+unchanged:
 
 	/*
 	 * If both the page table page and the reservation are fully
@@ -3321,9 +3610,11 @@
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
-		pmap_promote_pde(pmap, pde, va);
-
-	vm_page_unlock_queues();
+		pmap_promote_pde(pmap, pde, va, &lock);
+
+	if (lock != NULL)
+		rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3334,14 +3625,15 @@
  * (3) a pv entry cannot be allocated without reclaiming another pv entry. 
  */
 static boolean_t
-pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
+pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
+    struct rwlock **lockp)
 {
 	pd_entry_t *pde, newpde;
 	vm_page_t free, mpde;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
+	if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
@@ -3364,7 +3656,8 @@
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
-		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
+		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m),
+		    lockp)) {
 			free = NULL;
 			if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) {
 				pmap_invalidate_page(pmap, va);
@@ -3390,7 +3683,7 @@
 	 */
 	pde_store(pde, newpde);
 
-	pmap_pde_mappings++;
+	atomic_add_long(&pmap_pde_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
@@ -3412,6 +3705,7 @@
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
+	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
@@ -3420,21 +3714,24 @@
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
-	vm_page_lock_queues();
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 		    (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 &&
 		    pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 &&
-		    pmap_enter_pde(pmap, va, m, prot))
+		    pmap_enter_pde(pmap, va, m, prot, &lock))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
-			    mpte);
+			    mpte, &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
-	vm_page_unlock_queues();
+	if (lock != NULL)
+		rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3450,17 +3747,21 @@
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
-
-	vm_page_lock_queues();
+	struct rwlock *lock;
+
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
-	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
-	vm_page_unlock_queues();
+	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
+	if (lock != NULL)
+		rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
-    vm_prot_t prot, vm_page_t mpte)
+    vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	vm_page_t free;
 	pt_entry_t *pte;
@@ -3469,7 +3770,7 @@
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
@@ -3494,7 +3795,9 @@
 
 			/*
 			 * If the page table page is mapped, we just increment
-			 * the hold count, and activate it.
+			 * the hold count, and activate it.  Otherwise, we
+			 * attempt to allocate a page table page.  If this
+			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (ptepa && (*ptepa & PG_V) != 0) {
 				if (*ptepa & PG_PS)
@@ -3502,8 +3805,11 @@
 				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
-				mpte = _pmap_allocpte(pmap, ptepindex,
-				    M_NOWAIT);
+				/*
+				 * Pass NULL instead of the PV list lock
+				 * pointer, because we don't intend to sleep.
+				 */
+				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
@@ -3526,7 +3832,7 @@
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
-	    !pmap_try_insert_pv_entry(pmap, va, m)) {
+	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			free = NULL;
 			if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
@@ -3629,7 +3935,7 @@
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
 		    size; pa += NBPDR) {
-			pdpg = pmap_allocpde(pmap, addr, M_NOWAIT);
+			pdpg = pmap_allocpde(pmap, addr, NULL);
 			if (pdpg == NULL) {
 				/*
 				 * The creation of mappings below is only an
@@ -3647,7 +3953,7 @@
 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
-				pmap_pde_mappings++;
+				atomic_add_long(&pmap_pde_mappings, 1);
 			} else {
 				/* Continue on if the PDE is already valid. */
 				pdpg->wire_count--;
@@ -3673,9 +3979,9 @@
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
-	boolean_t are_queues_locked;
-
-	are_queues_locked = FALSE;
+	boolean_t pv_lists_locked;
+
+	pv_lists_locked = FALSE;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
@@ -3686,11 +3992,11 @@
 	pde = pmap_pde(pmap, va);
 	if ((*pde & PG_PS) != 0) {
 		if (!wired != ((*pde & PG_W) == 0)) {
-			if (!are_queues_locked) {
-				are_queues_locked = TRUE;
-				if (!mtx_trylock(&vm_page_queue_mtx)) {
+			if (!pv_lists_locked) {
+				pv_lists_locked = TRUE;
+				if (!rw_try_rlock(&pvh_global_lock)) {
 					PMAP_UNLOCK(pmap);
-					vm_page_lock_queues();
+					rw_rlock(&pvh_global_lock);
 					goto retry;
 				}
 			}
@@ -3708,8 +4014,8 @@
 		atomic_clear_long(pte, PG_W);
 	}
 out:
-	if (are_queues_locked)
-		vm_page_unlock_queues();
+	if (pv_lists_locked)
+		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3725,6 +4031,7 @@
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
+	struct rwlock *lock;
 	vm_page_t   free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
@@ -3733,7 +4040,8 @@
 	if (dst_addr != src_addr)
 		return;
 
-	vm_page_lock_queues();
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
@@ -3777,7 +4085,7 @@
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
-			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
+			dstmpde = pmap_allocpde(dst_pmap, addr, NULL);
 			if (dstmpde == NULL)
 				break;
 			pde = (pd_entry_t *)
@@ -3785,7 +4093,7 @@
 			pde = &pde[pmap_pde_index(addr)];
 			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
-			    PG_PS_FRAME))) {
+			    PG_PS_FRAME, &lock))) {
 				*pde = srcptepaddr & ~PG_W;
 				pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
 			} else
@@ -3815,14 +4123,15 @@
 				    dstmpte->pindex == pmap_pde_pindex(addr))
 					dstmpte->wire_count++;
 				else if ((dstmpte = pmap_allocpte(dst_pmap,
-				    addr, M_NOWAIT)) == NULL)
+				    addr, NULL)) == NULL)
 					goto out;
 				dst_pte = (pt_entry_t *)
 				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 				dst_pte = &dst_pte[pmap_pte_index(addr)];
 				if (*dst_pte == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
-				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
+				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
+				    &lock)) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
@@ -3849,7 +4158,9 @@
 		}
 	}
 out:
-	vm_page_unlock_queues();
+	if (lock != NULL)
+		rw_wunlock(lock);
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
@@ -3923,6 +4234,7 @@
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
+	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
@@ -3930,7 +4242,9 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
-	vm_page_lock_queues();
+	rw_rlock(&pvh_global_lock);
+	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
@@ -3952,7 +4266,8 @@
 				break;
 		}
 	}
-	vm_page_unlock_queues();
+	rw_runlock(lock);
+	rw_runlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -3970,13 +4285,13 @@
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	count = pmap_pvh_wired_mappings(&m->md, count);
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
 	        count);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
@@ -3992,7 +4307,7 @@
 	pt_entry_t *pte;
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
@@ -4011,15 +4326,19 @@
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
+	struct rwlock *lock;
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
-	vm_page_lock_queues();
+	rw_rlock(&pvh_global_lock);
+	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
+	rw_rlock(lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
-	vm_page_unlock_queues();
+	rw_runlock(lock);
+	rw_runlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4041,21 +4360,23 @@
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
-	int field, idx;
+	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
-	int allfree;
+	int allfree, field, freed, idx;
 
 	if (pmap != PCPU_GET(curpmap)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
-	vm_page_lock_queues();
+	lock = NULL;
+	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
+		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
-			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
+			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfq(inuse);
 				bitmask = 1UL << bit;
@@ -4109,10 +4430,9 @@
 						vm_page_dirty(m);
 				}
 
+				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
+
 				/* Mark free */
-				PV_STAT(pv_entry_frees++);
-				PV_STAT(pv_entry_spare++);
-				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 				if ((tpte & PG_PS) != 0) {
 					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
@@ -4120,7 +4440,8 @@
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
-							if (TAILQ_EMPTY(&mt->md.pv_list))
+							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
+							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_lookup_pt_page(pmap, pv->pv_va);
@@ -4136,7 +4457,8 @@
 				} else {
 					pmap_resident_count_dec(pmap, 1);
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-					if (TAILQ_EMPTY(&m->md.pv_list) &&
+					if ((m->aflags & PGA_WRITEABLE) != 0 &&
+					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
@@ -4144,21 +4466,21 @@
 					}
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
+				freed++;
 			}
 		}
+		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
+		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
+		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
-			PV_STAT(pv_entry_spare -= _NPCPV);
-			PV_STAT(pc_chunk_count--);
-			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
-			dump_drop_page(m->phys_addr);
-			vm_page_unwire(m, 0);
-			vm_page_free(m);
+			free_pv_chunk(pc);
 		}
 	}
+	if (lock != NULL)
+		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();
+	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
@@ -4186,11 +4508,11 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_modified_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4207,7 +4529,7 @@
 	pmap_t pmap;
 	boolean_t rv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
@@ -4258,11 +4580,11 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_referenced_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4278,7 +4600,7 @@
 	pmap_t pmap;
 	boolean_t rv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
@@ -4317,7 +4639,7 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -4335,8 +4657,9 @@
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
-		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
-		    " a 2mpage in page %p's pv list", m));
+		KASSERT((*pde & PG_PS) == 0,
+		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
+		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 retry:
 		oldpte = *pte;
@@ -4351,7 +4674,7 @@
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -4379,7 +4702,7 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -4437,7 +4760,7 @@
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 out:
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rtval);
 }
 
@@ -4467,7 +4790,7 @@
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -4516,7 +4839,7 @@
 		}
 		PMAP_UNLOCK(pmap);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -4536,7 +4859,7 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_reference: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -4576,7 +4899,7 @@
 		}
 		PMAP_UNLOCK(pmap);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/ptrace_machdep.c
--- a/head/sys/amd64/amd64/ptrace_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/ptrace_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/ptrace_machdep.c 232520 2012-03-04 20:24:28Z tijl $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/ptrace_machdep.c 238669 2012-07-21 13:06:37Z kib $");
 
 #include "opt_compat.h"
 
@@ -50,6 +50,7 @@
 
 	switch (req) {
 	case PT_GETXSTATE:
+		fpugetregs(td);
 		savefpu = (char *)(get_pcb_user_save_td(td) + 1);
 		error = copyout(savefpu, addr,
 		    cpu_max_ext_state_size - sizeof(struct savefpu));
@@ -62,8 +63,10 @@
 		}
 		savefpu = malloc(data, M_TEMP, M_WAITOK);
 		error = copyin(addr, savefpu, data);
-		if (error == 0)
+		if (error == 0) {
+			fpugetregs(td);
 			error = fpusetxstate(td, savefpu, data);
+		}
 		free(savefpu, M_TEMP);
 		break;
 
@@ -89,11 +92,13 @@
 
 	switch (req) {
 	case PT_I386_GETXMMREGS:
+		fpugetregs(td);
 		error = copyout(get_pcb_user_save_td(td), addr,
 		    sizeof(*fpstate));
 		break;
 
 	case PT_I386_SETXMMREGS:
+		fpugetregs(td);
 		fpstate = get_pcb_user_save_td(td);
 		error = copyin(addr, fpstate, sizeof(*fpstate));
 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/trap.c
--- a/head/sys/amd64/amd64/trap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/trap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/trap.c 233781 2012-04-02 15:07:22Z jhb $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/trap.c 238623 2012-07-19 19:09:12Z kib $");
 
 /*
  * AMD64 Trap and System call handling
@@ -328,7 +328,7 @@
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
-			ucode = fputrap();
+			ucode = fputrap_x87();
 			if (ucode == -1)
 				goto userout;
 			i = SIGFPE;
@@ -442,7 +442,9 @@
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
-			ucode = 0; /* XXX */
+			ucode = fputrap_sse();
+			if (ucode == -1)
+				goto userout;
 			i = SIGFPE;
 			break;
 		}
@@ -518,9 +520,8 @@
 				frame->tf_rip = (long)fsbase_load_fault;
 				goto out;
 			}
-			if (PCPU_GET(curpcb)->pcb_onfault != NULL) {
-				frame->tf_rip =
-				    (long)PCPU_GET(curpcb)->pcb_onfault;
+			if (curpcb->pcb_onfault != NULL) {
+				frame->tf_rip = (long)curpcb->pcb_onfault;
 				goto out;
 			}
 			break;
@@ -706,7 +707,7 @@
 		 * it normally, and panic immediately.
 		 */
 		if (!usermode && (td->td_intr_nesting_level != 0 ||
-		    PCPU_GET(curpcb)->pcb_onfault == NULL)) {
+		    curpcb->pcb_onfault == NULL)) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
@@ -762,8 +763,8 @@
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
-		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
-			frame->tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault;
+		    curpcb->pcb_onfault != NULL) {
+			frame->tf_rip = (long)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
@@ -972,4 +973,15 @@
 	     syscallname(td->td_proc, sa.code)));
 
 	syscallret(td, error, &sa);
+
+	/*
+	 * If the user-supplied value of %rip is not a canonical
+	 * address, then some CPUs will trigger a ring 0 #GP during
+	 * the sysret instruction.  However, the fault handler would
+	 * execute in ring 0 with the user's %gs and %rsp which would
+	 * not be safe.  Instead, use the full return path which
+	 * catches the problem safely.
+	 */
+	if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS)
+		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/amd64/vm_machdep.c
--- a/head/sys/amd64/amd64/vm_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/amd64/vm_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/amd64/vm_machdep.c 231441 2012-02-10 21:26:25Z kib $");
+__FBSDID("$FreeBSD: head/sys/amd64/amd64/vm_machdep.c 238623 2012-07-19 19:09:12Z kib $");
 
 #include "opt_isa.h"
 #include "opt_cpu.h"
@@ -90,6 +90,10 @@
 static volatile u_int	cpu_reset_proxy_active;
 #endif
 
+CTASSERT((struct thread **)OFFSETOF_CURTHREAD ==
+    &((struct pcpu *)NULL)->pc_curthread);
+CTASSERT((struct pcb **)OFFSETOF_CURPCB == &((struct pcpu *)NULL)->pc_curpcb);
+
 struct savefpu *
 get_pcb_user_save_td(struct thread *td)
 {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/conf/GENERIC
--- a/head/sys/amd64/conf/GENERIC	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/conf/GENERIC	Wed Jul 25 16:40:53 2012 +0300
@@ -16,7 +16,7 @@
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
-# $FreeBSD: head/sys/amd64/conf/GENERIC 234504 2012-04-20 21:37:42Z brooks $
+# $FreeBSD: head/sys/amd64/conf/GENERIC 237901 2012-07-01 08:10:49Z delphij $
 
 cpu		HAMMER
 ident		GENERIC
@@ -28,6 +28,7 @@
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
+options 	TCP_OFFLOAD		# TCP offload
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
@@ -44,6 +45,7 @@
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
+options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD32	# Compatible with i386 binaries
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
@@ -66,6 +68,7 @@
 options 	MAC			# TrustedBSD MAC Framework
 options 	KDTRACE_FRAME		# Ensure frames are compiled in
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
+options 	DDB_CTF			# Kernel ELF linker loads CTF data
 options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
 
 # Debugging support.  Always need this:
@@ -75,7 +78,6 @@
 # For full debugger support use this instead:
 options 	DDB			# Support DDB.
 options 	GDB			# Support remote GDB.
-options 	DDB_CTF			# kernel ELF linker loads CTF data
 options 	DEADLKRES		# Enable the deadlock resolver
 options 	INVARIANTS		# Enable calls of extra sanity checking
 options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
@@ -150,6 +152,7 @@
 device		ips		# IBM (Adaptec) ServeRAID
 device		mly		# Mylex AcceleRAID/eXtremeRAID
 device		twa		# 3ware 9000 series PATA/SATA RAID
+device		tws		# LSI 3ware 9750 SATA+SAS 6Gb/s RAID controller
 
 # RAID controllers
 device		aac		# Adaptec FSA RAID
@@ -160,7 +163,6 @@
 #XXX pointer/int warnings
 #device		pst		# Promise Supertrak SX6000
 device		twe		# 3ware ATA RAID
-device		tws		# LSI 3ware 9750 SATA+SAS 6Gb/s RAID controller
 
 # atkbdc0 controls both the keyboard and the PS/2 mouse
 device		atkbdc		# AT keyboard controller
@@ -272,6 +274,8 @@
 device		ath_pci		# Atheros pci/cardbus glue
 device		ath_hal		# pci/cardbus chip support
 options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
+options 	AH_AR5416_INTERRUPT_MITIGATION	# AR5416 interrupt mitigation
+options 	ATH_ENABLE_11N	# Enable 802.11n support for AR5416 and later
 device		ath_rate_sample	# SampleRate tx rate control for ath
 #device		bwi		# Broadcom BCM430x/BCM431x wireless NICs.
 #device		bwn		# Broadcom BCM43xx wireless NICs.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/atomic.h
--- a/head/sys/amd64/include/atomic.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/atomic.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/amd64/include/atomic.h 236456 2012-06-02 18:10:16Z kib $
  */
 #ifndef _MACHINE_ATOMIC_H_
 #define	_MACHINE_ATOMIC_H_
@@ -81,8 +81,9 @@
 u_int	atomic_fetchadd_int(volatile u_int *p, u_int v);
 u_long	atomic_fetchadd_long(volatile u_long *p, u_long v);
 
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)			\
-u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p);	\
+#define	ATOMIC_LOAD(TYPE, LOP)					\
+u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p)
+#define	ATOMIC_STORE(TYPE)					\
 void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 
 #else /* !KLD_MODULE && __GNUCLIKE_ASM */
@@ -210,37 +211,43 @@
 	return (v);
 }
 
+/*
+ * We assume that a = b will do atomic loads and stores.  Due to the
+ * IA32 memory model, a simple store guarantees release semantics.
+ *
+ * However, loads may pass stores, so for atomic_load_acq we have to
+ * ensure a Store/Load barrier to do the load in SMP kernels.  We use
+ * "lock cmpxchg" as recommended by the AMD Software Optimization
+ * Guide, and not mfence.  For UP kernels, however, the cache of the
+ * single processor is always consistent, so we only need to take care
+ * of the compiler.
+ */
+#define	ATOMIC_STORE(TYPE)				\
+static __inline void					\
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
+{							\
+	__asm __volatile("" : : : "memory");		\
+	*p = v;						\
+}							\
+struct __hack
+
 #if defined(_KERNEL) && !defined(SMP)
 
-/*
- * We assume that a = b will do atomic loads and stores.  However, on a
- * PentiumPro or higher, reads may pass writes, so for that case we have
- * to use a serializing instruction (i.e. with LOCK) to do the load in
- * SMP kernels.  For UP kernels, however, the cache of the single processor
- * is always consistent, so we only need to take care of compiler.
- */
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)		\
+#define	ATOMIC_LOAD(TYPE, LOP)				\
 static __inline u_##TYPE				\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
 {							\
 	u_##TYPE tmp;					\
 							\
 	tmp = *p;					\
-	__asm __volatile ("" : : : "memory");		\
+	__asm __volatile("" : : : "memory");		\
 	return (tmp);					\
 }							\
-							\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__asm __volatile ("" : : : "memory");		\
-	*p = v;						\
-}							\
 struct __hack
 
 #else /* !(_KERNEL && !SMP) */
 
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)		\
+#define	ATOMIC_LOAD(TYPE, LOP)				\
 static __inline u_##TYPE				\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
 {							\
@@ -254,19 +261,6 @@
 							\
 	return (res);					\
 }							\
-							\
-/*							\
- * The XCHG instruction asserts LOCK automagically.	\
- */							\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__asm __volatile(SOP				\
-	: "=m" (*p),			/* 0 */		\
-	  "+r" (v)			/* 1 */		\
-	: "m" (*p)			/* 2 */		\
-	: "memory");					\
-}							\
 struct __hack
 
 #endif /* _KERNEL && !SMP */
@@ -293,13 +287,19 @@
 ATOMIC_ASM(add,	     long,  "addq %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subq %1,%0",  "ir",  v);
 
-ATOMIC_STORE_LOAD(char,	"cmpxchgb %b0,%1", "xchgb %b1,%0");
-ATOMIC_STORE_LOAD(short,"cmpxchgw %w0,%1", "xchgw %w1,%0");
-ATOMIC_STORE_LOAD(int,	"cmpxchgl %0,%1",  "xchgl %1,%0");
-ATOMIC_STORE_LOAD(long,	"cmpxchgq %0,%1",  "xchgq %1,%0");
+ATOMIC_LOAD(char,  "cmpxchgb %b0,%1");
+ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
+ATOMIC_LOAD(int,   "cmpxchgl %0,%1");
+ATOMIC_LOAD(long,  "cmpxchgq %0,%1");
+
+ATOMIC_STORE(char);
+ATOMIC_STORE(short);
+ATOMIC_STORE(int);
+ATOMIC_STORE(long);
 
 #undef ATOMIC_ASM
-#undef ATOMIC_STORE_LOAD
+#undef ATOMIC_LOAD
+#undef ATOMIC_STORE
 
 #ifndef WANT_FUNCTIONS
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/cpufunc.h
--- a/head/sys/amd64/include/cpufunc.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/cpufunc.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/cpufunc.h 232227 2012-02-27 17:28:47Z jhb $
+ * $FreeBSD: head/sys/amd64/include/cpufunc.h 238311 2012-07-09 20:55:39Z jhb $
  */
 
 /*
@@ -107,6 +107,13 @@
 }
 
 static __inline void
+clts(void)
+{
+
+	__asm __volatile("clts");
+}
+
+static __inline void
 disable_intr(void)
 {
 	__asm __volatile("cli" : : : "memory");
@@ -273,6 +280,15 @@
 	__asm __volatile("outw %0, %w1" : : "a" (data), "Nd" (port));
 }
 
+static __inline u_long
+popcntq(u_long mask)
+{
+	u_long result;
+
+	__asm __volatile("popcntq %1,%0" : "=r" (result) : "rm" (mask));
+	return (result);
+}
+
 static __inline void
 mfence(void)
 {
@@ -409,6 +425,25 @@
 	return (data);
 }
 
+static __inline u_long
+rxcr(u_int reg)
+{
+	u_int low, high;
+
+	__asm __volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (reg));
+	return (low | ((uint64_t)high << 32));
+}
+
+static __inline void
+load_xcr(u_int reg, u_long val)
+{
+	u_int low, high;
+
+	low = val;
+	high = val >> 32;
+	__asm __volatile("xsetbv" : : "c" (reg), "a" (low), "d" (high));
+}
+
 /*
  * Global TLB flush (except for thise for pages marked PG_G)
  */
@@ -674,6 +709,9 @@
 int	breakpoint(void);
 u_int	bsfl(u_int mask);
 u_int	bsrl(u_int mask);
+void	clflush(u_long addr);
+void	clts(void);
+void	cpuid_count(u_int ax, u_int cx, u_int *p);
 void	disable_intr(void);
 void	do_cpuid(u_int ax, u_int *p);
 void	enable_intr(void);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/elf.h
--- a/head/sys/amd64/include/elf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/elf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/amd64/include/elf.h 237430 2012-06-22 06:38:31Z kib $
  */
 
 #ifndef _MACHINE_ELF_H_
@@ -94,6 +94,7 @@
 #define	AT_NCPUS	19	/* Number of CPUs. */
 #define	AT_PAGESIZES	20	/* Pagesizes. */
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
+#define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #define	AT_STACKPROT	23	/* Initial stack protection. */
 
 #define	AT_COUNT	24	/* Count of defined aux entry types. */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/fpu.h
--- a/head/sys/amd64/include/fpu.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/fpu.h	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)npx.h	5.3 (Berkeley) 1/18/91
- * $FreeBSD: head/sys/amd64/include/fpu.h 233044 2012-03-16 20:24:30Z tijl $
+ * $FreeBSD: head/sys/amd64/include/fpu.h 238598 2012-07-18 15:43:47Z kib $
  */
 
 /*
@@ -62,7 +62,8 @@
 	    char *xfpustate, size_t xfpustate_size);
 int	fpusetxstate(struct thread *td, char *xfpustate,
 	    size_t xfpustate_size);
-int	fputrap(void);
+int	fputrap_sse(void);
+int	fputrap_x87(void);
 void	fpuuserinited(struct thread *td);
 struct fpu_kern_ctx *fpu_kern_alloc_ctx(u_int flags);
 void	fpu_kern_free_ctx(struct fpu_kern_ctx *ctx);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/in_cksum.h
--- a/head/sys/amd64/include/in_cksum.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/in_cksum.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  *	from tahoe:	in_cksum.c	1.2	86/01/05
  *	from:		@(#)in_cksum.c	1.3 (Berkeley) 1/19/91
  *	from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
- * $FreeBSD$
+ * $FreeBSD: head/sys/amd64/include/in_cksum.h 235941 2012-05-24 22:00:48Z bz $
  */
 
 #ifndef _MACHINE_IN_CKSUM_H_
@@ -43,6 +43,7 @@
 
 #define in_cksum(m, len)	in_cksum_skip(m, len, 0)
 
+#if defined(IPVERSION) && (IPVERSION == 4)
 /*
  * It it useful to have an Internet checksum routine which is inlineable
  * and optimized specifically for the task of computing IP header checksums
@@ -69,9 +70,12 @@
 	} while(0)
 
 #endif
+#endif
 
 #ifdef _KERNEL
+#if defined(IPVERSION) && (IPVERSION == 4)
 u_int in_cksum_hdr(const struct ip *ip);
+#endif
 u_short	in_addword(u_short sum, u_short b);
 u_short	in_pseudo(u_int sum, u_int b, u_int c);
 u_short	in_cksum_skip(struct mbuf *m, int len, int skip);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/intr_machdep.h
--- a/head/sys/amd64/include/intr_machdep.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/intr_machdep.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/intr_machdep.h 234207 2012-04-13 07:15:40Z avg $
+ * $FreeBSD: head/sys/amd64/include/intr_machdep.h 234989 2012-05-03 21:44:01Z attilio $
  */
 
 #ifndef __MACHINE_INTR_MACHDEP_H__
@@ -140,9 +140,7 @@
 enum intr_trigger elcr_read_trigger(u_int irq);
 void	elcr_resume(void);
 void	elcr_write_trigger(u_int irq, enum intr_trigger trigger);
-#ifdef SMP
 void	intr_add_cpu(u_int cpu);
-#endif
 int	intr_add_handler(const char *name, int vector, driver_filter_t filter, 
 			 driver_intr_t handler, void *arg, enum intr_type flags, 
 			 void **cookiep);    
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/md_var.h
--- a/head/sys/amd64/include/md_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/md_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/md_var.h 230426 2012-01-21 17:45:27Z kib $
+ * $FreeBSD: head/sys/amd64/include/md_var.h 238450 2012-07-14 15:48:30Z kib $
  */
 
 #ifndef _MACHINE_MD_VAR_H_
@@ -57,6 +57,7 @@
 extern	u_int	cpu_procinfo2;
 extern	char	cpu_vendor[];
 extern	u_int	cpu_vendor_id;
+extern	char	ctx_switch_xsave[];
 extern	char	kstack[];
 extern	char	sigcode[];
 extern	int	szsigcode;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/pcb.h
--- a/head/sys/amd64/include/pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)pcb.h	5.10 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/amd64/include/pcb.h 230426 2012-01-21 17:45:27Z kib $
+ * $FreeBSD: head/sys/amd64/include/pcb.h 237037 2012-06-13 22:53:56Z jkim $
  */
 
 #ifndef _AMD64_PCB_H_
@@ -91,9 +91,20 @@
 	/* local tss, with i/o bitmap; NULL for common */
 	struct amd64tss *pcb_tssp;
 
+	/* model specific registers */
+	register_t	pcb_efer;
+	register_t	pcb_star;
+	register_t	pcb_lstar;
+	register_t	pcb_cstar;
+	register_t	pcb_sfmask;
+	register_t	pcb_xsmask;
+
+	/* fpu context for suspend/resume */
+	void		*pcb_fpususpend;
+
 	struct savefpu	*pcb_save;
 
-	uint64_t	pcb_pad[2];
+	uint64_t	pcb_pad[3];
 };
 
 #ifdef _KERNEL
@@ -130,7 +141,8 @@
 }
 
 void	makectx(struct trapframe *, struct pcb *);
-int	savectx(struct pcb *);
+int	savectx(struct pcb *) __returns_twice;
+void	resumectx(struct pcb *);
 
 #endif
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/pcpu.h
--- a/head/sys/amd64/include/pcpu.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/pcpu.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/include/pcpu.h 230260 2012-01-17 07:21:23Z kib $
+ * $FreeBSD: head/sys/amd64/include/pcpu.h 238723 2012-07-23 19:16:31Z kib $
  */
 
 #ifndef _MACHINE_PCPU_H_
@@ -216,16 +216,36 @@
 #define	PCPU_PTR(member)	__PCPU_PTR(pc_ ## member)
 #define	PCPU_SET(member, val)	__PCPU_SET(pc_ ## member, val)
 
+#define	OFFSETOF_CURTHREAD	0
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnull-dereference"
+#endif
 static __inline __pure2 struct thread *
 __curthread(void)
 {
 	struct thread *td;
 
-	__asm("movq %%gs:0,%0" : "=r" (td));
+	__asm("movq %%gs:%1,%0" : "=r" (td)
+	    : "m" (*(char *)OFFSETOF_CURTHREAD));
 	return (td);
 }
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
 #define	curthread		(__curthread())
 
+#define	OFFSETOF_CURPCB		32
+static __inline __pure2 struct pcb *
+__curpcb(void)
+{
+	struct pcb *pcb;
+
+	__asm("movq %%gs:%1,%0" : "=r" (pcb) : "m" (*(char *)OFFSETOF_CURPCB));
+	return (pcb);
+}
+#define	curpcb		(__curpcb())
+
 #define	IS_BSP()	(PCPU_GET(cpuid) == 0)
 
 #else /* !lint || defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/pmap.h
--- a/head/sys/amd64/include/pmap.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/pmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  *
  *	from: hp300: @(#)pmap.h	7.2 (Berkeley) 12/16/90
  *	from: @(#)pmap.h	7.4 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/amd64/include/pmap.h 222813 2011-06-07 08:46:13Z attilio $
+ * $FreeBSD: head/sys/amd64/include/pmap.h 237168 2012-06-16 18:56:19Z alc $
  */
 
 #ifndef _MACHINE_PMAP_H_
@@ -295,7 +295,7 @@
 	pmap_t			pc_pmap;
 	TAILQ_ENTRY(pv_chunk)	pc_list;
 	uint64_t		pc_map[_NPCM];	/* bitmap; 1 = free */
-	uint64_t		pc_spare[2];
+	TAILQ_ENTRY(pv_chunk)	pc_lru;
 	struct pv_entry		pc_pventry[_NPCPV];
 };
 
@@ -309,6 +309,7 @@
 extern vm_offset_t virtual_end;
 
 #define	pmap_page_get_memattr(m)	((vm_memattr_t)(m)->md.pat_mode)
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
 #define	pmap_unmapbios(va, sz)	pmap_unmapdev((va), (sz))
 
 void	pmap_bootstrap(vm_paddr_t *);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/smp.h
--- a/head/sys/amd64/include/smp.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/smp.h	Wed Jul 25 16:40:53 2012 +0300
@@ -6,7 +6,7 @@
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
- * $FreeBSD: head/sys/amd64/include/smp.h 222853 2011-06-08 08:12:15Z avg $
+ * $FreeBSD: head/sys/amd64/include/smp.h 236938 2012-06-12 00:14:54Z iwasaki $
  *
  */
 
@@ -59,6 +59,7 @@
 void	cpustop_handler(void);
 void	cpususpend_handler(void);
 void	init_secondary(void);
+void	ipi_startup(int apic_id, int vector);
 void	ipi_all_but_self(u_int ipi);
 void 	ipi_bitmap_handler(struct trapframe frame);
 void	ipi_cpu(int cpu, u_int ipi);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/vdso.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/amd64/include/vdso.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/amd64/include/vdso.h 237433 2012-06-22 07:06:40Z kib $ */
+
+#include <x86/vdso.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/include/vmparam.h
--- a/head/sys/amd64/include/vmparam.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/include/vmparam.h	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vmparam.h	5.9 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/amd64/include/vmparam.h 221855 2011-05-13 19:35:01Z mdf $
+ * $FreeBSD: head/sys/amd64/include/vmparam.h 234743 2012-04-27 22:27:21Z rmh $
  */
 
 
@@ -54,7 +54,7 @@
  */
 #define	MAXTSIZ		(128UL*1024*1024)	/* max text size */
 #ifndef DFLDSIZ
-#define	DFLDSIZ		(128UL*1024*1024)	/* initial data size limit */
+#define	DFLDSIZ		(32768UL*1024*1024)	/* initial data size limit */
 #endif
 #ifndef MAXDSIZ
 #define	MAXDSIZ		(32768UL*1024*1024)	/* max data size */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux.h
--- a/head/sys/amd64/linux32/linux.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/amd64/linux32/linux.h 230132 2012-01-15 13:23:18Z uqs $
+ * $FreeBSD: head/sys/amd64/linux32/linux.h 235063 2012-05-05 19:42:38Z netchild $
  */
 
 #ifndef _AMD64_LINUX_H_
@@ -42,6 +42,7 @@
 #define	ldebug(name)	isclr(linux_debug_map, LINUX_SYS_linux_ ## name)
 #define	ARGS(nm, fmt)	"linux(%ld): "#nm"("fmt")\n", (long)td->td_proc->p_pid
 #define	LMSG(fmt)	"linux(%ld): "fmt"\n", (long)td->td_proc->p_pid
+#define	LINUX_DTRACE	linuxulator32
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_LINUX);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_dummy.c
--- a/head/sys/amd64/linux32/linux32_dummy.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_dummy.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,16 +27,25 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_dummy.c 234352 2012-04-16 21:22:02Z jkim $");
+__FBSDID("$FreeBSD: head/sys/amd64/linux32/linux32_dummy.c 235063 2012-05-05 19:42:38Z netchild $");
+
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
 
 #include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <amd64/linux32/linux.h>
 #include <amd64/linux32/linux32_proto.h>
+#include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_util.h>
 
+/* DTrace init */
+LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
+
 DUMMY(stime);
 DUMMY(olduname);
 DUMMY(syslog);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_proto.h
--- a/head/sys/amd64/linux32/linux32_proto.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_proto.h	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call prototypes.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_proto.h 234360 2012-04-16 23:17:29Z jkim $
- * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim 
+ * $FreeBSD: head/sys/amd64/linux32/linux32_proto.h 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 #ifndef _LINUX_SYSPROTO_H_
@@ -60,8 +60,8 @@
 };
 struct linux_execve_args {
 	char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
-	char argp_l_[PADL_(u_int32_t *)]; u_int32_t * argp; char argp_r_[PADR_(u_int32_t *)];
-	char envp_l_[PADL_(u_int32_t *)]; u_int32_t * envp; char envp_r_[PADR_(u_int32_t *)];
+	char argp_l_[PADL_(uint32_t *)]; uint32_t * argp; char argp_r_[PADR_(uint32_t *)];
+	char envp_l_[PADL_(uint32_t *)]; uint32_t * envp; char envp_r_[PADR_(uint32_t *)];
 };
 struct linux_chdir_args {
 	char path_l_[PADL_(char *)]; char * path; char path_r_[PADR_(char *)];
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_syscall.h
--- a/head/sys/amd64/linux32/linux32_syscall.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_syscall.h	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call numbers.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_syscall.h 234360 2012-04-16 23:17:29Z jkim $
- * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim 
+ * $FreeBSD: head/sys/amd64/linux32/linux32_syscall.h 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 #define	LINUX_SYS_exit	1
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_syscalls.c
--- a/head/sys/amd64/linux32/linux32_syscalls.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_syscalls.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_syscalls.c 234360 2012-04-16 23:17:29Z jkim $
- * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim 
+ * $FreeBSD: head/sys/amd64/linux32/linux32_syscalls.c 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 const char *linux_syscallnames[] = {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_sysent.c
--- a/head/sys/amd64/linux32/linux32_sysent.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_sysent.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_sysent.c 234360 2012-04-16 23:17:29Z jkim $
- * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim 
+ * $FreeBSD: head/sys/amd64/linux32/linux32_sysent.c 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 #include "opt_compat.h"
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/linux32_systrace_args.c
--- a/head/sys/amd64/linux32/linux32_systrace_args.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/linux32_systrace_args.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,7 +2,7 @@
  * System call argument to DTrace register array converstion.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/amd64/linux32/linux32_systrace_args.c 234360 2012-04-16 23:17:29Z jkim $
+ * $FreeBSD: head/sys/amd64/linux32/linux32_systrace_args.c 236027 2012-05-25 21:52:57Z ed $
  * This file is part of the DTrace syscall provider.
  */
 
@@ -94,8 +94,8 @@
 	case 11: {
 		struct linux_execve_args *p = params;
 		uarg[0] = (intptr_t) p->path; /* char * */
-		uarg[1] = (intptr_t) p->argp; /* u_int32_t * */
-		uarg[2] = (intptr_t) p->envp; /* u_int32_t * */
+		uarg[1] = (intptr_t) p->argp; /* uint32_t * */
+		uarg[2] = (intptr_t) p->envp; /* uint32_t * */
 		*n_args = 3;
 		break;
 	}
@@ -2401,10 +2401,10 @@
 			p = "char *";
 			break;
 		case 1:
-			p = "u_int32_t *";
+			p = "uint32_t *";
 			break;
 		case 2:
-			p = "u_int32_t *";
+			p = "uint32_t *";
 			break;
 		default:
 			break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/amd64/linux32/syscalls.master
--- a/head/sys/amd64/linux32/syscalls.master	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/amd64/linux32/syscalls.master	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
- $FreeBSD: head/sys/amd64/linux32/syscalls.master 234359 2012-04-16 23:16:18Z jkim $
+ $FreeBSD: head/sys/amd64/linux32/syscalls.master 236026 2012-05-25 21:50:48Z ed $
 
 ;	@(#)syscalls.master	8.1 (Berkeley) 7/19/93
 ; System call name/number master file (or rather, slave, from LINUX).
@@ -54,8 +54,8 @@
 				    l_int mode); }
 9	AUE_LINK	STD	{ int linux_link(char *path, char *to); }
 10	AUE_UNLINK	STD	{ int linux_unlink(char *path); }
-11	AUE_EXECVE	STD	{ int linux_execve(char *path, u_int32_t *argp, \
-				    u_int32_t *envp); }
+11	AUE_EXECVE	STD	{ int linux_execve(char *path, uint32_t *argp, \
+				    uint32_t *envp); }
 12	AUE_CHDIR	STD	{ int linux_chdir(char *path); }
 13	AUE_NULL	STD	{ int linux_time(l_time_t *tm); }
 14	AUE_MKNOD	STD	{ int linux_mknod(char *path, l_int mode, \
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/cd9660/cd9660_node.c
--- a/head/sys/fs/cd9660/cd9660_node.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/cd9660/cd9660_node.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/fs/cd9660/cd9660_node.c 234607 2012-04-23 14:10:34Z trasz $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -65,7 +65,6 @@
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
-	struct thread *td = ap->a_td;
 	struct iso_node *ip = VTOI(vp);
 	int error = 0;
 
@@ -74,7 +73,7 @@
 	 * so that it can be reused immediately.
 	 */
 	if (ip->inode.iso_mode == 0)
-		vrecycle(vp, td);
+		vrecycle(vp);
 	return error;
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/cd9660/cd9660_vfsops.c
--- a/head/sys/fs/cd9660/cd9660_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/cd9660/cd9660_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/cd9660/cd9660_vfsops.c 232485 2012-03-04 09:48:58Z kevlo $");
+__FBSDID("$FreeBSD: head/sys/fs/cd9660/cd9660_vfsops.c 238697 2012-07-22 15:40:31Z kevlo $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -133,7 +133,7 @@
 	int error;
 	accmode_t accmode;
 	struct nameidata ndp;
-	struct iso_mnt *imp = 0;
+	struct iso_mnt *imp = NULL;
 
 	td = curthread;
 
@@ -214,7 +214,7 @@
 	int iso_bsize;
 	int iso_blknum;
 	int joliet_level;
-	struct iso_volume_descriptor *vdp = 0;
+	struct iso_volume_descriptor *vdp = NULL;
 	struct iso_primary_descriptor *pri = NULL;
 	struct iso_sierra_primary_descriptor *pri_sierra = NULL;
 	struct iso_supplementary_descriptor *sup = NULL;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/devfs/devfs_vnops.c
--- a/head/sys/fs/devfs/devfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/devfs/devfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  *	@(#)kernfs_vnops.c	8.15 (Berkeley) 5/21/95
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43
  *
- * $FreeBSD: head/sys/fs/devfs/devfs_vnops.c 231949 2012-02-21 01:05:12Z kib $
+ * $FreeBSD: head/sys/fs/devfs/devfs_vnops.c 238029 2012-07-02 21:01:03Z kib $
  */
 
 /*
@@ -1170,18 +1170,14 @@
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
-	if ((flags & FOF_OFFSET) == 0)
-		uio->uio_offset = fp->f_offset;
-
+	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 	error = dsw->d_read(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		vfs_timestamp(&dev->si_atime);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
-	if ((flags & FOF_OFFSET) == 0)
-		fp->f_offset = uio->uio_offset;
-	fp->f_nextoff = uio->uio_offset;
+	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
@@ -1648,8 +1644,7 @@
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
-	if ((flags & FOF_OFFSET) == 0)
-		uio->uio_offset = fp->f_offset;
+	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 
 	resid = uio->uio_resid;
 
@@ -1661,9 +1656,7 @@
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
-	if ((flags & FOF_OFFSET) == 0)
-		fp->f_offset = uio->uio_offset;
-	fp->f_nextoff = uio->uio_offset;
+	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ext2fs/ext2_inode.c
--- a/head/sys/fs/ext2fs/ext2_inode.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ext2fs/ext2_inode.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_inode.c	8.5 (Berkeley) 12/30/93
- * $FreeBSD: head/sys/fs/ext2fs/ext2_inode.c 228583 2011-12-16 15:47:43Z pfg $
+ * $FreeBSD: head/sys/fs/ext2fs/ext2_inode.c 234607 2012-04-23 14:10:34Z trasz $
  */
 
 #include <sys/param.h>
@@ -249,7 +249,7 @@
 	bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof(newblks));
 	bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof(oldblks));
 	oip->i_size = osize;
-	error = vtruncbuf(ovp, cred, td, length, (int)fs->e2fs_bsize);
+	error = vtruncbuf(ovp, cred, length, (int)fs->e2fs_bsize);
 	if (error && (allerror == 0))
 		allerror = error;
 	vnode_pager_setsize(ovp, length);
@@ -498,7 +498,7 @@
 	 * so that it can be reused immediately.
 	 */
 	if (ip->i_mode == 0)
-		vrecycle(vp, td);
+		vrecycle(vp);
 	return (error);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ext2fs/ext2_lookup.c
--- a/head/sys/fs/ext2fs/ext2_lookup.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ext2fs/ext2_lookup.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ufs_lookup.c	8.6 (Berkeley) 4/1/94
- * $FreeBSD: head/sys/fs/ext2fs/ext2_lookup.c 231949 2012-02-21 01:05:12Z kib $
+ * $FreeBSD: head/sys/fs/ext2fs/ext2_lookup.c 235508 2012-05-16 15:53:38Z pfg $
  */
 
 #include <sys/param.h>
@@ -115,6 +115,8 @@
 
 static int	ext2_dirbadentry(struct vnode *dp, struct ext2fs_direct_2 *de,
 		    int entryoffsetinblock);
+static int	ext2_lookup_ino(struct vnode *vdp, struct vnode **vpp,
+		    struct componentname *cnp, ino_t *dd_ino);
 
 /*
  * Vnode op for reading directories.
@@ -285,7 +287,14 @@
 		struct componentname *a_cnp;
 	} */ *ap;
 {
-	struct vnode *vdp;		/* vnode for directory being searched */
+
+	return (ext2_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
+}
+
+static int
+ext2_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp,
+    ino_t *dd_ino)
+{
 	struct inode *dp;		/* inode for directory being searched */
 	struct buf *bp;			/* a buffer of directory entries */
 	struct ext2fs_direct_2 *ep;	/* the current directory entry */
@@ -305,22 +314,22 @@
 	doff_t enduseful;		/* pointer past last used dir slot */
 	u_long bmask;			/* block offset mask */
 	int namlen, error;
-	struct vnode **vpp = ap->a_vpp;
-	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
-	ino_t ino;
+	ino_t ino, ino1;
 	int ltype;
 
-	int	DIRBLKSIZ = VTOI(ap->a_dvp)->i_e2fs->e2fs_bsize;
+	int	DIRBLKSIZ = VTOI(vdp)->i_e2fs->e2fs_bsize;
 
+	if (vpp != NULL)
+		*vpp = NULL;
+
+	dp = VTOI(vdp);
+	bmask = VFSTOEXT2(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
+restart:
 	bp = NULL;
 	slotoffset = -1;
-	*vpp = NULL;
-	vdp = ap->a_dvp;
-	dp = VTOI(vdp);
-	bmask = VFSTOEXT2(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
@@ -536,10 +545,12 @@
 	 * Insert name into cache (as non-existent) if appropriate.
 	 */
 	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
-		cache_enter(vdp, *vpp, cnp);
+		cache_enter(vdp, NULL, cnp);
 	return (ENOENT);
 
 found:
+	if (dd_ino != NULL)
+		*dd_ino = ino;
 	if (numdirpasses == 2)
 		nchstats.ncs_pass2++;
 	/*
@@ -582,6 +593,8 @@
 			dp->i_count = 0;
 		else
 			dp->i_count = dp->i_offset - prevoff;
+		if (dd_ino != NULL)
+			return (0);
 		if (dp->i_number == ino) {
 			VREF(vdp);
 			*vpp = vdp;
@@ -622,6 +635,8 @@
 		 */
 		if (dp->i_number == ino)
 			return (EISDIR);
+		if (dd_ino != NULL)
+			return (0);
 		if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE,
 		    &tdp)) != 0)
 			return (error);
@@ -629,6 +644,8 @@
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
+	if (dd_ino != NULL)
+		return (0);
 
 	/*
 	 * Step through the translation in the name.  We do not `vput' the
@@ -655,8 +672,27 @@
 		VOP_UNLOCK(pdp, 0);	/* race to get the inode */
 		error = VFS_VGET(vdp->v_mount, ino, cnp->cn_lkflags, &tdp);
 		vn_lock(pdp, ltype | LK_RETRY);
-		if (error != 0)
+		if (pdp->v_iflag & VI_DOOMED) {
+			if (error == 0)
+				vput(tdp);
+			error = ENOENT;
+		}
+		if (error)
 			return (error);
+		/*
+		 * Recheck that ".." entry in the vdp directory points
+		 * to the inode we looked up before vdp lock was
+		 * dropped.
+		 */
+		error = ext2_lookup_ino(pdp, NULL, cnp, &ino1);
+		if (error) {
+			vput(tdp);
+			return (error);
+		}
+		if (ino1 != ino) {
+			vput(tdp);
+			goto restart;
+		}
 		*vpp = tdp;
 	} else if (dp->i_number == ino) {
 		VREF(vdp);	/* we want ourself, ie "." */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ext2fs/ext2_vfsops.c
--- a/head/sys/fs/ext2fs/ext2_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ext2fs/ext2_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.8 (Berkeley) 4/18/94
- * $FreeBSD: head/sys/fs/ext2fs/ext2_vfsops.c 234386 2012-04-17 16:28:22Z mckusick $
+ * $FreeBSD: head/sys/fs/ext2fs/ext2_vfsops.c 238697 2012-07-22 15:40:31Z kevlo $
  */
 
 #include <sys/param.h>
@@ -112,7 +112,7 @@
 	struct vfsoptlist *opts;
 	struct vnode *devvp;
 	struct thread *td;
-	struct ext2mount *ump = 0;
+	struct ext2mount *ump = NULL;
 	struct m_ext2fs *fs;
 	struct nameidata nd, *ndp = &nd;
 	accmode_t accmode;
@@ -767,7 +767,7 @@
 	ump = VFSTOEXT2(mp);
 	fs = ump->um_e2fs;
 	if (fs->e2fs->e2fs_magic != E2FS_MAGIC)
-		panic("ext2fs_statvfs");
+		panic("ext2fs_statfs");
 
 	/*
 	 * Compute the overhead (FS structures)
@@ -830,7 +830,6 @@
 	/*
 	 * Write back each (modified) inode.
 	 */
-	MNT_ILOCK(mp);
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if (vp->v_type == VNON) {
@@ -847,7 +846,6 @@
 		}
 		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
 		if (error) {
-			MNT_ILOCK(mp);
 			if (error == ENOENT) {
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				goto loop;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ext2fs/ext2_vnops.c
--- a/head/sys/fs/ext2fs/ext2_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ext2fs/ext2_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  *
  *	@(#)ufs_vnops.c	8.7 (Berkeley) 2/3/94
  *	@(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95
- * $FreeBSD: head/sys/fs/ext2fs/ext2_vnops.c 234203 2012-04-13 05:48:31Z jh $
+ * $FreeBSD: head/sys/fs/ext2fs/ext2_vnops.c 235508 2012-05-16 15:53:38Z pfg $
  */
 
 #include "opt_suiddir.h"
@@ -1336,7 +1336,11 @@
 	error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred,
 	    cnp->cn_thread);
 	cache_purge(ITOV(ip));
-	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+	if (vn_lock(dvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+		VOP_UNLOCK(vp, 0);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	}
 out:
 	return (error);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/hpfs/hpfs_vnops.c
--- a/head/sys/fs/hpfs/hpfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/hpfs/hpfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/fs/hpfs/hpfs_vnops.c 235984 2012-05-25 09:16:59Z gleb $
  */
 
 #include <sys/param.h>
@@ -528,7 +528,7 @@
 		}
 
 		if (vap->va_size < hp->h_fn.fn_size) {
-			error = vtruncbuf(vp, cred, td, vap->va_size, DEV_BSIZE);
+			error = vtruncbuf(vp, cred, vap->va_size, DEV_BSIZE);
 			if (error)
 				return (error);
 			error = hpfs_truncate(hp, vap->va_size);
@@ -576,7 +576,7 @@
 	}
 
 	if (hp->h_flag & H_INVAL) {
-		vrecycle(vp, ap->a_td);
+		vrecycle(vp);
 		return (0);
 	}
 
@@ -797,10 +797,21 @@
 }
 
 
-static struct dirent hpfs_de_dot =
-	{ 0, sizeof(struct dirent), DT_DIR, 1, "." };
-static struct dirent hpfs_de_dotdot =
-	{ 0, sizeof(struct dirent), DT_DIR, 2, ".." };
+static struct dirent hpfs_de_dot = {
+	.d_fileno = 0,
+	.d_reclen = sizeof(struct dirent),
+	.d_type = DT_DIR,
+	.d_namlen = 1,
+	.d_name = "."
+};
+static struct dirent hpfs_de_dotdot = {
+	.d_fileno = 0,
+	.d_reclen = sizeof(struct dirent),
+	.d_type = DT_DIR,
+	.d_namlen = 2,
+	.d_name = ".."
+};
+
 int
 hpfs_readdir(ap)
 	struct vop_readdir_args /* {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/msdosfs/denode.h
--- a/head/sys/fs/msdosfs/denode.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/msdosfs/denode.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD$ */
+/* $FreeBSD: head/sys/fs/msdosfs/denode.h 234605 2012-04-23 13:21:28Z trasz $ */
 /*	$NetBSD: denode.h,v 1.25 1997/11/17 15:36:28 ws Exp $	*/
 
 /*-
@@ -276,6 +276,6 @@
 int createde(struct denode *dep, struct denode *ddep, struct denode **depp, struct componentname *cnp);
 int deupdat(struct denode *dep, int waitfor);
 int removede(struct denode *pdep, struct denode *dep);
-int detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred, struct thread *td);
+int detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred);
 int doscheckpath( struct denode *source, struct denode *target);
 #endif	/* _KERNEL */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/msdosfs/msdosfs_denode.c
--- a/head/sys/fs/msdosfs/msdosfs_denode.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/msdosfs/msdosfs_denode.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_denode.c 231998 2012-02-22 13:01:17Z kib $ */
+/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_denode.c 234607 2012-04-23 14:10:34Z trasz $ */
 /*	$NetBSD: msdosfs_denode.c,v 1.28 1998/02/10 14:10:00 mrg Exp $	*/
 
 /*-
@@ -326,12 +326,11 @@
  * Truncate the file described by dep to the length specified by length.
  */
 int
-detrunc(dep, length, flags, cred, td)
+detrunc(dep, length, flags, cred)
 	struct denode *dep;
 	u_long length;
 	int flags;
 	struct ucred *cred;
-	struct thread *td;
 {
 	int error;
 	int allerror;
@@ -426,7 +425,7 @@
 	dep->de_FileSize = length;
 	if (!isadir)
 		dep->de_flag |= DE_UPDATE | DE_MODIFIED;
-	allerror = vtruncbuf(DETOV(dep), cred, td, length, pmp->pm_bpcluster);
+	allerror = vtruncbuf(DETOV(dep), cred, length, pmp->pm_bpcluster);
 #ifdef MSDOSFS_DEBUG
 	if (allerror)
 		printf("detrunc(): vtruncbuf error %d\n", allerror);
@@ -504,7 +503,7 @@
 		error = extendfile(dep, count, NULL, NULL, DE_CLEAR);
 		if (error) {
 			/* truncate the added clusters away again */
-			(void) detrunc(dep, dep->de_FileSize, 0, cred, NULL);
+			(void) detrunc(dep, dep->de_FileSize, 0, cred);
 			return (error);
 		}
 	}
@@ -584,7 +583,6 @@
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
-	struct thread *td = ap->a_td;
 	int error = 0;
 
 #ifdef MSDOSFS_DEBUG
@@ -607,7 +605,7 @@
 	       dep, dep->de_refcnt, vp->v_mount->mnt_flag, MNT_RDONLY);
 #endif
 	if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
-		error = detrunc(dep, (u_long) 0, 0, NOCRED, td);
+		error = detrunc(dep, (u_long) 0, 0, NOCRED);
 		dep->de_flag |= DE_UPDATE;
 		dep->de_Name[0] = SLOT_DELETED;
 	}
@@ -623,6 +621,6 @@
 	       vrefcnt(vp), dep->de_Name[0]);
 #endif
 	if (dep->de_Name[0] == SLOT_DELETED || dep->de_Name[0] == SLOT_EMPTY)
-		vrecycle(vp, td);
+		vrecycle(vp);
 	return (error);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/msdosfs/msdosfs_lookup.c
--- a/head/sys/fs/msdosfs/msdosfs_lookup.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/msdosfs/msdosfs_lookup.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_lookup.c 231998 2012-02-22 13:01:17Z kib $ */
+/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_lookup.c 238697 2012-07-22 15:40:31Z kevlo $ */
 /*	$NetBSD: msdosfs_lookup.c,v 1.37 1997/11/17 15:36:54 ws Exp $	*/
 
 /*-
@@ -108,7 +108,7 @@
 	struct denode *dp;
 	struct denode *tdp;
 	struct msdosfsmount *pmp;
-	struct buf *bp = 0;
+	struct buf *bp = NULL;
 	struct direntry *dep = NULL;
 	u_char dosfilename[12];
 	int flags = cnp->cn_flags;
@@ -649,7 +649,7 @@
 		dirclust = de_clcount(pmp, diroffset);
 		error = extendfile(ddep, dirclust, 0, 0, DE_CLEAR);
 		if (error) {
-			(void)detrunc(ddep, ddep->de_FileSize, 0, NOCRED, NULL);
+			(void)detrunc(ddep, ddep->de_FileSize, 0, NOCRED);
 			return error;
 		}
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/msdosfs/msdosfs_vnops.c
--- a/head/sys/fs/msdosfs/msdosfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/msdosfs/msdosfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_vnops.c 231998 2012-02-22 13:01:17Z kib $ */
+/* $FreeBSD: head/sys/fs/msdosfs/msdosfs_vnops.c 234605 2012-04-23 13:21:28Z trasz $ */
 /*	$NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $	*/
 
 /*-
@@ -476,7 +476,7 @@
 			 */
 			break;
 		}
-		error = detrunc(dep, vap->va_size, 0, cred, td);
+		error = detrunc(dep, vap->va_size, 0, cred);
 		if (error)
 			return error;
 	}
@@ -835,11 +835,11 @@
 errexit:
 	if (error) {
 		if (ioflag & IO_UNIT) {
-			detrunc(dep, osize, ioflag & IO_SYNC, NOCRED, NULL);
+			detrunc(dep, osize, ioflag & IO_SYNC, NOCRED);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		} else {
-			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, NULL);
+			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED);
 			if (uio->uio_resid != resid)
 				error = 0;
 		}
@@ -1429,7 +1429,6 @@
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *ip, *dp;
-	struct thread *td = cnp->cn_thread;
 	int error;
 
 	ip = VTODE(vp);
@@ -1467,7 +1466,7 @@
 	/*
 	 * Truncate the directory that is being deleted.
 	 */
-	error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred, td);
+	error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred);
 	cache_purge(vp);
 
 out:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/bmap.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/bmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,621 @@
+/*-
+ * Copyright (c) 2012 Semihalf
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/bmap.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/ktr.h>
+#include <sys/kdb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include <machine/_inttypes.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+#include "bmap.h"
+
+static int bmap_getlbns(struct nandfs_node *, nandfs_lbn_t,
+    struct nandfs_indir *, int *);
+
+int
+bmap_lookup(struct nandfs_node *node, nandfs_lbn_t lblk, nandfs_daddr_t *vblk)
+{
+	struct nandfs_inode *ip;
+	struct nandfs_indir a[NIADDR + 1], *ap;
+	nandfs_daddr_t daddr;
+	struct buf *bp;
+	int error;
+	int num, *nump;
+
+	DPRINTF(BMAP, ("%s: node %p lblk %jx enter\n", __func__, node, lblk));
+	ip = &node->nn_inode;
+
+	ap = a;
+	nump = #
+
+	error = bmap_getlbns(node, lblk, ap, nump);
+	if (error)
+		return (error);
+
+	if (num == 0) {
+		*vblk = ip->i_db[lblk];
+		return (0);
+	}
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx trying ip->i_ib[%x]\n", __func__,
+	    node, lblk, ap->in_off));
+	daddr = ip->i_ib[ap->in_off];
+	for (bp = NULL, ++ap; --num; ap++) {
+		if (daddr == 0) {
+			DPRINTF(BMAP, ("%s: node %p lblk=%jx returning with "
+			    "vblk 0\n", __func__, node, lblk));
+			*vblk = 0;
+			return (0);
+		}
+		if (ap->in_lbn == lblk) {
+			DPRINTF(BMAP, ("%s: node %p lblk=%jx ap->in_lbn=%jx "
+			    "returning address of indirect block (%jx)\n",
+			    __func__, node, lblk, ap->in_lbn, daddr));
+			*vblk = daddr;
+			return (0);
+		}
+
+		DPRINTF(BMAP, ("%s: node %p lblk=%jx reading block "
+		    "ap->in_lbn=%jx\n", __func__, node, lblk, ap->in_lbn));
+
+		error = nandfs_bread_meta(node, ap->in_lbn, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+
+		daddr = ((nandfs_daddr_t *)bp->b_data)[ap->in_off];
+		brelse(bp);
+	}
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx returning with %jx\n", __func__,
+	    node, lblk, daddr));
+	*vblk = daddr;
+
+	return (0);
+}
+
+int
+bmap_dirty_meta(struct nandfs_node *node, nandfs_lbn_t lblk, int force)
+{
+	struct nandfs_indir a[NIADDR+1], *ap;
+#ifdef DEBUG
+	nandfs_daddr_t daddr;
+#endif
+	struct buf *bp;
+	int error;
+	int num, *nump;
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx\n", __func__, node, lblk));
+
+	ap = a;
+	nump = #
+
+	error = bmap_getlbns(node, lblk, ap, nump);
+	if (error)
+		return (error);
+
+	/*
+	 * Direct block, nothing to do
+	 */
+	if (num == 0)
+		return (0);
+
+	DPRINTF(BMAP, ("%s: node %p reading blocks\n", __func__, node));
+
+	for (bp = NULL, ++ap; --num; ap++) {
+		error = nandfs_bread_meta(node, ap->in_lbn, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+
+#ifdef DEBUG
+		daddr = ((nandfs_daddr_t *)bp->b_data)[ap->in_off];
+		MPASS(daddr != 0 || node->nn_ino == 3);
+#endif
+
+		error = nandfs_dirty_buf_meta(bp, force);
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+int
+bmap_insert_block(struct nandfs_node *node, nandfs_lbn_t lblk,
+    nandfs_daddr_t vblk)
+{
+	struct nandfs_inode *ip;
+	struct nandfs_indir a[NIADDR+1], *ap;
+	struct buf *bp;
+	nandfs_daddr_t daddr;
+	int error;
+	int num, *nump, i;
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx\n", __func__, node, lblk,
+	    vblk));
+
+	ip = &node->nn_inode;
+
+	ap = a;
+	nump = #
+
+	error = bmap_getlbns(node, lblk, ap, nump);
+	if (error)
+		return (error);
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx got num=%d\n", __func__,
+	    node, lblk, vblk, num));
+
+	if (num == 0) {
+		DPRINTF(BMAP, ("%s: node %p lblk=%jx direct block\n", __func__,
+		    node, lblk));
+		ip->i_db[lblk] = vblk;
+		return (0);
+	}
+
+	DPRINTF(BMAP, ("%s: node %p lblk=%jx indirect block level %d\n",
+	    __func__, node, lblk, ap->in_off));
+
+	if (num == 1) {
+		DPRINTF(BMAP, ("%s: node %p lblk=%jx indirect block: inserting "
+		    "%jx as vblk for indirect block %d\n", __func__, node,
+		    lblk, vblk, ap->in_off));
+		ip->i_ib[ap->in_off] = vblk;
+		return (0);
+	}
+
+	bp = NULL;
+	daddr = ip->i_ib[a[0].in_off];
+	for (i = 1; i < num; i++) {
+		if (bp)
+			brelse(bp);
+		if (daddr == 0) {
+			DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx create "
+			    "block %jx %d\n", __func__, node, lblk, vblk,
+			    a[i].in_lbn, a[i].in_off));
+			error = nandfs_bcreate_meta(node, a[i].in_lbn, NOCRED,
+			    0, &bp);
+			if (error)
+				return (error);
+		} else {
+			DPRINTF(BMAP, ("%s: node %p lblk=%jx vblk=%jx read "
+			    "block %jx %d\n", __func__, node, daddr, vblk,
+			    a[i].in_lbn, a[i].in_off));
+			error = nandfs_bread_meta(node, a[i].in_lbn, NOCRED, 0, &bp);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
+		}
+		daddr = ((nandfs_daddr_t *)bp->b_data)[a[i].in_off];
+	}
+	i--;
+
+	DPRINTF(BMAP,
+	    ("%s: bmap node %p lblk=%jx vblk=%jx inserting vblk level %d at "
+	    "offset %d at %jx\n", __func__, node, lblk, vblk, i, a[i].in_off,
+	    daddr));
+
+	if (!bp) {
+		nandfs_error("%s: cannot find indirect block\n", __func__);
+		return (-1);
+	}
+	((nandfs_daddr_t *)bp->b_data)[a[i].in_off] = vblk;
+
+	error = nandfs_dirty_buf_meta(bp, 0);
+	if (error) {
+		nandfs_warning("%s: dirty failed buf: %p\n", __func__, bp);
+		return (error);
+	}
+	DPRINTF(BMAP, ("%s: exiting node %p lblk=%jx vblk=%jx\n", __func__,
+	    node, lblk, vblk));
+
+	return (error);
+}
+
+CTASSERT(NIADDR <= 3);
+#define SINGLE	0	/* index of single indirect block */
+#define DOUBLE	1	/* index of double indirect block */
+#define TRIPLE	2	/* index of triple indirect block */
+
+static __inline nandfs_lbn_t
+lbn_offset(struct nandfs_device *fsdev, int level)
+{
+	nandfs_lbn_t res;
+
+	for (res = 1; level > 0; level--)
+		res *= MNINDIR(fsdev);
+	return (res);
+}
+
+static nandfs_lbn_t
+blocks_inside(struct nandfs_device *fsdev, int level, struct nandfs_indir *nip)
+{
+	nandfs_lbn_t blocks;
+
+	for (blocks = 1; level >= SINGLE; level--, nip++) {
+		MPASS(nip->in_off >= 0 && nip->in_off < MNINDIR(fsdev));
+		blocks += nip->in_off * lbn_offset(fsdev, level);
+	}
+
+	return (blocks);
+}
+
+static int
+bmap_truncate_indirect(struct nandfs_node *node, int level, nandfs_lbn_t *left,
+    int *cleaned, struct nandfs_indir *ap, struct nandfs_indir *fp,
+    nandfs_daddr_t *copy)
+{
+	struct buf *bp;
+	nandfs_lbn_t i, lbn, nlbn, factor, tosub;
+	struct nandfs_device *fsdev;
+	int error, lcleaned, modified;
+
+	DPRINTF(BMAP, ("%s: node %p level %d left %jx\n", __func__,
+	    node, level, *left));
+
+	fsdev = node->nn_nandfsdev;
+
+	MPASS(ap->in_off >= 0 && ap->in_off < MNINDIR(fsdev));
+
+	factor = lbn_offset(fsdev, level);
+	lbn = ap->in_lbn;
+
+	error = nandfs_bread_meta(node, lbn, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	bcopy(bp->b_data, copy, fsdev->nd_blocksize);
+	bqrelse(bp);
+
+	modified = 0;
+
+	i = ap->in_off;
+
+	if (ap != fp)
+		ap++;
+	for (nlbn = lbn + 1 - i * factor; i >= 0 && *left > 0; i--,
+	    nlbn += factor) {
+		lcleaned = 0;
+
+		DPRINTF(BMAP,
+		    ("%s: node %p i=%jx nlbn=%jx left=%jx ap=%p vblk %jx\n",
+		    __func__, node, i, nlbn, *left, ap, copy[i]));
+
+		if (copy[i] == 0) {
+			tosub = blocks_inside(fsdev, level - 1, ap);
+			if (tosub > *left)
+				tosub = 0;
+
+			*left -= tosub;
+		} else {
+			if (level > SINGLE) {
+				if (ap == fp)
+					ap->in_lbn = nlbn;
+
+				error = bmap_truncate_indirect(node, level - 1,
+				    left, &lcleaned, ap, fp,
+				    copy + MNINDIR(fsdev));
+				if (error)
+					return (error);
+			} else {
+				error = nandfs_bdestroy(node, copy[i]);
+				if (error)
+					return (error);
+				lcleaned = 1;
+				*left -= 1;
+			}
+		}
+
+		if (lcleaned) {
+			if (level > SINGLE) {
+				error = nandfs_vblock_end(fsdev, copy[i]);
+				if (error)
+					return (error);
+			}
+			copy[i] = 0;
+			modified++;
+		}
+
+		ap = fp;
+	}
+
+	if (i == -1)
+		*cleaned = 1;
+
+	error = nandfs_bread_meta(node, lbn, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	if (modified)
+		bcopy(copy, bp->b_data, fsdev->nd_blocksize);
+
+	error = nandfs_dirty_buf_meta(bp, 0);
+	if (error)
+		return (error);
+
+	return (error);
+}
+
+int
+bmap_truncate_mapping(struct nandfs_node *node, nandfs_lbn_t lastblk,
+    nandfs_lbn_t todo)
+{
+	struct nandfs_inode *ip;
+	struct nandfs_indir a[NIADDR + 1], f[NIADDR], *ap;
+	nandfs_daddr_t indir_lbn[NIADDR];
+	nandfs_daddr_t *copy;
+	int error, level;
+	nandfs_lbn_t left, tosub;
+	struct nandfs_device *fsdev;
+	int cleaned, i;
+	int num, *nump;
+
+	DPRINTF(BMAP, ("%s: node %p lastblk %jx truncating by %jx\n", __func__,
+	    node, lastblk, todo));
+
+	ip = &node->nn_inode;
+	fsdev = node->nn_nandfsdev;
+
+	ap = a;
+	nump = #
+
+	error = bmap_getlbns(node, lastblk, ap, nump);
+	if (error)
+		return (error);
+
+	indir_lbn[SINGLE] = -NDADDR;
+	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - MNINDIR(fsdev) - 1;
+	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - MNINDIR(fsdev)
+	    * MNINDIR(fsdev) - 1;
+
+	for (i = 0; i < NIADDR; i++) {
+		f[i].in_off = MNINDIR(fsdev) - 1;
+		f[i].in_lbn = 0xdeadbeef;
+	}
+
+	left = todo;
+
+#ifdef DEBUG
+	a[num].in_off = -1;
+#endif
+
+	ap++;
+	num -= 2;
+
+	if (num < 0)
+		goto direct;
+
+	copy = malloc(MNINDIR(fsdev) * sizeof(nandfs_daddr_t) * (num + 1),
+	    M_NANDFSTEMP, M_WAITOK);
+
+	for (level = num; level >= SINGLE && left > 0; level--) {
+		cleaned = 0;
+
+		if (ip->i_ib[level] == 0) {
+			tosub = blocks_inside(fsdev, level, ap);
+			if (tosub > left)
+				left = 0;
+			else
+				left -= tosub;
+		} else {
+			if (ap == f)
+				ap->in_lbn = indir_lbn[level];
+			error = bmap_truncate_indirect(node, level, &left,
+			    &cleaned, ap, f, copy);
+			if (error) {
+				nandfs_error("%s: error %d when truncate "
+				    "at level %d\n", __func__, error, level);
+				return (error);
+			}
+		}
+
+		if (cleaned) {
+			nandfs_vblock_end(fsdev, ip->i_ib[level]);
+			ip->i_ib[level] = 0;
+		}
+
+		ap = f;
+	}
+
+	free(copy, M_NANDFSTEMP);
+
+direct:
+	if (num < 0)
+		i = lastblk;
+	else
+		i = NDADDR - 1;
+
+	for (; i >= 0 && left > 0; i--) {
+		if (ip->i_db[i] != 0) {
+			error = nandfs_bdestroy(node, ip->i_db[i]);
+			if (error) {
+				nandfs_error("%s: cannot destroy "
+				    "block %jx, error %d\n", __func__,
+				    (uintmax_t)ip->i_db[i], error);
+				return (error);
+			}
+			ip->i_db[i] = 0;
+		}
+
+		left--;
+	}
+
+	KASSERT(left == 0,
+	    ("truncated wrong number of blocks (%jd should be 0)", left));
+
+	return (error);
+}
+
+nandfs_lbn_t
+get_maxfilesize(struct nandfs_device *fsdev)
+{
+	struct nandfs_indir f[NIADDR];
+	nandfs_lbn_t max;
+	int i;
+
+	max = NDADDR;
+
+	for (i = 0; i < NIADDR; i++) {
+		f[i].in_off = MNINDIR(fsdev) - 1;
+		max += blocks_inside(fsdev, i, f);
+	}
+
+	max *= fsdev->nd_blocksize;
+
+	return (max);
+}
+
+/*
+ * This is ufs_getlbns with minor modifications.
+ */
+/*
+ * Create an array of logical block number/offset pairs which represent the
+ * path of indirect blocks required to access a data block.  The first "pair"
+ * contains the logical block number of the appropriate single, double or
+ * triple indirect block and the offset into the inode indirect block array.
+ * Note, the logical block number of the inode single/double/triple indirect
+ * block appears twice in the array, once with the offset into the i_ib and
+ * once with the offset into the page itself.
+ */
+static int
+bmap_getlbns(struct nandfs_node *node, nandfs_lbn_t bn, struct nandfs_indir *ap, int *nump)
+{
+	nandfs_daddr_t blockcnt;
+	nandfs_lbn_t metalbn, realbn;
+	struct nandfs_device *fsdev;
+	int i, numlevels, off;
+
+	fsdev = node->nn_nandfsdev;
+
+	DPRINTF(BMAP, ("%s: node %p bn=%jx mnindir=%zd enter\n", __func__,
+	    node, bn, MNINDIR(fsdev)));
+
+	*nump = 0;
+	numlevels = 0;
+	realbn = bn;
+
+	if (bn < 0)
+		bn = -bn;
+
+	/* The first NDADDR blocks are direct blocks. */
+	if (bn < NDADDR)
+		return (0);
+
+	/*
+	 * Determine the number of levels of indirection.  After this loop
+	 * is done, blockcnt indicates the number of data blocks possible
+	 * at the previous level of indirection, and NIADDR - i is the number
+	 * of levels of indirection needed to locate the requested block.
+	 */
+	for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) {
+		DPRINTF(BMAP, ("%s: blockcnt=%jd i=%d bn=%jd\n", __func__,
+		    blockcnt, i, bn));
+		if (i == 0)
+			return (EFBIG);
+		blockcnt *= MNINDIR(fsdev);
+		if (bn < blockcnt)
+			break;
+	}
+
+	/* Calculate the address of the first meta-block. */
+	if (realbn >= 0)
+		metalbn = -(realbn - bn + NIADDR - i);
+	else
+		metalbn = -(-realbn - bn + NIADDR - i);
+
+	/*
+	 * At each iteration, off is the offset into the bap array which is
+	 * an array of disk addresses at the current level of indirection.
+	 * The logical block number and the offset in that block are stored
+	 * into the argument array.
+	 */
+	ap->in_lbn = metalbn;
+	ap->in_off = off = NIADDR - i;
+
+	DPRINTF(BMAP, ("%s: initial: ap->in_lbn=%jx ap->in_off=%d\n", __func__,
+	    metalbn, off));
+
+	ap++;
+	for (++numlevels; i <= NIADDR; i++) {
+		/* If searching for a meta-data block, quit when found. */
+		if (metalbn == realbn)
+			break;
+
+		blockcnt /= MNINDIR(fsdev);
+		off = (bn / blockcnt) % MNINDIR(fsdev);
+
+		++numlevels;
+		ap->in_lbn = metalbn;
+		ap->in_off = off;
+
+		DPRINTF(BMAP, ("%s: in_lbn=%jx in_off=%d\n", __func__,
+		    ap->in_lbn, ap->in_off));
+		++ap;
+
+		metalbn -= -1 + off * blockcnt;
+	}
+	if (nump)
+		*nump = numlevels;
+
+	DPRINTF(BMAP, ("%s: numlevels=%d\n", __func__, numlevels));
+
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/bmap.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/bmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2012 Semihalf
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/fs/nandfs/bmap.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _BMAP_H
+#define _BMAP_H
+
+#include "nandfs_fs.h"
+
+int bmap_lookup(struct nandfs_node *, nandfs_lbn_t, nandfs_daddr_t *);
+int bmap_insert_block(struct nandfs_node *, nandfs_lbn_t, nandfs_daddr_t);
+int bmap_truncate_mapping(struct nandfs_node *, nandfs_lbn_t, nandfs_lbn_t);
+int bmap_dirty_meta(struct nandfs_node *, nandfs_lbn_t, int);
+
+nandfs_lbn_t get_maxfilesize(struct nandfs_device *);
+
+#endif /* _BMAP_H */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,310 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs.h,v 1.1 2009/07/18 16:31:42 reinoud
+ *
+ * $FreeBSD: head/sys/fs/nandfs/nandfs.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _FS_NANDFS_NANDFS_H_
+#define _FS_NANDFS_NANDFS_H_
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <sys/queue.h>
+#include <sys/uio.h>
+#include <sys/mutex.h>
+
+#include <sys/disk.h>
+#include <sys/kthread.h>
+#include "nandfs_fs.h"
+
+MALLOC_DECLARE(M_NANDFSTEMP);
+
+/* Debug categories */
+#define	NANDFS_DEBUG_VOLUMES		0x000001
+#define	NANDFS_DEBUG_BLOCK		0x000004
+#define	NANDFS_DEBUG_LOCKING		0x000008
+#define	NANDFS_DEBUG_NODE		0x000010
+#define	NANDFS_DEBUG_LOOKUP		0x000020
+#define	NANDFS_DEBUG_READDIR		0x000040
+#define	NANDFS_DEBUG_TRANSLATE		0x000080
+#define	NANDFS_DEBUG_STRATEGY		0x000100
+#define	NANDFS_DEBUG_READ		0x000200
+#define	NANDFS_DEBUG_WRITE		0x000400
+#define	NANDFS_DEBUG_IFILE		0x000800
+#define	NANDFS_DEBUG_ATTR		0x001000
+#define	NANDFS_DEBUG_EXTATTR		0x002000
+#define	NANDFS_DEBUG_ALLOC		0x004000
+#define	NANDFS_DEBUG_CPFILE		0x008000
+#define	NANDFS_DEBUG_DIRHASH		0x010000
+#define	NANDFS_DEBUG_NOTIMPL		0x020000
+#define	NANDFS_DEBUG_SHEDULE		0x040000
+#define	NANDFS_DEBUG_SEG		0x080000
+#define	NANDFS_DEBUG_SYNC		0x100000
+#define	NANDFS_DEBUG_PARANOIA		0x200000
+#define	NANDFS_DEBUG_VNCALL		0x400000
+#define	NANDFS_DEBUG_BUF		0x1000000
+#define	NANDFS_DEBUG_BMAP		0x2000000
+#define	NANDFS_DEBUG_DAT		0x4000000
+#define	NANDFS_DEBUG_GENERIC		0x8000000
+#define	NANDFS_DEBUG_CLEAN		0x10000000
+
+extern int nandfs_verbose;
+
+#define	DPRINTF(name, arg) { \
+		if (nandfs_verbose & NANDFS_DEBUG_##name) {\
+			printf arg;\
+		};\
+	}
+#define	DPRINTFIF(name, cond, arg) { \
+		if (nandfs_verbose & NANDFS_DEBUG_##name) { \
+			if (cond) printf arg;\
+		};\
+	}
+
+#define	VFSTONANDFS(mp)    ((struct nandfsmount *)((mp)->mnt_data))
+#define	VTON(vp) ((struct nandfs_node *)(vp)->v_data)
+#define	NTOV(xp) ((xp)->nn_vnode)
+
+int nandfs_init(struct vfsconf *);
+int nandfs_uninit(struct vfsconf *);
+
+extern struct vop_vector nandfs_vnodeops;
+extern struct vop_vector nandfs_system_vnodeops;
+
+struct nandfs_node;
+
+/* Structure and derivatives */
+struct nandfs_mdt {
+	uint32_t	entries_per_block;
+	uint32_t	entries_per_group;
+	uint32_t	blocks_per_group;
+	uint32_t	groups_per_desc_block;	/* desc is super group */
+	uint32_t	blocks_per_desc_block;	/* desc is super group */
+};
+
+struct nandfs_segment {
+	LIST_ENTRY(nandfs_segment) seg_link;
+
+	struct nandfs_device	*fsdev;
+
+	TAILQ_HEAD(, buf)	 segsum;
+	TAILQ_HEAD(, buf)	 data;
+
+	uint64_t		 seg_num;
+	uint64_t		 seg_next;
+	uint64_t		 start_block;
+	uint32_t		 num_blocks;
+
+	uint32_t		 nblocks;
+	uint32_t		 nbinfos;
+	uint32_t		 segsum_blocks;
+	uint32_t		 segsum_bytes;
+	uint32_t		 bytes_left;
+	char			*current_off;
+};
+
+struct nandfs_seginfo {
+	LIST_HEAD( ,nandfs_segment)	seg_list;
+	struct nandfs_segment		*curseg;
+	struct nandfs_device		*fsdev;
+	uint32_t			blocks;
+	uint8_t				reiterate;
+};
+
+#define	NANDFS_FSSTOR_FAILED	1
+struct nandfs_fsarea {
+	int	offset;
+	int	flags;
+	int	last_used;
+};
+
+extern int nandfs_cleaner_enable;
+extern int nandfs_cleaner_interval;
+extern int nandfs_cleaner_segments;
+
+struct nandfs_device {
+	struct vnode		*nd_devvp;
+	struct g_consumer	*nd_gconsumer;
+
+	struct thread		*nd_syncer;
+	struct thread		*nd_cleaner;
+	int			nd_syncer_exit;
+	int			nd_cleaner_exit;
+
+	int			nd_is_nand;
+
+	struct nandfs_fsarea	nd_fsarea[NANDFS_NFSAREAS];
+	int			nd_last_fsarea;
+
+	STAILQ_HEAD(nandfs_mnts, nandfsmount)	nd_mounts;
+	SLIST_ENTRY(nandfs_device)		nd_next_device;
+
+	/* FS structures */
+	struct nandfs_fsdata		nd_fsdata;
+	struct nandfs_super_block	nd_super;
+	struct nandfs_segment_summary	nd_last_segsum;
+	struct nandfs_super_root	nd_super_root;
+	struct nandfs_node	*nd_dat_node;
+	struct nandfs_node	*nd_cp_node;
+	struct nandfs_node	*nd_su_node;
+	struct nandfs_node	*nd_gc_node;
+
+	struct nandfs_mdt	nd_dat_mdt;
+	struct nandfs_mdt	nd_ifile_mdt;
+
+	struct timespec		nd_ts;
+
+	/* Synchronization */
+	struct mtx		nd_mutex;
+	struct mtx		nd_sync_mtx;
+	struct cv		nd_sync_cv;
+	struct mtx		nd_clean_mtx;
+	struct cv		nd_clean_cv;
+	struct lock		nd_seg_const;
+
+	struct nandfs_seginfo	*nd_seginfo;
+
+	/* FS geometry */
+	uint64_t		nd_devsize;
+	uint64_t		nd_maxfilesize;
+	uint32_t		nd_blocksize;
+	uint32_t		nd_erasesize;
+
+	uint32_t		nd_devblocksize;
+
+	/* Segment usage */
+	uint64_t		nd_clean_segs;
+	uint64_t		*nd_free_base;
+	uint64_t		nd_free_count;
+	uint64_t		nd_dirty_bufs;
+
+	/* Running values */
+	uint64_t		nd_seg_sequence;
+	uint64_t		nd_seg_num;
+	uint64_t		nd_next_seg_num;
+	uint64_t		nd_last_pseg;
+	uint64_t		nd_last_cno;
+	uint64_t		nd_last_ino;
+	uint64_t		nd_fakevblk;
+
+	int			nd_mount_state;
+	int			nd_refcnt;
+	int			nd_syncing;
+	int			nd_cleaning;
+};
+
+extern SLIST_HEAD(_nandfs_devices, nandfs_device) nandfs_devices;
+
+#define	NANDFS_FORCE_SYNCER	0x1
+#define	NANDFS_UMOUNT		0x2
+
+#define	SYNCER_UMOUNT		0x0
+#define	SYNCER_VFS_SYNC		0x1
+#define	SYNCER_BDFLUSH		0x2
+#define	SYNCER_FFORCE		0x3
+#define	SYNCER_FSYNC		0x4
+#define	SYNCER_ROUPD		0x5
+
+static __inline int
+nandfs_writelockflags(struct nandfs_device *fsdev, int flags)
+{
+	int error = 0;
+
+	if (lockstatus(&fsdev->nd_seg_const) != LK_EXCLUSIVE)
+		error = lockmgr(&fsdev->nd_seg_const, flags | LK_SHARED, NULL);
+
+	return (error);
+}
+
+static __inline void
+nandfs_writeunlock(struct nandfs_device *fsdev)
+{
+
+	if (lockstatus(&fsdev->nd_seg_const) != LK_EXCLUSIVE)
+		lockmgr(&(fsdev)->nd_seg_const, LK_RELEASE, NULL);
+}
+
+#define NANDFS_WRITELOCKFLAGS(fsdev, flags)	nandfs_writelockflags(fsdev, flags)
+
+#define NANDFS_WRITELOCK(fsdev) NANDFS_WRITELOCKFLAGS(fsdev, 0)
+
+#define NANDFS_WRITEUNLOCK(fsdev) nandfs_writeunlock(fsdev)
+
+#define NANDFS_WRITEASSERT(fsdev) lockmgr_assert(&(fsdev)->nd_seg_const, KA_LOCKED)
+
+/* Specific mountpoint; head or a checkpoint/snapshot */
+struct nandfsmount {
+	STAILQ_ENTRY(nandfsmount) nm_next_mount;
+
+	struct mount		*nm_vfs_mountp;
+	struct nandfs_device	*nm_nandfsdev;
+	struct nandfs_args	nm_mount_args;
+	struct nandfs_node	*nm_ifile_node;
+
+	uint8_t			nm_flags;
+	int8_t			nm_ronly;
+};
+
+struct nandfs_node {
+	struct vnode			*nn_vnode;
+	struct nandfsmount		*nn_nmp;
+	struct nandfs_device		*nn_nandfsdev;
+	struct lockf			*nn_lockf;
+
+	uint64_t			nn_ino;
+	struct nandfs_inode		nn_inode;
+
+	uint64_t			nn_diroff;
+	uint32_t			nn_flags;
+};
+
+#define	IN_ACCESS	0x0001	/* Inode access time update request  */
+#define	IN_CHANGE	0x0002	/* Inode change time update request  */
+#define	IN_UPDATE	0x0004	/* Inode was written to; update mtime*/
+#define	IN_MODIFIED	0x0008	/* node has been modified */
+#define	IN_RENAME	0x0010	/* node is being renamed. */
+
+/* File permissions. */
+#define	IEXEC		0000100	/* Executable. */
+#define	IWRITE		0000200	/* Writeable. */
+#define	IREAD		0000400	/* Readable. */
+#define	ISVTX		0001000	/* Sticky bit. */
+#define	ISGID		0002000	/* Set-gid. */
+#define	ISUID		0004000	/* Set-uid. */
+
+#define	PRINT_NODE_FLAGS \
+	"\10\1IN_ACCESS\2IN_CHANGE\3IN_UPDATE\4IN_MODIFIED\5IN_RENAME"
+
+#define	NANDFS_GATHER(x) ((x)->b_flags |= B_00800000)
+#define	NANDFS_UNGATHER(x) ((x)->b_flags &= ~B_00800000)
+#define	NANDFS_ISGATHERED(x) ((x)->b_flags & B_00800000)
+
+#endif /* !_FS_NANDFS_NANDFS_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_alloc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_alloc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,364 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_alloc.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+static void
+nandfs_get_desc_block_nr(struct nandfs_mdt *mdt, uint64_t desc,
+    uint64_t *desc_block)
+{
+
+	*desc_block = desc * mdt->blocks_per_desc_block;
+}
+
+static void
+nandfs_get_group_block_nr(struct nandfs_mdt *mdt, uint64_t group,
+    uint64_t *group_block)
+{
+	uint64_t desc, group_off;
+
+	desc = group / mdt->groups_per_desc_block;
+	group_off = group % mdt->groups_per_desc_block;
+	*group_block = desc * mdt->blocks_per_desc_block +
+	    1 + group_off * mdt->blocks_per_group;
+}
+
+static void
+init_desc_block(struct nandfs_mdt *mdt, uint8_t *block_data)
+{
+	struct nandfs_block_group_desc *desc;
+	uint32_t i;
+
+	desc = (struct nandfs_block_group_desc *) block_data;
+	for (i = 0; i < mdt->groups_per_desc_block; i++)
+		desc[i].bg_nfrees = mdt->entries_per_group;
+}
+
+int
+nandfs_find_free_entry(struct nandfs_mdt *mdt, struct nandfs_node *node,
+    struct nandfs_alloc_request *req)
+{
+	nandfs_daddr_t desc, group, maxgroup, maxdesc, pos = 0;
+	nandfs_daddr_t start_group, start_desc;
+	nandfs_daddr_t desc_block, group_block;
+	nandfs_daddr_t file_blocks;
+	struct nandfs_block_group_desc *descriptors;
+	struct buf *bp, *bp2;
+	uint32_t *mask, i, mcount, msize;
+	int error;
+
+	file_blocks = node->nn_inode.i_blocks;
+	maxgroup = 0x100000000ull / mdt->entries_per_group;
+	maxdesc = maxgroup / mdt->groups_per_desc_block;
+	start_group = req->entrynum / mdt->entries_per_group;
+	start_desc = start_group / mdt->groups_per_desc_block;
+
+	bp = bp2 = NULL;
+restart:
+	for (desc = start_desc; desc < maxdesc; desc++) {
+		nandfs_get_desc_block_nr(mdt, desc, &desc_block);
+
+		if (bp)
+			brelse(bp);
+		if (desc_block < file_blocks) {
+			error = nandfs_bread(node, desc_block, NOCRED, 0, &bp);
+			if (error) {
+				brelse(bp);
+				return (error);
+			}
+		} else {
+			error = nandfs_bcreate(node, desc_block, NOCRED, 0,
+			    &bp);
+			if (error)
+				return (error);
+			file_blocks++;
+			init_desc_block(mdt, bp->b_data);
+		}
+
+		descriptors = (struct nandfs_block_group_desc *) bp->b_data;
+		for (group = start_group; group < mdt->groups_per_desc_block;
+		    group++) {
+			if (descriptors[group].bg_nfrees > 0) {
+				nandfs_get_group_block_nr(mdt, group,
+				    &group_block);
+
+				if (bp2)
+					brelse(bp2);
+				if (group_block < file_blocks) {
+					error = nandfs_bread(node, group_block,
+					    NOCRED, 0, &bp2);
+					if (error) {
+						brelse(bp);
+						return (error);
+					}
+				} else {
+					error = nandfs_bcreate(node,
+					    group_block, NOCRED, 0, &bp2);
+					if (error)
+						return (error);
+					file_blocks++;
+				}
+				mask = (uint32_t *)bp2->b_data;
+				msize = (sizeof(uint32_t) * __CHAR_BIT);
+				mcount = mdt->entries_per_group / msize;
+				for (i = 0; i < mcount; i++) {
+					if (mask[i] == UINT32_MAX)
+						continue;
+
+					pos = ffs(~mask[i]) - 1;
+					pos += (msize * i);
+					pos += (group * mdt->entries_per_group);
+					pos += desc * group *
+					    mdt->groups_per_desc_block *
+					    mdt->entries_per_group;
+					goto found;
+				}
+			}
+		}
+		start_group = 0;
+	}
+
+	if (start_desc != 0) {
+		maxdesc = start_desc;
+		start_desc = 0;
+		req->entrynum = 0;
+		goto restart;
+	}
+
+	return (ENOENT);
+
+found:
+	req->entrynum = pos;
+	req->bp_desc = bp;
+	req->bp_bitmap = bp2;
+	DPRINTF(ALLOC, ("%s: desc: %p bitmap: %p entry: %#jx\n",
+	    __func__, req->bp_desc, req->bp_bitmap, (uintmax_t)pos));
+
+	return (0);
+}
+
+int
+nandfs_find_entry(struct nandfs_mdt* mdt, struct nandfs_node *nnode,
+    struct nandfs_alloc_request *req)
+{
+	uint64_t dblock, bblock, eblock;
+	uint32_t offset;
+	int error;
+
+	nandfs_mdt_trans_blk(mdt, req->entrynum, &dblock, &bblock, &eblock,
+	    &offset);
+
+	error = nandfs_bread(nnode, dblock, NOCRED, 0, &req->bp_desc);
+	if (error) {
+		brelse(req->bp_desc);
+		return (error);
+	}
+
+	error = nandfs_bread(nnode, bblock, NOCRED, 0, &req->bp_bitmap);
+	if (error) {
+		brelse(req->bp_desc);
+		brelse(req->bp_bitmap);
+		return (error);
+	}
+
+	error = nandfs_bread(nnode, eblock, NOCRED, 0, &req->bp_entry);
+	if (error) {
+		brelse(req->bp_desc);
+		brelse(req->bp_bitmap);
+		brelse(req->bp_entry);
+		return (error);
+	}
+
+	DPRINTF(ALLOC,
+	    ("%s: desc_buf: %p bitmap_buf %p entry_buf %p offset %x\n",
+	    __func__, req->bp_desc, req->bp_bitmap, req->bp_entry, offset));
+
+	return (0);
+}
+
+static __inline void
+nandfs_calc_idx_entry(struct nandfs_mdt* mdt, uint32_t entrynum,
+    uint64_t *group, uint64_t *bitmap_idx, uint64_t *bitmap_off)
+{
+
+	/* Find group_desc index */
+	entrynum = entrynum %
+	    (mdt->entries_per_group * mdt->groups_per_desc_block);
+	*group = entrynum / mdt->entries_per_group;
+	/* Find bitmap index and bit offset */
+	entrynum = entrynum % mdt->entries_per_group;
+	*bitmap_idx = entrynum / (sizeof(uint32_t) * __CHAR_BIT);
+	*bitmap_off = entrynum % (sizeof(uint32_t) * __CHAR_BIT);
+}
+
+int
+nandfs_free_entry(struct nandfs_mdt* mdt, struct nandfs_alloc_request *req)
+{
+	struct nandfs_block_group_desc *descriptors;
+	uint64_t bitmap_idx, bitmap_off;
+	uint64_t group;
+	uint32_t *mask, maskrw;
+
+	nandfs_calc_idx_entry(mdt, req->entrynum, &group, &bitmap_idx,
+	    &bitmap_off);
+
+	DPRINTF(ALLOC, ("nandfs_free_entry: req->entrynum=%jx bitmap_idx=%jx"
+	   " bitmap_off=%jx group=%jx\n", (uintmax_t)req->entrynum,
+	   (uintmax_t)bitmap_idx, (uintmax_t)bitmap_off, (uintmax_t)group));
+
+	/* Update counter of free entries for group */
+	descriptors = (struct nandfs_block_group_desc *) req->bp_desc->b_data;
+	descriptors[group].bg_nfrees++;
+
+	/* Set bit to indicate that entry is taken */
+	mask = (uint32_t *)req->bp_bitmap->b_data;
+	maskrw = mask[bitmap_idx];
+	KASSERT(maskrw & (1 << bitmap_off), ("freeing unallocated vblock"));
+	maskrw &= ~(1 << bitmap_off);
+	mask[bitmap_idx] = maskrw;
+
+	/* Make descriptor, bitmap and entry buffer dirty */
+	if (nandfs_dirty_buf(req->bp_desc, 0) == 0) {
+		nandfs_dirty_buf(req->bp_bitmap, 1);
+		nandfs_dirty_buf(req->bp_entry, 1);
+	} else {
+		brelse(req->bp_bitmap);
+		brelse(req->bp_entry);
+		return (-1);
+	}
+
+	return (0);
+}
+
+int
+nandfs_alloc_entry(struct nandfs_mdt* mdt, struct nandfs_alloc_request *req)
+{
+	struct nandfs_block_group_desc *descriptors;
+	uint64_t bitmap_idx, bitmap_off;
+	uint64_t group;
+	uint32_t *mask, maskrw;
+
+	nandfs_calc_idx_entry(mdt, req->entrynum, &group, &bitmap_idx,
+	    &bitmap_off);
+
+	DPRINTF(ALLOC, ("nandfs_alloc_entry: req->entrynum=%jx bitmap_idx=%jx"
+	    " bitmap_off=%jx group=%jx\n", (uintmax_t)req->entrynum,
+	    (uintmax_t)bitmap_idx, (uintmax_t)bitmap_off, (uintmax_t)group));
+
+	/* Update counter of free entries for group */
+	descriptors = (struct nandfs_block_group_desc *) req->bp_desc->b_data;
+	descriptors[group].bg_nfrees--;
+
+	/* Clear bit to indicate that entry is free */
+	mask = (uint32_t *)req->bp_bitmap->b_data;
+	maskrw = mask[bitmap_idx];
+	maskrw |= 1 << bitmap_off;
+	mask[bitmap_idx] = maskrw;
+
+	/* Make descriptor, bitmap and entry buffer dirty */
+	if (nandfs_dirty_buf(req->bp_desc, 0) == 0) {
+		nandfs_dirty_buf(req->bp_bitmap, 1);
+		nandfs_dirty_buf(req->bp_entry, 1);
+	} else {
+		brelse(req->bp_bitmap);
+		brelse(req->bp_entry);
+		return (-1);
+	}
+
+	return (0);
+}
+
+void
+nandfs_abort_entry(struct nandfs_alloc_request *req)
+{
+
+	brelse(req->bp_desc);
+	brelse(req->bp_bitmap);
+	brelse(req->bp_entry);
+}
+
+int
+nandfs_get_entry_block(struct nandfs_mdt *mdt, struct nandfs_node *node,
+    struct nandfs_alloc_request *req, uint32_t *entry, int create)
+{
+	struct buf *bp;
+	nandfs_lbn_t blocknr;
+	int	error;
+
+	/* Find buffer number for given entry */
+	nandfs_mdt_trans(mdt, req->entrynum, &blocknr, entry);
+	DPRINTF(ALLOC, ("%s: ino %#jx entrynum:%#jx block:%#jx entry:%x\n",
+	    __func__, (uintmax_t)node->nn_ino, (uintmax_t)req->entrynum,
+	    (uintmax_t)blocknr, *entry));
+
+	/* Read entry block or create if 'create' parameter is not zero */
+	bp = NULL;
+
+	if (blocknr < node->nn_inode.i_blocks)
+		error = nandfs_bread(node, blocknr, NOCRED, 0, &bp);
+	else if (create)
+		error = nandfs_bcreate(node, blocknr, NOCRED, 0, &bp);
+	else
+		error = E2BIG;
+
+	if (error) {
+		DPRINTF(ALLOC, ("%s: ino %#jx block %#jx entry %x error %d\n",
+		    __func__, (uintmax_t)node->nn_ino, (uintmax_t)blocknr,
+		    *entry, error));
+		if (bp)
+			brelse(bp);
+		return (error);
+	}
+
+	MPASS(nandfs_vblk_get(bp) != 0 || node->nn_ino == NANDFS_DAT_INO);
+
+	req->bp_entry = bp;
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_bmap.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_bmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,230 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_subr.c,v 1.4 2009/07/29 17:06:57 reinoud
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_bmap.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/ktr.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include <machine/_inttypes.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+#include "bmap.h"
+
+nandfs_lbn_t
+nandfs_get_maxfilesize(struct nandfs_device *fsdev)
+{
+
+	return (get_maxfilesize(fsdev));
+}
+
+int
+nandfs_bmap_lookup(struct nandfs_node *node, nandfs_lbn_t lblk,
+    nandfs_daddr_t *vblk)
+{
+	int error = 0;
+
+	if (node->nn_ino == NANDFS_GC_INO && lblk >= 0)
+		*vblk = lblk;
+	else
+		error = bmap_lookup(node, lblk, vblk);
+
+	DPRINTF(TRANSLATE, ("%s: error %d ino %#jx lblocknr %#jx -> %#jx\n",
+	    __func__, error, (uintmax_t)node->nn_ino, (uintmax_t)lblk,
+	    (uintmax_t)*vblk));
+
+	if (error)
+		nandfs_error("%s: returned %d", __func__, error);
+
+	return (error);
+}
+
+int
+nandfs_bmap_insert_block(struct nandfs_node *node, nandfs_lbn_t lblk,
+    struct buf *bp)
+{
+	struct nandfs_device *fsdev;
+	nandfs_daddr_t vblk;
+	int error;
+
+	fsdev = node->nn_nandfsdev;
+
+	vblk = 0;
+	if (node->nn_ino != NANDFS_DAT_INO) {
+		error = nandfs_vblock_alloc(fsdev, &vblk);
+		if (error)
+			return (error);
+	}
+
+	nandfs_buf_set(bp, NANDFS_VBLK_ASSIGNED);
+	nandfs_vblk_set(bp, vblk);
+
+	error = bmap_insert_block(node, lblk, vblk);
+	if (error) {
+		nandfs_vblock_free(fsdev, vblk);
+		return (error);
+	}
+
+	return (0);
+}
+
+int
+nandfs_bmap_dirty_blocks(struct nandfs_node *node, struct buf *bp, int force)
+{
+	int error;
+
+	error = bmap_dirty_meta(node, bp->b_lblkno, force);
+	if (error)
+		nandfs_error("%s: cannot dirty buffer %p\n",
+		    __func__, bp);
+
+	return (error);
+}
+
+static int
+nandfs_bmap_update_mapping(struct nandfs_node *node, nandfs_lbn_t lblk,
+    nandfs_daddr_t blknr)
+{
+	int error;
+
+	DPRINTF(BMAP,
+	    ("%s: node: %p ino: %#jx lblk: %#jx vblk: %#jx\n",
+	    __func__, node, (uintmax_t)node->nn_ino, (uintmax_t)lblk,
+	    (uintmax_t)blknr));
+
+	error = bmap_insert_block(node, lblk, blknr);
+
+	return (error);
+}
+
+int
+nandfs_bmap_update_block(struct nandfs_node *node, struct buf *bp,
+    nandfs_lbn_t blknr)
+{
+	nandfs_lbn_t lblk;
+	int error;
+
+	lblk = bp->b_lblkno;
+	nandfs_vblk_set(bp, blknr);
+
+	DPRINTF(BMAP, ("%s: node: %p ino: %#jx bp: %p lblk: %#jx blk: %#jx\n",
+	    __func__, node, (uintmax_t)node->nn_ino, bp,
+	    (uintmax_t)lblk, (uintmax_t)blknr));
+
+	error = nandfs_bmap_update_mapping(node, lblk, blknr);
+	if (error) {
+		nandfs_error("%s: cannot update lblk:%jx to blk:%jx for "
+		    "node:%p, error:%d\n", __func__, (uintmax_t)lblk,
+		    (uintmax_t)blknr, node, error);
+		return (error);
+	}
+
+	return (error);
+}
+
+int
+nandfs_bmap_update_dat(struct nandfs_node *node, nandfs_daddr_t oldblk,
+    struct buf *bp)
+{
+	struct nandfs_device *fsdev;
+	nandfs_daddr_t vblk = 0;
+	int error;
+
+	if (node->nn_ino == NANDFS_DAT_INO)
+		return (0);
+
+	if (nandfs_buf_check(bp, NANDFS_VBLK_ASSIGNED)) {
+		nandfs_buf_clear(bp, NANDFS_VBLK_ASSIGNED);
+		return (0);
+	}
+
+	fsdev = node->nn_nandfsdev;
+
+	/* First alloc new virtual block.... */
+	error = nandfs_vblock_alloc(fsdev, &vblk);
+	if (error)
+		return (error);
+
+	error = nandfs_bmap_update_block(node, bp, vblk);
+	if (error)
+		return (error);
+
+	/* Then we can end up with old one */
+	nandfs_vblock_end(fsdev, oldblk);
+
+	DPRINTF(BMAP,
+	    ("%s: ino %#jx block %#jx: update vblk %#jx to %#jx\n",
+	    __func__, (uintmax_t)node->nn_ino, (uintmax_t)bp->b_lblkno,
+	    (uintmax_t)oldblk, (uintmax_t)vblk));
+	return (error);
+}
+
+int
+nandfs_bmap_truncate_mapping(struct nandfs_node *node, nandfs_lbn_t oblk,
+    nandfs_lbn_t nblk)
+{
+	nandfs_lbn_t todo;
+	int error;
+
+	todo = oblk - nblk;
+
+	DPRINTF(BMAP, ("%s: node %p oblk %jx nblk %jx truncate by %jx\n",
+	    __func__, node, oblk, nblk, todo));
+
+	error = bmap_truncate_mapping(node, oblk, todo);
+	if (error)
+		return (error);
+
+	return (error);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_buffer.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_buffer.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,83 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_buffer.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/buf.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+struct buf *
+nandfs_geteblk(int size, int flags)
+{
+	struct buf *bp;
+
+	/*
+	 * XXX
+	 * Right now we can call geteblk with GB_NOWAIT_BD flag, which means
+	 * it can return NULL. But we cannot afford to get NULL, hence this panic.
+	 */
+	bp = geteblk(size, flags);
+	if (bp == NULL)
+		panic("geteblk returned NULL");
+
+	return (bp);
+}
+
+void
+nandfs_dirty_bufs_increment(struct nandfs_device *fsdev)
+{
+
+	mtx_lock(&fsdev->nd_mutex);
+	KASSERT(fsdev->nd_dirty_bufs >= 0, ("negative nd_dirty_bufs"));
+	fsdev->nd_dirty_bufs++;
+	mtx_unlock(&fsdev->nd_mutex);
+}
+
+void
+nandfs_dirty_bufs_decrement(struct nandfs_device *fsdev)
+{
+
+	mtx_lock(&fsdev->nd_mutex);
+	KASSERT(fsdev->nd_dirty_bufs > 0,
+	    ("decrementing not-positive nd_dirty_bufs"));
+	fsdev->nd_dirty_bufs--;
+	mtx_unlock(&fsdev->nd_mutex);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_cleaner.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_cleaner.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,620 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_cleaner.c 236188 2012-05-28 16:33:58Z marcel $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/buf.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+#define	NANDFS_CLEANER_KILL	1
+
+static void nandfs_cleaner(struct nandfs_device *);
+static int nandfs_cleaner_clean_segments(struct nandfs_device *,
+    struct nandfs_vinfo *, uint32_t, struct nandfs_period *, uint32_t,
+    struct nandfs_bdesc *, uint32_t, uint64_t *, uint32_t);
+
+static int
+nandfs_process_bdesc(struct nandfs_device *nffsdev, struct nandfs_bdesc *bd,
+    uint64_t nmembs);
+
+static void
+nandfs_wakeup_wait_cleaner(struct nandfs_device *fsdev, int reason)
+{
+
+	mtx_lock(&fsdev->nd_clean_mtx);
+	if (reason == NANDFS_CLEANER_KILL)
+		fsdev->nd_cleaner_exit = 1;
+	if (fsdev->nd_cleaning == 0) {
+		fsdev->nd_cleaning = 1;
+		wakeup(&fsdev->nd_cleaning);
+	}
+	cv_wait(&fsdev->nd_clean_cv, &fsdev->nd_clean_mtx);
+	mtx_unlock(&fsdev->nd_clean_mtx);
+}
+
+int
+nandfs_start_cleaner(struct nandfs_device *fsdev)
+{
+	int error;
+
+	MPASS(fsdev->nd_cleaner == NULL);
+
+	fsdev->nd_cleaner_exit = 0;
+
+	error = kthread_add((void(*)(void *))nandfs_cleaner, fsdev, NULL,
+	    &fsdev->nd_cleaner, 0, 0, "nandfs_cleaner");
+	if (error)
+		printf("nandfs: could not start cleaner: %d\n", error);
+
+	return (error);
+}
+
+int
+nandfs_stop_cleaner(struct nandfs_device *fsdev)
+{
+
+	MPASS(fsdev->nd_cleaner != NULL);
+	nandfs_wakeup_wait_cleaner(fsdev, NANDFS_CLEANER_KILL);
+	fsdev->nd_cleaner = NULL;
+
+	DPRINTF(CLEAN, ("cleaner stopped\n"));
+	return (0);
+}
+
+static int
+nandfs_cleaner_finished(struct nandfs_device *fsdev)
+{
+	int exit;
+
+	mtx_lock(&fsdev->nd_clean_mtx);
+	fsdev->nd_cleaning = 0;
+	if (!fsdev->nd_cleaner_exit) {
+		DPRINTF(CLEAN, ("%s: sleep\n", __func__));
+		msleep(&fsdev->nd_cleaning, &fsdev->nd_clean_mtx, PRIBIO, "-",
+		    hz * nandfs_cleaner_interval);
+	}
+	exit = fsdev->nd_cleaner_exit;
+	cv_broadcast(&fsdev->nd_clean_cv);
+	mtx_unlock(&fsdev->nd_clean_mtx);
+	if (exit) {
+		DPRINTF(CLEAN, ("%s: no longer active\n", __func__));
+		return (1);
+	}
+
+	return (0);
+}
+
+static void
+print_suinfo(struct nandfs_suinfo *suinfo, int nsegs)
+{
+	int i;
+
+	for (i = 0; i < nsegs; i++) {
+		DPRINTF(CLEAN, ("%jx  %jd  %c%c%c  %10u\n",
+		    suinfo[i].nsi_num, suinfo[i].nsi_lastmod,
+		    (suinfo[i].nsi_flags &
+		    (NANDFS_SEGMENT_USAGE_ACTIVE) ? 'a' : '-'),
+		    (suinfo[i].nsi_flags &
+		    (NANDFS_SEGMENT_USAGE_DIRTY) ? 'd' : '-'),
+		    (suinfo[i].nsi_flags &
+		    (NANDFS_SEGMENT_USAGE_ERROR) ? 'e' : '-'),
+		    suinfo[i].nsi_blocks));
+	}
+}
+
+static int
+nandfs_cleaner_vblock_is_alive(struct nandfs_device *fsdev,
+    struct nandfs_vinfo *vinfo, struct nandfs_cpinfo *cp, uint32_t ncps)
+{
+	int64_t idx, min, max;
+
+	if (vinfo->nvi_end >= fsdev->nd_last_cno)
+		return (1);
+
+	if (ncps == 0)
+		return (0);
+
+	if (vinfo->nvi_end < cp[0].nci_cno ||
+	    vinfo->nvi_start > cp[ncps - 1].nci_cno)
+		return (0);
+
+	idx = min = 0;
+	max = ncps - 1;
+	while (min <= max) {
+		idx = (min + max) / 2;
+		if (vinfo->nvi_start == cp[idx].nci_cno)
+			return (1);
+		if (vinfo->nvi_start < cp[idx].nci_cno)
+			max = idx - 1;
+		else
+			min = idx + 1;
+	}
+
+	return (vinfo->nvi_end >= cp[idx].nci_cno);
+}
+
+static void
+nandfs_cleaner_vinfo_mark_alive(struct nandfs_device *fsdev,
+    struct nandfs_vinfo *vinfo, uint32_t nmembs, struct nandfs_cpinfo *cp,
+    uint32_t ncps)
+{
+	uint32_t i;
+
+	for (i = 0; i < nmembs; i++)
+		vinfo[i].nvi_alive =
+		    nandfs_cleaner_vblock_is_alive(fsdev, &vinfo[i], cp, ncps);
+}
+
+static int
+nandfs_cleaner_bdesc_is_alive(struct nandfs_device *fsdev,
+    struct nandfs_bdesc *bdesc)
+{
+	int alive;
+
+	alive = bdesc->bd_oblocknr == bdesc->bd_blocknr;
+	if (!alive)
+		MPASS(abs(bdesc->bd_oblocknr - bdesc->bd_blocknr) > 2);
+
+	return (alive);
+}
+
+static void
+nandfs_cleaner_bdesc_mark_alive(struct nandfs_device *fsdev,
+    struct nandfs_bdesc *bdesc, uint32_t nmembs)
+{
+	uint32_t i;
+
+	for (i = 0; i < nmembs; i++)
+		bdesc[i].bd_alive = nandfs_cleaner_bdesc_is_alive(fsdev,
+		    &bdesc[i]);
+}
+
+static void
+nandfs_cleaner_iterate_psegment(struct nandfs_device *fsdev,
+    struct nandfs_segment_summary *segsum, union nandfs_binfo *binfo,
+    nandfs_daddr_t blk, struct nandfs_vinfo **vipp, struct nandfs_bdesc **bdpp)
+{
+	int i;
+
+	DPRINTF(CLEAN, ("%s nbinfos %x\n", __func__, segsum->ss_nbinfos));
+	for (i = 0; i < segsum->ss_nbinfos; i++) {
+		if (binfo[i].bi_v.bi_ino == NANDFS_DAT_INO) {
+			(*bdpp)->bd_oblocknr = blk + segsum->ss_nblocks -
+			    segsum->ss_nbinfos + i;
+			/*
+			 * XXX Hack
+			 */
+			if (segsum->ss_flags & NANDFS_SS_SR)
+				(*bdpp)->bd_oblocknr--;
+			(*bdpp)->bd_level = binfo[i].bi_dat.bi_level;
+			(*bdpp)->bd_offset = binfo[i].bi_dat.bi_blkoff;
+			(*bdpp)++;
+		} else {
+			(*vipp)->nvi_ino = binfo[i].bi_v.bi_ino;
+			(*vipp)->nvi_vblocknr = binfo[i].bi_v.bi_vblocknr;
+			(*vipp)++;
+		}
+	}
+}
+
+static int
+nandfs_cleaner_iterate_segment(struct nandfs_device *fsdev, uint64_t segno,
+    struct nandfs_vinfo **vipp, struct nandfs_bdesc **bdpp, int *select)
+{
+	struct nandfs_segment_summary *segsum;
+	union nandfs_binfo *binfo;
+	struct buf *bp;
+	uint32_t nblocks;
+	nandfs_daddr_t curr, start, end;
+	int error = 0;
+
+	nandfs_get_segment_range(fsdev, segno, &start, &end);
+
+	DPRINTF(CLEAN, ("%s: segno %jx start %jx end %jx\n", __func__, segno,
+	    start, end));
+
+	*select = 0;
+
+	for (curr = start; curr < end; curr += nblocks) {
+		error = nandfs_dev_bread(fsdev, curr, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			nandfs_error("%s: couldn't load segment summary of %jx: %d\n",
+			    __func__, segno, error);
+			return (error);
+		}
+
+		segsum = (struct nandfs_segment_summary *)bp->b_data;
+		binfo = (union nandfs_binfo *)(bp->b_data + segsum->ss_bytes);
+
+		if (!nandfs_segsum_valid(segsum)) {
+			brelse(bp);
+			nandfs_error("nandfs: invalid summary of segment %jx\n", segno);
+			return (error);
+		}
+
+		DPRINTF(CLEAN, ("%s: %jx magic %x bytes %x nblocks %x nbinfos "
+		    "%x\n", __func__, segno, segsum->ss_magic, segsum->ss_bytes,
+		    segsum->ss_nblocks, segsum->ss_nbinfos));
+
+		nandfs_cleaner_iterate_psegment(fsdev, segsum, binfo, curr,
+		    vipp, bdpp);
+		nblocks = segsum->ss_nblocks;
+		brelse(bp);
+	}
+
+	if (error == 0)
+		*select = 1;
+
+	return (error);
+}
+
+static int
+nandfs_cleaner_choose_segment(struct nandfs_device *fsdev, uint64_t **segpp,
+    uint64_t nsegs, uint64_t *rseg)
+{
+	struct nandfs_suinfo *suinfo;
+	uint64_t i, ssegs;
+	int error;
+
+	suinfo = malloc(sizeof(*suinfo) * nsegs, M_NANDFSTEMP,
+	    M_ZERO | M_WAITOK);
+
+	if (*rseg >= fsdev->nd_fsdata.f_nsegments)
+		*rseg = 0;
+
+retry:
+	error = nandfs_get_segment_info_filter(fsdev, suinfo, nsegs, *rseg,
+	    &ssegs, NANDFS_SEGMENT_USAGE_DIRTY,
+	    NANDFS_SEGMENT_USAGE_ACTIVE | NANDFS_SEGMENT_USAGE_ERROR |
+	    NANDFS_SEGMENT_USAGE_GC);
+	if (error) {
+		nandfs_error("%s:%d", __FILE__, __LINE__);
+		goto out;
+	}
+	if (ssegs == 0 && *rseg != 0) {
+		*rseg = 0;
+		goto retry;
+	}
+	if (ssegs > 0) {
+		print_suinfo(suinfo, ssegs);
+
+		for (i = 0; i < ssegs; i++) {
+			(**segpp) = suinfo[i].nsi_num;
+			(*segpp)++;
+		}
+		*rseg = suinfo[i - 1].nsi_num + 1;
+	}
+
+out:
+	free(suinfo, M_NANDFSTEMP);
+	return (error);
+}
+
+static int
+nandfs_cleaner_body(struct nandfs_device *fsdev, uint64_t *rseg)
+{
+	struct nandfs_vinfo *vinfo, *vip, *vipi;
+	struct nandfs_bdesc *bdesc, *bdp, *bdpi;
+	struct nandfs_cpstat cpstat;
+	struct nandfs_cpinfo *cpinfo = NULL;
+	uint64_t *segnums, *segp;
+	int select, selected;
+	int error = 0;
+	int nsegs;
+	int i;
+
+	nsegs = nandfs_cleaner_segments;
+
+	vip = vinfo = malloc(sizeof(*vinfo) *
+	    fsdev->nd_fsdata.f_blocks_per_segment * nsegs, M_NANDFSTEMP,
+	    M_ZERO | M_WAITOK);
+	bdp = bdesc = malloc(sizeof(*bdesc) *
+	    fsdev->nd_fsdata.f_blocks_per_segment * nsegs, M_NANDFSTEMP,
+	    M_ZERO | M_WAITOK);
+	segp = segnums = malloc(sizeof(*segnums) * nsegs, M_NANDFSTEMP,
+	    M_WAITOK);
+
+	error = nandfs_cleaner_choose_segment(fsdev, &segp, nsegs, rseg);
+	if (error) {
+		nandfs_error("%s:%d", __FILE__, __LINE__);
+		goto out;
+	}
+
+	if (segnums == segp)
+		goto out;
+
+	selected = 0;
+	for (i = 0; i < segp - segnums; i++) {
+		error = nandfs_cleaner_iterate_segment(fsdev, segnums[i], &vip,
+		    &bdp, &select);
+		if (error) {
+			/*
+			 * XXX deselect (see below)?
+			 */
+			goto out;
+		}
+		if (!select)
+			segnums[i] = NANDFS_NOSEGMENT;
+		else {
+			error = nandfs_markgc_segment(fsdev, segnums[i]);
+			if (error) {
+				nandfs_error("%s:%d\n", __FILE__, __LINE__);
+				goto out;
+			}
+			selected++;
+		}
+	}
+
+	if (selected == 0) {
+		MPASS(vinfo == vip);
+		MPASS(bdesc == bdp);
+		goto out;
+	}
+
+	error = nandfs_get_cpstat(fsdev->nd_cp_node, &cpstat);
+	if (error) {
+		nandfs_error("%s:%d\n", __FILE__, __LINE__);
+		goto out;
+	}
+
+	if (cpstat.ncp_nss != 0) {
+		cpinfo = malloc(sizeof(struct nandfs_cpinfo) * cpstat.ncp_nss,
+		    M_NANDFSTEMP, M_WAITOK);
+		error = nandfs_get_cpinfo(fsdev->nd_cp_node, 1, NANDFS_SNAPSHOT,
+		    cpinfo, cpstat.ncp_nss, NULL);
+		if (error) {
+			nandfs_error("%s:%d\n", __FILE__, __LINE__);
+			goto out_locked;
+		}
+	}
+
+	NANDFS_WRITELOCK(fsdev);
+	DPRINTF(CLEAN, ("%s: got lock\n", __func__));
+
+	error = nandfs_get_dat_vinfo(fsdev, vinfo, vip - vinfo);
+	if (error) {
+		nandfs_error("%s:%d\n", __FILE__, __LINE__);
+		goto out_locked;
+	}
+
+	nandfs_cleaner_vinfo_mark_alive(fsdev, vinfo, vip - vinfo, cpinfo,
+	    cpstat.ncp_nss);
+
+	error = nandfs_get_dat_bdescs(fsdev, bdesc, bdp - bdesc);
+	if (error) {
+		nandfs_error("%s:%d\n", __FILE__, __LINE__);
+		goto out_locked;
+	}
+
+	nandfs_cleaner_bdesc_mark_alive(fsdev, bdesc, bdp - bdesc);
+
+	DPRINTF(CLEAN, ("got:\n"));
+	for (vipi = vinfo; vipi < vip; vipi++) {
+		DPRINTF(CLEAN, ("v ino %jx vblocknr %jx start %jx end %jx "
+		    "alive %d\n", vipi->nvi_ino, vipi->nvi_vblocknr,
+		    vipi->nvi_start, vipi->nvi_end, vipi->nvi_alive));
+	}
+	for (bdpi = bdesc; bdpi < bdp; bdpi++) {
+		DPRINTF(CLEAN, ("b oblocknr %jx blocknr %jx offset %jx "
+		    "alive %d\n", bdpi->bd_oblocknr, bdpi->bd_blocknr,
+		    bdpi->bd_offset, bdpi->bd_alive));
+	}
+	DPRINTF(CLEAN, ("end list\n"));
+
+	error = nandfs_cleaner_clean_segments(fsdev, vinfo, vip - vinfo, NULL,
+	    0, bdesc, bdp - bdesc, segnums, segp - segnums);
+	if (error)
+		nandfs_error("%s:%d\n", __FILE__, __LINE__);
+
+out_locked:
+	NANDFS_WRITEUNLOCK(fsdev);
+out:
+	free(cpinfo, M_NANDFSTEMP);
+	free(segnums, M_NANDFSTEMP);
+	free(bdesc, M_NANDFSTEMP);
+	free(vinfo, M_NANDFSTEMP);
+
+	return (error);
+}
+
+static void
+nandfs_cleaner(struct nandfs_device *fsdev)
+{
+	uint64_t checked_seg = 0;
+	int error;
+
+	while (!nandfs_cleaner_finished(fsdev)) {
+		if (!nandfs_cleaner_enable || rebooting)
+			continue;
+
+		DPRINTF(CLEAN, ("%s: run started\n", __func__));
+
+		fsdev->nd_cleaning = 1;
+
+		error = nandfs_cleaner_body(fsdev, &checked_seg);
+
+		DPRINTF(CLEAN, ("%s: run finished error %d\n", __func__,
+		    error));
+	}
+
+	DPRINTF(CLEAN, ("%s: exiting\n", __func__));
+	kthread_exit();
+}
+
+static int
+nandfs_cleaner_clean_segments(struct nandfs_device *nffsdev,
+    struct nandfs_vinfo *vinfo, uint32_t nvinfo,
+    struct nandfs_period *pd, uint32_t npd,
+    struct nandfs_bdesc *bdesc, uint32_t nbdesc,
+    uint64_t *segments, uint32_t nsegs)
+{
+	struct nandfs_node *gc;
+	struct buf *bp;
+	uint32_t i;
+	int error = 0;
+
+	gc = nffsdev->nd_gc_node;
+
+	DPRINTF(CLEAN, ("%s: enter\n", __func__));
+
+	VOP_LOCK(NTOV(gc), LK_EXCLUSIVE);
+	for (i = 0; i < nvinfo; i++) {
+		if (!vinfo[i].nvi_alive)
+			continue;
+		DPRINTF(CLEAN, ("%s: read vblknr:%#jx blk:%#jx\n",
+		    __func__, (uintmax_t)vinfo[i].nvi_vblocknr,
+		    (uintmax_t)vinfo[i].nvi_blocknr));
+		error = nandfs_bread(nffsdev->nd_gc_node, vinfo[i].nvi_blocknr,
+		    NULL, 0, &bp);
+		if (error) {
+			nandfs_error("%s:%d", __FILE__, __LINE__);
+			VOP_UNLOCK(NTOV(gc), 0);
+			goto out;
+		}
+		nandfs_vblk_set(bp, vinfo[i].nvi_vblocknr);
+		nandfs_buf_set(bp, NANDFS_VBLK_ASSIGNED);
+		nandfs_dirty_buf(bp, 1);
+	}
+	VOP_UNLOCK(NTOV(gc), 0);
+
+	/* Delete checkpoints */
+	for (i = 0; i < npd; i++) {
+		DPRINTF(CLEAN, ("delete checkpoint: %jx\n",
+		    (uintmax_t)pd[i].p_start));
+		error = nandfs_delete_cp(nffsdev->nd_cp_node, pd[i].p_start,
+		    pd[i].p_end);
+		if (error) {
+			nandfs_error("%s:%d", __FILE__, __LINE__);
+			goto out;
+		}
+	}
+
+	/* Update vblocks */
+	for (i = 0; i < nvinfo; i++) {
+		if (vinfo[i].nvi_alive)
+			continue;
+		DPRINTF(CLEAN, ("freeing vblknr: %jx\n", vinfo[i].nvi_vblocknr));
+		error = nandfs_vblock_free(nffsdev, vinfo[i].nvi_vblocknr);
+		if (error) {
+			nandfs_error("%s:%d", __FILE__, __LINE__);
+			goto out;
+		}
+	}
+
+	error = nandfs_process_bdesc(nffsdev, bdesc, nbdesc);
+	if (error) {
+		nandfs_error("%s:%d", __FILE__, __LINE__);
+		goto out;
+	}
+
+	/* Add segments to clean */
+	if (nffsdev->nd_free_count) {
+		nffsdev->nd_free_base = realloc(nffsdev->nd_free_base,
+		    (nffsdev->nd_free_count + nsegs) * sizeof(uint64_t),
+		    M_NANDFSTEMP, M_WAITOK | M_ZERO);
+		memcpy(&nffsdev->nd_free_base[nffsdev->nd_free_count], segments,
+		    nsegs * sizeof(uint64_t));
+		nffsdev->nd_free_count += nsegs;
+	} else {
+		nffsdev->nd_free_base = malloc(nsegs * sizeof(uint64_t),
+		    M_NANDFSTEMP, M_WAITOK|M_ZERO);
+		memcpy(nffsdev->nd_free_base, segments,
+		    nsegs * sizeof(uint64_t));
+		nffsdev->nd_free_count = nsegs;
+	}
+
+out:
+
+	DPRINTF(CLEAN, ("%s: exit error %d\n", __func__, error));
+
+	return (error);
+}
+
+static int
+nandfs_process_bdesc(struct nandfs_device *nffsdev, struct nandfs_bdesc *bd,
+    uint64_t nmembs)
+{
+	struct nandfs_node *dat_node;
+	struct buf *bp;
+	uint64_t i;
+	int error;
+
+	dat_node = nffsdev->nd_dat_node;
+
+	VOP_LOCK(NTOV(dat_node), LK_EXCLUSIVE);
+
+	for (i = 0; i < nmembs; i++) {
+		if (!bd[i].bd_alive)
+			continue;
+		DPRINTF(CLEAN, ("%s: idx %jx offset %jx\n",
+		    __func__, i, bd[i].bd_offset));
+		if (bd[i].bd_level) {
+			error = nandfs_bread_meta(dat_node, bd[i].bd_offset,
+			    NULL, 0, &bp);
+			if (error) {
+				nandfs_error("%s: cannot read dat node "
+				    "level:%d\n", __func__, bd[i].bd_level);
+				brelse(bp);
+				VOP_UNLOCK(NTOV(dat_node), 0);
+				return (error);
+			}
+			nandfs_dirty_buf_meta(bp, 1);
+			nandfs_bmap_dirty_blocks(VTON(bp->b_vp), bp, 1);
+		} else {
+			error = nandfs_bread(dat_node, bd[i].bd_offset, NULL,
+			    0, &bp);
+			if (error) {
+				nandfs_error("%s: cannot read dat node\n",
+				    __func__);
+				brelse(bp);
+				VOP_UNLOCK(NTOV(dat_node), 0);
+				return (error);
+			}
+			nandfs_dirty_buf(bp, 1);
+		}
+		DPRINTF(CLEAN, ("%s: bp: %p\n", __func__, bp));
+	}
+
+	VOP_UNLOCK(NTOV(dat_node), 0);
+
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_cpfile.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_cpfile.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,776 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_cpfile.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+
+
+static int
+nandfs_checkpoint_size(struct nandfs_device *fsdev)
+{
+
+	return (fsdev->nd_fsdata.f_checkpoint_size);
+}
+
+static int
+nandfs_checkpoint_blk_offset(struct nandfs_device *fsdev, uint64_t cn,
+    uint64_t *blk, uint64_t *offset)
+{
+	uint64_t off;
+	uint16_t cp_size, cp_per_blk;
+
+	KASSERT((cn), ("checkpoing cannot be zero"));
+
+	cp_size = fsdev->nd_fsdata.f_checkpoint_size;
+	cp_per_blk = fsdev->nd_blocksize / cp_size;
+	off = roundup(sizeof(struct nandfs_cpfile_header), cp_size) / cp_size;
+	off += (cn - 1);
+
+	*blk = off / cp_per_blk;
+	*offset = (off % cp_per_blk) * cp_size;
+
+	return (0);
+}
+
+static int
+nandfs_checkpoint_blk_remaining(struct nandfs_device *fsdev, uint64_t cn,
+    uint64_t blk, uint64_t offset)
+{
+	uint16_t cp_size, cp_remaining;
+
+	cp_size = fsdev->nd_fsdata.f_checkpoint_size;
+	cp_remaining = (fsdev->nd_blocksize - offset) / cp_size;
+
+	return (cp_remaining);
+}
+
+int
+nandfs_get_checkpoint(struct nandfs_device *fsdev, struct nandfs_node *cp_node,
+    uint64_t cn)
+{
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	if (cn != fsdev->nd_last_cno && cn != (fsdev->nd_last_cno + 1)) {
+		return (-1);
+	}
+
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (-1);
+	}
+
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (-1);
+
+
+	nandfs_checkpoint_blk_offset(fsdev, cn, &blk, &offset);
+
+	if (blk != 0) {
+		if (blk < cp_node->nn_inode.i_blocks)
+			error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		else
+			error = nandfs_bcreate(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			if (bp)
+				brelse(bp);
+			return (-1);
+		}
+
+		nandfs_dirty_buf(bp, 1);
+	}
+
+	DPRINTF(CPFILE, ("%s: cn:%#jx entry block:%#jx offset:%#jx\n",
+	    __func__, (uintmax_t)cn, (uintmax_t)blk, (uintmax_t)offset));
+
+	return (0);
+}
+
+int
+nandfs_set_checkpoint(struct nandfs_device *fsdev, struct nandfs_node *cp_node,
+    uint64_t cn, struct nandfs_inode *ifile_inode, uint64_t nblocks)
+{
+	struct nandfs_cpfile_header *cnh;
+	struct nandfs_checkpoint *cnp;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	if (cn != fsdev->nd_last_cno && cn != (fsdev->nd_last_cno + 1)) {
+		nandfs_error("%s: trying to set invalid chekpoint %jx - %jx\n",
+		    __func__, cn, fsdev->nd_last_cno);
+		return (-1);
+	}
+
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return error;
+	}
+
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	cnh->ch_ncheckpoints++;
+
+	nandfs_checkpoint_blk_offset(fsdev, cn, &blk, &offset);
+
+	if(blk != 0) {
+		brelse(bp);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return error;
+		}
+	}
+
+	cnp = (struct nandfs_checkpoint *)((uint8_t *)bp->b_data + offset);
+	cnp->cp_flags = 0;
+	cnp->cp_checkpoints_count = 1;
+	memset(&cnp->cp_snapshot_list, 0, sizeof(struct nandfs_snapshot_list));
+	cnp->cp_cno = cn;
+	cnp->cp_create = fsdev->nd_ts.tv_sec;
+	cnp->cp_nblk_inc = nblocks;
+	cnp->cp_blocks_count = 0;
+	memcpy (&cnp->cp_ifile_inode, ifile_inode, sizeof(cnp->cp_ifile_inode));
+
+	DPRINTF(CPFILE, ("%s: cn:%#jx ctime:%#jx nblk:%#jx\n",
+	    __func__, (uintmax_t)cn, (uintmax_t)cnp->cp_create,
+	    (uintmax_t)nblocks));
+
+	brelse(bp);
+	return (0);
+}
+
+static int
+nandfs_cp_mounted(struct nandfs_device *nandfsdev, uint64_t cno)
+{
+	struct nandfsmount *nmp;
+	int mounted = 0;
+
+	mtx_lock(&nandfsdev->nd_mutex);
+	/* No double-mounting of the same checkpoint */
+	STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) {
+		if (nmp->nm_mount_args.cpno == cno) {
+			mounted = 1;
+			break;
+		}
+	}
+	mtx_unlock(&nandfsdev->nd_mutex);
+
+	return (mounted);
+}
+
+static int
+nandfs_cp_set_snapshot(struct nandfs_node *cp_node, uint64_t cno)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_cpfile_header *cnh;
+	struct nandfs_checkpoint *cnp;
+	struct nandfs_snapshot_list *list;
+	struct buf *bp;
+	uint64_t blk, prev_blk, offset;
+	uint64_t curr, prev;
+	int error;
+
+	fsdev = cp_node->nn_nandfsdev;
+
+	/* Get snapshot data */
+	nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+	error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+	if (cnp->cp_flags & NANDFS_CHECKPOINT_INVALID) {
+		brelse(bp);
+		return (ENOENT);
+	}
+	if ((cnp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT)) {
+		brelse(bp);
+		return (EINVAL);
+	}
+
+	brelse(bp);
+	/* Get list from header */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	list = &cnh->ch_snapshot_list;
+	prev = list->ssl_prev;
+	brelse(bp);
+	prev_blk = ~(0);
+	curr = 0;
+	while (prev > cno) {
+		curr = prev;
+		nandfs_checkpoint_blk_offset(fsdev, prev, &prev_blk, &offset);
+		error = nandfs_bread(cp_node, prev_blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+		prev = list->ssl_prev;
+		brelse(bp);
+	}
+
+	if (curr == 0) {
+		nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		list = &cnh->ch_snapshot_list;
+	} else {
+		nandfs_checkpoint_blk_offset(fsdev, curr, &blk, &offset);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+	}
+
+	list->ssl_prev = cno;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+
+	/* Update snapshot for cno */
+	nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+	error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+	list = &cnp->cp_snapshot_list;
+	list->ssl_prev = prev;
+	list->ssl_next = curr;
+	cnp->cp_flags |= NANDFS_CHECKPOINT_SNAPSHOT;
+	nandfs_dirty_buf(bp, 1);
+
+	if (prev == 0) {
+		nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		list = &cnh->ch_snapshot_list;
+	} else {
+		/* Update snapshot list for prev */
+		nandfs_checkpoint_blk_offset(fsdev, prev, &blk, &offset);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+	}
+	list->ssl_next = cno;
+	nandfs_dirty_buf(bp, 1);
+
+	/* Update header */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	cnh->ch_nsnapshots++;
+	nandfs_dirty_buf(bp, 1);
+
+	return (0);
+}
+
+static int
+nandfs_cp_clr_snapshot(struct nandfs_node *cp_node, uint64_t cno)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_cpfile_header *cnh;
+	struct nandfs_checkpoint *cnp;
+	struct nandfs_snapshot_list *list;
+	struct buf *bp;
+	uint64_t blk, offset, snapshot_cnt;
+	uint64_t next, prev;
+	int error;
+
+	fsdev = cp_node->nn_nandfsdev;
+
+	/* Get snapshot data */
+	nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+	error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+	if (cnp->cp_flags & NANDFS_CHECKPOINT_INVALID) {
+		brelse(bp);
+		return (ENOENT);
+	}
+	if (!(cnp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT)) {
+		brelse(bp);
+		return (EINVAL);
+	}
+
+	list = &cnp->cp_snapshot_list;
+	next = list->ssl_next;
+	prev = list->ssl_prev;
+	brelse(bp);
+
+	/* Get previous snapshot */
+	if (prev != 0) {
+		nandfs_checkpoint_blk_offset(fsdev, prev, &blk, &offset);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+	} else {
+		nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		list = &cnh->ch_snapshot_list;
+	}
+
+	list->ssl_next = next;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	/* Get next snapshot */
+	if (next != 0) {
+		nandfs_checkpoint_blk_offset(fsdev, next, &blk, &offset);
+		error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		list = &cnp->cp_snapshot_list;
+	} else {
+		nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		list = &cnh->ch_snapshot_list;
+	}
+	list->ssl_prev = prev;
+	nandfs_dirty_buf(bp, 1);
+
+	/* Update snapshot list for cno */
+	nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+	error = nandfs_bread(cp_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+	list = &cnp->cp_snapshot_list;
+	list->ssl_prev = 0;
+	list->ssl_next = 0;
+	cnp->cp_flags &= !NANDFS_CHECKPOINT_SNAPSHOT;
+	nandfs_dirty_buf(bp, 1);
+
+	/* Update header */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	snapshot_cnt = cnh->ch_nsnapshots;
+	snapshot_cnt--;
+	cnh->ch_nsnapshots = snapshot_cnt;
+	nandfs_dirty_buf(bp, 1);
+
+	return (0);
+}
+
+int
+nandfs_chng_cpmode(struct nandfs_node *node, struct nandfs_cpmode *ncpm)
+{
+	struct nandfs_device *fsdev;
+	uint64_t cno = ncpm->ncpm_cno;
+	int mode = ncpm->ncpm_mode;
+	int ret;
+
+	fsdev = node->nn_nandfsdev;
+	VOP_LOCK(NTOV(node), LK_EXCLUSIVE);
+	switch (mode) {
+	case NANDFS_CHECKPOINT:
+		if (nandfs_cp_mounted(fsdev, cno)) {
+			ret = EBUSY;
+		} else
+			ret = nandfs_cp_clr_snapshot(node, cno);
+		break;
+	case NANDFS_SNAPSHOT:
+		ret = nandfs_cp_set_snapshot(node, cno);
+		break;
+	default:
+		ret = EINVAL;
+		break;
+	}
+	VOP_UNLOCK(NTOV(node), 0);
+
+	return (ret);
+}
+
+static void
+nandfs_cpinfo_fill(struct nandfs_checkpoint *cnp, struct nandfs_cpinfo *nci)
+{
+
+	nci->nci_flags = cnp->cp_flags;
+	nci->nci_pad = 0;
+	nci->nci_cno = cnp->cp_cno;
+	nci->nci_create = cnp->cp_create;
+	nci->nci_nblk_inc = cnp->cp_nblk_inc;
+	nci->nci_blocks_count = cnp->cp_blocks_count;
+	nci->nci_next = cnp->cp_snapshot_list.ssl_next;
+	DPRINTF(CPFILE, ("%s: cn:%#jx ctime:%#jx\n",
+	    __func__, (uintmax_t)cnp->cp_cno,
+	    (uintmax_t)cnp->cp_create));
+}
+
+static int
+nandfs_get_cpinfo_cp(struct nandfs_node *node, uint64_t cno,
+    struct nandfs_cpinfo *nci, uint32_t mnmembs, uint32_t *nmembs)
+{
+	struct nandfs_device *fsdev;
+	struct buf *bp;
+	uint64_t blk, offset, last_cno, i;
+	uint16_t remaining;
+	int error;
+#ifdef INVARIANTS
+	uint64_t testblk, testoffset;
+#endif
+
+	if (cno == 0) {
+		return (ENOENT);
+	}
+
+	if (mnmembs < 1) {
+		return (EINVAL);
+	}
+
+	fsdev = node->nn_nandfsdev;
+	last_cno = fsdev->nd_last_cno;
+	DPRINTF(CPFILE, ("%s: cno:%#jx mnmembs: %#jx last:%#jx\n", __func__,
+	    (uintmax_t)cno, (uintmax_t)mnmembs,
+	    (uintmax_t)fsdev->nd_last_cno));
+
+	/*
+	 * do {
+	 * 	get block
+	 * 	read checkpoints until we hit last checkpoint, end of block or
+	 * 	requested number
+	 * } while (last read checkpoint <= last checkpoint on fs &&
+	 * 		read checkpoints < request number);
+	 */
+	*nmembs = i = 0;
+	do {
+		nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+		remaining = nandfs_checkpoint_blk_remaining(fsdev, cno,
+		    blk, offset);
+		error = nandfs_bread(node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+
+		while (cno <= last_cno && i < mnmembs && remaining) {
+#ifdef INVARIANTS
+			nandfs_checkpoint_blk_offset(fsdev, cno, &testblk,
+			    &testoffset);
+			KASSERT(testblk == blk, ("testblk != blk"));
+			KASSERT(testoffset == offset, ("testoffset != offset"));
+#endif
+			DPRINTF(CPFILE, ("%s: cno %#jx\n", __func__,
+			    (uintmax_t)cno));
+
+			nandfs_cpinfo_fill((struct nandfs_checkpoint *)
+			    (bp->b_data + offset), nci);
+			offset += nandfs_checkpoint_size(fsdev);
+			i++;
+			nci++;
+			cno++;
+			(*nmembs)++;
+			remaining--;
+		}
+		brelse(bp);
+	} while (cno <= last_cno && i < mnmembs);
+
+	return (0);
+}
+
+static int
+nandfs_get_cpinfo_sp(struct nandfs_node *node, uint64_t cno,
+    struct nandfs_cpinfo *nci, uint32_t mnmembs, uint32_t *nmembs)
+{
+	struct nandfs_checkpoint *cnp;
+	struct nandfs_cpfile_header *cnh;
+	struct nandfs_device *fsdev;
+	struct buf *bp = NULL;
+	uint64_t curr = 0;
+	uint64_t blk, offset, curr_cno;
+	uint32_t flag;
+	int i, error;
+
+	if (cno == 0 || cno == ~(0))
+		return (ENOENT);
+
+	fsdev = node->nn_nandfsdev;
+	curr_cno = cno;
+
+	if (nmembs)
+		*nmembs = 0;
+	if (curr_cno == 1) {
+		/* Get list from header */
+		error = nandfs_bread(node, 0, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		cnh = (struct nandfs_cpfile_header *) bp->b_data;
+		curr_cno = cnh->ch_snapshot_list.ssl_next;
+		brelse(bp);
+		bp = NULL;
+
+		/* No snapshots */
+		if (curr_cno == 0)
+			return (0);
+	}
+
+	for (i = 0; i < mnmembs; i++, nci++) {
+		nandfs_checkpoint_blk_offset(fsdev, curr_cno, &blk, &offset);
+		if (i == 0 || curr != blk) {
+			if (bp)
+				brelse(bp);
+			error = nandfs_bread(node, blk, NOCRED, 0, &bp);
+			if (error) {
+				brelse(bp);
+				return (ENOENT);
+			}
+			curr = blk;
+		}
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		flag = cnp->cp_flags;
+		if (!(flag & NANDFS_CHECKPOINT_SNAPSHOT) ||
+		    (flag & NANDFS_CHECKPOINT_INVALID))
+			break;
+
+		nci->nci_flags = flag;
+		nci->nci_pad = 0;
+		nci->nci_cno = cnp->cp_cno;
+		nci->nci_create = cnp->cp_create;
+		nci->nci_nblk_inc = cnp->cp_nblk_inc;
+		nci->nci_blocks_count = cnp->cp_blocks_count;
+		nci->nci_next = cnp->cp_snapshot_list.ssl_next;
+		if (nmembs)
+			(*nmembs)++;
+
+		curr_cno = nci->nci_next;
+		if (!curr_cno)
+			break;
+	}
+
+	brelse(bp);
+
+	return (0);
+}
+
+int
+nandfs_get_cpinfo(struct nandfs_node *node, uint64_t cno, uint16_t flags,
+    struct nandfs_cpinfo *nci, uint32_t nmembs, uint32_t *nnmembs)
+{
+	int error;
+
+	VOP_LOCK(NTOV(node), LK_EXCLUSIVE);
+	switch (flags) {
+	case NANDFS_CHECKPOINT:
+		error = nandfs_get_cpinfo_cp(node, cno, nci, nmembs, nnmembs);
+		break;
+	case NANDFS_SNAPSHOT:
+		error = nandfs_get_cpinfo_sp(node, cno, nci, nmembs, nnmembs);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	VOP_UNLOCK(NTOV(node), 0);
+
+	return (error);
+}
+
+int
+nandfs_get_cpinfo_ioctl(struct nandfs_node *node, struct nandfs_argv *nargv)
+{
+	struct nandfs_cpinfo *nci;
+	uint64_t cno = nargv->nv_index;
+	void *buf = (void *)((uintptr_t)nargv->nv_base);
+	uint16_t flags = nargv->nv_flags;
+	uint32_t nmembs = 0;
+	int error;
+
+	if (nargv->nv_nmembs > NANDFS_CPINFO_MAX)
+		return (EINVAL);
+
+	nci = malloc(sizeof(struct nandfs_cpinfo) * nargv->nv_nmembs,
+	    M_NANDFSTEMP, M_WAITOK | M_ZERO);
+
+	error = nandfs_get_cpinfo(node, cno, flags, nci, nargv->nv_nmembs, &nmembs);
+
+	if (error == 0) {
+		nargv->nv_nmembs = nmembs;
+		error = copyout(nci, buf,
+		    sizeof(struct nandfs_cpinfo) * nmembs);
+	}
+
+	free(nci, M_NANDFSTEMP);
+	return (error);
+}
+
+int
+nandfs_delete_cp(struct nandfs_node *node, uint64_t start, uint64_t end)
+{
+	struct nandfs_checkpoint *cnp;
+	struct nandfs_device *fsdev;
+	struct buf *bp;
+	uint64_t cno = start, blk, offset;
+	int error;
+
+	DPRINTF(CPFILE, ("%s: delete cno %jx-%jx\n", __func__, start, end));
+	VOP_LOCK(NTOV(node), LK_EXCLUSIVE);
+	fsdev = node->nn_nandfsdev;
+	for (cno = start; cno <= end; cno++) {
+		if (!cno)
+			continue;
+
+		nandfs_checkpoint_blk_offset(fsdev, cno, &blk, &offset);
+		error = nandfs_bread(node, blk, NOCRED, 0, &bp);
+		if (error) {
+			VOP_UNLOCK(NTOV(node), 0);
+			brelse(bp);
+			return (error);
+		}
+
+		cnp = (struct nandfs_checkpoint *)(bp->b_data + offset);
+		if (cnp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT) {
+			brelse(bp);
+			VOP_UNLOCK(NTOV(node), 0);
+			return (0);
+		}
+
+		cnp->cp_flags |= NANDFS_CHECKPOINT_INVALID;
+
+		error = nandfs_dirty_buf(bp, 0);
+		if (error)
+			return (error);
+	}
+	VOP_UNLOCK(NTOV(node), 0);
+
+	return (0);
+}
+
+int
+nandfs_make_snap(struct nandfs_device *fsdev, uint64_t *cno)
+{
+	struct nandfs_cpmode cpm;
+	int error;
+
+	*cno = cpm.ncpm_cno = fsdev->nd_last_cno;
+	cpm.ncpm_mode = NANDFS_SNAPSHOT;
+	error = nandfs_chng_cpmode(fsdev->nd_cp_node, &cpm);
+	return (error);
+}
+
+int
+nandfs_delete_snap(struct nandfs_device *fsdev, uint64_t cno)
+{
+	struct nandfs_cpmode cpm;
+	int error;
+
+	cpm.ncpm_cno = cno;
+	cpm.ncpm_mode = NANDFS_CHECKPOINT;
+	error = nandfs_chng_cpmode(fsdev->nd_cp_node, &cpm);
+	return (error);
+}
+
+int nandfs_get_cpstat(struct nandfs_node *cp_node, struct nandfs_cpstat *ncp)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_cpfile_header *cnh;
+	struct buf *bp;
+	int error;
+
+	VOP_LOCK(NTOV(cp_node), LK_EXCLUSIVE);
+	fsdev = cp_node->nn_nandfsdev;
+
+	/* Get header */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (error);
+	}
+	cnh = (struct nandfs_cpfile_header *) bp->b_data;
+	ncp->ncp_cno = fsdev->nd_last_cno;
+	ncp->ncp_ncps = cnh->ch_ncheckpoints;
+	ncp->ncp_nss = cnh->ch_nsnapshots;
+	DPRINTF(CPFILE, ("%s: cno:%#jx ncps:%#jx nss:%#jx\n",
+	    __func__, ncp->ncp_cno, ncp->ncp_ncps, ncp->ncp_nss));
+	brelse(bp);
+	VOP_UNLOCK(NTOV(cp_node), 0);
+
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_dat.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_dat.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,344 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_dat.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+int
+nandfs_vblock_alloc(struct nandfs_device *nandfsdev, nandfs_daddr_t *vblock)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	struct nandfs_dat_entry *dat_entry;
+	uint64_t start;
+	uint32_t entry;
+	int locked, error;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+	start = nandfsdev->nd_last_cno + 1;
+
+	locked = NANDFS_VOP_ISLOCKED(NTOV(dat));
+	if (!locked)
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+	req.entrynum = 0;
+
+	/* Alloc vblock number */
+	error = nandfs_find_free_entry(mdt, dat, &req);
+	if (error) {
+		nandfs_error("%s: cannot find free vblk entry\n",
+		    __func__);
+		if (!locked)
+			VOP_UNLOCK(NTOV(dat), 0);
+		return (error);
+	}
+
+	/* Read/create buffer */
+	error = nandfs_get_entry_block(mdt, dat, &req, &entry, 1);
+	if (error) {
+		nandfs_error("%s: cannot get free vblk entry\n",
+		    __func__);
+		nandfs_abort_entry(&req);
+		if (!locked)
+			VOP_UNLOCK(NTOV(dat), 0);
+		return (error);
+	}
+
+	/* Fill out vblock data */
+	dat_entry = (struct nandfs_dat_entry *) req.bp_entry->b_data;
+	dat_entry[entry].de_start = start;
+	dat_entry[entry].de_end = UINTMAX_MAX;
+	dat_entry[entry].de_blocknr = 0;
+
+	/* Commit allocation */
+	error = nandfs_alloc_entry(mdt, &req);
+	if (error) {
+		nandfs_error("%s: cannot get free vblk entry\n",
+		    __func__);
+		if (!locked)
+			VOP_UNLOCK(NTOV(dat), 0);
+		return (error);
+	}
+
+	/* Return allocated vblock */
+	*vblock = req.entrynum;
+	DPRINTF(DAT, ("%s: allocated vblock %#jx\n",
+	    __func__, (uintmax_t)*vblock));
+
+	if (!locked)
+		VOP_UNLOCK(NTOV(dat), 0);
+	return (error);
+}
+
+int
+nandfs_vblock_assign(struct nandfs_device *nandfsdev, nandfs_daddr_t vblock,
+    nandfs_lbn_t block)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	struct nandfs_dat_entry *dat_entry;
+	uint32_t entry;
+	int locked, error;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+
+	locked = NANDFS_VOP_ISLOCKED(NTOV(dat));
+	if (!locked)
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+	req.entrynum = vblock;
+
+	error = nandfs_get_entry_block(mdt, dat, &req, &entry, 0);
+	if (!error) {
+		dat_entry = (struct nandfs_dat_entry *) req.bp_entry->b_data;
+		dat_entry[entry].de_blocknr = block;
+
+		DPRINTF(DAT, ("%s: assing vblock %jx->%jx\n",
+		    __func__, (uintmax_t)vblock, (uintmax_t)block));
+
+		/*
+		 * It is mostly called from syncer() so
+		 * we want to force making buf dirty
+		 */
+		error = nandfs_dirty_buf(req.bp_entry, 1);
+	}
+
+	if (!locked)
+		VOP_UNLOCK(NTOV(dat), 0);
+
+	return (error);
+}
+
+int
+nandfs_vblock_end(struct nandfs_device *nandfsdev, nandfs_daddr_t vblock)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	struct nandfs_dat_entry *dat_entry;
+	uint64_t end;
+	uint32_t entry;
+	int locked, error;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+	end = nandfsdev->nd_last_cno;
+
+	locked = NANDFS_VOP_ISLOCKED(NTOV(dat));
+	if (!locked)
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+	req.entrynum = vblock;
+
+	error = nandfs_get_entry_block(mdt, dat, &req, &entry, 0);
+	if (!error) {
+		dat_entry = (struct nandfs_dat_entry *) req.bp_entry->b_data;
+		dat_entry[entry].de_end = end;
+		DPRINTF(DAT, ("%s: end vblock %#jx at checkpoint %#jx\n",
+		    __func__, (uintmax_t)vblock, (uintmax_t)end));
+
+		/*
+		 * It is mostly called from syncer() so
+		 * we want to force making buf dirty
+		 */
+		error = nandfs_dirty_buf(req.bp_entry, 1);
+	}
+
+	if (!locked)
+		VOP_UNLOCK(NTOV(dat), 0);
+
+	return (error);
+}
+
+int
+nandfs_vblock_free(struct nandfs_device *nandfsdev, nandfs_daddr_t vblock)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	int error;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+
+	VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+	req.entrynum = vblock;
+
+	error = nandfs_find_entry(mdt, dat, &req);
+	if (!error) {
+		DPRINTF(DAT, ("%s: vblk %#jx\n", __func__, (uintmax_t)vblock));
+		nandfs_free_entry(mdt, &req);
+	}
+
+	VOP_UNLOCK(NTOV(dat), 0);
+	return (error);
+}
+
+int
+nandfs_get_dat_vinfo_ioctl(struct nandfs_device *nandfsdev, struct nandfs_argv *nargv)
+{
+	struct nandfs_vinfo *vinfo;
+	size_t size;
+	int error;
+
+	if (nargv->nv_nmembs > NANDFS_VINFO_MAX)
+		return (EINVAL);
+
+	size = sizeof(struct nandfs_vinfo) * nargv->nv_nmembs;
+	vinfo = malloc(size, M_NANDFSTEMP, M_WAITOK|M_ZERO);
+
+	error = copyin((void *)(uintptr_t)nargv->nv_base, vinfo, size);
+	if (error) {
+		free(vinfo, M_NANDFSTEMP);
+		return (error);
+	}
+
+	error = nandfs_get_dat_vinfo(nandfsdev, vinfo, nargv->nv_nmembs);
+	if (error == 0)
+		error =	copyout(vinfo, (void *)(uintptr_t)nargv->nv_base, size);
+	free(vinfo, M_NANDFSTEMP);
+	return (error);
+}
+
+int
+nandfs_get_dat_vinfo(struct nandfs_device *nandfsdev, struct nandfs_vinfo *vinfo,
+    uint32_t nmembs)
+{
+	struct nandfs_node *dat;
+	struct nandfs_mdt *mdt;
+	struct nandfs_alloc_request req;
+	struct nandfs_dat_entry *dat_entry;
+	uint32_t i, idx;
+	int error = 0;
+
+	dat = nandfsdev->nd_dat_node;
+	mdt = &nandfsdev->nd_dat_mdt;
+
+	DPRINTF(DAT, ("%s: nmembs %#x\n", __func__, nmembs));
+
+	VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+
+	for (i = 0; i < nmembs; i++) {
+		req.entrynum = vinfo[i].nvi_vblocknr;
+
+		error = nandfs_get_entry_block(mdt, dat,&req, &idx, 0);
+		if (error)
+			break;
+
+		dat_entry = ((struct nandfs_dat_entry *) req.bp_entry->b_data);
+		vinfo[i].nvi_start = dat_entry[idx].de_start;
+		vinfo[i].nvi_end = dat_entry[idx].de_end;
+		vinfo[i].nvi_blocknr = dat_entry[idx].de_blocknr;
+
+		DPRINTF(DAT, ("%s: vinfo: %jx[%jx-%jx]->%jx\n",
+		    __func__, vinfo[i].nvi_vblocknr, vinfo[i].nvi_start,
+		    vinfo[i].nvi_end, vinfo[i].nvi_blocknr));
+
+		brelse(req.bp_entry);
+	}
+
+	VOP_UNLOCK(NTOV(dat), 0);
+	return (error);
+}
+
+int
+nandfs_get_dat_bdescs_ioctl(struct nandfs_device *nffsdev,
+    struct nandfs_argv *nargv)
+{
+	struct nandfs_bdesc *bd;
+	size_t size;
+	int error;
+
+	size = nargv->nv_nmembs * sizeof(struct nandfs_bdesc);
+	bd = malloc(size, M_NANDFSTEMP, M_WAITOK);
+	error = copyin((void *)(uintptr_t)nargv->nv_base, bd, size);
+	if (error) {
+		free(bd, M_NANDFSTEMP);
+		return (error);
+	}
+
+	error = nandfs_get_dat_bdescs(nffsdev, bd, nargv->nv_nmembs);
+
+	if (error == 0)
+		error =	copyout(bd, (void *)(uintptr_t)nargv->nv_base, size);
+
+	free(bd, M_NANDFSTEMP);
+	return (error);
+}
+
+int
+nandfs_get_dat_bdescs(struct nandfs_device *nffsdev, struct nandfs_bdesc *bd,
+    uint32_t nmembs)
+{
+	struct nandfs_node *dat_node;
+	uint64_t map;
+	uint32_t i;
+	int error = 0;
+
+	dat_node = nffsdev->nd_dat_node;
+
+	VOP_LOCK(NTOV(dat_node), LK_EXCLUSIVE);
+
+	for (i = 0; i < nmembs; i++) {
+		DPRINTF(CLEAN,
+		    ("%s: bd ino:%#jx oblk:%#jx blocknr:%#jx off:%#jx\n",
+		    __func__,  (uintmax_t)bd[i].bd_ino,
+		    (uintmax_t)bd[i].bd_oblocknr, (uintmax_t)bd[i].bd_blocknr,
+		    (uintmax_t)bd[i].bd_offset));
+
+		error = nandfs_bmap_lookup(dat_node, bd[i].bd_offset, &map);
+		if (error)
+			break;
+		bd[i].bd_blocknr = map;
+	}
+
+	VOP_UNLOCK(NTOV(dat_node), 0);
+	return (error);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_dir.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_dir.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,314 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_subr.c,v 1.4 2009/07/29 17:06:57 reinoud
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_dir.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/kernel.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+
+int
+nandfs_add_dirent(struct vnode *dvp, uint64_t ino, char *nameptr, long namelen,
+    uint8_t type)
+{
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfs_dir_entry *dirent, *pdirent;
+	uint32_t blocksize = dir_node->nn_nandfsdev->nd_blocksize;
+	uint64_t filesize = dir_node->nn_inode.i_size;
+	uint64_t inode_blks = dir_node->nn_inode.i_blocks;
+	uint32_t off, rest;
+	uint8_t *pos;
+	struct buf *bp;
+	int error;
+
+	pdirent = NULL;
+	bp = NULL;
+	if (inode_blks) {
+		error = nandfs_bread(dir_node, inode_blks - 1, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+
+		pos = bp->b_data;
+		off = 0;
+		while (off < blocksize) {
+			pdirent = (struct nandfs_dir_entry *) (pos + off);
+			if (!pdirent->rec_len) {
+				pdirent = NULL;
+				break;
+			}
+			off += pdirent->rec_len;
+		}
+
+		if (pdirent)
+			rest = pdirent->rec_len -
+			    NANDFS_DIR_REC_LEN(pdirent->name_len);
+		else
+			rest = blocksize;
+
+		if (rest < NANDFS_DIR_REC_LEN(namelen)) {
+			/* Do not update pdirent as new block is created */
+			pdirent = NULL;
+			brelse(bp);
+			/* Set to NULL to create new */
+			bp = NULL;
+			filesize += rest;
+		}
+	}
+
+	/* If no bp found create new */
+	if (!bp) {
+		error = nandfs_bcreate(dir_node, inode_blks, NOCRED, 0, &bp);
+		if (error)
+			return (error);
+		off = 0;
+		pos = bp->b_data;
+	}
+
+	/* Modify pdirent if exists */
+	if (pdirent) {
+		DPRINTF(LOOKUP, ("modify pdirent %p\n", pdirent));
+		/* modify last de */
+		off -= pdirent->rec_len;
+		pdirent->rec_len =
+		    NANDFS_DIR_REC_LEN(pdirent->name_len);
+		off += pdirent->rec_len;
+	}
+
+	/* Create new dirent */
+	dirent = (struct nandfs_dir_entry *) (pos + off);
+	dirent->rec_len = blocksize - off;
+	dirent->inode = ino;
+	dirent->name_len = namelen;
+	memset(dirent->name, 0, NANDFS_DIR_NAME_LEN(namelen));
+	memcpy(dirent->name, nameptr, namelen);
+	dirent->file_type = type;
+
+	filesize += NANDFS_DIR_REC_LEN(dirent->name_len);
+
+	DPRINTF(LOOKUP, ("create dir_entry '%.*s' at %p with size %x "
+	    "new filesize: %jx\n",
+	    (int)namelen, dirent->name, dirent, dirent->rec_len,
+	    (uintmax_t)filesize));
+
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	dir_node->nn_inode.i_size = filesize;
+	dir_node->nn_flags |= IN_CHANGE | IN_UPDATE;
+	vnode_pager_setsize(dvp, filesize);
+
+	return (0);
+}
+
+int
+nandfs_remove_dirent(struct vnode *dvp, struct nandfs_node *node,
+    struct componentname *cnp)
+{
+	struct nandfs_node *dir_node;
+	struct nandfs_dir_entry *dirent, *pdirent;
+	struct buf *bp;
+	uint64_t filesize, blocknr, ino, offset;
+	uint32_t blocksize, limit, off;
+	uint16_t newsize;
+	uint8_t *pos;
+	int error, found;
+
+	dir_node = VTON(dvp);
+	filesize = dir_node->nn_inode.i_size;
+	if (!filesize)
+		return (0);
+
+	if (node) {
+		offset = node->nn_diroff;
+		ino = node->nn_ino;
+	} else {
+		offset = dir_node->nn_diroff;
+		ino = NANDFS_WHT_INO;
+	}
+
+	dirent = pdirent = NULL;
+	blocksize = dir_node->nn_nandfsdev->nd_blocksize;
+	blocknr = offset / blocksize;
+
+	DPRINTF(LOOKUP, ("rm direntry dvp %p node %p ino %#jx at off %#jx\n",
+	    dvp, node, (uintmax_t)ino, (uintmax_t)offset));
+
+	error = nandfs_bread(dir_node, blocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	pos = bp->b_data;
+	off = 0;
+	found = 0;
+	limit = offset % blocksize;
+	pdirent = (struct nandfs_dir_entry *) bp->b_data;
+	while (off <= limit) {
+		dirent = (struct nandfs_dir_entry *) (pos + off);
+
+		if ((off == limit) &&
+		    (dirent->inode == ino)) {
+			found = 1;
+			break;
+		}
+		if (dirent->inode != 0)
+			pdirent = dirent;
+		off += dirent->rec_len;
+	}
+
+	if (!found) {
+		nandfs_error("cannot find entry to remove");
+		brelse(bp);
+		return (error);
+	}
+	DPRINTF(LOOKUP,
+	    ("rm dirent ino %#jx at %#x with size %#x\n",
+	    (uintmax_t)dirent->inode, off, dirent->rec_len));
+
+	newsize = (uintptr_t)dirent - (uintptr_t)pdirent;
+	newsize += dirent->rec_len;
+	pdirent->rec_len = newsize;
+	dirent->inode = 0;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	dir_node->nn_flags |= IN_CHANGE | IN_UPDATE;
+	/* If last one modify filesize */
+	if ((offset + NANDFS_DIR_REC_LEN(dirent->name_len)) == filesize) {
+		filesize = blocknr * blocksize +
+		    ((uintptr_t)pdirent - (uintptr_t)pos) +
+		    NANDFS_DIR_REC_LEN(pdirent->name_len);
+		dir_node->nn_inode.i_size = filesize;
+	}
+
+	return (0);
+}
+
+int
+nandfs_update_parent_dir(struct vnode *dvp, uint64_t newparent)
+{
+	struct nandfs_dir_entry *dirent;
+	struct nandfs_node *dir_node;
+	struct buf *bp;
+	int error;
+
+	dir_node = VTON(dvp);
+	error = nandfs_bread(dir_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+	dirent = (struct nandfs_dir_entry *)bp->b_data;
+	dirent->inode = newparent;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+int
+nandfs_update_dirent(struct vnode *dvp, struct nandfs_node *fnode,
+    struct nandfs_node *tnode)
+{
+	struct nandfs_node *dir_node;
+	struct nandfs_dir_entry *dirent;
+	struct buf *bp;
+	uint64_t file_size, blocknr;
+	uint32_t blocksize, off;
+	uint8_t *pos;
+	int error;
+
+	dir_node = VTON(dvp);
+	file_size = dir_node->nn_inode.i_size;
+	if (!file_size)
+		return (0);
+
+	DPRINTF(LOOKUP,
+	    ("chg direntry dvp %p ino %#jx  to in %#jx at off %#jx\n",
+	    dvp, (uintmax_t)tnode->nn_ino, (uintmax_t)fnode->nn_ino,
+	    (uintmax_t)tnode->nn_diroff));
+
+	blocksize = dir_node->nn_nandfsdev->nd_blocksize;
+	blocknr = tnode->nn_diroff / blocksize;
+	off = tnode->nn_diroff % blocksize;
+	error = nandfs_bread(dir_node, blocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	pos = bp->b_data;
+	dirent = (struct nandfs_dir_entry *) (pos + off);
+	KASSERT((dirent->inode == tnode->nn_ino),
+	    ("direntry mismatch"));
+
+	dirent->inode = fnode->nn_ino;
+	error = nandfs_dirty_buf(bp, 0);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+int
+nandfs_init_dir(struct vnode *dvp, uint64_t ino, uint64_t parent_ino)
+{
+
+	if (nandfs_add_dirent(dvp, parent_ino, "..", 2, DT_DIR) ||
+	    nandfs_add_dirent(dvp, ino, ".", 1, DT_DIR)) {
+		nandfs_error("%s: cannot initialize dir ino:%jd(pino:%jd)\n",
+		    __func__, ino, parent_ino);
+		return (-1);
+	}
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_fs.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_fs.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,565 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Original definitions written by Koji Sato <koji at osrg.net>
+ *                    and Ryusuke Konishi <ryusuke at osrg.net>
+ * From: NetBSD: nandfs_fs.h,v 1.1 2009/07/18 16:31:42 reinoud
+ *
+ * $FreeBSD: head/sys/fs/nandfs/nandfs_fs.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _NANDFS_FS_H
+#define _NANDFS_FS_H
+
+#include <sys/uuid.h>
+
+#define	MNINDIR(fsdev)	((fsdev)->nd_blocksize / sizeof(nandfs_daddr_t))
+
+/*
+ * Inode structure. There are a few dedicated inode numbers that are
+ * defined here first.
+ */
+#define	NANDFS_WHT_INO		1	/* Whiteout ino			*/
+#define	NANDFS_ROOT_INO		2	/* Root file inode		*/
+#define	NANDFS_DAT_INO		3	/* DAT file			*/
+#define	NANDFS_CPFILE_INO	4	/* checkpoint file		*/
+#define	NANDFS_SUFILE_INO	5	/* segment usage file		*/
+#define	NANDFS_IFILE_INO	6	/* ifile			*/
+#define	NANDFS_GC_INO		7	/* Cleanerd node		*/
+#define	NANDFS_ATIME_INO	8	/* Atime file (reserved)	*/
+#define	NANDFS_XATTR_INO	9	/* Xattribute file (reserved)	*/
+#define	NANDFS_SKETCH_INO	10	/* Sketch file (obsolete)	*/
+#define	NANDFS_USER_INO		11	/* First user's file inode number */
+
+#define	NANDFS_SYS_NODE(ino) \
+	(((ino) >= NANDFS_DAT_INO) && ((ino) <= NANDFS_GC_INO))
+
+#define	NDADDR		12		/* Direct addresses in inode. */
+#define	NIADDR		3		/* Indirect addresses in inode. */
+
+typedef	int64_t		nandfs_daddr_t;
+typedef	int64_t		nandfs_lbn_t;
+
+struct nandfs_inode {
+	uint64_t	i_blocks;	/* 0: size in device blocks		*/
+	uint64_t	i_size;		/* 8: size in bytes			*/
+	uint64_t	i_ctime;	/* 16: creation time in seconds		*/
+	uint64_t	i_mtime;	/* 24: modification time in seconds part*/
+	uint32_t	i_ctime_nsec;	/* 32: creation time nanoseconds part	*/
+	uint32_t	i_mtime_nsec;	/* 36: modification time in nanoseconds	*/
+	uint32_t	i_uid;		/* 40: user id				*/
+	uint32_t	i_gid;		/* 44: group id				*/
+	uint16_t	i_mode;		/* 48: file mode			*/
+	uint16_t	i_links_count;	/* 50: number of references to the inode*/
+	uint32_t	i_flags;	/* 52: NANDFS_*_FL flags		*/
+	nandfs_daddr_t	i_special;	/* 56: special				*/
+	nandfs_daddr_t	i_db[NDADDR];	/* 64: Direct disk blocks.		*/
+	nandfs_daddr_t	i_ib[NIADDR];	/* 160: Indirect disk blocks.		*/
+	uint64_t	i_xattr;	/* 184: reserved for extended attributes*/
+	uint32_t	i_generation;	/* 192: file generation for NFS		*/
+	uint32_t	i_pad[15];	/* 196: make it 64 bits aligned		*/
+};
+
+#ifdef _KERNEL
+CTASSERT(sizeof(struct nandfs_inode) == 256);
+#endif
+
+/*
+ * Each checkpoint/snapshot has a super root.
+ *
+ * The super root holds the inodes of the three system files: `dat', `cp' and
+ * 'su' files. All other FS state is defined by those.
+ *
+ * It is CRC checksum'ed and time stamped.
+ */
+
+struct nandfs_super_root {
+	uint32_t	sr_sum;		/* check-sum				*/
+	uint16_t	sr_bytes;	/* byte count of this structure		*/
+	uint16_t	sr_flags;	/* reserved for flags			*/
+	uint64_t	sr_nongc_ctime;	/* timestamp, not for cleaner(?)	*/
+	struct nandfs_inode sr_dat;	/* DAT, virt->phys translation inode	*/
+	struct nandfs_inode sr_cpfile;	/* CP, checkpoints inode		*/
+	struct nandfs_inode sr_sufile;	/* SU, segment usage inode		*/
+};
+
+#define	NANDFS_SR_MDT_OFFSET(inode_size, i)			\
+	((uint32_t)&((struct nandfs_super_root *)0)->sr_dat +	\
+	(inode_size) * (i))
+
+#define	NANDFS_SR_DAT_OFFSET(inode_size)	NANDFS_SR_MDT_OFFSET(inode_size, 0)
+#define	NANDFS_SR_CPFILE_OFFSET(inode_size)	NANDFS_SR_MDT_OFFSET(inode_size, 1)
+#define	NANDFS_SR_SUFILE_OFFSET(inode_size)	NANDFS_SR_MDT_OFFSET(inode_size, 2)
+#define	NANDFS_SR_BYTES			(sizeof(struct nandfs_super_root))
+
+/*
+ * The superblock describes the basic structure and mount history. It also
+ * records some sizes of structures found on the disc for sanity checks.
+ *
+ * The superblock is stored at two places: NANDFS_SB_OFFSET_BYTES and
+ * NANDFS_SB2_OFFSET_BYTES.
+ */
+
+/* File system states stored on media in superblock's sbp->s_state */
+#define	NANDFS_VALID_FS		0x0001	/* cleanly unmounted and all is ok  */
+#define	NANDFS_ERROR_FS		0x0002	/* there were errors detected, fsck */
+#define	NANDFS_RESIZE_FS	0x0004	/* resize required, XXX unknown flag*/
+#define	NANDFS_MOUNT_STATE_BITS	"\20\1VALID_FS\2ERROR_FS\3RESIZE_FS"
+
+/*
+ * Brief description of control structures:
+ *
+ * NANDFS_NFSAREAS first blocks contain fsdata and some amount of super blocks.
+ * Simple round-robin policy is used in order to choose which block will
+ * contain new super block.
+ *
+ * Simple case with 2 blocks:
+ * 1: fsdata sblock1 [sblock3 [sblock5 ..]]
+ * 2: fsdata sblock2 [sblock4 [sblock6 ..]]
+ */
+struct nandfs_fsdata {
+	uint16_t	f_magic;
+	uint16_t	f_bytes;
+
+	uint32_t	f_sum;		/* checksum of fsdata		*/
+	uint32_t	f_rev_level;	/* major disk format revision	*/
+
+	uint64_t	f_ctime;	/* creation time (execution time
+					   of newfs)			*/
+	/* Block size represented as: blocksize = 1 << (f_log_block_size + 10)	*/
+	uint32_t	f_log_block_size;
+
+	uint16_t	f_inode_size;		/* size of an inode		*/
+	uint16_t	f_dat_entry_size;	/* size of a dat entry		*/
+	uint16_t	f_checkpoint_size;	/* size of a checkpoint		*/
+	uint16_t	f_segment_usage_size;	/* size of a segment usage	*/
+
+	uint16_t	f_sbbytes;		/* byte count of CRC calculation
+						   for super blocks. s_reserved
+						   is excluded!			*/
+
+	uint16_t	f_errors;		/* behaviour on detecting errors	*/
+
+	uint32_t	f_erasesize;
+	uint64_t	f_nsegments;		/* number of segm. in filesystem	*/
+	nandfs_daddr_t	f_first_data_block;	/* 1st seg disk block number		*/
+	uint32_t	f_blocks_per_segment;	/* number of blocks per segment		*/
+	uint32_t	f_r_segments_percentage;	/* reserved segments percentage		*/
+
+	struct uuid	f_uuid;			/* 128-bit uuid for volume		*/
+	char		f_volume_name[16];	/* volume name				*/
+	uint32_t	f_pad[104];
+} __packed;
+
+#ifdef _KERNEL
+CTASSERT(sizeof(struct nandfs_fsdata) == 512);
+#endif
+
+struct nandfs_super_block {
+	uint16_t	s_magic;		/* magic value for identification */
+
+	uint32_t	s_sum;			/* check sum of super block       */
+
+	uint64_t	s_last_cno;		/* last checkpoint number         */
+	uint64_t	s_last_pseg;		/* addr part. segm. written last  */
+	uint64_t	s_last_seq;		/* seq.number of seg written last */
+	uint64_t	s_free_blocks_count;	/* free blocks count              */
+
+	uint64_t	s_mtime;		/* mount time                     */
+	uint64_t	s_wtime;		/* write time                     */
+	uint16_t	s_state;		/* file system state              */
+
+	char		s_last_mounted[64];	/* directory where last mounted   */
+
+	uint32_t	s_c_interval;		/* commit interval of segment     */
+	uint32_t	s_c_block_max;		/* threshold of data amount for
+						   the segment construction */
+	uint32_t	s_reserved[32];		/* padding to end of the block    */
+} __packed;
+
+#ifdef _KERNEL
+CTASSERT(sizeof(struct nandfs_super_block) == 256);
+#endif
+
+#define	NANDFS_FSDATA_MAGIC	0xf8da
+#define	NANDFS_SUPER_MAGIC	0x8008
+
+#define	NANDFS_NFSAREAS		4
+#define	NANDFS_DATA_OFFSET_BYTES(esize)	(NANDFS_NFSAREAS * (esize))
+
+#define	NANDFS_SBLOCK_OFFSET_BYTES (sizeof(struct nandfs_fsdata))
+
+#define	NANDFS_DEF_BLOCKSIZE	4096
+#define	NANDFS_MIN_BLOCKSIZE	512
+
+#define	NANDFS_DEF_ERASESIZE	(2 << 16)
+
+#define	NANDFS_MIN_SEGSIZE	NANDFS_DEF_ERASESIZE
+
+#define	NANDFS_CURRENT_REV	9	/* current major revision */
+
+#define	NANDFS_FSDATA_CRC_BYTES offsetof(struct nandfs_fsdata, f_pad)
+/* Bytes count of super_block for CRC-calculation */
+#define	NANDFS_SB_BYTES  offsetof(struct nandfs_super_block, s_reserved)
+
+/* Maximal count of links to a file */
+#define	NANDFS_LINK_MAX		32000
+
+/*
+ * Structure of a directory entry.
+ *
+ * Note that they can't span blocks; the rec_len fills out.
+ */
+
+#define	NANDFS_NAME_LEN 255
+struct nandfs_dir_entry {
+	uint64_t	inode;			/* inode number */
+	uint16_t	rec_len;		/* directory entry length */
+	uint8_t		name_len;		/* name length */
+	uint8_t		file_type;
+	char		name[NANDFS_NAME_LEN];	/* file name */
+	char		pad;
+};
+
+/*
+ * NANDFS_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 8
+ */
+#define	NANDFS_DIR_PAD			8
+#define	NANDFS_DIR_ROUND		(NANDFS_DIR_PAD - 1)
+#define	NANDFS_DIR_NAME_OFFSET		(offsetof(struct nandfs_dir_entry, name))
+#define	NANDFS_DIR_REC_LEN(name_len)					\
+	(((name_len) + NANDFS_DIR_NAME_OFFSET + NANDFS_DIR_ROUND)	\
+	& ~NANDFS_DIR_ROUND)
+#define	NANDFS_DIR_NAME_LEN(name_len)	\
+	(NANDFS_DIR_REC_LEN(name_len) - NANDFS_DIR_NAME_OFFSET)
+
+/*
+ * NiLFS/NANDFS devides the disc into fixed length segments. Each segment is
+ * filled with one or more partial segments of variable lengths.
+ *
+ * Each partial segment has a segment summary header followed by updates of
+ * files and optionally a super root.
+ */
+
+/*
+ * Virtual to physical block translation information. For data blocks it maps
+ * logical block number bi_blkoff to virtual block nr bi_vblocknr. For non
+ * datablocks it is the virtual block number assigned to an indirect block
+ * and has no bi_blkoff. The physical block number is the next
+ * available data block in the partial segment after all the binfo's.
+ */
+struct nandfs_binfo_v {
+	uint64_t	bi_ino;		/* file's inode			     */
+	uint64_t	bi_vblocknr;	/* assigned virtual block number     */
+	uint64_t	bi_blkoff;	/* for file's logical block number   */
+};
+
+/*
+ * DAT allocation. For data blocks just the logical block number that maps on
+ * the next available data block in the partial segment after the binfo's.
+ */
+struct nandfs_binfo_dat {
+	uint64_t	bi_ino;
+	uint64_t	bi_blkoff;	/* DAT file's logical block number */
+	uint8_t		bi_level;	/* whether this is meta block */
+	uint8_t		bi_pad[7];
+};
+
+#ifdef _KERNEL
+CTASSERT(sizeof(struct nandfs_binfo_v) == sizeof(struct nandfs_binfo_dat));
+#endif
+
+/* Convenience union for both types of binfo's */
+union nandfs_binfo {
+	struct nandfs_binfo_v bi_v;
+	struct nandfs_binfo_dat bi_dat;
+};
+
+/* Indirect buffers path */
+struct nandfs_indir {
+	nandfs_daddr_t	in_lbn;
+	int		in_off;
+};
+
+/* The (partial) segment summary */
+struct nandfs_segment_summary {
+	uint32_t	ss_datasum;	/* CRC of complete data block        */
+	uint32_t	ss_sumsum;	/* CRC of segment summary only       */
+	uint32_t	ss_magic;	/* magic to identify segment summary */
+	uint16_t	ss_bytes;	/* size of segment summary structure */
+	uint16_t	ss_flags;	/* NANDFS_SS_* flags                  */
+	uint64_t	ss_seq;		/* sequence number of this segm. sum */
+	uint64_t	ss_create;	/* creation timestamp in seconds     */
+	uint64_t	ss_next;	/* blocknumber of next segment       */
+	uint32_t	ss_nblocks;	/* number of blocks used by summary  */
+	uint32_t	ss_nbinfos;	/* number of binfo structures	     */
+	uint32_t	ss_sumbytes;	/* total size of segment summary     */
+	uint32_t	ss_pad;
+	/* stream of binfo structures */
+};
+
+#define	NANDFS_SEGSUM_MAGIC	0x8e680011	/* segment summary magic number */
+
+/* Segment summary flags */
+#define	NANDFS_SS_LOGBGN	0x0001	/* begins a logical segment */
+#define	NANDFS_SS_LOGEND	0x0002	/* ends a logical segment */
+#define	NANDFS_SS_SR		0x0004	/* has super root */
+#define	NANDFS_SS_SYNDT		0x0008	/* includes data only updates */
+#define	NANDFS_SS_GC		0x0010	/* segment written for cleaner operation */
+#define	NANDFS_SS_FLAG_BITS	"\20\1LOGBGN\2LOGEND\3SR\4SYNDT\5GC"
+
+/* Segment summary constrains */
+#define	NANDFS_SEG_MIN_BLOCKS	16	/* minimum number of blocks in a
+					   full segment */
+#define	NANDFS_PSEG_MIN_BLOCKS	2	/* minimum number of blocks in a
+					   partial segment */
+#define	NANDFS_MIN_NRSVSEGS	8	/* minimum number of reserved
+					   segments */
+
+/*
+ * Structure of DAT/inode file.
+ *
+ * A DAT file is devided into groups. The maximum number of groups is the
+ * number of block group descriptors that fit into one block; this descriptor
+ * only gives the number of free entries in the associated group.
+ *
+ * Each group has a block sized bitmap indicating if an entry is taken or
+ * empty. Each bit stands for a DAT entry.
+ *
+ * The inode file has exactly the same format only the entries are inode
+ * entries.
+ */
+
+struct nandfs_block_group_desc {
+	uint32_t	bg_nfrees;	/* num. free entries in block group  */
+};
+
+/* DAT entry in a super root's DAT file */
+struct nandfs_dat_entry {
+	uint64_t	de_blocknr;	/* block number                      */
+	uint64_t	de_start;	/* valid from checkpoint             */
+	uint64_t	de_end;		/* valid till checkpoint             */
+	uint64_t	de_rsv;		/* reserved for future use           */
+};
+
+/*
+ * Structure of CP file.
+ *
+ * A snapshot is just a checkpoint only it's protected against removal by the
+ * cleaner. The snapshots are kept on a double linked list of checkpoints.
+ */
+struct nandfs_snapshot_list {
+	uint64_t	ssl_next;	/* checkpoint nr. forward */
+	uint64_t	ssl_prev;	/* checkpoint nr. back    */
+};
+
+/* Checkpoint entry structure */
+struct nandfs_checkpoint {
+	uint32_t	cp_flags;		/* NANDFS_CHECKPOINT_* flags          */
+	uint32_t	cp_checkpoints_count;	/* ZERO, not used anymore?           */
+	struct nandfs_snapshot_list cp_snapshot_list; /* list of snapshots   */
+	uint64_t	cp_cno;			/* checkpoint number                 */
+	uint64_t	cp_create;		/* creation timestamp                */
+	uint64_t	cp_nblk_inc;		/* number of blocks incremented      */
+	uint64_t	cp_blocks_count;	/* reserved (might be deleted)       */
+	struct nandfs_inode cp_ifile_inode;	/* inode file inode          */
+};
+
+/* Checkpoint flags */
+#define	NANDFS_CHECKPOINT_SNAPSHOT	1
+#define	NANDFS_CHECKPOINT_INVALID	2
+#define	NANDFS_CHECKPOINT_SKETCH	4
+#define	NANDFS_CHECKPOINT_MINOR		8
+#define	NANDFS_CHECKPOINT_BITS		"\20\1SNAPSHOT\2INVALID\3SKETCH\4MINOR"
+
+/* Header of the checkpoint file */
+struct nandfs_cpfile_header {
+	uint64_t	ch_ncheckpoints;	/* number of checkpoints             */
+	uint64_t	ch_nsnapshots;	/* number of snapshots               */
+	struct nandfs_snapshot_list ch_snapshot_list;	/* snapshot list     */
+};
+
+#define	NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET		\
+	((sizeof(struct nandfs_cpfile_header) +		\
+	sizeof(struct nandfs_checkpoint) - 1) /		\
+	sizeof(struct nandfs_checkpoint))
+
+
+#define NANDFS_NOSEGMENT        0xffffffff
+
+/*
+ * Structure of SU file.
+ *
+ * The segment usage file sums up how each of the segments are used. They are
+ * indexed by their segment number.
+ */
+
+/* Segment usage entry */
+struct nandfs_segment_usage {
+	uint64_t	su_lastmod;	/* last modified timestamp           */
+	uint32_t	su_nblocks;	/* number of blocks in segment       */
+	uint32_t	su_flags;	/* NANDFS_SEGMENT_USAGE_* flags       */
+};
+
+/* Segment usage flag */
+#define	NANDFS_SEGMENT_USAGE_ACTIVE	1
+#define	NANDFS_SEGMENT_USAGE_DIRTY	2
+#define	NANDFS_SEGMENT_USAGE_ERROR	4
+#define	NANDFS_SEGMENT_USAGE_GC		8
+#define	NANDFS_SEGMENT_USAGE_BITS	"\20\1ACTIVE\2DIRTY\3ERROR"
+
+/* Header of the segment usage file */
+struct nandfs_sufile_header {
+	uint64_t	sh_ncleansegs;	/* number of segments marked clean   */
+	uint64_t	sh_ndirtysegs;	/* number of segments marked dirty   */
+	uint64_t	sh_last_alloc;	/* last allocated segment number     */
+};
+
+#define	NANDFS_SUFILE_FIRST_SEGMENT_USAGE_OFFSET	\
+	((sizeof(struct nandfs_sufile_header) +		\
+	sizeof(struct nandfs_segment_usage) - 1) /	\
+	sizeof(struct nandfs_segment_usage))
+
+struct nandfs_seg_stat {
+	uint64_t	nss_nsegs;
+	uint64_t	nss_ncleansegs;
+	uint64_t	nss_ndirtysegs;
+	uint64_t	nss_ctime;
+	uint64_t	nss_nongc_ctime;
+	uint64_t	nss_prot_seq;
+};
+
+enum {
+	NANDFS_CHECKPOINT,
+	NANDFS_SNAPSHOT
+};
+
+#define	NANDFS_CPINFO_MAX		512
+
+struct nandfs_cpinfo {
+	uint32_t	nci_flags;
+	uint32_t	nci_pad;
+	uint64_t	nci_cno;
+	uint64_t	nci_create;
+	uint64_t	nci_nblk_inc;
+	uint64_t	nci_blocks_count;
+	uint64_t	nci_next;
+};
+
+#define	NANDFS_SEGMENTS_MAX	512
+
+struct nandfs_suinfo {
+	uint64_t	nsi_num;
+	uint64_t	nsi_lastmod;
+	uint32_t	nsi_blocks;
+	uint32_t	nsi_flags;
+};
+
+#define	NANDFS_VINFO_MAX	512
+
+struct nandfs_vinfo {
+	uint64_t	nvi_ino;
+	uint64_t	nvi_vblocknr;
+	uint64_t	nvi_start;
+	uint64_t	nvi_end;
+	uint64_t	nvi_blocknr;
+	int		nvi_alive;
+};
+
+struct nandfs_cpmode {
+	uint64_t	ncpm_cno;
+	uint32_t	ncpm_mode;
+	uint32_t	ncpm_pad;
+};
+
+struct nandfs_argv {
+	uint64_t	nv_base;
+	uint32_t	nv_nmembs;
+	uint16_t	nv_size;
+	uint16_t	nv_flags;
+	uint64_t	nv_index;
+};
+
+struct nandfs_cpstat {
+	uint64_t	ncp_cno;
+	uint64_t	ncp_ncps;
+	uint64_t	ncp_nss;
+};
+
+struct nandfs_period {
+	uint64_t	p_start;
+	uint64_t	p_end;
+};
+
+struct nandfs_vdesc {
+	uint64_t	vd_ino;
+	uint64_t	vd_cno;
+	uint64_t	vd_vblocknr;
+	struct nandfs_period	vd_period;
+	uint64_t	vd_blocknr;
+	uint64_t	vd_offset;
+	uint32_t	vd_flags;
+	uint32_t	vd_pad;
+};
+
+struct nandfs_bdesc {
+	uint64_t	bd_ino;
+	uint64_t	bd_oblocknr;
+	uint64_t	bd_blocknr;
+	uint64_t	bd_offset;
+	uint32_t	bd_level;
+	uint32_t	bd_alive;
+};
+
+#ifndef _KERNEL
+#ifndef	MNAMELEN
+#define	MNAMELEN	88
+#endif
+#endif
+
+struct nandfs_fsinfo {
+	struct nandfs_fsdata		fs_fsdata;
+	struct nandfs_super_block	fs_super;
+	char				fs_dev[MNAMELEN];
+};
+
+#define	NANDFS_MAX_MOUNTS	65535
+
+#define	NANDFS_IOCTL_GET_SUSTAT		_IOR('N', 100, struct nandfs_seg_stat)
+#define	NANDFS_IOCTL_CHANGE_CPMODE	_IOWR('N', 101, struct nandfs_cpmode)
+#define	NANDFS_IOCTL_GET_CPINFO		_IOWR('N', 102, struct nandfs_argv)
+#define	NANDFS_IOCTL_DELETE_CP		_IOWR('N', 103, uint64_t[2])
+#define	NANDFS_IOCTL_GET_CPSTAT		_IOR('N', 104, struct nandfs_cpstat)
+#define	NANDFS_IOCTL_GET_SUINFO		_IOWR('N', 105, struct nandfs_argv)
+#define	NANDFS_IOCTL_GET_VINFO		_IOWR('N', 106, struct nandfs_argv)
+#define	NANDFS_IOCTL_GET_BDESCS		_IOWR('N', 107, struct nandfs_argv)
+#define	NANDFS_IOCTL_GET_FSINFO		_IOR('N', 108, struct nandfs_fsinfo)
+#define	NANDFS_IOCTL_MAKE_SNAP		_IOWR('N', 109, uint64_t)
+#define	NANDFS_IOCTL_DELETE_SNAP	_IOWR('N', 110, uint64_t)
+#define	NANDFS_IOCTL_SYNC		_IOWR('N', 111, uint64_t)
+
+#endif /* _NANDFS_FS_H */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_ifile.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_ifile.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,213 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_ifile.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+int
+nandfs_node_create(struct nandfsmount *nmp, struct nandfs_node **node,
+    uint16_t mode)
+{
+	struct nandfs_alloc_request req;
+	struct nandfs_device *nandfsdev;
+	struct nandfs_mdt *mdt;
+	struct nandfs_node *ifile;
+	struct nandfs_inode *inode;
+	struct vnode *vp;
+	uint32_t entry;
+	int error = 0;
+
+	nandfsdev = nmp->nm_nandfsdev;
+	mdt = &nandfsdev->nd_ifile_mdt;
+	ifile = nmp->nm_ifile_node;
+	vp = NTOV(ifile);
+
+	VOP_LOCK(vp, LK_EXCLUSIVE);
+	/* Allocate new inode in ifile */
+	req.entrynum = nandfsdev->nd_last_ino + 1;
+	error = nandfs_find_free_entry(mdt, ifile, &req);
+	if (error) {
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	error = nandfs_get_entry_block(mdt, ifile, &req, &entry, 1);
+	if (error) {
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	/* Inode initialization */
+	inode = ((struct nandfs_inode *) req.bp_entry->b_data) + entry;
+	nandfs_inode_init(inode, mode);
+
+	error = nandfs_alloc_entry(mdt, &req);
+	if (error) {
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	VOP_UNLOCK(vp, 0);
+
+	nandfsdev->nd_last_ino = req.entrynum;
+	error = nandfs_get_node(nmp, req.entrynum, node);
+	DPRINTF(IFILE, ("%s: node: %p ino: %#jx\n",
+	    __func__, node, (uintmax_t)((*node)->nn_ino)));
+
+	return (error);
+}
+
+int
+nandfs_node_destroy(struct nandfs_node *node)
+{
+	struct nandfs_alloc_request req;
+	struct nandfsmount *nmp;
+	struct nandfs_mdt *mdt;
+	struct nandfs_node *ifile;
+	struct vnode *vp;
+	int error = 0;
+
+	nmp = node->nn_nmp;
+	req.entrynum = node->nn_ino;
+	mdt = &nmp->nm_nandfsdev->nd_ifile_mdt;
+	ifile = nmp->nm_ifile_node;
+	vp = NTOV(ifile);
+
+	DPRINTF(IFILE, ("%s: destroy node: %p ino: %#jx\n",
+	    __func__, node, (uintmax_t)node->nn_ino));
+	VOP_LOCK(vp, LK_EXCLUSIVE);
+
+	error = nandfs_find_entry(mdt, ifile, &req);
+	if (error) {
+		nandfs_error("%s: finding entry error:%d node %p(%jx)",
+		    __func__, error, node, node->nn_ino);
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	nandfs_inode_destroy(&node->nn_inode);
+
+	error = nandfs_free_entry(mdt, &req);
+	if (error) {
+		nandfs_error("%s: freing entry error:%d node %p(%jx)",
+		    __func__, error, node, node->nn_ino);
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	VOP_UNLOCK(vp, 0);
+	DPRINTF(IFILE, ("%s: freed node %p ino %#jx\n",
+	    __func__, node, (uintmax_t)node->nn_ino));
+	return (error);
+}
+
+int
+nandfs_node_update(struct nandfs_node *node)
+{
+	struct nandfs_alloc_request req;
+	struct nandfsmount *nmp;
+	struct nandfs_mdt *mdt;
+	struct nandfs_node *ifile;
+	struct nandfs_inode *inode;
+	uint32_t index;
+	int error = 0;
+
+	nmp = node->nn_nmp;
+	ifile = nmp->nm_ifile_node;
+	ASSERT_VOP_LOCKED(NTOV(ifile), __func__);
+
+	req.entrynum = node->nn_ino;
+	mdt = &nmp->nm_nandfsdev->nd_ifile_mdt;
+
+	DPRINTF(IFILE, ("%s: node:%p ino:%#jx\n",
+	    __func__, &node->nn_inode, (uintmax_t)node->nn_ino));
+
+	error = nandfs_get_entry_block(mdt, ifile, &req, &index, 0);
+	if (error) {
+		printf("nandfs_get_entry_block returned with ERROR=%d\n",
+		    error);
+		return (error);
+	}
+
+	inode = ((struct nandfs_inode *) req.bp_entry->b_data) + index;
+	memcpy(inode, &node->nn_inode, sizeof(*inode));
+	error = nandfs_dirty_buf(req.bp_entry, 0);
+
+	return (error);
+}
+
+int
+nandfs_get_node_entry(struct nandfsmount *nmp, struct nandfs_inode **inode,
+    uint64_t ino, struct buf **bp)
+{
+	struct nandfs_alloc_request req;
+	struct nandfs_mdt *mdt;
+	struct nandfs_node *ifile;
+	struct vnode *vp;
+	uint32_t index;
+	int error = 0;
+
+	req.entrynum = ino;
+	mdt = &nmp->nm_nandfsdev->nd_ifile_mdt;
+	ifile = nmp->nm_ifile_node;
+	vp = NTOV(ifile);
+
+	VOP_LOCK(vp, LK_EXCLUSIVE);
+	error = nandfs_get_entry_block(mdt, ifile, &req, &index, 0);
+	if (error) {
+		VOP_UNLOCK(vp, 0);
+		return (error);
+	}
+
+	*inode = ((struct nandfs_inode *) req.bp_entry->b_data) + index;
+	*bp = req.bp_entry;
+	VOP_UNLOCK(vp, 0);
+	return (0);
+}
+
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_mount.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_mount.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *          This product includes software developed for the
+ *          NetBSD Project.  See http://www.NetBSD.org/ for
+ *          information about NetBSD.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_mount.h,v 1.1 2009/07/18 16:31:42 reinoud
+ *
+ * $FreeBSD: head/sys/fs/nandfs/nandfs_mount.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _FS_NANDFS_NANDFS_MOUNT_H_
+#define _FS_NANDFS_NANDFS_MOUNT_H_
+
+/*
+ * Arguments to mount NANDFS filingsystem.
+ */
+
+struct nandfs_args {
+	char		*fspec;		/* mount specifier                   */
+	int64_t		cpno;		/* checkpoint number                 */
+};
+
+#endif /* !_FS_NANDFS_NANDFS_MOUNT_H_ */
+
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_segment.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_segment.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,1329 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_segment.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/libkern.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <geom/geom.h>
+#include <geom/geom_vfs.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+static int
+nandfs_new_segment(struct nandfs_device *fsdev)
+{
+	int error = 0;
+	uint64_t new;
+
+	error = nandfs_alloc_segment(fsdev, &new);
+	if (!error) {
+		fsdev->nd_seg_num = fsdev->nd_next_seg_num;
+		fsdev->nd_next_seg_num = new;
+	}
+	DPRINTF(SYNC, ("%s: new segment %jx next %jx error %d\n",
+	    __func__, (uintmax_t)fsdev->nd_seg_num, (uintmax_t)new, error));
+	if (error)
+		nandfs_error("%s: cannot create segment error %d\n",
+		    __func__, error);
+
+	return (error);
+}
+
+static int
+create_segment(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_segment *seg;
+	struct nandfs_device *fsdev;
+	struct nandfs_segment *prev;
+	struct buf *bp;
+	uint64_t start_block, curr;
+	uint32_t blks_per_seg, nblocks;
+	int error;
+
+	fsdev = seginfo->fsdev;
+	prev = seginfo->curseg;
+	blks_per_seg = fsdev->nd_fsdata.f_blocks_per_segment;
+	nblocks = fsdev->nd_last_segsum.ss_nblocks;
+
+	if (!prev) {
+		vfs_timestamp(&fsdev->nd_ts);
+		/* Touch current segment */
+		error = nandfs_touch_segment(fsdev, fsdev->nd_seg_num);
+		if (error) {
+			nandfs_error("%s: cannot preallocate segment %jx\n",
+			    __func__, fsdev->nd_seg_num);
+			return (error);
+		}
+		error = nandfs_touch_segment(fsdev, 0);
+		if (error) {
+			nandfs_error("%s: cannot dirty block with segment 0\n",
+			    __func__);
+			return (error);
+		}
+		start_block = fsdev->nd_last_pseg + (uint64_t)nblocks;
+		/*
+		 * XXX Hack
+		 */
+		if (blks_per_seg - (start_block % blks_per_seg) - 1 == 0)
+			start_block++;
+		curr = nandfs_get_segnum_of_block(fsdev, start_block);
+		/* Allocate new segment if last one is full */
+		if (fsdev->nd_seg_num != curr) {
+			error = nandfs_new_segment(fsdev);
+			if (error) {
+				nandfs_error("%s: cannot create new segment\n",
+				    __func__);
+				return (error);
+			}
+			/*
+			 * XXX Hack
+			 */
+			nandfs_get_segment_range(fsdev, fsdev->nd_seg_num, &start_block, NULL);
+		}
+	} else {
+		nandfs_get_segment_range(fsdev, fsdev->nd_next_seg_num,
+		    &start_block, NULL);
+
+		/* Touch current segment and allocate and touch new one */
+		error = nandfs_new_segment(fsdev);
+		if (error) {
+			nandfs_error("%s: cannot create next segment\n",
+			    __func__);
+			return (error);
+		}
+
+		/* Reiterate in case new buf is dirty */
+		seginfo->reiterate = 1;
+	}
+
+	/* Allocate and initialize nandfs_segment structure */
+	seg = malloc(sizeof(*seg), M_DEVBUF, M_WAITOK|M_ZERO);
+	TAILQ_INIT(&seg->segsum);
+	TAILQ_INIT(&seg->data);
+	seg->fsdev = fsdev;
+	seg->start_block = start_block;
+	seg->num_blocks = blks_per_seg - (start_block % blks_per_seg) - 1;
+	seg->seg_num = fsdev->nd_seg_num;
+	seg->seg_next = fsdev->nd_next_seg_num;
+	seg->segsum_blocks = 1;
+	seg->bytes_left = fsdev->nd_blocksize -
+	    sizeof(struct nandfs_segment_summary);
+	seg->segsum_bytes = sizeof(struct nandfs_segment_summary);
+
+	/* Allocate buffer for segment summary */
+	bp = getblk(fsdev->nd_devvp, nandfs_block_to_dblock(fsdev,
+	    seg->start_block), fsdev->nd_blocksize, 0, 0, 0);
+	bzero(bp->b_data, seginfo->fsdev->nd_blocksize);
+	bp->b_bufobj = &seginfo->fsdev->nd_devvp->v_bufobj;
+	bp->b_flags |= B_MANAGED;
+
+	/* Add buffer to segment */
+	TAILQ_INSERT_TAIL(&seg->segsum, bp, b_cluster.cluster_entry);
+	seg->current_off = bp->b_data + sizeof(struct nandfs_segment_summary);
+
+	DPRINTF(SYNC, ("%s: seg %p : initial settings: start %#jx size :%#x\n",
+	    __func__, seg, (uintmax_t)seg->start_block, seg->num_blocks));
+	DPRINTF(SYNC, ("%s: seg->seg_num %#jx cno %#jx next %#jx\n", __func__,
+	    (uintmax_t)seg->seg_num, (uintmax_t)(fsdev->nd_last_cno + 1),
+	    (uintmax_t)seg->seg_next));
+
+	if (!prev)
+		LIST_INSERT_HEAD(&seginfo->seg_list, seg, seg_link);
+	else
+		LIST_INSERT_AFTER(prev, seg, seg_link);
+
+	seginfo->curseg = seg;
+
+	return (0);
+}
+
+static int
+delete_segment(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_segment *seg, *tseg;
+	struct buf *bp, *tbp;
+
+	LIST_FOREACH_SAFE(seg, &seginfo->seg_list, seg_link, tseg) {
+		TAILQ_FOREACH_SAFE(bp, &seg->segsum, b_cluster.cluster_entry,
+		    tbp) {
+			TAILQ_REMOVE(&seg->segsum, bp, b_cluster.cluster_entry);
+			bp->b_flags &= ~B_MANAGED;
+			brelse(bp);
+		};
+
+		LIST_REMOVE(seg, seg_link);
+		free(seg, M_DEVBUF);
+	}
+
+	return (0);
+}
+
+static int
+create_seginfo(struct nandfs_device *fsdev, struct nandfs_seginfo **seginfo)
+{
+	struct nandfs_seginfo *info;
+
+	info = malloc(sizeof(*info), M_DEVBUF, M_WAITOK);
+
+	LIST_INIT(&info->seg_list);
+	info->fsdev = fsdev;
+	info->curseg = NULL;
+	info->blocks = 0;
+	*seginfo = info;
+	fsdev->nd_seginfo = info;
+	return (0);
+}
+
+static int
+delete_seginfo(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_device *nffsdev;
+
+	nffsdev = seginfo->fsdev;
+	delete_segment(seginfo);
+	nffsdev->nd_seginfo = NULL;
+	free(seginfo, M_DEVBUF);
+
+	return (0);
+}
+
+static int
+nandfs_create_superroot_block(struct nandfs_seginfo *seginfo,
+    struct buf **newbp)
+{
+	struct buf *bp;
+	int error;
+
+	bp = nandfs_geteblk(seginfo->fsdev->nd_blocksize, GB_NOWAIT_BD);
+
+	bzero(bp->b_data, seginfo->fsdev->nd_blocksize);
+	bp->b_bufobj = &seginfo->fsdev->nd_devvp->v_bufobj;
+	bp->b_flags |= B_MANAGED;
+
+	if (!(seginfo->curseg) || !seginfo->curseg->num_blocks) {
+		error = create_segment(seginfo);
+		if (error) {
+			brelse(bp);
+			nandfs_error("%s: no segment for superroot\n",
+			    __func__);
+			return (error);
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&seginfo->curseg->data, bp, b_cluster.cluster_entry);
+
+	seginfo->curseg->nblocks++;
+	seginfo->curseg->num_blocks--;
+	seginfo->blocks++;
+
+	*newbp = bp;
+	return (0);
+}
+
+static int
+nandfs_add_superroot(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_super_root *sr;
+	struct buf *bp = NULL;
+	uint64_t crc_skip;
+	uint32_t crc_calc;
+	int error;
+
+	fsdev = seginfo->fsdev;
+
+	error = nandfs_create_superroot_block(seginfo, &bp);
+	if (error) {
+		nandfs_error("%s: cannot add superroot\n", __func__);
+		return (error);
+	}
+
+	sr = (struct nandfs_super_root *)bp->b_data;
+	/* Save superroot CRC */
+	sr->sr_bytes = NANDFS_SR_BYTES;
+	sr->sr_flags = 0;
+	sr->sr_nongc_ctime = 0;
+
+	memcpy(&sr->sr_dat, &fsdev->nd_dat_node->nn_inode,
+	    sizeof(struct nandfs_inode));
+	memcpy(&sr->sr_cpfile, &fsdev->nd_cp_node->nn_inode,
+	    sizeof(struct nandfs_inode));
+	memcpy(&sr->sr_sufile, &fsdev->nd_su_node->nn_inode,
+	    sizeof(struct nandfs_inode));
+
+	crc_skip = sizeof(sr->sr_sum);
+	crc_calc = crc32((uint8_t *)sr + crc_skip, NANDFS_SR_BYTES - crc_skip);
+
+	sr->sr_sum = crc_calc;
+
+	bp->b_flags |= B_MANAGED;
+	bp->b_bufobj = &seginfo->fsdev->nd_devvp->v_bufobj;
+
+	bp->b_flags &= ~B_INVAL;
+	nandfs_dirty_bufs_increment(fsdev);
+	DPRINTF(SYNC, ("%s: bp:%p\n", __func__, bp));
+
+	return (0);
+}
+
+static int
+nandfs_add_segsum_block(struct nandfs_seginfo *seginfo, struct buf **newbp)
+{
+	struct nandfs_device *fsdev;
+	nandfs_daddr_t blk;
+	struct buf *bp;
+	int error;
+
+	if (!(seginfo->curseg) || seginfo->curseg->num_blocks <= 1) {
+		error = create_segment(seginfo);
+		if (error) {
+			nandfs_error("%s: error:%d when creating segment\n",
+			    __func__, error);
+			return (error);
+		}
+		*newbp = TAILQ_FIRST(&seginfo->curseg->segsum);
+		return (0);
+	}
+
+	fsdev = seginfo->fsdev;
+	blk = nandfs_block_to_dblock(fsdev, seginfo->curseg->start_block +
+	    seginfo->curseg->segsum_blocks);
+
+	bp = getblk(fsdev->nd_devvp, blk, fsdev->nd_blocksize, 0, 0, 0);
+
+	bzero(bp->b_data, seginfo->fsdev->nd_blocksize);
+	bp->b_bufobj = &seginfo->fsdev->nd_devvp->v_bufobj;
+	bp->b_flags |= B_MANAGED;
+
+	TAILQ_INSERT_TAIL(&seginfo->curseg->segsum, bp,
+	    b_cluster.cluster_entry);
+	seginfo->curseg->num_blocks--;
+
+	seginfo->curseg->segsum_blocks++;
+	seginfo->curseg->bytes_left = seginfo->fsdev->nd_blocksize;
+	seginfo->curseg->current_off = bp->b_data;
+	seginfo->blocks++;
+
+	*newbp = bp;
+
+	DPRINTF(SYNC, ("%s: bp %p\n", __func__, bp));
+
+	return (0);
+}
+
+static int
+nandfs_add_blocks(struct nandfs_seginfo *seginfo, struct nandfs_node *node,
+    struct buf *bp)
+{
+	union nandfs_binfo *binfo;
+	struct buf *seg_bp;
+	int error;
+
+	if (!(seginfo->curseg) || !seginfo->curseg->num_blocks) {
+		error = create_segment(seginfo);
+		if (error) {
+			nandfs_error("%s: error:%d when creating segment\n",
+			    __func__, error);
+			return (error);
+		}
+	}
+
+	if (seginfo->curseg->bytes_left < sizeof(union nandfs_binfo)) {
+		error = nandfs_add_segsum_block(seginfo, &seg_bp);
+		if (error) {
+			nandfs_error("%s: error:%d when adding segsum\n",
+			    __func__, error);
+			return (error);
+		}
+	}
+	binfo = (union nandfs_binfo *)seginfo->curseg->current_off;
+
+	if (node->nn_ino != NANDFS_DAT_INO) {
+		binfo->bi_v.bi_blkoff = bp->b_lblkno;
+		binfo->bi_v.bi_ino = node->nn_ino;
+	} else {
+		binfo->bi_dat.bi_blkoff = bp->b_lblkno;
+		binfo->bi_dat.bi_ino = node->nn_ino;
+		if (NANDFS_IS_INDIRECT(bp))
+			binfo->bi_dat.bi_level = 1;
+		else
+			binfo->bi_dat.bi_level = 0;
+	}
+	binfo++;
+
+	seginfo->curseg->bytes_left -= sizeof(union nandfs_binfo);
+	seginfo->curseg->segsum_bytes += sizeof(union nandfs_binfo);
+	seginfo->curseg->current_off = (char *)binfo;
+
+	TAILQ_INSERT_TAIL(&seginfo->curseg->data, bp, b_cluster.cluster_entry);
+
+	seginfo->curseg->nbinfos++;
+	seginfo->curseg->nblocks++;
+	seginfo->curseg->num_blocks--;
+	seginfo->blocks++;
+
+	DPRINTF(SYNC, ("%s: bp (%p) number %x (left %x)\n",
+	    __func__, bp, seginfo->curseg->nblocks,
+	    seginfo->curseg->num_blocks));
+	return (0);
+}
+
+static int
+nandfs_iterate_dirty_buf(struct vnode *vp, struct nandfs_seginfo *seginfo,
+    uint8_t hold)
+{
+	struct buf *bp, *tbd;
+	struct bufobj *bo;
+	struct nandfs_node *node;
+	int error;
+
+	node = VTON(vp);
+	bo = &vp->v_bufobj;
+
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	/* Iterate dirty data bufs */
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, tbd) {
+		DPRINTF(SYNC, ("%s: vp (%p): bp (%p) with lblkno %jx ino %jx "
+		    "add buf\n", __func__, vp, bp, bp->b_lblkno, node->nn_ino));
+
+		if (!(NANDFS_ISGATHERED(bp))) {
+			error = nandfs_bmap_update_dat(node,
+			    nandfs_vblk_get(bp), bp);
+			if (error)
+				return (error);
+			NANDFS_GATHER(bp);
+			nandfs_add_blocks(seginfo, node, bp);
+		}
+	}
+
+	return (0);
+}
+
+static int
+nandfs_iterate_system_vnode(struct nandfs_node *node,
+    struct nandfs_seginfo *seginfo)
+{
+	struct vnode *vp;
+	int nblocks;
+	uint8_t hold = 0;
+
+	if (node->nn_ino != NANDFS_IFILE_INO)
+		hold = 1;
+
+	vp = NTOV(node);
+
+	nblocks = vp->v_bufobj.bo_dirty.bv_cnt;
+	DPRINTF(SYNC, ("%s: vp (%p): nblocks %x ino %jx\n",
+	    __func__, vp, nblocks, node->nn_ino));
+
+	if (nblocks)
+		nandfs_iterate_dirty_buf(vp, seginfo, hold);
+
+	return (0);
+}
+
+static int
+nandfs_iterate_dirty_vnodes(struct mount *mp, struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_node *nandfs_node;
+	struct vnode *vp, *mvp;
+	struct thread *td;
+	int error, lockreq, update;
+
+	td = curthread;
+	lockreq = LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY;
+
+	MNT_ILOCK(mp);
+
+	MNT_VNODE_FOREACH(vp, mp, mvp) {
+		update = 0;
+
+		if (mp->mnt_syncer == vp)
+			continue;
+		if (VOP_ISLOCKED(vp))
+			continue;
+
+		VI_LOCK(vp);
+		MNT_IUNLOCK(mp);
+		if (vp->v_iflag & VI_DOOMED) {
+			VI_UNLOCK(vp);
+			MNT_ILOCK(mp);
+			continue;
+		}
+
+		if ((error = vget(vp, lockreq, td)) != 0) {
+			MNT_ILOCK(mp);
+			continue;
+		}
+
+		if (vp->v_iflag & VI_DOOMED) {
+			vput(vp);
+			MNT_ILOCK(mp);
+			continue;
+		}
+
+		nandfs_node = VTON(vp);
+		if (nandfs_node->nn_flags & IN_MODIFIED) {
+			nandfs_node->nn_flags &= ~(IN_MODIFIED);
+			update = 1;
+		}
+
+		if (vp->v_bufobj.bo_dirty.bv_cnt) {
+			error = nandfs_iterate_dirty_buf(vp, seginfo, 0);
+			if (error) {
+				nandfs_error("%s: cannot iterate vnode:%p "
+				    "err:%d\n", __func__, vp, error);
+				vput(vp);
+				return (error);
+			}
+			update = 1;
+		} else
+			vput(vp);
+
+		if (update)
+			nandfs_node_update(nandfs_node);
+
+		MNT_ILOCK(mp);
+	}
+
+	MNT_IUNLOCK(mp);
+
+	return (0);
+}
+
+static int
+nandfs_update_phys_block(struct nandfs_device *fsdev, struct buf *bp,
+    uint64_t phys_blknr, union nandfs_binfo *binfo)
+{
+	struct nandfs_node *node, *dat;
+	struct vnode *vp;
+	uint64_t new_blknr;
+	int error;
+
+	vp = bp->b_vp;
+	node = VTON(vp);
+	new_blknr = nandfs_vblk_get(bp);
+	dat = fsdev->nd_dat_node;
+
+	DPRINTF(BMAP, ("%s: ino %#jx lblk %#jx: vblk %#jx -> %#jx\n",
+	    __func__, (uintmax_t)node->nn_ino, (uintmax_t)bp->b_lblkno,
+	    (uintmax_t)new_blknr, (uintmax_t)phys_blknr));
+
+	if (node->nn_ino != NANDFS_DAT_INO) {
+		KASSERT((new_blknr != 0), ("vblk for bp %p is 0", bp));
+
+		nandfs_vblock_assign(fsdev, new_blknr, phys_blknr);
+		binfo->bi_v.bi_vblocknr = new_blknr;
+		binfo->bi_v.bi_blkoff = bp->b_lblkno;
+		binfo->bi_v.bi_ino = node->nn_ino;
+	} else {
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		error = nandfs_bmap_update_block(node, bp, phys_blknr);
+		if (error) {
+			nandfs_error("%s: error updating block:%jx for bp:%p\n",
+			    __func__, (uintmax_t)phys_blknr, bp);
+			VOP_UNLOCK(NTOV(dat), 0);
+			return (error);
+		}
+		VOP_UNLOCK(NTOV(dat), 0);
+		binfo->bi_dat.bi_blkoff = bp->b_lblkno;
+		binfo->bi_dat.bi_ino = node->nn_ino;
+		if (NANDFS_IS_INDIRECT(bp))
+			binfo->bi_dat.bi_level = 1;
+		else
+			binfo->bi_dat.bi_level = 0;
+	}
+
+	return (0);
+}
+
+#define	NBINFO(off) ((off) + sizeof(union nandfs_binfo))
+static int
+nandfs_segment_assign_pblk(struct nandfs_segment *nfsseg)
+{
+	struct nandfs_device *fsdev;
+	union nandfs_binfo *binfo;
+	struct buf *bp, *seg_bp;
+	uint64_t blocknr;
+	uint32_t curr_off, blocksize;
+	int error;
+
+	fsdev = nfsseg->fsdev;
+	blocksize = fsdev->nd_blocksize;
+
+	blocknr = nfsseg->start_block + nfsseg->segsum_blocks;
+	seg_bp = TAILQ_FIRST(&nfsseg->segsum);
+	DPRINTF(SYNC, ("%s: seg:%p segsum bp:%p data:%p\n",
+	    __func__, nfsseg, seg_bp, seg_bp->b_data));
+
+	binfo = (union nandfs_binfo *)(seg_bp->b_data +
+	    sizeof(struct nandfs_segment_summary));
+	curr_off = sizeof(struct nandfs_segment_summary);
+
+	TAILQ_FOREACH(bp, &nfsseg->data, b_cluster.cluster_entry) {
+		KASSERT((bp->b_vp), ("bp %p has not vp", bp));
+
+		DPRINTF(BMAP, ("\n\n%s: assign buf %p for ino %#jx next %p\n",
+		    __func__, bp, (uintmax_t)VTON(bp->b_vp)->nn_ino,
+		    TAILQ_NEXT(bp, b_cluster.cluster_entry)));
+
+		if (NBINFO(curr_off) > blocksize) {
+			seg_bp = TAILQ_NEXT(seg_bp, b_cluster.cluster_entry);
+			binfo = (union nandfs_binfo *)seg_bp->b_data;
+			curr_off = 0;
+			DPRINTF(SYNC, ("%s: next segsum %p data %p\n",
+			    __func__, seg_bp, seg_bp->b_data));
+		}
+
+		error = nandfs_update_phys_block(fsdev, bp, blocknr, binfo);
+		if (error) {
+			nandfs_error("%s: err:%d when updatinng phys block:%jx"
+			    " for bp:%p and binfo:%p\n", __func__, error,
+			    (uintmax_t)blocknr, bp, binfo);
+			return (error);
+		}
+		binfo++;
+		curr_off = NBINFO(curr_off);
+
+		blocknr++;
+	}
+
+	return (0);
+}
+
+static int
+nandfs_seginfo_assign_pblk(struct nandfs_seginfo *seginfo)
+{
+	struct nandfs_segment *nfsseg;
+	int error = 0;
+
+	LIST_FOREACH(nfsseg, &seginfo->seg_list, seg_link) {
+		error = nandfs_segment_assign_pblk(nfsseg);
+		if (error)
+			break;
+	}
+
+	return (error);
+}
+
+static struct nandfs_segment_summary *
+nandfs_fill_segsum(struct nandfs_segment *seg, int has_sr)
+{
+	struct nandfs_segment_summary *ss;
+	struct nandfs_device *fsdev;
+	struct buf *bp;
+	uint32_t rest, segsum_size, blocksize, crc_calc;
+	uint16_t flags;
+	uint8_t *crc_area, crc_skip;
+
+	DPRINTF(SYNC, ("%s: seg %#jx nblocks %#x sumbytes %#x\n",
+	    __func__, (uintmax_t) seg->seg_num,
+	    seg->nblocks + seg->segsum_blocks,
+	    seg->segsum_bytes));
+
+	fsdev = seg->fsdev;
+
+	flags = NANDFS_SS_LOGBGN | NANDFS_SS_LOGEND;
+	if (has_sr)
+		flags |= NANDFS_SS_SR;
+
+	bp = TAILQ_FIRST(&seg->segsum);
+	ss = (struct nandfs_segment_summary *) bp->b_data;
+	ss->ss_magic = NANDFS_SEGSUM_MAGIC;
+	ss->ss_bytes = sizeof(struct nandfs_segment_summary);
+	ss->ss_flags = flags;
+	ss->ss_seq = ++(fsdev->nd_seg_sequence);
+	ss->ss_create = fsdev->nd_ts.tv_sec;
+	nandfs_get_segment_range(fsdev, seg->seg_next, &ss->ss_next, NULL);
+	ss->ss_nblocks = seg->nblocks + seg->segsum_blocks;
+	ss->ss_nbinfos = seg->nbinfos;
+	ss->ss_sumbytes = seg->segsum_bytes;
+
+	crc_skip = sizeof(ss->ss_datasum) + sizeof(ss->ss_sumsum);
+	blocksize = seg->fsdev->nd_blocksize;
+
+	segsum_size = seg->segsum_bytes - crc_skip;
+	rest = min(seg->segsum_bytes, blocksize) - crc_skip;
+	crc_area = (uint8_t *)ss + crc_skip;
+	crc_calc = ~0U;
+	while (segsum_size > 0) {
+		crc_calc = crc32_raw(crc_area, rest, crc_calc);
+		segsum_size -= rest;
+		if (!segsum_size)
+			break;
+		bp = TAILQ_NEXT(bp, b_cluster.cluster_entry);
+		crc_area = (uint8_t *)bp->b_data;
+		rest = segsum_size <= blocksize ? segsum_size : blocksize;
+	}
+	ss->ss_sumsum = crc_calc ^ ~0U;
+
+	return (ss);
+
+}
+
+static int
+nandfs_save_buf(struct buf *bp, uint64_t blocknr, struct nandfs_device *fsdev)
+{
+	struct bufobj *bo;
+	int error;
+
+	bo = &fsdev->nd_devvp->v_bufobj;
+
+	bp->b_blkno = nandfs_block_to_dblock(fsdev, blocknr);
+	bp->b_iooffset = dbtob(bp->b_blkno);
+
+	KASSERT(bp->b_bufobj != NULL, ("no bufobj for %p", bp));
+	if (bp->b_bufobj != bo) {
+		BO_LOCK(bp->b_bufobj);
+		BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
+		    BO_MTX(bp->b_bufobj));
+		KASSERT(BUF_ISLOCKED(bp), ("Problem with locking buffer"));
+	}
+
+	DPRINTF(SYNC, ("%s: buf: %p offset %#jx blk %#jx size %#x\n",
+	    __func__, bp, (uintmax_t)bp->b_offset, (uintmax_t)blocknr,
+	    fsdev->nd_blocksize));
+
+	NANDFS_UNGATHER(bp);
+	nandfs_buf_clear(bp, 0xffffffff);
+	bp->b_flags &= ~(B_ASYNC|B_INVAL|B_MANAGED);
+	error = bwrite(bp);
+	if (error) {
+		nandfs_error("%s: error:%d when writing buffer:%p\n",
+		    __func__, error, bp);
+		return (error);
+	}
+	return (error);
+}
+
+static void
+nandfs_clean_buf(struct nandfs_device *fsdev, struct buf *bp)
+{
+
+	DPRINTF(SYNC, ("%s: buf: %p\n", __func__, bp));
+
+	NANDFS_UNGATHER(bp);
+	nandfs_buf_clear(bp, 0xffffffff);
+	bp->b_flags &= ~(B_ASYNC|B_INVAL|B_MANAGED);
+	nandfs_undirty_buf_fsdev(fsdev, bp);
+}
+
+static void
+nandfs_clean_segblocks(struct nandfs_segment *seg, uint8_t unlock)
+{
+	struct nandfs_device *fsdev = seg->fsdev;
+	struct nandfs_segment *next_seg;
+	struct buf *bp, *tbp, *next_bp;
+	struct vnode *vp, *next_vp;
+
+	VOP_LOCK(fsdev->nd_devvp, LK_EXCLUSIVE);
+	TAILQ_FOREACH_SAFE(bp, &seg->segsum, b_cluster.cluster_entry, tbp) {
+		TAILQ_REMOVE(&seg->segsum, bp, b_cluster.cluster_entry);
+		nandfs_clean_buf(fsdev, bp);
+	};
+
+	TAILQ_FOREACH_SAFE(bp, &seg->data, b_cluster.cluster_entry, tbp) {
+		TAILQ_REMOVE(&seg->data, bp, b_cluster.cluster_entry);
+
+		/*
+		 * If bp is not super-root and vnode is not currently
+		 * locked lock it.
+		 */
+		vp = bp->b_vp;
+		next_vp = NULL;
+		next_bp = TAILQ_NEXT(bp,  b_cluster.cluster_entry);
+		if (!next_bp) {
+			next_seg = LIST_NEXT(seg, seg_link);
+			if (next_seg)
+				next_bp = TAILQ_FIRST(&next_seg->data);
+		}
+
+		if (next_bp)
+			next_vp = next_bp->b_vp;
+
+		nandfs_clean_buf(fsdev, bp);
+
+		if (unlock && vp != NULL && next_vp != vp &&
+		    !NANDFS_SYS_NODE(VTON(vp)->nn_ino))
+			vput(vp);
+
+		nandfs_dirty_bufs_decrement(fsdev);
+	}
+
+	VOP_UNLOCK(fsdev->nd_devvp, 0);
+}
+
+static int
+nandfs_save_segblocks(struct nandfs_segment *seg, uint8_t unlock)
+{
+	struct nandfs_device *fsdev = seg->fsdev;
+	struct nandfs_segment *next_seg;
+	struct buf *bp, *tbp, *next_bp;
+	struct vnode *vp, *next_vp;
+	uint64_t blocknr;
+	uint32_t i = 0;
+	int error = 0;
+
+	VOP_LOCK(fsdev->nd_devvp, LK_EXCLUSIVE);
+	TAILQ_FOREACH_SAFE(bp, &seg->segsum, b_cluster.cluster_entry, tbp) {
+		TAILQ_REMOVE(&seg->segsum, bp, b_cluster.cluster_entry);
+		blocknr = seg->start_block + i;
+		error = nandfs_save_buf(bp, blocknr, fsdev);
+		if (error) {
+			nandfs_error("%s: error saving buf: %p blocknr:%jx\n",
+			    __func__, bp, (uintmax_t)blocknr);
+			goto out;
+		}
+		i++;
+	};
+
+	i = 0;
+	TAILQ_FOREACH_SAFE(bp, &seg->data, b_cluster.cluster_entry, tbp) {
+		TAILQ_REMOVE(&seg->data, bp, b_cluster.cluster_entry);
+
+		blocknr = seg->start_block + seg->segsum_blocks + i;
+		/*
+		 * If bp is not super-root and vnode is not currently
+		 * locked lock it.
+		 */
+		vp = bp->b_vp;
+		next_vp = NULL;
+		next_bp = TAILQ_NEXT(bp,  b_cluster.cluster_entry);
+		if (!next_bp) {
+			next_seg = LIST_NEXT(seg, seg_link);
+			if (next_seg)
+				next_bp = TAILQ_FIRST(&next_seg->data);
+		}
+
+		if (next_bp)
+			next_vp = next_bp->b_vp;
+
+		error = nandfs_save_buf(bp, blocknr, fsdev);
+		if (error) {
+			nandfs_error("%s: error saving buf: %p blknr: %jx\n",
+			    __func__, bp, (uintmax_t)blocknr);
+			if (unlock && vp != NULL && next_vp != vp &&
+			    !NANDFS_SYS_NODE(VTON(vp)->nn_ino))
+				vput(vp);
+			goto out;
+		}
+
+		if (unlock && vp != NULL && next_vp != vp &&
+		    !NANDFS_SYS_NODE(VTON(vp)->nn_ino))
+			vput(vp);
+
+		i++;
+		nandfs_dirty_bufs_decrement(fsdev);
+	}
+out:
+	if (error) {
+		nandfs_clean_segblocks(seg, unlock);
+		VOP_UNLOCK(fsdev->nd_devvp, 0);
+		return (error);
+	}
+
+	VOP_UNLOCK(fsdev->nd_devvp, 0);
+	return (error);
+}
+
+
+static void
+clean_seginfo(struct nandfs_seginfo *seginfo, uint8_t unlock)
+{
+	struct nandfs_segment *seg;
+
+	DPRINTF(SYNC, ("%s: seginfo %p\n", __func__, seginfo));
+
+	LIST_FOREACH(seg, &seginfo->seg_list, seg_link) {
+		nandfs_clean_segblocks(seg, unlock);
+	}
+}
+
+static int
+save_seginfo(struct nandfs_seginfo *seginfo, uint8_t unlock)
+{
+	struct nandfs_segment *seg;
+	struct nandfs_device *fsdev;
+	struct nandfs_segment_summary *ss;
+	int error = 0;
+
+	fsdev = seginfo->fsdev;
+
+	DPRINTF(SYNC, ("%s: seginfo %p\n", __func__, seginfo));
+
+	LIST_FOREACH(seg, &seginfo->seg_list, seg_link) {
+		if (LIST_NEXT(seg, seg_link)) {
+			nandfs_fill_segsum(seg, 0);
+			error = nandfs_save_segblocks(seg, unlock);
+			if (error) {
+				nandfs_error("%s: error:%d saving seg:%p\n",
+				    __func__, error, seg);
+				goto out;
+			}
+		} else {
+			ss = nandfs_fill_segsum(seg, 1);
+			fsdev->nd_last_segsum = *ss;
+			error = nandfs_save_segblocks(seg, unlock);
+			if (error) {
+				nandfs_error("%s: error:%d saving seg:%p\n",
+				    __func__, error, seg);
+				goto out;
+			}
+			fsdev->nd_last_cno++;
+			fsdev->nd_last_pseg = seg->start_block;
+		}
+	}
+out:
+	if (error)
+		clean_seginfo(seginfo, unlock);
+	return (error);
+}
+
+static void
+nandfs_invalidate_bufs(struct nandfs_device *fsdev, uint64_t segno)
+{
+	uint64_t start, end;
+	struct buf *bp, *tbd;
+	struct bufobj *bo;
+
+	nandfs_get_segment_range(fsdev, segno, &start, &end);
+
+	bo = &NTOV(fsdev->nd_gc_node)->v_bufobj;
+
+	BO_LOCK(bo);
+restart_locked_gc:
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, tbd) {
+		if (!(bp->b_lblkno >= start && bp->b_lblkno <= end))
+			continue;
+
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
+			goto restart_locked_gc;
+
+		bremfree(bp);
+		bp->b_flags |= (B_INVAL | B_RELBUF);
+		bp->b_flags &= ~(B_ASYNC | B_MANAGED);
+		BO_UNLOCK(bo);
+		brelse(bp);
+		BO_LOCK(bo);
+	}
+	BO_UNLOCK(bo);
+}
+
+/* Process segments marks to free by cleaner */
+static void
+nandfs_process_segments(struct nandfs_device *fsdev)
+{
+	uint64_t saved_segment;
+	int i;
+
+	if (fsdev->nd_free_base) {
+		saved_segment = nandfs_get_segnum_of_block(fsdev,
+		    fsdev->nd_super.s_last_pseg);
+		for (i = 0; i < fsdev->nd_free_count; i++) {
+			if (fsdev->nd_free_base[i] == NANDFS_NOSEGMENT)
+				continue;
+			/* Update superblock if clearing segment point by it */
+			if (fsdev->nd_free_base[i] == saved_segment) {
+				nandfs_write_superblock(fsdev);
+				saved_segment = nandfs_get_segnum_of_block(
+				    fsdev, fsdev->nd_super.s_last_pseg);
+			}
+			nandfs_invalidate_bufs(fsdev, fsdev->nd_free_base[i]);
+			nandfs_clear_segment(fsdev, fsdev->nd_free_base[i]);
+		}
+
+		free(fsdev->nd_free_base, M_NANDFSTEMP);
+		fsdev->nd_free_base = NULL;
+		fsdev->nd_free_count = 0;
+	}
+}
+
+/* Collect and write dirty buffers */
+int
+nandfs_sync_file(struct vnode *vp)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_node *nandfs_node;
+	struct nandfsmount *nmp;
+	struct nandfs_node *dat, *su, *ifile, *cp;
+	struct nandfs_seginfo *seginfo = NULL;
+	struct nandfs_segment *seg;
+	int update, error;
+	int cno_changed;
+
+	ASSERT_VOP_LOCKED(vp, __func__);
+	DPRINTF(SYNC, ("%s: START\n", __func__));
+
+	error = 0;
+	nmp = VFSTONANDFS(vp->v_mount);
+	fsdev = nmp->nm_nandfsdev;
+
+	dat = fsdev->nd_dat_node;
+	su = fsdev->nd_su_node;
+	cp = fsdev->nd_cp_node;
+	ifile = nmp->nm_ifile_node;
+
+	NANDFS_WRITEASSERT(fsdev);
+	if (lockmgr(&fsdev->nd_seg_const, LK_UPGRADE, NULL) != 0) {
+		DPRINTF(SYNC, ("%s: lost shared lock\n", __func__));
+		if (lockmgr(&fsdev->nd_seg_const, LK_EXCLUSIVE, NULL) != 0)
+			panic("couldn't lock exclusive");
+	}
+	DPRINTF(SYNC, ("%s: got lock\n", __func__));
+
+	VOP_LOCK(NTOV(su), LK_EXCLUSIVE);
+	create_seginfo(fsdev, &seginfo);
+
+	update = 0;
+
+	nandfs_node = VTON(vp);
+	if (nandfs_node->nn_flags & IN_MODIFIED) {
+		nandfs_node->nn_flags &= ~(IN_MODIFIED);
+		update = 1;
+	}
+
+	if (vp->v_bufobj.bo_dirty.bv_cnt) {
+		error = nandfs_iterate_dirty_buf(vp, seginfo, 0);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d iterating dirty bufs vp:%p",
+			    __func__, error, vp);
+			return (error);
+		}
+		update = 1;
+	}
+
+	if (update) {
+		VOP_LOCK(NTOV(ifile), LK_EXCLUSIVE);
+		error = nandfs_node_update(nandfs_node);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(ifile), 0);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d updating vp:%p",
+			    __func__, error, vp);
+			return (error);
+		}
+		VOP_UNLOCK(NTOV(ifile), 0);
+	}
+
+	cno_changed = 0;
+	if (seginfo->blocks) {
+		VOP_LOCK(NTOV(cp), LK_EXCLUSIVE);
+		cno_changed = 1;
+		/* Create new checkpoint */
+		error = nandfs_get_checkpoint(fsdev, cp, fsdev->nd_last_cno + 1);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(cp), 0);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d getting cp:%jx",
+			    __func__, error, fsdev->nd_last_cno + 1);
+			return (error);
+		}
+
+		/* Reiterate all blocks and assign physical block number */
+		nandfs_seginfo_assign_pblk(seginfo);
+
+		/* Fill checkpoint data */
+		error = nandfs_set_checkpoint(fsdev, cp, fsdev->nd_last_cno + 1,
+		    &ifile->nn_inode, seginfo->blocks);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(cp), 0);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d setting cp:%jx",
+			    __func__, error, fsdev->nd_last_cno + 1);
+			return (error);
+		}
+
+		VOP_UNLOCK(NTOV(cp), 0);
+		LIST_FOREACH(seg, &seginfo->seg_list, seg_link)
+			nandfs_update_segment(fsdev, seg->seg_num,
+			    seg->nblocks + seg->segsum_blocks);
+
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		error = save_seginfo(seginfo, 0);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			VOP_UNLOCK(NTOV(dat), 0);
+			VOP_UNLOCK(NTOV(su), 0);
+			lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+			nandfs_error("%s: err:%d updating seg",
+			    __func__, error);
+			return (error);
+		}
+		VOP_UNLOCK(NTOV(dat), 0);
+	}
+
+	VOP_UNLOCK(NTOV(su), 0);
+
+	delete_seginfo(seginfo);
+	lockmgr(&fsdev->nd_seg_const, LK_DOWNGRADE, NULL);
+
+	if (cno_changed && !error) {
+		if (nandfs_cps_between_sblocks != 0 &&
+		    fsdev->nd_last_cno % nandfs_cps_between_sblocks == 0)
+			nandfs_write_superblock(fsdev);
+	}
+
+	ASSERT_VOP_LOCKED(vp, __func__);
+	DPRINTF(SYNC, ("%s: END error %d\n", __func__, error));
+	return (error);
+}
+
+int
+nandfs_segment_constructor(struct nandfsmount *nmp, int flags)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_seginfo *seginfo = NULL;
+	struct nandfs_segment *seg;
+	struct nandfs_node *dat, *su, *ifile, *cp, *gc;
+	int cno_changed, error;
+
+	DPRINTF(SYNC, ("%s: START\n", __func__));
+	fsdev = nmp->nm_nandfsdev;
+
+	lockmgr(&fsdev->nd_seg_const, LK_EXCLUSIVE, NULL);
+	DPRINTF(SYNC, ("%s: git lock\n", __func__));
+again:
+	create_seginfo(fsdev, &seginfo);
+
+	dat = fsdev->nd_dat_node;
+	su = fsdev->nd_su_node;
+	cp = fsdev->nd_cp_node;
+	gc = fsdev->nd_gc_node;
+	ifile = nmp->nm_ifile_node;
+
+	VOP_LOCK(NTOV(su), LK_EXCLUSIVE);
+	VOP_LOCK(NTOV(ifile), LK_EXCLUSIVE);
+	VOP_LOCK(NTOV(gc), LK_EXCLUSIVE);
+	VOP_LOCK(NTOV(cp), LK_EXCLUSIVE);
+
+	nandfs_iterate_system_vnode(gc, seginfo);
+	nandfs_iterate_dirty_vnodes(nmp->nm_vfs_mountp, seginfo);
+	nandfs_iterate_system_vnode(ifile, seginfo);
+	nandfs_iterate_system_vnode(su, seginfo);
+
+	cno_changed = 0;
+	if (seginfo->blocks || flags) {
+		cno_changed = 1;
+		/* Create new checkpoint */
+		error = nandfs_get_checkpoint(fsdev, cp, fsdev->nd_last_cno + 1);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			goto error_locks;
+		}
+
+		/* Collect blocks from system files */
+		nandfs_iterate_system_vnode(cp, seginfo);
+		nandfs_iterate_system_vnode(su, seginfo);
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		nandfs_iterate_system_vnode(dat, seginfo);
+		VOP_UNLOCK(NTOV(dat), 0);
+reiterate:
+		seginfo->reiterate = 0;
+		nandfs_iterate_system_vnode(su, seginfo);
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		nandfs_iterate_system_vnode(dat, seginfo);
+		VOP_UNLOCK(NTOV(dat), 0);
+		if (seginfo->reiterate)
+			goto reiterate;
+		if (!(seginfo->curseg) || !seginfo->curseg->num_blocks) {
+			error = create_segment(seginfo);
+			if (error) {
+				clean_seginfo(seginfo, 0);
+				delete_seginfo(seginfo);
+				goto error_locks;
+			}
+			goto reiterate;
+		}
+
+		/* Reiterate all blocks and assign physical block number */
+		nandfs_seginfo_assign_pblk(seginfo);
+
+		/* Fill superroot */
+		error = nandfs_add_superroot(seginfo);
+		if (error) {
+			clean_seginfo(seginfo, 0);
+			delete_seginfo(seginfo);
+			goto error_locks;
+		}
+		KASSERT(!(seginfo->reiterate), ("reiteration after superroot"));
+
+		/* Fill checkpoint data */
+		nandfs_set_checkpoint(fsdev, cp, fsdev->nd_last_cno + 1,
+		    &ifile->nn_inode, seginfo->blocks);
+
+		LIST_FOREACH(seg, &seginfo->seg_list, seg_link)
+			nandfs_update_segment(fsdev, seg->seg_num,
+			    seg->nblocks + seg->segsum_blocks);
+
+		VOP_LOCK(NTOV(dat), LK_EXCLUSIVE);
+		error = save_seginfo(seginfo, 1);
+		if (error) {
+			clean_seginfo(seginfo, 1);
+			delete_seginfo(seginfo);
+			goto error_dat;
+		}
+		VOP_UNLOCK(NTOV(dat), 0);
+	}
+
+	VOP_UNLOCK(NTOV(cp), 0);
+	VOP_UNLOCK(NTOV(gc), 0);
+	VOP_UNLOCK(NTOV(ifile), 0);
+
+	nandfs_process_segments(fsdev);
+
+	VOP_UNLOCK(NTOV(su), 0);
+
+	delete_seginfo(seginfo);
+
+	/*
+	 * XXX: a hack, will go away soon
+	 */
+	if ((NTOV(dat)->v_bufobj.bo_dirty.bv_cnt != 0 ||
+	    NTOV(cp)->v_bufobj.bo_dirty.bv_cnt != 0 ||
+	    NTOV(gc)->v_bufobj.bo_dirty.bv_cnt != 0 ||
+	    NTOV(ifile)->v_bufobj.bo_dirty.bv_cnt != 0 ||
+	    NTOV(su)->v_bufobj.bo_dirty.bv_cnt != 0) &&
+	    (flags & NANDFS_UMOUNT)) {
+		DPRINTF(SYNC, ("%s: RERUN\n", __func__));
+		goto again;
+	}
+
+	MPASS(fsdev->nd_free_base == NULL);
+
+	lockmgr(&fsdev->nd_seg_const, LK_RELEASE, NULL);
+
+	if (cno_changed) {
+		if ((nandfs_cps_between_sblocks != 0 &&
+		    fsdev->nd_last_cno % nandfs_cps_between_sblocks == 0) ||
+		    flags & NANDFS_UMOUNT)
+			nandfs_write_superblock(fsdev);
+	}
+
+	DPRINTF(SYNC, ("%s: END\n", __func__));
+	return (0);
+error_dat:
+	VOP_UNLOCK(NTOV(dat), 0);
+error_locks:
+	VOP_UNLOCK(NTOV(cp), 0);
+	VOP_UNLOCK(NTOV(gc), 0);
+	VOP_UNLOCK(NTOV(ifile), 0);
+	VOP_UNLOCK(NTOV(su), 0);
+	lockmgr(&fsdev->nd_seg_const, LK_RELEASE, NULL);
+
+	return (error);
+}
+
+#ifdef DDB
+/*
+ * Show details about the given NANDFS mount point.
+ */
+DB_SHOW_COMMAND(nandfs, db_show_nandfs)
+{
+	struct mount *mp;
+	struct nandfs_device *nffsdev;
+	struct nandfs_segment *seg;
+	struct nandfsmount *nmp;
+	struct buf *bp;
+	struct vnode *vp;
+
+	if (!have_addr) {
+		db_printf("\nUsage: show nandfs <mount_addr>\n");
+		return;
+	}
+
+	mp = (struct mount *)addr;
+	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
+	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
+
+
+	nmp = (struct nandfsmount *)(mp->mnt_data);
+	nffsdev = nmp->nm_nandfsdev;
+	db_printf("dev vnode:%p\n", nffsdev->nd_devvp);
+	db_printf("blocksize:%jx last cno:%jx last pseg:%jx seg num:%jx\n",
+	    (uintmax_t)nffsdev->nd_blocksize, (uintmax_t)nffsdev->nd_last_cno,
+	    (uintmax_t)nffsdev->nd_last_pseg, (uintmax_t)nffsdev->nd_seg_num);
+	db_printf("system nodes: dat:%p cp:%p su:%p ifile:%p gc:%p\n",
+	    nffsdev->nd_dat_node, nffsdev->nd_cp_node, nffsdev->nd_su_node,
+	    nmp->nm_ifile_node, nffsdev->nd_gc_node);
+
+	if (nffsdev->nd_seginfo != NULL) {
+		LIST_FOREACH(seg, &nffsdev->nd_seginfo->seg_list, seg_link) {
+			db_printf("seg: %p\n", seg);
+			TAILQ_FOREACH(bp, &seg->segsum,
+			    b_cluster.cluster_entry)
+				db_printf("segbp %p\n", bp);
+			TAILQ_FOREACH(bp, &seg->data,
+			    b_cluster.cluster_entry) {
+				vp = bp->b_vp;
+				db_printf("bp:%p bp->b_vp:%p ino:%jx\n", bp, vp,
+				    (uintmax_t)(vp ? VTON(vp)->nn_ino : 0));
+			}
+		}
+	}
+}
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_subr.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_subr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,1120 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_subr.c,v 1.4 2009/07/29 17:06:57 reinoud
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_subr.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/libkern.h>
+
+#include <geom/geom.h>
+#include <geom/geom_vfs.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#include <machine/_inttypes.h>
+#include "nandfs_mount.h"
+#include "nandfs.h"
+#include "nandfs_subr.h"
+
+MALLOC_DEFINE(M_NANDFSMNT, "nandfs_mount", "NANDFS mount");;
+MALLOC_DEFINE(M_NANDFSTEMP, "nandfs_tmt", "NANDFS tmp");
+
+uma_zone_t nandfs_node_zone;
+
+void nandfs_bdflush(struct bufobj *bo, struct buf *bp);
+int nandfs_bufsync(struct bufobj *bo, int waitfor);
+
+struct buf_ops buf_ops_nandfs = {
+	.bop_name	=	"buf_ops_nandfs",
+	.bop_write	=	bufwrite,
+	.bop_strategy	=	bufstrategy,
+	.bop_sync	=	nandfs_bufsync,
+	.bop_bdflush	=	nandfs_bdflush,
+};
+
+int
+nandfs_bufsync(struct bufobj *bo, int waitfor)
+{
+	struct vnode *vp;
+	int error = 0;
+
+	vp = bo->__bo_vnode;
+
+	ASSERT_VOP_LOCKED(vp, __func__);
+	error = nandfs_sync_file(vp);
+	if (error)
+		nandfs_warning("%s: cannot flush buffers err:%d\n",
+		    __func__, error);
+
+	return (error);
+}
+
+void
+nandfs_bdflush(bo, bp)
+	struct bufobj *bo;
+	struct buf *bp;
+{
+	struct vnode *vp;
+	int error;
+
+	if (bo->bo_dirty.bv_cnt <= ((dirtybufthresh * 8) / 10))
+		return;
+
+	vp = bp->b_vp;
+	if (NANDFS_SYS_NODE(VTON(vp)->nn_ino))
+		return;
+
+	if (NANDFS_IS_INDIRECT(bp))
+		return;
+
+	error = nandfs_sync_file(vp);
+	if (error)
+		nandfs_warning("%s: cannot flush buffers err:%d\n",
+		    __func__, error);
+}
+
+int
+nandfs_init(struct vfsconf *vfsp)
+{
+
+	nandfs_node_zone = uma_zcreate("nandfs node zone",
+	    sizeof(struct nandfs_node), NULL, NULL, NULL, NULL, 0, 0);
+
+	return (0);
+}
+
+int
+nandfs_uninit(struct vfsconf *vfsp)
+{
+
+	uma_zdestroy(nandfs_node_zone);
+	return (0);
+}
+
+/* Basic calculators */
+uint64_t
+nandfs_get_segnum_of_block(struct nandfs_device *nandfsdev,
+    nandfs_daddr_t blocknr)
+{
+	uint64_t segnum, blks_per_seg;
+
+	MPASS(blocknr >= nandfsdev->nd_fsdata.f_first_data_block);
+
+	blks_per_seg = nandfsdev->nd_fsdata.f_blocks_per_segment;
+
+	segnum = blocknr / blks_per_seg;
+	segnum -= nandfsdev->nd_fsdata.f_first_data_block / blks_per_seg;
+
+	DPRINTF(SYNC, ("%s: returning blocknr %jx -> segnum %jx\n", __func__,
+	    blocknr, segnum));
+
+	return (segnum);
+}
+
+void
+nandfs_get_segment_range(struct nandfs_device *nandfsdev, uint64_t segnum,
+    uint64_t *seg_start, uint64_t *seg_end)
+{
+	uint64_t blks_per_seg;
+
+	blks_per_seg = nandfsdev->nd_fsdata.f_blocks_per_segment;
+	*seg_start = nandfsdev->nd_fsdata.f_first_data_block +
+	    blks_per_seg * segnum;
+	if (seg_end != NULL)
+		*seg_end = *seg_start + blks_per_seg -1;
+}
+
+void nandfs_calc_mdt_consts(struct nandfs_device *nandfsdev,
+    struct nandfs_mdt *mdt, int entry_size)
+{
+	uint32_t blocksize = nandfsdev->nd_blocksize;
+
+	mdt->entries_per_group = blocksize * 8;
+	mdt->entries_per_block = blocksize / entry_size;
+
+	mdt->blocks_per_group =
+	    (mdt->entries_per_group -1) / mdt->entries_per_block + 1 + 1;
+	mdt->groups_per_desc_block =
+	    blocksize / sizeof(struct nandfs_block_group_desc);
+	mdt->blocks_per_desc_block =
+	    mdt->groups_per_desc_block * mdt->blocks_per_group + 1;
+}
+
+int
+nandfs_dev_bread(struct nandfs_device *nandfsdev, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	int blk2dev = nandfsdev->nd_blocksize / DEV_BSIZE;
+	int error;
+
+	DPRINTF(BLOCK, ("%s: read from block %jx vp %p\n", __func__,
+	    blocknr * blk2dev, nandfsdev->nd_devvp));
+	error = bread(nandfsdev->nd_devvp, blocknr * blk2dev,
+	    nandfsdev->nd_blocksize, NOCRED, bpp);
+	if (error)
+		nandfs_error("%s: cannot read from device - blk:%jx\n",
+		    __func__, blocknr);
+	return (error);
+}
+
+/* Read on a node */
+int
+nandfs_bread(struct nandfs_node *node, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	nandfs_daddr_t vblk;
+	int error;
+
+	DPRINTF(BLOCK, ("%s: vp:%p lbn:%#jx\n", __func__, NTOV(node),
+	    blocknr));
+
+	error = bread(NTOV(node), blocknr, node->nn_nandfsdev->nd_blocksize,
+	    cred, bpp);
+
+	KASSERT(error == 0, ("%s: vp:%p lbn:%#jx err:%d\n", __func__,
+	    NTOV(node), blocknr, error));
+
+	if (!nandfs_vblk_get(*bpp) &&
+	    ((*bpp)->b_flags & B_CACHE) && node->nn_ino != NANDFS_DAT_INO) {
+		nandfs_bmap_lookup(node, blocknr, &vblk);
+		nandfs_vblk_set(*bpp, vblk);
+	}
+	return (error);
+}
+
+int
+nandfs_bread_meta(struct nandfs_node *node, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	nandfs_daddr_t vblk;
+	int error;
+
+	DPRINTF(BLOCK, ("%s: vp:%p lbn:%#jx\n", __func__, NTOV(node),
+	    blocknr));
+
+	error = bread(NTOV(node), blocknr, node->nn_nandfsdev->nd_blocksize,
+	    cred, bpp);
+
+	KASSERT(error == 0, ("%s: vp:%p lbn:%#jx err:%d\n", __func__,
+	    NTOV(node), blocknr, error));
+
+	if (!nandfs_vblk_get(*bpp) &&
+	    ((*bpp)->b_flags & B_CACHE) && node->nn_ino != NANDFS_DAT_INO) {
+		nandfs_bmap_lookup(node, blocknr, &vblk);
+		nandfs_vblk_set(*bpp, vblk);
+	}
+
+	return (error);
+}
+
+int
+nandfs_bdestroy(struct nandfs_node *node, nandfs_daddr_t vblk)
+{
+	int error;
+
+	if (!NANDFS_SYS_NODE(node->nn_ino))
+		NANDFS_WRITEASSERT(node->nn_nandfsdev);
+
+	error = nandfs_vblock_end(node->nn_nandfsdev, vblk);
+	if (error) {
+		nandfs_error("%s: ending vblk: %jx failed\n",
+		    __func__, (uintmax_t)vblk);
+		return (error);
+	}
+	node->nn_inode.i_blocks--;
+
+	return (0);
+}
+
+int
+nandfs_bcreate(struct nandfs_node *node, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	int error;
+
+	ASSERT_VOP_LOCKED(NTOV(node), __func__);
+	if (!NANDFS_SYS_NODE(node->nn_ino))
+		NANDFS_WRITEASSERT(node->nn_nandfsdev);
+
+	DPRINTF(BLOCK, ("%s: vp:%p lbn:%#jx\n", __func__, NTOV(node),
+	    blocknr));
+
+	*bpp = getblk(NTOV(node), blocknr, node->nn_nandfsdev->nd_blocksize,
+	    0, 0, 0);
+
+	KASSERT((*bpp), ("%s: vp:%p lbn:%#jx\n", __func__,
+	    NTOV(node), blocknr));
+
+	if (*bpp) {
+		vfs_bio_clrbuf(*bpp);
+		(*bpp)->b_blkno = ~(0); /* To avoid VOP_BMAP in bdwrite */
+		error = nandfs_bmap_insert_block(node, blocknr, *bpp);
+		if (error) {
+			nandfs_warning("%s: failed bmap insert node:%p"
+			    " blk:%jx\n", __func__, node, blocknr);
+			brelse(*bpp);
+			return (error);
+		}
+		node->nn_inode.i_blocks++;
+
+		return (0);
+	}
+
+	return (-1);
+}
+
+int
+nandfs_bcreate_meta(struct nandfs_node *node, nandfs_lbn_t blocknr,
+    struct ucred *cred, int flags, struct buf **bpp)
+{
+	struct nandfs_device *fsdev;
+	nandfs_daddr_t vblk;
+	int error;
+
+	ASSERT_VOP_LOCKED(NTOV(node), __func__);
+	NANDFS_WRITEASSERT(node->nn_nandfsdev);
+
+	DPRINTF(BLOCK, ("%s: vp:%p lbn:%#jx\n", __func__, NTOV(node),
+	    blocknr));
+
+	fsdev = node->nn_nandfsdev;
+
+	*bpp = getblk(NTOV(node), blocknr, node->nn_nandfsdev->nd_blocksize,
+	    0, 0, 0);
+
+	KASSERT((*bpp), ("%s: vp:%p lbn:%#jx\n", __func__,
+	    NTOV(node), blocknr));
+
+	memset((*bpp)->b_data, 0, fsdev->nd_blocksize);
+
+	vfs_bio_clrbuf(*bpp);
+	(*bpp)->b_blkno = ~(0); /* To avoid VOP_BMAP in bdwrite */
+
+	nandfs_buf_set(*bpp, NANDFS_VBLK_ASSIGNED);
+
+	if (node->nn_ino != NANDFS_DAT_INO) {
+		error = nandfs_vblock_alloc(fsdev, &vblk);
+		if (error) {
+			nandfs_buf_clear(*bpp, NANDFS_VBLK_ASSIGNED);
+			brelse(*bpp);
+			return (error);
+		}
+	} else
+		vblk = fsdev->nd_fakevblk++;
+
+	nandfs_vblk_set(*bpp, vblk);
+
+	nandfs_bmap_insert_block(node, blocknr, *bpp);
+	return (0);
+}
+
+/* Translate index to a file block number and an entry */
+void
+nandfs_mdt_trans(struct nandfs_mdt *mdt, uint64_t index,
+    nandfs_lbn_t *blocknr, uint32_t *entry_in_block)
+{
+	uint64_t blknr;
+	uint64_t group, group_offset, blocknr_in_group;
+	uint64_t desc_block, desc_offset;
+
+	/* Calculate our offset in the file */
+	group = index / mdt->entries_per_group;
+	group_offset = index % mdt->entries_per_group;
+	desc_block = group / mdt->groups_per_desc_block;
+	desc_offset = group % mdt->groups_per_desc_block;
+	blocknr_in_group = group_offset / mdt->entries_per_block;
+
+	/* To descgroup offset */
+	blknr = 1 + desc_block * mdt->blocks_per_desc_block;
+
+	/* To group offset */
+	blknr += desc_offset * mdt->blocks_per_group;
+
+	/* To actual file block */
+	blknr += 1 + blocknr_in_group;
+
+	*blocknr = blknr;
+	*entry_in_block = group_offset % mdt->entries_per_block;
+}
+
+void
+nandfs_mdt_trans_blk(struct nandfs_mdt *mdt, uint64_t index,
+    uint64_t *desc, uint64_t *bitmap, nandfs_lbn_t *blocknr,
+    uint32_t *entry_in_block)
+{
+	uint64_t blknr;
+	uint64_t group, group_offset, blocknr_in_group;
+	uint64_t desc_block, desc_offset;
+
+	/* Calculate our offset in the file */
+	group = index / mdt->entries_per_group;
+	group_offset = index % mdt->entries_per_group;
+	desc_block = group / mdt->groups_per_desc_block;
+	desc_offset = group % mdt->groups_per_desc_block;
+	blocknr_in_group = group_offset / mdt->entries_per_block;
+
+	/* To descgroup offset */
+	*desc = desc_block * mdt->blocks_per_desc_block;
+	blknr = 1 + desc_block * mdt->blocks_per_desc_block;
+
+	/* To group offset */
+	blknr += desc_offset * mdt->blocks_per_group;
+	*bitmap = blknr;
+
+	/* To actual file block */
+	blknr += 1 + blocknr_in_group;
+
+	*blocknr = blknr;
+	*entry_in_block = group_offset % mdt->entries_per_block;
+
+	DPRINTF(ALLOC,
+	    ("%s: desc_buf: %jx bitmap_buf: %jx entry_buf: %jx entry: %x\n",
+	    __func__, (uintmax_t)*desc, (uintmax_t)*bitmap,
+	    (uintmax_t)*blocknr, *entry_in_block));
+}
+
+int
+nandfs_vtop(struct nandfs_node *node, nandfs_daddr_t vblocknr,
+    nandfs_daddr_t *pblocknr)
+{
+	struct nandfs_node *dat_node;
+	struct nandfs_dat_entry *entry;
+	struct buf *bp;
+	nandfs_lbn_t ldatblknr;
+	uint32_t entry_in_block;
+	int locked, error;
+
+	if (node->nn_ino == NANDFS_DAT_INO || node->nn_ino == NANDFS_GC_INO) {
+		*pblocknr = vblocknr;
+		return (0);
+	}
+
+	/* only translate valid vblocknrs */
+	if (vblocknr == 0)
+		return (0);
+
+	dat_node = node->nn_nandfsdev->nd_dat_node;
+	nandfs_mdt_trans(&node->nn_nandfsdev->nd_dat_mdt, vblocknr, &ldatblknr,
+	    &entry_in_block);
+
+	locked = NANDFS_VOP_ISLOCKED(NTOV(dat_node));
+	if (!locked)
+		VOP_LOCK(NTOV(dat_node), LK_SHARED);
+	error = nandfs_bread(dat_node, ldatblknr, NOCRED, 0, &bp);
+	if (error) {
+		DPRINTF(TRANSLATE, ("vtop: can't read in DAT block %#jx!\n",
+		    (uintmax_t)ldatblknr));
+		brelse(bp);
+		VOP_UNLOCK(NTOV(dat_node), 0);
+		return (error);
+	}
+
+	/* Get our translation */
+	entry = ((struct nandfs_dat_entry *) bp->b_data) + entry_in_block;
+	DPRINTF(TRANSLATE, ("\tentry %p data %p entry_in_block %x\n",
+	    entry, bp->b_data, entry_in_block))
+	DPRINTF(TRANSLATE, ("\tvblk %#jx -> %#jx for cp [%#jx-%#jx]\n",
+	    (uintmax_t)vblocknr, (uintmax_t)entry->de_blocknr,
+	    (uintmax_t)entry->de_start, (uintmax_t)entry->de_end));
+
+	*pblocknr = entry->de_blocknr;
+	brelse(bp);
+	if (!locked)
+		VOP_UNLOCK(NTOV(dat_node), 0);
+
+	MPASS(*pblocknr >= node->nn_nandfsdev->nd_fsdata.f_first_data_block ||
+	    *pblocknr == 0);
+
+	return (0);
+}
+
+int
+nandfs_segsum_valid(struct nandfs_segment_summary *segsum)
+{
+
+	return (segsum->ss_magic == NANDFS_SEGSUM_MAGIC);
+}
+
+int
+nandfs_load_segsum(struct nandfs_device *fsdev, nandfs_daddr_t blocknr,
+    struct nandfs_segment_summary *segsum)
+{
+	struct buf *bp;
+	int error;
+
+	DPRINTF(VOLUMES, ("nandfs: try segsum at block %jx\n",
+	    (uintmax_t)blocknr));
+
+	error = nandfs_dev_bread(fsdev, blocknr, NOCRED, 0, &bp);
+	if (error)
+		return (error);
+
+	memcpy(segsum, bp->b_data, sizeof(struct nandfs_segment_summary));
+	brelse(bp);
+
+	if (!nandfs_segsum_valid(segsum)) {
+		DPRINTF(VOLUMES, ("%s: bad magic pseg:%jx\n", __func__,
+		    blocknr));
+		return (EINVAL);
+	}
+
+	return (error);
+}
+
+static int
+nandfs_load_super_root(struct nandfs_device *nandfsdev,
+    struct nandfs_segment_summary *segsum, uint64_t pseg)
+{
+	struct nandfs_super_root super_root;
+	struct buf *bp;
+	uint64_t blocknr;
+	uint32_t super_root_crc, comp_crc;
+	int off, error;
+
+	/* Check if there is a superroot */
+	if ((segsum->ss_flags & NANDFS_SS_SR) == 0) {
+		DPRINTF(VOLUMES, ("%s: no super root in pseg:%jx\n", __func__,
+		    pseg));
+		return (ENOENT);
+	}
+
+	/* Get our super root, located at the end of the pseg */
+	blocknr = pseg + segsum->ss_nblocks - 1;
+	DPRINTF(VOLUMES, ("%s: try at %#jx\n", __func__, (uintmax_t)blocknr));
+
+	error = nandfs_dev_bread(nandfsdev, blocknr, NOCRED, 0, &bp);
+	if (error)
+		return (error);
+
+	memcpy(&super_root, bp->b_data, sizeof(struct nandfs_super_root));
+	brelse(bp);
+
+	/* Check super root CRC */
+	super_root_crc = super_root.sr_sum;
+	off = sizeof(super_root.sr_sum);
+	comp_crc = crc32((uint8_t *)&super_root + off,
+	    NANDFS_SR_BYTES - off);
+
+	if (super_root_crc != comp_crc) {
+		DPRINTF(VOLUMES, ("%s: invalid crc:%#x [expect:%#x]\n",
+		    __func__, super_root_crc, comp_crc));
+		return (EINVAL);
+	}
+
+	nandfsdev->nd_super_root = super_root;
+	DPRINTF(VOLUMES, ("%s: got valid superroot\n", __func__));
+
+	return (0);
+}
+
+/*
+ * Search for the last super root recorded.
+ */
+int
+nandfs_search_super_root(struct nandfs_device *nandfsdev)
+{
+	struct nandfs_super_block *super;
+	struct nandfs_segment_summary segsum;
+	uint64_t seg_start, seg_end, cno, seq, create, pseg;
+	uint64_t segnum;
+	int error, found;
+
+	error = found = 0;
+
+	/* Search for last super root */
+	pseg = nandfsdev->nd_super.s_last_pseg;
+	segnum = nandfs_get_segnum_of_block(nandfsdev, pseg);
+
+	cno = nandfsdev->nd_super.s_last_cno;
+	create = seq = 0;
+	DPRINTF(VOLUMES, ("%s: start in pseg %#jx\n", __func__,
+	    (uintmax_t)pseg));
+
+	for (;;) {
+		error = nandfs_load_segsum(nandfsdev, pseg, &segsum);
+		if (error)
+			break;
+
+		if (segsum.ss_seq < seq || segsum.ss_create < create)
+			break;
+
+		/* Try to load super root */
+		if (segsum.ss_flags & NANDFS_SS_SR) {
+			error = nandfs_load_super_root(nandfsdev, &segsum, pseg);
+			if (error)
+				break;	/* confused */
+			found = 1;
+
+			super = &nandfsdev->nd_super;
+			nandfsdev->nd_last_segsum = segsum;
+			super->s_last_pseg = pseg;
+			super->s_last_cno = cno++;
+			super->s_last_seq = segsum.ss_seq;
+			super->s_state = NANDFS_VALID_FS;
+			seq = segsum.ss_seq;
+			create = segsum.ss_create;
+		} else {
+			seq = segsum.ss_seq;
+			create = segsum.ss_create;
+		}
+
+		/* Calculate next partial segment location */
+		pseg += segsum.ss_nblocks;
+		DPRINTF(VOLUMES, ("%s: next partial seg is %jx\n", __func__,
+		    (uintmax_t)pseg));
+
+		/* Did we reach the end of the segment? if so, go to the next */
+		nandfs_get_segment_range(nandfsdev, segnum, &seg_start,
+		    &seg_end);
+		if (pseg >= seg_end) {
+			pseg = segsum.ss_next;
+			DPRINTF(VOLUMES,
+			    (" partial seg oor next is %jx[%jx - %jx]\n",
+			    (uintmax_t)pseg, (uintmax_t)seg_start,
+			    (uintmax_t)seg_end));
+		}
+		segnum = nandfs_get_segnum_of_block(nandfsdev, pseg);
+	}
+
+	if (error && !found)
+		return (error);
+
+	return (0);
+}
+
+int
+nandfs_get_node_raw(struct nandfs_device *nandfsdev, struct nandfsmount *nmp,
+    uint64_t ino, struct nandfs_inode *inode, struct nandfs_node **nodep)
+{
+	struct nandfs_node *node;
+	struct vnode *nvp;
+	struct mount *mp;
+	int error;
+
+	*nodep = NULL;
+
+	/* Associate with mountpoint if present */
+	if (nmp) {
+		mp = nmp->nm_vfs_mountp;
+		error = getnewvnode("nandfs", mp, &nandfs_vnodeops, &nvp);
+		if (error) {
+			return (error);
+		}
+	} else {
+		mp = NULL;
+		error = getnewvnode("snandfs", mp, &nandfs_system_vnodeops,
+		    &nvp);
+		if (error) {
+			return (error);
+		}
+	}
+
+	if (mp)
+		NANDFS_WRITELOCK(nandfsdev);
+
+	DPRINTF(IFILE, ("%s: ino: %#jx -> vp: %p\n",
+	    __func__, (uintmax_t)ino, nvp));
+	/* Lock node */
+	lockmgr(nvp->v_vnlock, LK_EXCLUSIVE, NULL);
+
+	if (mp) {
+		error = insmntque(nvp, mp);
+		if (error != 0) {
+			*nodep = NULL;
+			return (error);
+		}
+	}
+
+	node = uma_zalloc(nandfs_node_zone, M_WAITOK | M_ZERO);
+
+	/* Crosslink */
+	node->nn_vnode = nvp;
+	nvp->v_bufobj.bo_ops = &buf_ops_nandfs;
+	node->nn_nmp = nmp;
+	node->nn_nandfsdev = nandfsdev;
+	nvp->v_data = node;
+
+	/* Initiase NANDFS node */
+	node->nn_ino = ino;
+	if (inode != NULL)
+		node->nn_inode = *inode;
+
+	nandfs_vinit(nvp, ino);
+
+	/* Return node */
+	*nodep = node;
+	DPRINTF(IFILE, ("%s: ino:%#jx vp:%p node:%p\n",
+	    __func__, (uintmax_t)ino, nvp, *nodep));
+
+	return (0);
+}
+
+int
+nandfs_get_node(struct nandfsmount *nmp, uint64_t ino,
+    struct nandfs_node **nodep)
+{
+	struct nandfs_device *nandfsdev;
+	struct nandfs_inode inode, *entry;
+	struct vnode *nvp, *vpp;
+	struct thread *td;
+	struct buf *bp;
+	uint64_t ivblocknr;
+	uint32_t entry_in_block;
+	int error;
+
+	/* Look up node in hash table */
+	td = curthread;
+	*nodep = NULL;
+
+	if ((ino < NANDFS_ATIME_INO) && (ino != NANDFS_ROOT_INO)) {
+		printf("nandfs_get_node: system ino %"PRIu64" not in mount "
+		    "point!\n", ino);
+		return (ENOENT);
+	}
+
+	error = vfs_hash_get(nmp->nm_vfs_mountp, ino, LK_EXCLUSIVE, td, &nvp,
+	    NULL, NULL);
+	if (error)
+		return (error);
+
+	if (nvp != NULL) {
+		*nodep = (struct nandfs_node *)nvp->v_data;
+		return (0);
+	}
+
+	/* Look up inode structure in mountpoints ifile */
+	nandfsdev = nmp->nm_nandfsdev;
+	nandfs_mdt_trans(&nandfsdev->nd_ifile_mdt, ino, &ivblocknr,
+	    &entry_in_block);
+
+	VOP_LOCK(NTOV(nmp->nm_ifile_node), LK_SHARED);
+	error = nandfs_bread(nmp->nm_ifile_node, ivblocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(nmp->nm_ifile_node), 0);
+		return (ENOENT);
+	}
+
+	/* Get inode entry */
+	entry = (struct nandfs_inode *) bp->b_data + entry_in_block;
+	memcpy(&inode, entry, sizeof(struct nandfs_inode));
+	brelse(bp);
+	VOP_UNLOCK(NTOV(nmp->nm_ifile_node), 0);
+
+	/* Get node */
+	error = nandfs_get_node_raw(nmp->nm_nandfsdev, nmp, ino, &inode, nodep);
+	if (error) {
+		*nodep = NULL;
+		return (error);
+	}
+
+	nvp = (*nodep)->nn_vnode;
+	error = vfs_hash_insert(nvp, ino, 0, td, &vpp, NULL, NULL);
+	if (error) {
+		*nodep = NULL;
+		return (error);
+	}
+
+	return (error);
+}
+
+void
+nandfs_dispose_node(struct nandfs_node **nodep)
+{
+	struct nandfs_node *node;
+	struct vnode *vp;
+
+	/* Protect against rogue values */
+	node = *nodep;
+	if (!node) {
+		return;
+	}
+	DPRINTF(NODE, ("nandfs_dispose_node: %p\n", *nodep));
+
+	vp = NTOV(node);
+	vp->v_data = NULL;
+
+	/* Free our associated memory */
+	uma_zfree(nandfs_node_zone, node);
+
+	*nodep = NULL;
+}
+
+int
+nandfs_lookup_name_in_dir(struct vnode *dvp, const char *name, int namelen,
+    uint64_t *ino, int *found, uint64_t *off)
+{
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfs_dir_entry	*ndirent;
+	struct buf *bp;
+	uint64_t file_size, diroffset, blkoff;
+	uint64_t blocknr;
+	uint32_t blocksize = dir_node->nn_nandfsdev->nd_blocksize;
+	uint8_t *pos, name_len;
+	int error;
+
+	*found = 0;
+
+	DPRINTF(VNCALL, ("%s: %s file\n", __func__, name));
+	if (dvp->v_type != VDIR) {
+		return (ENOTDIR);
+	}
+
+	/* Get directory filesize */
+	file_size = dir_node->nn_inode.i_size;
+
+	/* Walk the directory */
+	diroffset = 0;
+	blocknr = 0;
+	blkoff = 0;
+	error = nandfs_bread(dir_node, blocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (EIO);
+	}
+
+	while (diroffset < file_size) {
+		if (blkoff >= blocksize) {
+			blkoff = 0; blocknr++;
+			brelse(bp);
+			error = nandfs_bread(dir_node, blocknr, NOCRED, 0,
+			    &bp);
+			if (error) {
+				brelse(bp);
+				return (EIO);
+			}
+		}
+
+		/* Read in one dirent */
+		pos = (uint8_t *) bp->b_data + blkoff;
+		ndirent = (struct nandfs_dir_entry *) pos;
+		name_len = ndirent->name_len;
+
+		if ((name_len == namelen) &&
+		    (strncmp(name, ndirent->name, name_len) == 0) &&
+		    (ndirent->inode != 0)) {
+			*ino = ndirent->inode;
+			*off = diroffset;
+			DPRINTF(LOOKUP, ("found `%.*s` with ino %"PRIx64"\n",
+			    name_len, ndirent->name, *ino));
+			*found = 1;
+			break;
+		}
+
+		/* Advance */
+		diroffset += ndirent->rec_len;
+		blkoff += ndirent->rec_len;
+	}
+	brelse(bp);
+
+	return (error);
+}
+
+int
+nandfs_get_fsinfo(struct nandfsmount *nmp, struct nandfs_fsinfo *fsinfo)
+{
+	struct nandfs_device *fsdev;
+
+	fsdev = nmp->nm_nandfsdev;
+
+	memcpy(&fsinfo->fs_fsdata, &fsdev->nd_fsdata, sizeof(fsdev->nd_fsdata));
+	memcpy(&fsinfo->fs_super, &fsdev->nd_super, sizeof(fsdev->nd_super));
+	snprintf(fsinfo->fs_dev, sizeof(fsinfo->fs_dev),
+	    "%s", nmp->nm_vfs_mountp->mnt_stat.f_mntfromname);
+
+	return (0);
+}
+
+void
+nandfs_inode_init(struct nandfs_inode *inode, uint16_t mode)
+{
+	struct timespec ts;
+
+	vfs_timestamp(&ts);
+
+	inode->i_blocks = 0;
+	inode->i_size = 0;
+	inode->i_ctime = ts.tv_sec;
+	inode->i_ctime_nsec = ts.tv_nsec;
+	inode->i_mtime = ts.tv_sec;
+	inode->i_mtime_nsec = ts.tv_nsec;
+	inode->i_mode = mode;
+	inode->i_links_count = 1;
+	if (S_ISDIR(mode))
+		inode->i_links_count = 2;
+	inode->i_flags = 0;
+
+	inode->i_special = 0;
+	memset(inode->i_db, 0, sizeof(inode->i_db));
+	memset(inode->i_ib, 0, sizeof(inode->i_ib));
+}
+
+void
+nandfs_inode_destroy(struct nandfs_inode *inode)
+{
+
+	MPASS(inode->i_blocks == 0);
+	bzero(inode, sizeof(*inode));
+}
+
+int
+nandfs_fs_full(struct nandfs_device *nffsdev)
+{
+	uint64_t space, bps;
+
+	bps = nffsdev->nd_fsdata.f_blocks_per_segment;
+	space = (nffsdev->nd_clean_segs - 1) * bps;
+
+	DPRINTF(BUF, ("%s: bufs:%jx space:%jx\n", __func__,
+	    (uintmax_t)nffsdev->nd_dirty_bufs, (uintmax_t)space));
+
+	if (nffsdev->nd_dirty_bufs + (10 * bps) >= space)
+		return (1);
+
+	return (0);
+}
+
+static int
+_nandfs_dirty_buf(struct buf *bp, int dirty_meta, int force)
+{
+	struct nandfs_device *nffsdev;
+	struct nandfs_node *node;
+	uint64_t ino, bps;
+
+	if (NANDFS_ISGATHERED(bp)) {
+		bqrelse(bp);
+		return (0);
+	}
+	if ((bp->b_flags & (B_MANAGED | B_DELWRI)) == (B_MANAGED | B_DELWRI)) {
+		bqrelse(bp);
+		return (0);
+	}
+
+	node = VTON(bp->b_vp);
+	nffsdev = node->nn_nandfsdev;
+	DPRINTF(BUF, ("%s: buf:%p\n", __func__, bp));
+	ino = node->nn_ino;
+
+	if (nandfs_fs_full(nffsdev) && !NANDFS_SYS_NODE(ino) && !force) {
+		brelse(bp);
+		return (ENOSPC);
+	}
+
+	bp->b_flags |= B_MANAGED;
+	bdwrite(bp);
+
+	nandfs_dirty_bufs_increment(nffsdev);
+
+	KASSERT((bp->b_vp), ("vp missing for bp"));
+	KASSERT((nandfs_vblk_get(bp) || ino == NANDFS_DAT_INO),
+	    ("bp vblk is 0"));
+
+	/*
+	 * To maintain consistency of FS we need to force making
+	 * meta buffers dirty, even if free space is low.
+	 */
+	if (dirty_meta && ino != NANDFS_GC_INO)
+		nandfs_bmap_dirty_blocks(VTON(bp->b_vp), bp, 1);
+
+	bps = nffsdev->nd_fsdata.f_blocks_per_segment;
+
+	if (nffsdev->nd_dirty_bufs >= (bps * nandfs_max_dirty_segs)) {
+		mtx_lock(&nffsdev->nd_sync_mtx);
+		if (nffsdev->nd_syncing == 0) {
+			DPRINTF(SYNC, ("%s: wakeup gc\n", __func__));
+			nffsdev->nd_syncing = 1;
+			wakeup(&nffsdev->nd_syncing);
+		}
+		mtx_unlock(&nffsdev->nd_sync_mtx);
+	}
+
+	return (0);
+}
+
+int
+nandfs_dirty_buf(struct buf *bp, int force)
+{
+
+	return (_nandfs_dirty_buf(bp, 1, force));
+}
+
+int
+nandfs_dirty_buf_meta(struct buf *bp, int force)
+{
+
+	return (_nandfs_dirty_buf(bp, 0, force));
+}
+
+void
+nandfs_undirty_buf_fsdev(struct nandfs_device *nffsdev, struct buf *bp)
+{
+
+	BUF_ASSERT_HELD(bp);
+
+	if (bp->b_flags & B_DELWRI) {
+		bp->b_flags &= ~(B_DELWRI|B_MANAGED);
+		nandfs_dirty_bufs_decrement(nffsdev);
+	}
+	/*
+	 * Since it is now being written, we can clear its deferred write flag.
+	 */
+	bp->b_flags &= ~B_DEFERRED;
+
+	brelse(bp);
+}
+
+void
+nandfs_undirty_buf(struct buf *bp)
+{
+	struct nandfs_node *node;
+
+	node = VTON(bp->b_vp);
+
+	nandfs_undirty_buf_fsdev(node->nn_nandfsdev, bp);
+}
+
+void
+nandfs_vblk_set(struct buf *bp, nandfs_daddr_t blocknr)
+{
+
+	nandfs_daddr_t *vblk = (nandfs_daddr_t *)(&bp->b_fsprivate1);
+	*vblk = blocknr;
+}
+
+nandfs_daddr_t
+nandfs_vblk_get(struct buf *bp)
+{
+
+	nandfs_daddr_t *vblk = (nandfs_daddr_t *)(&bp->b_fsprivate1);
+	return (*vblk);
+}
+
+void
+nandfs_buf_set(struct buf *bp, uint32_t bits)
+{
+	uintptr_t flags;
+
+	flags = (uintptr_t)bp->b_fsprivate3;
+	flags |= (uintptr_t)bits;
+	bp->b_fsprivate3 = (void *)flags;
+}
+
+void
+nandfs_buf_clear(struct buf *bp, uint32_t bits)
+{
+	uintptr_t flags;
+
+	flags = (uintptr_t)bp->b_fsprivate3;
+	flags &= ~(uintptr_t)bits;
+	bp->b_fsprivate3 = (void *)flags;
+}
+
+int
+nandfs_buf_check(struct buf *bp, uint32_t bits)
+{
+	uintptr_t flags;
+
+	flags = (uintptr_t)bp->b_fsprivate3;
+	if (flags & bits)
+		return (1);
+	return (0);
+}
+
+int
+nandfs_erase(struct nandfs_device *fsdev, off_t offset, size_t size)
+{
+	struct buf *bp;
+	int read_size, error, i;
+
+	DPRINTF(BLOCK, ("%s: performing erase at offset %jx size %zx\n",
+	    __func__, offset, size));
+
+	MPASS(size % fsdev->nd_erasesize == 0);
+
+	if (fsdev->nd_is_nand) {
+		error = g_delete_data(fsdev->nd_gconsumer, offset, size);
+		return (error);
+	}
+
+	if (size > MAXBSIZE)
+		read_size = MAXBSIZE;
+	else
+		read_size = size;
+
+	error = 0;
+	for (i = 0; i < size / MAXBSIZE; i++) {
+		error = bread(fsdev->nd_devvp, btodb(offset + i * read_size),
+		    read_size, NOCRED, &bp);
+		if (error) {
+			brelse(bp);
+			return (error);
+		}
+		memset(bp->b_data, 0xff, read_size);
+		error = bwrite(bp);
+		if (error) {
+			nandfs_error("%s: err:%d from bwrite\n",
+			    __func__, error);
+			return (error);
+		}
+	}
+
+	return (error);
+}
+
+int
+nandfs_vop_islocked(struct vnode *vp)
+{
+	int islocked;
+
+	islocked = VOP_ISLOCKED(vp);
+	return (islocked == LK_EXCLUSIVE || islocked == LK_SHARED);
+}
+
+nandfs_daddr_t
+nandfs_block_to_dblock(struct nandfs_device *fsdev, nandfs_lbn_t block)
+{
+
+	return (btodb(block * fsdev->nd_blocksize));
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_subr.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_subr.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_subr.h,v 1.1 2009/07/18 16:31:42 reinoud
+ *
+ * $FreeBSD: head/sys/fs/nandfs/nandfs_subr.h 235537 2012-05-17 10:11:18Z gber $
+ */
+
+#ifndef _FS_NANDFS_NANDFS_SUBR_H_
+#define _FS_NANDFS_NANDFS_SUBR_H_
+
+struct nandfs_mdt;
+
+struct nandfs_alloc_request
+{
+	uint64_t	entrynum;
+	struct buf	*bp_desc;
+	struct buf	*bp_bitmap;
+	struct buf	*bp_entry;
+};
+
+/* Segment creation */
+void nandfs_wakeup_wait_sync(struct nandfs_device *, int);
+int nandfs_segment_constructor(struct nandfsmount *, int);
+int nandfs_sync_file(struct vnode *);
+
+/* Basic calculators */
+uint64_t nandfs_get_segnum_of_block(struct nandfs_device *, nandfs_daddr_t);
+void nandfs_get_segment_range(struct nandfs_device *, uint64_t, uint64_t *,
+    uint64_t *);
+void nandfs_calc_mdt_consts(struct nandfs_device *, struct nandfs_mdt *, int);
+
+/* Log reading / volume helpers */
+int nandfs_search_super_root(struct nandfs_device *);
+
+/* Reading */
+int nandfs_dev_bread(struct nandfs_device *, nandfs_daddr_t, struct ucred *,
+    int, struct buf **);
+int nandfs_bread(struct nandfs_node *, nandfs_lbn_t, struct ucred *, int,
+    struct buf **);
+int nandfs_bread_meta(struct nandfs_node *, nandfs_lbn_t, struct ucred *, int,
+    struct buf **);
+int nandfs_bdestroy(struct nandfs_node *, nandfs_daddr_t);
+int nandfs_bcreate(struct nandfs_node *, nandfs_lbn_t, struct ucred *, int,
+    struct buf **);
+int nandfs_bcreate_meta(struct nandfs_node *, nandfs_lbn_t, struct ucred *,
+    int, struct buf **);
+int nandfs_bread_create(struct nandfs_node *, nandfs_lbn_t, struct ucred *,
+    int, struct buf **);
+
+/* vtop operations */
+int nandfs_vtop(struct nandfs_node *, nandfs_daddr_t, nandfs_daddr_t *);
+
+/* Node action implementators */
+int nandfs_vinit(struct vnode *, uint64_t);
+int nandfs_get_node(struct nandfsmount *, uint64_t, struct nandfs_node **);
+int nandfs_get_node_raw(struct nandfs_device *, struct nandfsmount *, uint64_t,
+    struct nandfs_inode *, struct nandfs_node **);
+void nandfs_dispose_node(struct nandfs_node **);
+
+void nandfs_itimes(struct vnode *);
+int nandfs_lookup_name_in_dir(struct vnode *, const char *, int, uint64_t *,
+    int *, uint64_t *);
+int nandfs_create_node(struct vnode *, struct vnode **, struct vattr *,
+    struct componentname *);
+void nandfs_delete_node(struct nandfs_node *);
+
+int nandfs_chsize(struct vnode *, u_quad_t, struct ucred *);
+int nandfs_dir_detach(struct nandfsmount *, struct nandfs_node *,
+    struct nandfs_node *, struct componentname *);
+int nandfs_dir_attach(struct nandfsmount *, struct nandfs_node *,
+    struct nandfs_node *, struct vattr *, struct componentname *);
+
+int nandfs_dirty_buf(struct buf *, int);
+int nandfs_dirty_buf_meta(struct buf *, int);
+int nandfs_fs_full(struct nandfs_device *);
+void nandfs_undirty_buf_fsdev(struct nandfs_device *, struct buf *);
+void nandfs_undirty_buf(struct buf *);
+
+void nandfs_clear_buf(struct buf *);
+void nandfs_buf_set(struct buf *, uint32_t);
+void nandfs_buf_clear(struct buf *, uint32_t);
+int nandfs_buf_check(struct buf *, uint32_t);
+
+int  nandfs_find_free_entry(struct nandfs_mdt *, struct nandfs_node *,
+    struct nandfs_alloc_request *);
+int  nandfs_find_entry(struct nandfs_mdt *, struct nandfs_node *,
+    struct nandfs_alloc_request *);
+int  nandfs_alloc_entry(struct nandfs_mdt *, struct nandfs_alloc_request *);
+void nandfs_abort_entry(struct nandfs_alloc_request *);
+int  nandfs_free_entry(struct nandfs_mdt *, struct nandfs_alloc_request *);
+int nandfs_get_entry_block(struct nandfs_mdt *, struct nandfs_node *,
+    struct nandfs_alloc_request *, uint32_t *, int);
+
+/* inode managment */
+int  nandfs_node_create(struct nandfsmount *, struct nandfs_node **, uint16_t);
+int nandfs_node_destroy(struct nandfs_node *);
+int nandfs_node_update(struct nandfs_node *);
+int nandfs_get_node_entry(struct nandfsmount *, struct nandfs_inode **,
+    uint64_t, struct buf **);
+void nandfs_mdt_trans_blk(struct nandfs_mdt *, uint64_t, uint64_t *,
+    uint64_t *, nandfs_lbn_t *, uint32_t *);
+
+/* vblock management */
+void nandfs_mdt_trans(struct nandfs_mdt *, uint64_t, nandfs_lbn_t *, uint32_t *);
+int nandfs_vblock_alloc(struct nandfs_device *, nandfs_daddr_t *);
+int nandfs_vblock_end(struct nandfs_device *, nandfs_daddr_t);
+int nandfs_vblock_assign(struct nandfs_device *, nandfs_daddr_t,
+    nandfs_lbn_t);
+int nandfs_vblock_free(struct nandfs_device *, nandfs_daddr_t);
+
+/* Checkpoint management */
+int nandfs_get_checkpoint(struct nandfs_device *, struct nandfs_node *,
+    uint64_t);
+int nandfs_set_checkpoint(struct nandfs_device *, struct nandfs_node *,
+    uint64_t, struct nandfs_inode *, uint64_t);
+
+/* Segment management */
+int nandfs_alloc_segment(struct nandfs_device *, uint64_t *);
+int nandfs_update_segment(struct nandfs_device *, uint64_t, uint32_t);
+int nandfs_free_segment(struct nandfs_device *, uint64_t);
+int nandfs_clear_segment(struct nandfs_device *, uint64_t);
+int nandfs_touch_segment(struct nandfs_device *, uint64_t);
+int nandfs_markgc_segment(struct nandfs_device *, uint64_t);
+
+int nandfs_bmap_insert_block(struct nandfs_node *, nandfs_lbn_t, struct buf *);
+int nandfs_bmap_update_block(struct nandfs_node *, struct buf *, nandfs_lbn_t);
+int nandfs_bmap_update_dat(struct nandfs_node *, nandfs_daddr_t, struct buf *);
+int nandfs_bmap_dirty_blocks(struct nandfs_node *, struct buf *, int);
+int nandfs_bmap_truncate_mapping(struct nandfs_node *, nandfs_lbn_t,
+    nandfs_lbn_t);
+int nandfs_bmap_lookup(struct nandfs_node *, nandfs_lbn_t, nandfs_daddr_t *);
+
+/* dirent */
+int nandfs_add_dirent(struct vnode *, uint64_t, char *, long, uint8_t);
+int nandfs_remove_dirent(struct vnode *, struct nandfs_node *,
+    struct componentname *);
+int nandfs_update_dirent(struct vnode *, struct nandfs_node *,
+    struct nandfs_node *);
+int nandfs_init_dir(struct vnode *, uint64_t, uint64_t);
+int nandfs_update_parent_dir(struct vnode *, uint64_t);
+
+void nandfs_vblk_set(struct buf *, nandfs_daddr_t);
+nandfs_daddr_t nandfs_vblk_get(struct buf *);
+
+void nandfs_inode_init(struct nandfs_inode *, uint16_t);
+void nandfs_inode_destroy(struct nandfs_inode *);
+
+/* ioctl */
+int nandfs_get_seg_stat(struct nandfs_device *, struct nandfs_seg_stat *);
+int nandfs_chng_cpmode(struct nandfs_node *, struct nandfs_cpmode *);
+int nandfs_get_cpinfo_ioctl(struct nandfs_node *, struct nandfs_argv *);
+int nandfs_delete_cp(struct nandfs_node *, uint64_t start, uint64_t);
+int nandfs_make_snap(struct nandfs_device *, uint64_t *);
+int nandfs_delete_snap(struct nandfs_device *, uint64_t);
+int nandfs_get_cpstat(struct nandfs_node *, struct nandfs_cpstat *);
+int nandfs_get_segment_info_ioctl(struct nandfs_device *, struct nandfs_argv *);
+int nandfs_get_dat_vinfo_ioctl(struct nandfs_device *, struct nandfs_argv *);
+int nandfs_get_dat_bdescs_ioctl(struct nandfs_device *, struct nandfs_argv *);
+int nandfs_get_fsinfo(struct nandfsmount *, struct nandfs_fsinfo *);
+
+int nandfs_get_cpinfo(struct nandfs_node *, uint64_t, uint16_t,
+    struct nandfs_cpinfo *, uint32_t, uint32_t *);
+
+nandfs_lbn_t nandfs_get_maxfilesize(struct nandfs_device *);
+
+int nandfs_write_superblock(struct nandfs_device *);
+
+extern int nandfs_sync_interval;
+extern int nandfs_max_dirty_segs;
+extern int nandfs_cps_between_sblocks;
+
+struct buf *nandfs_geteblk(int, int);
+
+void nandfs_dirty_bufs_increment(struct nandfs_device *);
+void nandfs_dirty_bufs_decrement(struct nandfs_device *);
+
+int nandfs_start_cleaner(struct nandfs_device *);
+int nandfs_stop_cleaner(struct nandfs_device *);
+
+int nandfs_segsum_valid(struct nandfs_segment_summary *);
+int nandfs_load_segsum(struct nandfs_device *, nandfs_daddr_t,
+    struct nandfs_segment_summary *);
+int nandfs_get_segment_info(struct nandfs_device *, struct nandfs_suinfo *,
+    uint32_t, uint64_t);
+int nandfs_get_segment_info_filter(struct nandfs_device *,
+    struct nandfs_suinfo *, uint32_t, uint64_t, uint64_t *, uint32_t, uint32_t);
+int nandfs_get_dat_vinfo(struct nandfs_device *, struct nandfs_vinfo *,
+    uint32_t);
+int nandfs_get_dat_bdescs(struct nandfs_device *, struct nandfs_bdesc *,
+    uint32_t);
+
+#define	NANDFS_VBLK_ASSIGNED	1
+
+#define	NANDFS_IS_INDIRECT(bp)	((bp)->b_lblkno < 0)
+
+int nandfs_erase(struct nandfs_device *, off_t, size_t);
+
+#define	NANDFS_VOP_ISLOCKED(vp)	nandfs_vop_islocked((vp))
+int nandfs_vop_islocked(struct vnode *vp);
+
+nandfs_daddr_t nandfs_block_to_dblock(struct nandfs_device *, nandfs_lbn_t);
+
+#define DEBUG_MODE
+#if defined(DEBUG_MODE)
+#define	nandfs_error		panic
+#define	nandfs_warning		printf
+#elif defined(TEST_MODE)
+#define	nandfs_error	printf
+#define	nandfs_warning	printf
+#else
+#define	nandfs_error(...)
+#define	nandfs_warning(...)
+#endif
+
+#endif	/* !_FS_NANDFS_NANDFS_SUBR_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_sufile.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_sufile.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,569 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_sufile.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+
+#include <geom/geom.h>
+#include <geom/geom_vfs.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+#define	SU_USAGE_OFF(bp, offset) \
+	((struct nandfs_segment_usage *)((bp)->b_data + offset))
+
+static int
+nandfs_seg_usage_blk_offset(struct nandfs_device *fsdev, uint64_t seg,
+    uint64_t *blk, uint64_t *offset)
+{
+	uint64_t off;
+	uint16_t seg_size;
+
+	seg_size = fsdev->nd_fsdata.f_segment_usage_size;
+
+	off = roundup(sizeof(struct nandfs_sufile_header), seg_size);
+	off += (seg * seg_size);
+
+	*blk = off / fsdev->nd_blocksize;
+	*offset = off % fsdev->nd_blocksize;
+	return (0);
+}
+
+/* Alloc new segment */
+int
+nandfs_alloc_segment(struct nandfs_device *fsdev, uint64_t *seg)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_sufile_header *su_header;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp_header, *bp;
+	uint64_t blk, vblk, offset, i, rest, nsegments;
+	uint16_t seg_size;
+	int error, found;
+
+	seg_size = fsdev->nd_fsdata.f_segment_usage_size;
+	nsegments = fsdev->nd_fsdata.f_nsegments;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	/* Read header buffer */
+	error = nandfs_bread(su_node, 0, NOCRED, 0, &bp_header);
+	if (error) {
+		brelse(bp_header);
+		return (error);
+	}
+
+	su_header = (struct nandfs_sufile_header *)bp_header->b_data;
+
+	/* Get last allocated segment */
+	i = su_header->sh_last_alloc + 1;
+
+	found = 0;
+	bp = NULL;
+	while (!found) {
+		nandfs_seg_usage_blk_offset(fsdev, i, &blk, &offset);
+		if(blk != 0) {
+			error = nandfs_bmap_lookup(su_node, blk, &vblk);
+			if (error) {
+				nandfs_error("%s: cannot find vblk for blk "
+				    "blk:%jx\n", __func__, blk);
+				return (error);
+			}
+			if (vblk)
+				error = nandfs_bread(su_node, blk, NOCRED, 0,
+				    &bp);
+			else
+				error = nandfs_bcreate(su_node, blk, NOCRED, 0,
+				    &bp);
+			if (error) {
+				nandfs_error("%s: cannot create/read "
+				    "vblk:%jx\n", __func__, vblk);
+				if (bp)
+					brelse(bp);
+				return (error);
+			}
+
+			su_usage = SU_USAGE_OFF(bp, offset);
+		} else {
+			su_usage = SU_USAGE_OFF(bp_header, offset);
+			bp = bp_header;
+		}
+
+		rest = (fsdev->nd_blocksize - offset) / seg_size;
+		/* Go through all su usage in block */
+		while (rest) {
+			/* When last check start from beggining */
+			if (i == nsegments)
+				break;
+
+			if (!su_usage->su_flags) {
+				su_usage->su_flags = 1;
+				found = 1;
+				break;
+			}
+			su_usage++;
+			i++;
+
+			/* If all checked return error */
+			if (i == su_header->sh_last_alloc) {
+				DPRINTF(SEG, ("%s: cannot allocate segment \n",
+				    __func__));
+				brelse(bp_header);
+				if (blk != 0)
+					brelse(bp);
+				return (1);
+			}
+			rest--;
+		}
+		if (!found) {
+			/* Otherwise read another block */
+			if (blk != 0)
+				brelse(bp);
+			if (i == nsegments) {
+				blk = 0;
+				i = 0;
+			} else
+				blk++;
+			offset = 0;
+		}
+	}
+
+	if (found) {
+		*seg = i;
+		su_header->sh_last_alloc = i;
+		su_header->sh_ncleansegs--;
+		su_header->sh_ndirtysegs++;
+
+		fsdev->nd_super.s_free_blocks_count = su_header->sh_ncleansegs *
+		    fsdev->nd_fsdata.f_blocks_per_segment;
+		fsdev->nd_clean_segs--;
+
+		/*
+		 * It is mostly called from syncer() so we want to force
+		 * making buf dirty.
+		 */
+		error = nandfs_dirty_buf(bp_header, 1);
+		if (error) {
+			if (bp && bp != bp_header)
+				brelse(bp);
+			return (error);
+		}
+		if (bp && bp != bp_header)
+			nandfs_dirty_buf(bp, 1);
+
+		DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)i));
+
+		return (0);
+	}
+
+	DPRINTF(SEG, ("%s: failed\n", __func__));
+
+	return (1);
+}
+
+/*
+ * Make buffer dirty, it will be updated soon but first it need to be
+ * gathered by syncer.
+ */
+int
+nandfs_touch_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	struct nandfs_node *su_node;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		nandfs_error("%s: cannot preallocate new segment\n", __func__);
+		return (error);
+	} else
+		nandfs_dirty_buf(bp, 1);
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+	return (error);
+}
+
+/* Update block count of segment */
+int
+nandfs_update_segment(struct nandfs_device *fsdev, uint64_t seg, uint32_t nblks)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		nandfs_error("%s: read block:%jx to update\n",
+		    __func__, blk);
+		brelse(bp);
+		return (error);
+	}
+
+	su_usage = SU_USAGE_OFF(bp, offset);
+	su_usage->su_lastmod = fsdev->nd_ts.tv_sec;
+	su_usage->su_flags = NANDFS_SEGMENT_USAGE_DIRTY;
+	su_usage->su_nblocks += nblks;
+
+	DPRINTF(SEG, ("%s: seg:%#jx inc:%#x cur:%#x\n",  __func__,
+	    (uintmax_t)seg, nblks, su_usage->su_nblocks));
+
+	nandfs_dirty_buf(bp, 1);
+
+	return (0);
+}
+
+/* Make segment free */
+int
+nandfs_free_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_sufile_header *su_header;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp_header, *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	/* Read su header */
+	error = nandfs_bread(su_node, 0, NOCRED, 0, &bp_header);
+	if (error) {
+		brelse(bp_header);
+		return (error);
+	}
+
+	su_header = (struct nandfs_sufile_header *)bp_header->b_data;
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	/* Read su usage block if other than su header block */
+	if (blk != 0) {
+		error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			brelse(bp_header);
+			return (error);
+		}
+	} else
+		bp = bp_header;
+
+	/* Reset su usage data */
+	su_usage = SU_USAGE_OFF(bp, offset);
+	su_usage->su_lastmod = fsdev->nd_ts.tv_sec;
+	su_usage->su_nblocks = 0;
+	su_usage->su_flags = 0;
+
+	/* Update clean/dirty counter in header */
+	su_header->sh_ncleansegs++;
+	su_header->sh_ndirtysegs--;
+
+	/*
+	 *  Make buffers dirty, called by cleaner
+	 *  so force dirty even if no much space left
+	 *  on device
+	 */
+	nandfs_dirty_buf(bp_header, 1);
+	if (bp != bp_header)
+		nandfs_dirty_buf(bp, 1);
+
+	/* Update free block count */
+	fsdev->nd_super.s_free_blocks_count = su_header->sh_ncleansegs *
+	    fsdev->nd_fsdata.f_blocks_per_segment;
+	fsdev->nd_clean_segs++;
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+
+	return (0);
+}
+
+static int
+nandfs_bad_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+	ASSERT_VOP_LOCKED(NTOV(su_node), __func__);
+
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	su_usage = SU_USAGE_OFF(bp, offset);
+	su_usage->su_lastmod = fsdev->nd_ts.tv_sec;
+	su_usage->su_flags = NANDFS_SEGMENT_USAGE_ERROR;
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+
+	nandfs_dirty_buf(bp, 1);
+
+	return (0);
+}
+
+int
+nandfs_markgc_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	struct nandfs_node *su_node;
+	struct nandfs_segment_usage *su_usage;
+	struct buf *bp;
+	uint64_t blk, offset;
+	int error;
+
+	su_node = fsdev->nd_su_node;
+
+	VOP_LOCK(NTOV(su_node), LK_EXCLUSIVE);
+
+	nandfs_seg_usage_blk_offset(fsdev, seg, &blk, &offset);
+
+	error = nandfs_bread(su_node, blk, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(su_node), 0);
+		return (error);
+	}
+
+	su_usage = SU_USAGE_OFF(bp, offset);
+	MPASS((su_usage->su_flags & NANDFS_SEGMENT_USAGE_GC) == 0);
+	su_usage->su_flags |= NANDFS_SEGMENT_USAGE_GC;
+
+	brelse(bp);
+	VOP_UNLOCK(NTOV(su_node), 0);
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+
+	return (0);
+}
+
+int
+nandfs_clear_segment(struct nandfs_device *fsdev, uint64_t seg)
+{
+	uint64_t offset, segsize;
+	uint32_t bps, bsize;
+	int error = 0;
+
+	bps = fsdev->nd_fsdata.f_blocks_per_segment;
+	bsize = fsdev->nd_blocksize;
+	segsize = bsize * bps;
+	nandfs_get_segment_range(fsdev, seg, &offset, NULL);
+	offset *= bsize;
+
+	DPRINTF(SEG, ("%s: seg:%#jx\n", __func__, (uintmax_t)seg));
+
+	/* Erase it and mark it bad when fail */
+	if (nandfs_erase(fsdev, offset, segsize))
+		error = nandfs_bad_segment(fsdev, seg);
+
+	if (error)
+		return (error);
+
+	/* Mark it free */
+	error = nandfs_free_segment(fsdev, seg);
+
+	return (error);
+}
+
+int
+nandfs_get_seg_stat(struct nandfs_device *nandfsdev,
+    struct nandfs_seg_stat *nss)
+{
+	struct nandfs_sufile_header *suhdr;
+	struct nandfs_node *su_node;
+	struct buf *bp;
+	int err;
+
+	su_node = nandfsdev->nd_su_node;
+
+	NANDFS_WRITELOCK(nandfsdev);
+	VOP_LOCK(NTOV(su_node), LK_SHARED);
+	err = nandfs_bread(nandfsdev->nd_su_node, 0, NOCRED, 0, &bp);
+	if (err) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(su_node), 0);
+		NANDFS_WRITEUNLOCK(nandfsdev);
+		return (-1);
+	}
+
+	suhdr = (struct nandfs_sufile_header *)bp->b_data;
+	nss->nss_nsegs = nandfsdev->nd_fsdata.f_nsegments;
+	nss->nss_ncleansegs = suhdr->sh_ncleansegs;
+	nss->nss_ndirtysegs = suhdr->sh_ndirtysegs;
+	nss->nss_ctime = 0;
+	nss->nss_nongc_ctime = nandfsdev->nd_ts.tv_sec;
+	nss->nss_prot_seq = nandfsdev->nd_seg_sequence;
+
+	brelse(bp);
+	VOP_UNLOCK(NTOV(su_node), 0);
+
+	NANDFS_WRITEUNLOCK(nandfsdev);
+
+	return (0);
+}
+
+int
+nandfs_get_segment_info_ioctl(struct nandfs_device *fsdev,
+    struct nandfs_argv *nargv)
+{
+	struct nandfs_suinfo *nsi;
+	int error;
+
+	if (nargv->nv_nmembs > NANDFS_SEGMENTS_MAX)
+		return (EINVAL);
+
+	nsi = malloc(sizeof(struct nandfs_suinfo) * nargv->nv_nmembs,
+	    M_NANDFSTEMP, M_WAITOK | M_ZERO);
+
+	error = nandfs_get_segment_info(fsdev, nsi, nargv->nv_nmembs,
+	    nargv->nv_index);
+
+	if (error == 0)
+		error = copyout(nsi, (void *)(uintptr_t)nargv->nv_base,
+		    sizeof(struct nandfs_suinfo) * nargv->nv_nmembs);
+
+	free(nsi, M_NANDFSTEMP);
+	return (error);
+}
+
+int
+nandfs_get_segment_info(struct nandfs_device *fsdev, struct nandfs_suinfo *nsi,
+    uint32_t nmembs, uint64_t segment)
+{
+
+	return (nandfs_get_segment_info_filter(fsdev, nsi, nmembs, segment,
+	    NULL, 0, 0));
+}
+
+int
+nandfs_get_segment_info_filter(struct nandfs_device *fsdev,
+    struct nandfs_suinfo *nsi, uint32_t nmembs, uint64_t segment,
+    uint64_t *nsegs, uint32_t filter, uint32_t nfilter)
+{
+	struct nandfs_segment_usage *su;
+	struct nandfs_node *su_node;
+	struct buf *bp;
+	uint64_t curr, blocknr, blockoff, i;
+	uint32_t flags;
+	int err = 0;
+
+	curr = ~(0);
+
+	lockmgr(&fsdev->nd_seg_const, LK_EXCLUSIVE, NULL);
+	su_node = fsdev->nd_su_node;
+
+	VOP_LOCK(NTOV(su_node), LK_SHARED);
+
+	bp = NULL;
+	if (nsegs !=  NULL)
+		*nsegs = 0;
+	for (i = 0; i < nmembs; segment++) {
+		if (segment == fsdev->nd_fsdata.f_nsegments)
+			break;
+
+		nandfs_seg_usage_blk_offset(fsdev, segment, &blocknr,
+		    &blockoff);
+
+		if (i == 0 || curr != blocknr) {
+			if (bp != NULL)
+				brelse(bp);
+			err = nandfs_bread(su_node, blocknr, NOCRED,
+			    0, &bp);
+			if (err) {
+				goto out;
+			}
+			curr = blocknr;
+		}
+
+		su = SU_USAGE_OFF(bp, blockoff);
+		flags = su->su_flags;
+		if (segment == fsdev->nd_seg_num ||
+		    segment == fsdev->nd_next_seg_num)
+			flags |= NANDFS_SEGMENT_USAGE_ACTIVE;
+
+		if (nfilter != 0 && (flags & nfilter) != 0)
+			continue;
+		if (filter != 0 && (flags & filter) == 0)
+			continue;
+
+		nsi->nsi_num = segment;
+		nsi->nsi_lastmod = su->su_lastmod;
+		nsi->nsi_blocks = su->su_nblocks;
+		nsi->nsi_flags = flags;
+		nsi++;
+		i++;
+		if (nsegs != NULL)
+			(*nsegs)++;
+	}
+
+out:
+	if (bp != NULL)
+		brelse(bp);
+	VOP_UNLOCK(NTOV(su_node), 0);
+	lockmgr(&fsdev->nd_seg_const, LK_RELEASE, NULL);
+
+	return (err);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_vfsops.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,1590 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_vfsops.c,v 1.1 2009/07/18 16:31:42 reinoud Exp
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_vfsops.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/sysctl.h>
+#include <sys/libkern.h>
+
+#include <geom/geom.h>
+#include <geom/geom_vfs.h>
+
+#include <machine/_inttypes.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+static MALLOC_DEFINE(M_NANDFSMNT, "nandfs_mount", "NANDFS mount structure");
+
+#define	NANDFS_SET_SYSTEMFILE(vp) {	\
+	(vp)->v_vflag |= VV_SYSTEM;	\
+	vref(vp);			\
+	vput(vp); }
+
+#define	NANDFS_UNSET_SYSTEMFILE(vp) {	\
+	VOP_LOCK(vp, LK_EXCLUSIVE);	\
+	MPASS(vp->v_bufobj.bo_dirty.bv_cnt == 0); \
+	(vp)->v_vflag &= ~VV_SYSTEM;	\
+	vgone(vp);			\
+	vput(vp); }
+
+/* Globals */
+struct _nandfs_devices nandfs_devices;
+
+/* Parameters */
+int nandfs_verbose = 0;
+
+static void
+nandfs_tunable_init(void *arg)
+{
+
+	TUNABLE_INT_FETCH("vfs.nandfs.verbose", &nandfs_verbose);
+}
+SYSINIT(nandfs_tunables, SI_SUB_VFS, SI_ORDER_ANY, nandfs_tunable_init, NULL);
+
+static SYSCTL_NODE(_vfs, OID_AUTO, nandfs, CTLFLAG_RD, 0, "NAND filesystem");
+static SYSCTL_NODE(_vfs_nandfs, OID_AUTO, mount, CTLFLAG_RD, 0,
+    "NANDFS mountpoints");
+SYSCTL_INT(_vfs_nandfs, OID_AUTO, verbose, CTLFLAG_RW, &nandfs_verbose, 0, "");
+
+#define NANDFS_CONSTR_INTERVAL	5
+int nandfs_sync_interval = NANDFS_CONSTR_INTERVAL; /* sync every 5 seconds */
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, sync_interval, CTLFLAG_RW,
+    &nandfs_sync_interval, 0, "");
+
+#define NANDFS_MAX_DIRTY_SEGS	5
+int nandfs_max_dirty_segs = NANDFS_MAX_DIRTY_SEGS; /* sync when 5 dirty seg */
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, max_dirty_segs, CTLFLAG_RW,
+    &nandfs_max_dirty_segs, 0, "");
+
+#define NANDFS_CPS_BETWEEN_SBLOCKS 5
+int nandfs_cps_between_sblocks = NANDFS_CPS_BETWEEN_SBLOCKS; /* write superblock every 5 checkpoints */
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cps_between_sblocks, CTLFLAG_RW,
+    &nandfs_cps_between_sblocks, 0, "");
+
+#define NANDFS_CLEANER_ENABLE 1
+int nandfs_cleaner_enable = NANDFS_CLEANER_ENABLE;
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_enable, CTLFLAG_RW,
+    &nandfs_cleaner_enable, 0, "");
+
+#define NANDFS_CLEANER_INTERVAL 5
+int nandfs_cleaner_interval = NANDFS_CLEANER_INTERVAL;
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_interval, CTLFLAG_RW,
+    &nandfs_cleaner_interval, 0, "");
+
+#define NANDFS_CLEANER_SEGMENTS 5
+int nandfs_cleaner_segments = NANDFS_CLEANER_SEGMENTS;
+SYSCTL_UINT(_vfs_nandfs, OID_AUTO, cleaner_segments, CTLFLAG_RW,
+    &nandfs_cleaner_segments, 0, "");
+
+static int nandfs_mountfs(struct vnode *devvp, struct mount *mp);
+static vfs_mount_t	nandfs_mount;
+static vfs_root_t	nandfs_root;
+static vfs_statfs_t	nandfs_statfs;
+static vfs_unmount_t	nandfs_unmount;
+static vfs_vget_t	nandfs_vget;
+static vfs_sync_t	nandfs_sync;
+static const char *nandfs_opts[] = {
+	"snap", "from", "noatime", NULL
+};
+
+/* System nodes */
+static int
+nandfs_create_system_nodes(struct nandfs_device *nandfsdev)
+{
+	int error;
+
+	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_DAT_INO,
+	    &nandfsdev->nd_super_root.sr_dat, &nandfsdev->nd_dat_node);
+	if (error)
+		goto errorout;
+
+	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_CPFILE_INO,
+	    &nandfsdev->nd_super_root.sr_cpfile, &nandfsdev->nd_cp_node);
+	if (error)
+		goto errorout;
+
+	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_SUFILE_INO,
+	    &nandfsdev->nd_super_root.sr_sufile, &nandfsdev->nd_su_node);
+	if (error)
+		goto errorout;
+
+	error = nandfs_get_node_raw(nandfsdev, NULL, NANDFS_GC_INO,
+	    NULL, &nandfsdev->nd_gc_node);
+	if (error)
+		goto errorout;
+
+	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_dat_node));
+	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_cp_node));
+	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_su_node));
+	NANDFS_SET_SYSTEMFILE(NTOV(nandfsdev->nd_gc_node));
+
+	DPRINTF(VOLUMES, ("System vnodes: dat: %p cp: %p su: %p\n",
+	    NTOV(nandfsdev->nd_dat_node), NTOV(nandfsdev->nd_cp_node),
+	    NTOV(nandfsdev->nd_su_node)));
+	return (0);
+
+errorout:
+	nandfs_dispose_node(&nandfsdev->nd_gc_node);
+	nandfs_dispose_node(&nandfsdev->nd_dat_node);
+	nandfs_dispose_node(&nandfsdev->nd_cp_node);
+	nandfs_dispose_node(&nandfsdev->nd_su_node);
+
+	return (error);
+}
+
+static void
+nandfs_release_system_nodes(struct nandfs_device *nandfsdev)
+{
+
+	if (!nandfsdev)
+		return;
+	if (nandfsdev->nd_refcnt > 0)
+		return;
+
+	if (nandfsdev->nd_gc_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_gc_node));
+	if (nandfsdev->nd_dat_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_dat_node));
+	if (nandfsdev->nd_cp_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_cp_node));
+	if (nandfsdev->nd_su_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nandfsdev->nd_su_node));
+}
+
+static int
+nandfs_check_fsdata_crc(struct nandfs_fsdata *fsdata)
+{
+	uint32_t fsdata_crc, comp_crc;
+
+	if (fsdata->f_magic != NANDFS_FSDATA_MAGIC)
+		return (0);
+
+	/* Preserve CRC */
+	fsdata_crc = fsdata->f_sum;
+
+	/* Calculate */
+	fsdata->f_sum = (0);
+	comp_crc = crc32((uint8_t *)fsdata, fsdata->f_bytes);
+
+	/* Restore */
+	fsdata->f_sum = fsdata_crc;
+
+	/* Check CRC */
+	return (fsdata_crc == comp_crc);
+}
+
+static int
+nandfs_check_superblock_crc(struct nandfs_fsdata *fsdata,
+    struct nandfs_super_block *super)
+{
+	uint32_t super_crc, comp_crc;
+
+	/* Check super block magic */
+	if (super->s_magic != NANDFS_SUPER_MAGIC)
+		return (0);
+
+	/* Preserve CRC */
+	super_crc = super->s_sum;
+
+	/* Calculate */
+	super->s_sum = (0);
+	comp_crc = crc32((uint8_t *)super, fsdata->f_sbbytes);
+
+	/* Restore */
+	super->s_sum = super_crc;
+
+	/* Check CRC */
+	return (super_crc == comp_crc);
+}
+
+static void
+nandfs_calc_superblock_crc(struct nandfs_fsdata *fsdata,
+    struct nandfs_super_block *super)
+{
+	uint32_t comp_crc;
+
+	/* Calculate */
+	super->s_sum = 0;
+	comp_crc = crc32((uint8_t *)super, fsdata->f_sbbytes);
+
+	/* Restore */
+	super->s_sum = comp_crc;
+}
+
+static int
+nandfs_is_empty(u_char *area, int size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		if (area[i] != 0xff)
+			return (0);
+
+	return (1);
+}
+
+static __inline int
+nandfs_sblocks_in_esize(struct nandfs_device *fsdev)
+{
+
+	return ((fsdev->nd_erasesize - NANDFS_SBLOCK_OFFSET_BYTES) /
+	    sizeof(struct nandfs_super_block));
+}
+
+static __inline int
+nandfs_max_sblocks(struct nandfs_device *fsdev)
+{
+
+	return (NANDFS_NFSAREAS * nandfs_sblocks_in_esize(fsdev));
+}
+
+static __inline int
+nandfs_sblocks_in_block(struct nandfs_device *fsdev)
+{
+
+	return (fsdev->nd_devblocksize / sizeof(struct nandfs_super_block));
+}
+
+static __inline int
+nandfs_sblocks_in_first_block(struct nandfs_device *fsdev)
+{
+	int n;
+
+	n = nandfs_sblocks_in_block(fsdev) -
+	    NANDFS_SBLOCK_OFFSET_BYTES / sizeof(struct nandfs_super_block);
+	if (n < 0)
+		n = 0;
+
+	return (n);
+}
+
+static int
+nandfs_write_superblock_at(struct nandfs_device *fsdev,
+    struct nandfs_fsarea *fstp)
+{
+	struct nandfs_super_block *super, *supert;
+	struct buf *bp;
+	int sb_per_sector, sbs_in_fsd, read_block;
+	int index, pos, error;
+	off_t offset;
+
+	DPRINTF(SYNC, ("%s: last_used %d nandfs_sblocks_in_esize %d\n",
+	    __func__, fstp->last_used, nandfs_sblocks_in_esize(fsdev)));
+	if (fstp->last_used == nandfs_sblocks_in_esize(fsdev) - 1)
+		index = 0;
+	else
+		index = fstp->last_used + 1;
+
+	super = &fsdev->nd_super;
+	supert = NULL;
+
+	sb_per_sector = nandfs_sblocks_in_block(fsdev);
+	sbs_in_fsd = sizeof(struct nandfs_fsdata) /
+	    sizeof(struct nandfs_super_block);
+	index += sbs_in_fsd;
+	offset = fstp->offset;
+
+	DPRINTF(SYNC, ("%s: offset %#jx s_last_pseg %#jx s_last_cno %#jx "
+	    "s_last_seq %#jx wtime %jd index %d\n", __func__, offset,
+	    super->s_last_pseg, super->s_last_cno, super->s_last_seq,
+	    super->s_wtime, index));
+
+	read_block = btodb(offset + ((index / sb_per_sector) * sb_per_sector)
+	    * sizeof(struct nandfs_super_block));
+
+	DPRINTF(SYNC, ("%s: read_block %#x\n", __func__, read_block));
+
+	if (index == sbs_in_fsd) {
+		error = nandfs_erase(fsdev, offset, fsdev->nd_erasesize);
+		if (error)
+			return (error);
+
+		error = bread(fsdev->nd_devvp, btodb(offset),
+		    fsdev->nd_devblocksize, NOCRED, &bp);
+		if (error) {
+			printf("NANDFS: couldn't read initial data: %d\n",
+			    error);
+			brelse(bp);
+			return (error);
+		}
+		memcpy(bp->b_data, &fsdev->nd_fsdata, sizeof(fsdev->nd_fsdata));
+		/*
+		 * 0xff-out the rest. This bp could be cached, so potentially
+		 * b_data contains stale super blocks.
+		 *
+		 * We don't mind cached bp since most of the time we just add
+		 * super blocks to already 0xff-out b_data and don't need to
+		 * perform actual read.
+		 */
+		if (fsdev->nd_devblocksize > sizeof(fsdev->nd_fsdata))
+			memset(bp->b_data + sizeof(fsdev->nd_fsdata), 0xff,
+			    fsdev->nd_devblocksize - sizeof(fsdev->nd_fsdata));
+		error = bwrite(bp);
+		if (error) {
+			printf("NANDFS: cannot rewrite initial data at %jx\n",
+			    offset);
+			return (error);
+		}
+	}
+
+	error = bread(fsdev->nd_devvp, read_block, fsdev->nd_devblocksize,
+	    NOCRED, &bp);
+	if (error) {
+		brelse(bp);
+		return (error);
+	}
+
+	supert = (struct nandfs_super_block *)(bp->b_data);
+	pos = index % sb_per_sector;
+
+	DPRINTF(SYNC, ("%s: storing at %d\n", __func__, pos));
+	memcpy(&supert[pos], super, sizeof(struct nandfs_super_block));
+
+	/*
+	 * See comment above in code that performs erase.
+	 */
+	if (pos == 0)
+		memset(&supert[1], 0xff,
+		    (sb_per_sector - 1) * sizeof(struct nandfs_super_block));
+
+	error = bwrite(bp);
+	if (error) {
+		printf("NANDFS: cannot update superblock at %jx\n", offset);
+		return (error);
+	}
+
+	DPRINTF(SYNC, ("%s: fstp->last_used %d -> %d\n", __func__,
+	    fstp->last_used, index - sbs_in_fsd));
+	fstp->last_used = index - sbs_in_fsd;
+
+	return (0);
+}
+
+int
+nandfs_write_superblock(struct nandfs_device *fsdev)
+{
+	struct nandfs_super_block *super;
+	struct timespec ts;
+	int error;
+	int i, j;
+
+	vfs_timestamp(&ts);
+
+	super = &fsdev->nd_super;
+
+	super->s_last_pseg = fsdev->nd_last_pseg;
+	super->s_last_cno = fsdev->nd_last_cno;
+	super->s_last_seq = fsdev->nd_seg_sequence;
+	super->s_wtime = ts.tv_sec;
+
+	nandfs_calc_superblock_crc(&fsdev->nd_fsdata, super);
+
+	error = 0;
+	for (i = 0, j = fsdev->nd_last_fsarea; i < NANDFS_NFSAREAS;
+	    i++, j = (j + 1 % NANDFS_NFSAREAS)) {
+		if (fsdev->nd_fsarea[j].flags & NANDFS_FSSTOR_FAILED) {
+			DPRINTF(SYNC, ("%s: skipping %d\n", __func__, j));
+			continue;
+		}
+		error = nandfs_write_superblock_at(fsdev, &fsdev->nd_fsarea[j]);
+		if (error) {
+			printf("NANDFS: writing superblock at offset %d failed:"
+			    "%d\n", j * fsdev->nd_erasesize, error);
+			fsdev->nd_fsarea[j].flags |= NANDFS_FSSTOR_FAILED;
+		} else
+			break;
+	}
+
+	if (i == NANDFS_NFSAREAS) {
+		printf("NANDFS: superblock was not written\n");
+		/*
+		 * TODO: switch to read-only?
+		 */
+		return (error);
+	} else
+		fsdev->nd_last_fsarea = (j + 1) % NANDFS_NFSAREAS;
+
+	return (0);
+}
+
+static int
+nandfs_select_fsdata(struct nandfs_device *fsdev,
+    struct nandfs_fsdata *fsdatat, struct nandfs_fsdata **fsdata, int nfsds)
+{
+	int i;
+
+	*fsdata = NULL;
+	for (i = 0; i < nfsds; i++) {
+		DPRINTF(VOLUMES, ("%s: i %d f_magic %x f_crc %x\n", __func__,
+		    i, fsdatat[i].f_magic, fsdatat[i].f_sum));
+		if (!nandfs_check_fsdata_crc(&fsdatat[i]))
+			continue;
+		*fsdata = &fsdatat[i];
+		break;
+	}
+
+	return (*fsdata != NULL ? 0 : EINVAL);
+}
+
+static int
+nandfs_select_sb(struct nandfs_device *fsdev,
+    struct nandfs_super_block *supert, struct nandfs_super_block **super,
+    int nsbs)
+{
+	int i;
+
+	*super = NULL;
+	for (i = 0; i < nsbs; i++) {
+		if (!nandfs_check_superblock_crc(&fsdev->nd_fsdata, &supert[i]))
+			continue;
+		DPRINTF(SYNC, ("%s: i %d s_last_cno %jx s_magic %x "
+		    "s_wtime %jd\n", __func__, i, supert[i].s_last_cno,
+		    supert[i].s_magic, supert[i].s_wtime));
+		if (*super == NULL || supert[i].s_last_cno >
+		    (*super)->s_last_cno)
+			*super = &supert[i];
+	}
+
+	return (*super != NULL ? 0 : EINVAL);
+}
+
+static int
+nandfs_read_structures_at(struct nandfs_device *fsdev,
+    struct nandfs_fsarea *fstp, struct nandfs_fsdata *fsdata,
+    struct nandfs_super_block *super)
+{
+	struct nandfs_super_block *tsuper, *tsuperd;
+	struct buf *bp;
+	int error, read_size;
+	int i;
+	int offset;
+
+	offset = fstp->offset;
+
+	if (fsdev->nd_erasesize > MAXBSIZE)
+		read_size = MAXBSIZE;
+	else
+		read_size = fsdev->nd_erasesize;
+
+	error = bread(fsdev->nd_devvp, btodb(offset), read_size, NOCRED, &bp);
+	if (error) {
+		printf("couldn't read: %d\n", error);
+		brelse(bp);
+		fstp->flags |= NANDFS_FSSTOR_FAILED;
+		return (error);
+	}
+
+	tsuper = super;
+
+	memcpy(fsdata, bp->b_data, sizeof(struct nandfs_fsdata));
+	memcpy(tsuper, (bp->b_data + sizeof(struct nandfs_fsdata)),
+	    read_size - sizeof(struct nandfs_fsdata));
+	brelse(bp);
+
+	tsuper += (read_size - sizeof(struct nandfs_fsdata)) /
+	    sizeof(struct nandfs_super_block);
+
+	for (i = 1; i < fsdev->nd_erasesize / read_size; i++) {
+		error = bread(fsdev->nd_devvp, btodb(offset + i * read_size),
+		    read_size, NOCRED, &bp);
+		if (error) {
+			printf("couldn't read: %d\n", error);
+			brelse(bp);
+			fstp->flags |= NANDFS_FSSTOR_FAILED;
+			return (error);
+		}
+		memcpy(tsuper, bp->b_data, read_size);
+		tsuper += read_size / sizeof(struct nandfs_super_block);
+		brelse(bp);
+	}
+
+	tsuper -= 1;
+	fstp->last_used = nandfs_sblocks_in_esize(fsdev) - 1;
+	for (tsuperd = super - 1; (tsuper != tsuperd); tsuper -= 1) {
+		if (nandfs_is_empty((u_char *)tsuper, sizeof(*tsuper)))
+			fstp->last_used--;
+		else
+			break;
+	}
+
+	DPRINTF(VOLUMES, ("%s: last_used %d\n", __func__, fstp->last_used));
+
+	return (0);
+}
+
+static int
+nandfs_read_structures(struct nandfs_device *fsdev)
+{
+	struct nandfs_fsdata *fsdata, *fsdatat;
+	struct nandfs_super_block *sblocks, *ssblock;
+	int nsbs, nfsds, i;
+	int error = 0;
+	int nrsbs;
+
+	nfsds = NANDFS_NFSAREAS;
+	nsbs = nandfs_max_sblocks(fsdev);
+
+	fsdatat = malloc(sizeof(struct nandfs_fsdata) * nfsds, M_NANDFSTEMP,
+	    M_WAITOK | M_ZERO);
+	sblocks = malloc(sizeof(struct nandfs_super_block) * nsbs, M_NANDFSTEMP,
+	    M_WAITOK | M_ZERO);
+
+	nrsbs = 0;
+	for (i = 0; i < NANDFS_NFSAREAS; i++) {
+		fsdev->nd_fsarea[i].offset = i * fsdev->nd_erasesize;
+		error = nandfs_read_structures_at(fsdev, &fsdev->nd_fsarea[i],
+		    &fsdatat[i], sblocks + nrsbs);
+		if (error)
+			continue;
+		nrsbs += (fsdev->nd_fsarea[i].last_used + 1);
+		if (fsdev->nd_fsarea[fsdev->nd_last_fsarea].last_used >
+		    fsdev->nd_fsarea[i].last_used)
+			fsdev->nd_last_fsarea = i;
+	}
+
+	if (nrsbs == 0) {
+		printf("nandfs: no valid superblocks found\n");
+		error = EINVAL;
+		goto out;
+	}
+
+	error = nandfs_select_fsdata(fsdev, fsdatat, &fsdata, nfsds);
+	if (error)
+		goto out;
+	memcpy(&fsdev->nd_fsdata, fsdata, sizeof(struct nandfs_fsdata));
+
+	error = nandfs_select_sb(fsdev, sblocks, &ssblock, nsbs);
+	if (error)
+		goto out;
+
+	memcpy(&fsdev->nd_super, ssblock, sizeof(struct nandfs_super_block));
+out:
+	free(fsdatat, M_NANDFSTEMP);
+	free(sblocks, M_NANDFSTEMP);
+
+	if (error == 0)
+		DPRINTF(VOLUMES, ("%s: selected sb with w_time %jd "
+		    "last_pseg %#jx\n", __func__, fsdev->nd_super.s_wtime,
+		    fsdev->nd_super.s_last_pseg));
+
+	return (error);
+}
+
+static void
+nandfs_unmount_base(struct nandfs_device *nandfsdev)
+{
+	int error;
+
+	if (!nandfsdev)
+		return;
+
+	/* Remove all our information */
+	error = vinvalbuf(nandfsdev->nd_devvp, V_SAVE, 0, 0);
+	if (error) {
+		/*
+		 * Flushing buffers failed when fs was umounting, can't do
+		 * much now, just printf error and continue with umount.
+		 */
+		nandfs_error("%s(): error:%d when umounting FS\n",
+		    __func__, error);
+	}
+
+	/* Release the device's system nodes */
+	nandfs_release_system_nodes(nandfsdev);
+}
+
+static void
+nandfs_get_ncleanseg(struct nandfs_device *nandfsdev)
+{
+	struct nandfs_seg_stat nss;
+
+	nandfs_get_seg_stat(nandfsdev, &nss);
+	nandfsdev->nd_clean_segs = nss.nss_ncleansegs;
+	DPRINTF(VOLUMES, ("nandfs_mount: clean segs: %jx\n",
+	    (uintmax_t)nandfsdev->nd_clean_segs));
+}
+
+
+static int
+nandfs_mount_base(struct nandfs_device *nandfsdev, struct mount *mp,
+    struct nandfs_args *args)
+{
+	uint32_t log_blocksize;
+	int error;
+
+	/* Flush out any old buffers remaining from a previous use. */
+	if ((error = vinvalbuf(nandfsdev->nd_devvp, V_SAVE, 0, 0)))
+		return (error);
+
+	error = nandfs_read_structures(nandfsdev);
+	if (error) {
+		printf("nandfs: could not get valid filesystem structures\n");
+		return (error);
+	}
+
+	if (nandfsdev->nd_fsdata.f_rev_level != NANDFS_CURRENT_REV) {
+		printf("nandfs: unsupported file system revision: %d "
+		    "(supported is %d).\n", nandfsdev->nd_fsdata.f_rev_level,
+		    NANDFS_CURRENT_REV);
+		return (EINVAL);
+	}
+
+	if (nandfsdev->nd_fsdata.f_erasesize != nandfsdev->nd_erasesize) {
+		printf("nandfs: erasesize mismatch (device %#x, fs %#x)\n",
+		    nandfsdev->nd_erasesize, nandfsdev->nd_fsdata.f_erasesize);
+		return (EINVAL);
+	}
+
+	/* Get our blocksize */
+	log_blocksize = nandfsdev->nd_fsdata.f_log_block_size;
+	nandfsdev->nd_blocksize = (uint64_t) 1 << (log_blocksize + 10);
+	DPRINTF(VOLUMES, ("%s: blocksize:%x\n", __func__,
+	    nandfsdev->nd_blocksize));
+
+	DPRINTF(VOLUMES, ("%s: accepted super block with cp %#jx\n", __func__,
+	    (uintmax_t)nandfsdev->nd_super.s_last_cno));
+
+	/* Calculate dat structure parameters */
+	nandfs_calc_mdt_consts(nandfsdev, &nandfsdev->nd_dat_mdt,
+	    nandfsdev->nd_fsdata.f_dat_entry_size);
+	nandfs_calc_mdt_consts(nandfsdev, &nandfsdev->nd_ifile_mdt,
+	    nandfsdev->nd_fsdata.f_inode_size);
+
+	/* Search for the super root and roll forward when needed */
+	if (nandfs_search_super_root(nandfsdev)) {
+		printf("Cannot find valid SuperRoot\n");
+		return (EINVAL);
+	}
+
+	nandfsdev->nd_mount_state = nandfsdev->nd_super.s_state;
+	if (nandfsdev->nd_mount_state != NANDFS_VALID_FS) {
+		printf("FS is seriously damaged, needs repairing\n");
+		printf("aborting mount\n");
+		return (EINVAL);
+	}
+
+	/*
+	 * FS should be ok now. The superblock and the last segsum could be
+	 * updated from the repair so extract running values again.
+	 */
+	nandfsdev->nd_last_pseg = nandfsdev->nd_super.s_last_pseg;
+	nandfsdev->nd_seg_sequence = nandfsdev->nd_super.s_last_seq;
+	nandfsdev->nd_seg_num = nandfs_get_segnum_of_block(nandfsdev,
+	    nandfsdev->nd_last_pseg);
+	nandfsdev->nd_next_seg_num = nandfs_get_segnum_of_block(nandfsdev,
+	    nandfsdev->nd_last_segsum.ss_next);
+	nandfsdev->nd_ts.tv_sec = nandfsdev->nd_last_segsum.ss_create;
+	nandfsdev->nd_last_cno = nandfsdev->nd_super.s_last_cno;
+	nandfsdev->nd_fakevblk = 1;
+	nandfsdev->nd_last_ino  = NANDFS_USER_INO;
+	DPRINTF(VOLUMES, ("%s: last_pseg %#jx last_cno %#jx last_seq %#jx\n"
+	    "fsdev: last_seg: seq %#jx num %#jx, next_seg_num %#jx\n",
+	    __func__, (uintmax_t)nandfsdev->nd_last_pseg,
+	    (uintmax_t)nandfsdev->nd_last_cno,
+	    (uintmax_t)nandfsdev->nd_seg_sequence,
+	    (uintmax_t)nandfsdev->nd_seg_sequence,
+	    (uintmax_t)nandfsdev->nd_seg_num,
+	    (uintmax_t)nandfsdev->nd_next_seg_num));
+
+	DPRINTF(VOLUMES, ("nandfs_mount: accepted super root\n"));
+
+	/* Create system vnodes for DAT, CP and SEGSUM */
+	error = nandfs_create_system_nodes(nandfsdev);
+	if (error)
+		nandfs_unmount_base(nandfsdev);
+
+	nandfs_get_ncleanseg(nandfsdev);
+
+	return (error);
+}
+
+static void
+nandfs_unmount_device(struct nandfs_device *nandfsdev)
+{
+
+	/* Is there anything? */
+	if (nandfsdev == NULL)
+		return;
+
+	/* Remove the device only if we're the last reference */
+	nandfsdev->nd_refcnt--;
+	if (nandfsdev->nd_refcnt >= 1)
+		return;
+
+	MPASS(nandfsdev->nd_syncer == NULL);
+	MPASS(nandfsdev->nd_cleaner == NULL);
+	MPASS(nandfsdev->nd_free_base == NULL);
+
+	/* Unmount our base */
+	nandfs_unmount_base(nandfsdev);
+
+	/* Remove from our device list */
+	SLIST_REMOVE(&nandfs_devices, nandfsdev, nandfs_device, nd_next_device);
+
+	DROP_GIANT();
+	g_topology_lock();
+	g_vfs_close(nandfsdev->nd_gconsumer);
+	g_topology_unlock();
+	PICKUP_GIANT();
+
+	DPRINTF(VOLUMES, ("closing device\n"));
+
+	/* Clear our mount reference and release device node */
+	vrele(nandfsdev->nd_devvp);
+
+	dev_rel(nandfsdev->nd_devvp->v_rdev);
+
+	/* Free our device info */
+	cv_destroy(&nandfsdev->nd_sync_cv);
+	mtx_destroy(&nandfsdev->nd_sync_mtx);
+	cv_destroy(&nandfsdev->nd_clean_cv);
+	mtx_destroy(&nandfsdev->nd_clean_mtx);
+	mtx_destroy(&nandfsdev->nd_mutex);
+	lockdestroy(&nandfsdev->nd_seg_const);
+	free(nandfsdev, M_NANDFSMNT);
+}
+
+static int
+nandfs_check_mounts(struct nandfs_device *nandfsdev, struct mount *mp,
+    struct nandfs_args *args)
+{
+	struct nandfsmount *nmp;
+	uint64_t last_cno;
+
+	/* no double-mounting of the same checkpoint */
+	STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) {
+		if (nmp->nm_mount_args.cpno == args->cpno)
+			return (EBUSY);
+	}
+
+	/* Allow readonly mounts without questioning here */
+	if (mp->mnt_flag & MNT_RDONLY)
+		return (0);
+
+	/* Read/write mount */
+	STAILQ_FOREACH(nmp, &nandfsdev->nd_mounts, nm_next_mount) {
+		/* Only one RW mount on this device! */
+		if ((nmp->nm_vfs_mountp->mnt_flag & MNT_RDONLY)==0)
+			return (EROFS);
+		/* RDONLY on last mountpoint is device busy */
+		last_cno = nmp->nm_nandfsdev->nd_super.s_last_cno;
+		if (nmp->nm_mount_args.cpno == last_cno)
+			return (EBUSY);
+	}
+
+	/* OK for now */
+	return (0);
+}
+
+static int
+nandfs_mount_device(struct vnode *devvp, struct mount *mp,
+    struct nandfs_args *args, struct nandfs_device **nandfsdev_p)
+{
+	struct nandfs_device *nandfsdev;
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	struct cdev *dev;
+	uint32_t erasesize;
+	int error, size;
+	int ronly;
+
+	DPRINTF(VOLUMES, ("Mounting NANDFS device\n"));
+
+	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+	/* Look up device in our nandfs_mountpoints */
+	*nandfsdev_p = NULL;
+	SLIST_FOREACH(nandfsdev, &nandfs_devices, nd_next_device)
+		if (nandfsdev->nd_devvp == devvp)
+			break;
+
+	if (nandfsdev) {
+		DPRINTF(VOLUMES, ("device already mounted\n"));
+		error = nandfs_check_mounts(nandfsdev, mp, args);
+		if (error)
+			return error;
+		nandfsdev->nd_refcnt++;
+		*nandfsdev_p = nandfsdev;
+
+		if (!ronly) {
+			DROP_GIANT();
+			g_topology_lock();
+			error = g_access(nandfsdev->nd_gconsumer, 0, 1, 0);
+			g_topology_unlock();
+			PICKUP_GIANT();
+		}
+		return (error);
+	}
+
+	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+	dev = devvp->v_rdev;
+	dev_ref(dev);
+	DROP_GIANT();
+	g_topology_lock();
+	error = g_vfs_open(devvp, &cp, "nandfs", ronly ? 0 : 1);
+	pp = g_dev_getprovider(dev);
+	g_topology_unlock();
+	PICKUP_GIANT();
+	VOP_UNLOCK(devvp, 0);
+	if (error) {
+		dev_rel(dev);
+		return (error);
+	}
+
+	nandfsdev = malloc(sizeof(struct nandfs_device), M_NANDFSMNT, M_WAITOK | M_ZERO);
+
+	/* Initialise */
+	nandfsdev->nd_refcnt = 1;
+	nandfsdev->nd_devvp = devvp;
+	nandfsdev->nd_syncing = 0;
+	nandfsdev->nd_cleaning = 0;
+	nandfsdev->nd_gconsumer = cp;
+	cv_init(&nandfsdev->nd_sync_cv, "nandfssync");
+	mtx_init(&nandfsdev->nd_sync_mtx, "nffssyncmtx", NULL, MTX_DEF);
+	cv_init(&nandfsdev->nd_clean_cv, "nandfsclean");
+	mtx_init(&nandfsdev->nd_clean_mtx, "nffscleanmtx", NULL, MTX_DEF);
+	mtx_init(&nandfsdev->nd_mutex, "nandfsdev lock", NULL, MTX_DEF);
+	lockinit(&nandfsdev->nd_seg_const, PVFS, "nffssegcon", VLKTIMEOUT,
+	    LK_CANRECURSE);
+	STAILQ_INIT(&nandfsdev->nd_mounts);
+
+	nandfsdev->nd_devsize = pp->mediasize;
+	nandfsdev->nd_devblocksize = pp->sectorsize;
+
+	size = sizeof(erasesize);
+	error = g_io_getattr("NAND::blocksize", nandfsdev->nd_gconsumer, &size,
+	    &erasesize);
+	if (error) {
+		DPRINTF(VOLUMES, ("couldn't get erasesize: %d\n", error));
+
+		if (error == ENOIOCTL || error == EOPNOTSUPP) {
+			/*
+			 * We conclude that this is not NAND storage
+			 */
+			nandfsdev->nd_erasesize = NANDFS_DEF_ERASESIZE;
+			nandfsdev->nd_is_nand = 0;
+		} else {
+			DROP_GIANT();
+			g_topology_lock();
+			g_vfs_close(nandfsdev->nd_gconsumer);
+			g_topology_unlock();
+			PICKUP_GIANT();
+			dev_rel(dev);
+			free(nandfsdev, M_NANDFSMNT);
+			return (error);
+		}
+	} else {
+		nandfsdev->nd_erasesize = erasesize;
+		nandfsdev->nd_is_nand = 1;
+	}
+
+	DPRINTF(VOLUMES, ("%s: erasesize %x\n", __func__,
+	    nandfsdev->nd_erasesize));
+
+	/* Register nandfs_device in list */
+	SLIST_INSERT_HEAD(&nandfs_devices, nandfsdev, nd_next_device);
+
+	error = nandfs_mount_base(nandfsdev, mp, args);
+	if (error) {
+		/* Remove all our information */
+		nandfs_unmount_device(nandfsdev);
+		return (EINVAL);
+	}
+
+	nandfsdev->nd_maxfilesize = nandfs_get_maxfilesize(nandfsdev);
+
+	*nandfsdev_p = nandfsdev;
+	DPRINTF(VOLUMES, ("NANDFS device mounted ok\n"));
+
+	return (0);
+}
+
+static int
+nandfs_mount_checkpoint(struct nandfsmount *nmp)
+{
+	struct nandfs_cpfile_header *cphdr;
+	struct nandfs_checkpoint *cp;
+	struct nandfs_inode ifile_inode;
+	struct nandfs_node *cp_node;
+	struct buf *bp;
+	uint64_t ncp, nsn, cpno, fcpno, blocknr, last_cno;
+	uint32_t off, dlen;
+	int cp_per_block, error;
+
+	cpno = nmp->nm_mount_args.cpno;
+	if (cpno == 0)
+		cpno = nmp->nm_nandfsdev->nd_super.s_last_cno;
+
+	DPRINTF(VOLUMES, ("%s: trying to mount checkpoint number %"PRIu64"\n",
+	    __func__, cpno));
+
+	cp_node = nmp->nm_nandfsdev->nd_cp_node;
+
+	VOP_LOCK(NTOV(cp_node), LK_SHARED);
+	/* Get cpfile header from 1st block of cp file */
+	error = nandfs_bread(cp_node, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (error);
+	}
+
+	cphdr = (struct nandfs_cpfile_header *) bp->b_data;
+	ncp = cphdr->ch_ncheckpoints;
+	nsn = cphdr->ch_nsnapshots;
+
+	brelse(bp);
+
+	DPRINTF(VOLUMES, ("mount_nandfs: checkpoint header read in\n"));
+	DPRINTF(VOLUMES, ("\tNumber of checkpoints %"PRIu64"\n", ncp));
+	DPRINTF(VOLUMES, ("\tNumber of snapshots %"PRIu64"\n", nsn));
+
+	/* Read in our specified checkpoint */
+	dlen = nmp->nm_nandfsdev->nd_fsdata.f_checkpoint_size;
+	cp_per_block = nmp->nm_nandfsdev->nd_blocksize / dlen;
+
+	fcpno = cpno + NANDFS_CPFILE_FIRST_CHECKPOINT_OFFSET - 1;
+	blocknr = fcpno / cp_per_block;
+	off = (fcpno % cp_per_block) * dlen;
+	error = nandfs_bread(cp_node, blocknr, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		printf("mount_nandfs: couldn't read cp block %"PRIu64"\n",
+		    fcpno);
+		return (EINVAL);
+	}
+
+	/* Needs to be a valid checkpoint */
+	cp = (struct nandfs_checkpoint *) ((uint8_t *) bp->b_data + off);
+	if (cp->cp_flags & NANDFS_CHECKPOINT_INVALID) {
+		printf("mount_nandfs: checkpoint marked invalid\n");
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (EINVAL);
+	}
+
+	/* Is this really the checkpoint we want? */
+	if (cp->cp_cno != cpno) {
+		printf("mount_nandfs: checkpoint file corrupt? "
+		    "expected cpno %"PRIu64", found cpno %"PRIu64"\n",
+		    cpno, cp->cp_cno);
+		brelse(bp);
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (EINVAL);
+	}
+
+	/* Check if it's a snapshot ! */
+	last_cno = nmp->nm_nandfsdev->nd_super.s_last_cno;
+	if (cpno != last_cno) {
+		/* Only allow snapshots if not mounting on the last cp */
+		if ((cp->cp_flags & NANDFS_CHECKPOINT_SNAPSHOT) == 0) {
+			printf( "mount_nandfs: checkpoint %"PRIu64" is not a "
+			    "snapshot\n", cpno);
+			brelse(bp);
+			VOP_UNLOCK(NTOV(cp_node), 0);
+			return (EINVAL);
+		}
+	}
+
+	ifile_inode = cp->cp_ifile_inode;
+	brelse(bp);
+
+	/* Get ifile inode */
+	error = nandfs_get_node_raw(nmp->nm_nandfsdev, NULL, NANDFS_IFILE_INO,
+	    &ifile_inode, &nmp->nm_ifile_node);
+	if (error) {
+		printf("mount_nandfs: can't read ifile node\n");
+		VOP_UNLOCK(NTOV(cp_node), 0);
+		return (EINVAL);
+	}
+
+	NANDFS_SET_SYSTEMFILE(NTOV(nmp->nm_ifile_node));
+	VOP_UNLOCK(NTOV(cp_node), 0);
+	/* Get root node? */
+
+	return (0);
+}
+
+static void
+free_nandfs_mountinfo(struct mount *mp)
+{
+	struct nandfsmount *nmp = VFSTONANDFS(mp);
+
+	if (nmp == NULL)
+		return;
+
+	free(nmp, M_NANDFSMNT);
+}
+
+void
+nandfs_wakeup_wait_sync(struct nandfs_device *nffsdev, int reason)
+{
+	char *reasons[] = {
+	    "umount",
+	    "vfssync",
+	    "bdflush",
+	    "fforce",
+	    "fsync",
+	    "ro_upd"
+	};
+
+	DPRINTF(SYNC, ("%s: %s\n", __func__, reasons[reason]));
+	mtx_lock(&nffsdev->nd_sync_mtx);
+	if (nffsdev->nd_syncing)
+		cv_wait(&nffsdev->nd_sync_cv, &nffsdev->nd_sync_mtx);
+	if (reason == SYNCER_UMOUNT)
+		nffsdev->nd_syncer_exit = 1;
+	nffsdev->nd_syncing = 1;
+	wakeup(&nffsdev->nd_syncing);
+	cv_wait(&nffsdev->nd_sync_cv, &nffsdev->nd_sync_mtx);
+
+	mtx_unlock(&nffsdev->nd_sync_mtx);
+}
+
+static void
+nandfs_gc_finished(struct nandfs_device *nffsdev, int exit)
+{
+	int error;
+
+	mtx_lock(&nffsdev->nd_sync_mtx);
+	nffsdev->nd_syncing = 0;
+	DPRINTF(SYNC, ("%s: cleaner finish\n", __func__));
+	cv_broadcast(&nffsdev->nd_sync_cv);
+	mtx_unlock(&nffsdev->nd_sync_mtx);
+	if (!exit) {
+		error = tsleep(&nffsdev->nd_syncing, PRIBIO, "-",
+		    hz * nandfs_sync_interval);
+		DPRINTF(SYNC, ("%s: cleaner waked up: %d\n",
+		    __func__, error));
+	}
+}
+
+static void
+nandfs_syncer(struct nandfsmount *nmp)
+{
+	struct nandfs_device *nffsdev;
+	struct mount *mp;
+	int flags, error;
+
+	mp = nmp->nm_vfs_mountp;
+	nffsdev = nmp->nm_nandfsdev;
+	tsleep(&nffsdev->nd_syncing, PRIBIO, "-", hz * nandfs_sync_interval);
+
+	while (!nffsdev->nd_syncer_exit) {
+		DPRINTF(SYNC, ("%s: syncer run\n", __func__));
+		nffsdev->nd_syncing = 1;
+
+		flags = (nmp->nm_flags & (NANDFS_FORCE_SYNCER | NANDFS_UMOUNT));
+
+		error = nandfs_segment_constructor(nmp, flags);
+		if (error)
+			nandfs_error("%s: error:%d when creating segments\n",
+			    __func__, error);
+
+		nmp->nm_flags &= ~flags;
+
+		nandfs_gc_finished(nffsdev, 0);
+	}
+
+	MPASS(nffsdev->nd_cleaner == NULL);
+	error = nandfs_segment_constructor(nmp,
+	    NANDFS_FORCE_SYNCER | NANDFS_UMOUNT);
+	if (error)
+		nandfs_error("%s: error:%d when creating segments\n",
+		    __func__, error);
+	nandfs_gc_finished(nffsdev, 1);
+	nffsdev->nd_syncer = NULL;
+	MPASS(nffsdev->nd_free_base == NULL);
+
+	DPRINTF(SYNC, ("%s: exiting\n", __func__));
+	kthread_exit();
+}
+
+static int
+start_syncer(struct nandfsmount *nmp)
+{
+	int error;
+
+	MPASS(nmp->nm_nandfsdev->nd_syncer == NULL);
+
+	DPRINTF(SYNC, ("%s: start syncer\n", __func__));
+
+	nmp->nm_nandfsdev->nd_syncer_exit = 0;
+
+	error = kthread_add((void(*)(void *))nandfs_syncer, nmp, NULL,
+	    &nmp->nm_nandfsdev->nd_syncer, 0, 0, "nandfs_syncer");
+
+	if (error)
+		printf("nandfs: could not start syncer: %d\n", error);
+
+	return (error);
+}
+
+static int
+stop_syncer(struct nandfsmount *nmp)
+{
+
+	MPASS(nmp->nm_nandfsdev->nd_syncer != NULL);
+
+	nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_UMOUNT);
+
+	DPRINTF(SYNC, ("%s: stop syncer\n", __func__));
+	return (0);
+}
+
+/*
+ * Mount null layer
+ */
+static int
+nandfs_mount(struct mount *mp)
+{
+	struct nandfsmount *nmp;
+	struct vnode *devvp;
+	struct nameidata nd;
+	struct vfsoptlist *opts;
+	struct thread *td;
+	char *from;
+	int error = 0, flags;
+
+	DPRINTF(VOLUMES, ("%s: mp = %p\n", __func__, (void *)mp));
+
+	td = curthread;
+	opts = mp->mnt_optnew;
+
+	if (vfs_filteropt(opts, nandfs_opts))
+		return (EINVAL);
+
+	/*
+	 * Update is a no-op
+	 */
+	if (mp->mnt_flag & MNT_UPDATE) {
+		nmp = VFSTONANDFS(mp);
+		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) {
+			return (error);
+		}
+		if (!(nmp->nm_ronly) && vfs_flagopt(opts, "ro", NULL, 0)) {
+			vn_start_write(NULL, &mp, V_WAIT);
+			error = VFS_SYNC(mp, MNT_WAIT);
+			if (error)
+				return (error);
+			vn_finished_write(mp);
+
+			flags = WRITECLOSE;
+			if (mp->mnt_flag & MNT_FORCE)
+				flags |= FORCECLOSE;
+
+			nandfs_wakeup_wait_sync(nmp->nm_nandfsdev,
+			    SYNCER_ROUPD);
+			error = vflush(mp, 0, flags, td);
+			if (error)
+				return (error);
+
+			nandfs_stop_cleaner(nmp->nm_nandfsdev);
+			stop_syncer(nmp);
+			DROP_GIANT();
+			g_topology_lock();
+			g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, -1, 0);
+			g_topology_unlock();
+			PICKUP_GIANT();
+			MNT_ILOCK(mp);
+			mp->mnt_flag |= MNT_RDONLY;
+			MNT_IUNLOCK(mp);
+			nmp->nm_ronly = 1;
+
+		} else if ((nmp->nm_ronly) &&
+		    !vfs_flagopt(opts, "ro", NULL, 0)) {
+			/*
+			 * Don't allow read-write snapshots.
+			 */
+			if (nmp->nm_mount_args.cpno != 0)
+				return (EROFS);
+			/*
+			 * If upgrade to read-write by non-root, then verify
+			 * that user has necessary permissions on the device.
+			 */
+			devvp = nmp->nm_nandfsdev->nd_devvp;
+			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
+			error = VOP_ACCESS(devvp, VREAD | VWRITE,
+			    td->td_ucred, td);
+			if (error) {
+				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
+				if (error) {
+					VOP_UNLOCK(devvp, 0);
+					return (error);
+				}
+			}
+
+			VOP_UNLOCK(devvp, 0);
+			DROP_GIANT();
+			g_topology_lock();
+			error = g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, 1,
+			    0);
+			g_topology_unlock();
+			PICKUP_GIANT();
+			if (error)
+				return (error);
+
+			MNT_ILOCK(mp);
+			mp->mnt_flag &= ~MNT_RDONLY;
+			MNT_IUNLOCK(mp);
+			error = start_syncer(nmp);
+			if (error == 0)
+				error = nandfs_start_cleaner(nmp->nm_nandfsdev);
+			if (error) {
+				DROP_GIANT();
+				g_topology_lock();
+				g_access(nmp->nm_nandfsdev->nd_gconsumer, 0, -1,
+				    0);
+				g_topology_unlock();
+				PICKUP_GIANT();
+				return (error);
+			}
+
+			nmp->nm_ronly = 0;
+		}
+		return (0);
+	}
+
+	from = vfs_getopts(opts, "from", &error);
+	if (error)
+		return (error);
+
+	/*
+	 * Find device node
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, from, curthread);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	devvp = nd.ni_vp;
+
+	if (!vn_isdisk(devvp, &error)) {
+		vput(devvp);
+		return (error);
+	}
+
+	/* Check the access rights on the mount device */
+	error = VOP_ACCESS(devvp, VREAD, curthread->td_ucred, curthread);
+	if (error)
+		error = priv_check(curthread, PRIV_VFS_MOUNT_PERM);
+	if (error) {
+		vput(devvp);
+		return (error);
+	}
+
+	vfs_getnewfsid(mp);
+
+	error = nandfs_mountfs(devvp, mp);
+	if (error)
+		return (error);
+	vfs_mountedfrom(mp, from);
+
+	return (0);
+}
+
+static int
+nandfs_mountfs(struct vnode *devvp, struct mount *mp)
+{
+	struct nandfsmount *nmp = NULL;
+	struct nandfs_args *args = NULL;
+	struct nandfs_device *nandfsdev;
+	char *from;
+	int error, ronly;
+	char *cpno;
+
+	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
+
+	if (devvp->v_rdev->si_iosize_max != 0)
+		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
+	VOP_UNLOCK(devvp, 0);
+
+	if (mp->mnt_iosize_max > MAXPHYS)
+		mp->mnt_iosize_max = MAXPHYS;
+
+	from = vfs_getopts(mp->mnt_optnew, "from", &error);
+	if (error)
+		goto error;
+
+	error = vfs_getopt(mp->mnt_optnew, "snap", (void **)&cpno, NULL);
+	if (error == ENOENT)
+		cpno = NULL;
+	else if (error)
+		goto error;
+
+	args = (struct nandfs_args *)malloc(sizeof(struct nandfs_args),
+	    M_NANDFSMNT, M_WAITOK | M_ZERO);
+
+	if (cpno != NULL)
+		args->cpno = strtoul(cpno, (char **)NULL, 10);
+	else
+		args->cpno = 0;
+	args->fspec = from;
+
+	if (args->cpno != 0 && !ronly) {
+		error = EROFS;
+		goto error;
+	}
+
+	printf("WARNING: NANDFS is considered to be a highly experimental "
+	    "feature in FreeBSD.\n");
+
+	error = nandfs_mount_device(devvp, mp, args, &nandfsdev);
+	if (error)
+		goto error;
+
+	nmp = (struct nandfsmount *) malloc(sizeof(struct nandfsmount),
+	    M_NANDFSMNT, M_WAITOK | M_ZERO);
+
+	mp->mnt_data = nmp;
+	nmp->nm_vfs_mountp = mp;
+	nmp->nm_ronly = ronly;
+	MNT_ILOCK(mp);
+	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_kern_flag |= MNTK_MPSAFE;
+	MNT_IUNLOCK(mp);
+	nmp->nm_nandfsdev = nandfsdev;
+	/* Add our mountpoint */
+	STAILQ_INSERT_TAIL(&nandfsdev->nd_mounts, nmp, nm_next_mount);
+
+	if (args->cpno > nandfsdev->nd_last_cno) {
+		printf("WARNING: supplied checkpoint number (%jd) is greater "
+		    "than last known checkpoint on filesystem (%jd). Mounting"
+		    " checkpoint %jd\n", (uintmax_t)args->cpno,
+		    (uintmax_t)nandfsdev->nd_last_cno,
+		    (uintmax_t)nandfsdev->nd_last_cno);
+		args->cpno = nandfsdev->nd_last_cno;
+	}
+
+	/* Setting up other parameters */
+	nmp->nm_mount_args = *args;
+	free(args, M_NANDFSMNT);
+	error = nandfs_mount_checkpoint(nmp);
+	if (error) {
+		nandfs_unmount(mp, MNT_FORCE);
+		goto unmounted;
+	}
+
+	if (!ronly) {
+		error = start_syncer(nmp);
+		if (error == 0)
+			error = nandfs_start_cleaner(nmp->nm_nandfsdev);
+		if (error)
+			nandfs_unmount(mp, MNT_FORCE);
+	}
+
+	return (0);
+
+error:
+	if (args != NULL)
+		free(args, M_NANDFSMNT);
+
+	if (nmp != NULL) {
+		free(nmp, M_NANDFSMNT);
+		mp->mnt_data = NULL;
+	}
+unmounted:
+	return (error);
+}
+
+static int
+nandfs_unmount(struct mount *mp, int mntflags)
+{
+	struct nandfs_device *nandfsdev;
+	struct nandfsmount *nmp;
+	int error;
+	int flags = 0;
+
+	DPRINTF(VOLUMES, ("%s: mp = %p\n", __func__, (void *)mp));
+
+	if (mntflags & MNT_FORCE)
+		flags |= FORCECLOSE;
+
+	nmp = mp->mnt_data;
+	nandfsdev = nmp->nm_nandfsdev;
+
+	error = vflush(mp, 0, flags | SKIPSYSTEM, curthread);
+	if (error)
+		return (error);
+
+	if (!(nmp->nm_ronly)) {
+		nandfs_stop_cleaner(nandfsdev);
+		stop_syncer(nmp);
+	}
+
+	if (nmp->nm_ifile_node)
+		NANDFS_UNSET_SYSTEMFILE(NTOV(nmp->nm_ifile_node));
+
+	/* Remove our mount point */
+	STAILQ_REMOVE(&nandfsdev->nd_mounts, nmp, nandfsmount, nm_next_mount);
+
+	/* Unmount the device itself when we're the last one */
+	nandfs_unmount_device(nandfsdev);
+
+	free_nandfs_mountinfo(mp);
+
+	/*
+	 * Finally, throw away the null_mount structure
+	 */
+	mp->mnt_data = 0;
+	MNT_ILOCK(mp);
+	mp->mnt_flag &= ~MNT_LOCAL;
+	MNT_IUNLOCK(mp);
+
+	return (0);
+}
+
+static int
+nandfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+	struct nandfsmount *nmp;
+	struct nandfs_device *nandfsdev;
+	struct nandfs_fsdata *fsdata;
+	struct nandfs_super_block *sb;
+	struct nandfs_block_group_desc *groups;
+	struct nandfs_node *ifile;
+	struct nandfs_mdt *mdt;
+	struct buf *bp;
+	int i, error;
+	uint32_t entries_per_group;
+	uint64_t files = 0;
+
+	nmp = mp->mnt_data;
+	nandfsdev = nmp->nm_nandfsdev;
+	fsdata = &nandfsdev->nd_fsdata;
+	sb = &nandfsdev->nd_super;
+	ifile = nmp->nm_ifile_node;
+	mdt = &nandfsdev->nd_ifile_mdt;
+	entries_per_group = mdt->entries_per_group;
+
+	VOP_LOCK(NTOV(ifile), LK_SHARED);
+	error = nandfs_bread(ifile, 0, NOCRED, 0, &bp);
+	if (error) {
+		brelse(bp);
+		VOP_UNLOCK(NTOV(ifile), 0);
+		return (error);
+	}
+
+	groups = (struct nandfs_block_group_desc *)bp->b_data;
+
+	for (i = 0; i < mdt->groups_per_desc_block; i++)
+		files += (entries_per_group - groups[i].bg_nfrees);
+
+	brelse(bp);
+	VOP_UNLOCK(NTOV(ifile), 0);
+
+	sbp->f_bsize = nandfsdev->nd_blocksize;
+	sbp->f_iosize = sbp->f_bsize;
+	sbp->f_blocks = fsdata->f_blocks_per_segment * fsdata->f_nsegments;
+	sbp->f_bfree = sb->s_free_blocks_count;
+	sbp->f_bavail = sbp->f_bfree;
+	sbp->f_files = files;
+	sbp->f_ffree = 0;
+	return (0);
+}
+
+static int
+nandfs_root(struct mount *mp, int flags, struct vnode **vpp)
+{
+	struct nandfsmount *nmp = VFSTONANDFS(mp);
+	struct nandfs_node *node;
+	int error;
+
+	error = nandfs_get_node(nmp, NANDFS_ROOT_INO, &node);
+	if (error)
+		return (error);
+
+	KASSERT(NTOV(node)->v_vflag & VV_ROOT,
+	    ("root_vp->v_vflag & VV_ROOT"));
+
+	*vpp = NTOV(node);
+
+	return (error);
+}
+
+static int
+nandfs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
+{
+	struct nandfsmount *nmp = VFSTONANDFS(mp);
+	struct nandfs_node *node;
+	int error;
+
+	error = nandfs_get_node(nmp, ino, &node);
+	if (node)
+		*vpp = NTOV(node);
+
+	return (error);
+}
+
+static int
+nandfs_sync(struct mount *mp, int waitfor)
+{
+	struct nandfsmount *nmp = VFSTONANDFS(mp);
+
+	DPRINTF(SYNC, ("%s: mp %p waitfor %d\n", __func__, mp, waitfor));
+
+	/*
+	 * XXX: A hack to be removed soon
+	 */
+	if (waitfor == MNT_LAZY)
+		return (0);
+	if (waitfor == MNT_SUSPEND)
+		return (0);
+	nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_VFS_SYNC);
+	return (0);
+}
+
+static struct vfsops nandfs_vfsops = {
+	.vfs_init =		nandfs_init,
+	.vfs_mount =		nandfs_mount,
+	.vfs_root =		nandfs_root,
+	.vfs_statfs =		nandfs_statfs,
+	.vfs_uninit =		nandfs_uninit,
+	.vfs_unmount =		nandfs_unmount,
+	.vfs_vget =		nandfs_vget,
+	.vfs_sync =		nandfs_sync,
+};
+
+VFS_SET(nandfs_vfsops, nandfs, VFCF_LOOPBACK);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nandfs/nandfs_vnops.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/fs/nandfs/nandfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,2455 @@
+/*-
+ * Copyright (c) 2010-2012 Semihalf
+ * Copyright (c) 2008, 2009 Reinoud Zandijk
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * From: NetBSD: nilfs_vnops.c,v 1.2 2009/08/26 03:40:48 elad
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/fs/nandfs/nandfs_vnops.c 235537 2012-05-17 10:11:18Z gber $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/sysctl.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+#include <sys/bio.h>
+#include <sys/fcntl.h>
+#include <sys/dirent.h>
+#include <sys/stat.h>
+#include <sys/priv.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+#include <machine/_inttypes.h>
+
+#include <fs/nandfs/nandfs_mount.h>
+#include <fs/nandfs/nandfs.h>
+#include <fs/nandfs/nandfs_subr.h>
+
+extern uma_zone_t nandfs_node_zone;
+static void nandfs_read_filebuf(struct nandfs_node *, struct buf *);
+static void nandfs_itimes_locked(struct vnode *);
+static int nandfs_truncate(struct vnode *, uint64_t);
+
+static vop_pathconf_t	nandfs_pathconf;
+
+#define UPDATE_CLOSE 0
+#define UPDATE_WAIT 0
+
+static int
+nandfs_inactive(struct vop_inactive_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	int error = 0;
+
+	DPRINTF(VNCALL, ("%s: vp:%p node:%p\n", __func__, vp, node));
+
+	if (node == NULL) {
+		DPRINTF(NODE, ("%s: inactive NULL node\n", __func__));
+		return (0);
+	}
+
+	if (node->nn_inode.i_mode != 0 && !(node->nn_inode.i_links_count)) {
+		nandfs_truncate(vp, 0);
+		error = nandfs_node_destroy(node);
+		if (error)
+			nandfs_error("%s: destroy node: %p\n", __func__, node);
+		node->nn_flags = 0;
+		vrecycle(vp);
+	}
+
+	return (error);
+}
+
+static int
+nandfs_reclaim(struct vop_reclaim_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *nandfs_node = VTON(vp);
+	struct nandfs_device *fsdev = nandfs_node->nn_nandfsdev;
+	uint64_t ino = nandfs_node->nn_ino;
+
+	DPRINTF(VNCALL, ("%s: vp:%p node:%p\n", __func__, vp, nandfs_node));
+
+	/* Invalidate all entries to a particular vnode. */
+	cache_purge(vp);
+
+	/* Destroy the vm object and flush associated pages. */
+	vnode_destroy_vobject(vp);
+
+	/* Remove from vfs hash if not system vnode */
+	if (!NANDFS_SYS_NODE(nandfs_node->nn_ino))
+		vfs_hash_remove(vp);
+
+	/* Dispose all node knowledge */
+	nandfs_dispose_node(&nandfs_node);
+
+	if (!NANDFS_SYS_NODE(ino))
+		NANDFS_WRITEUNLOCK(fsdev);
+
+	return (0);
+}
+
+static int
+nandfs_read(struct vop_read_args *ap)
+{
+	register struct vnode *vp = ap->a_vp;
+	register struct nandfs_node *node = VTON(vp);
+	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
+	struct uio *uio = ap->a_uio;
+	struct buf *bp;
+	uint64_t size;
+	uint32_t blocksize;
+	off_t bytesinfile;
+	ssize_t toread, off;
+	daddr_t lbn;
+	ssize_t resid;
+	int error = 0;
+
+	if (uio->uio_resid == 0)
+		return (0);
+
+	size = node->nn_inode.i_size;
+	if (uio->uio_offset >= size)
+		return (0);
+
+	blocksize = nandfsdev->nd_blocksize;
+	bytesinfile = size - uio->uio_offset;
+
+	resid = omin(uio->uio_resid, bytesinfile);
+
+	while (resid) {
+		lbn = uio->uio_offset / blocksize;
+		off = uio->uio_offset & (blocksize - 1);
+
+		toread = omin(resid, blocksize - off);
+
+		DPRINTF(READ, ("nandfs_read bn: 0x%jx toread: 0x%zx (0x%x)\n",
+		    (uintmax_t)lbn, toread, blocksize));
+
+		error = nandfs_bread(node, lbn, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			break;
+		}
+
+		error = uiomove(bp->b_data + off, toread, uio);
+		if (error) {
+			brelse(bp);
+			break;
+		}
+
+		brelse(bp);
+		resid -= toread;
+	}
+
+	return (error);
+}
+
+static int
+nandfs_write(struct vop_write_args *ap)
+{
+	struct nandfs_device *fsdev;
+	struct nandfs_node *node;
+	struct vnode *vp;
+	struct uio *uio;
+	struct buf *bp;
+	uint64_t file_size, vblk;
+	uint32_t blocksize;
+	ssize_t towrite, off;
+	daddr_t lbn;
+	ssize_t resid;
+	int error, ioflag, modified;
+
+	vp = ap->a_vp;
+	uio = ap->a_uio;
+	ioflag = ap->a_ioflag;
+	node = VTON(vp);
+	fsdev = node->nn_nandfsdev;
+
+	if (nandfs_fs_full(fsdev))
+		return (ENOSPC);
+
+	DPRINTF(WRITE, ("nandfs_write called %#zx at %#jx\n",
+	    uio->uio_resid, (uintmax_t)uio->uio_offset));
+
+	if (uio->uio_offset < 0)
+		return (EINVAL);
+	if (uio->uio_resid == 0)
+		return (0);
+
+	blocksize = fsdev->nd_blocksize;
+	file_size = node->nn_inode.i_size;
+
+	switch (vp->v_type) {
+	case VREG:
+		if (ioflag & IO_APPEND)
+			uio->uio_offset = file_size;
+		break;
+	case VDIR:
+		return (EISDIR);
+	case VLNK:
+		break;
+	default:
+		panic("%s: bad file type vp: %p", __func__, vp);
+	}
+
+	/* If explicitly asked to append, uio_offset can be wrong? */
+	if (ioflag & IO_APPEND)
+		uio->uio_offset = file_size;
+
+	resid = uio->uio_resid;
+	modified = error = 0;
+
+	while (uio->uio_resid) {
+		lbn = uio->uio_offset / blocksize;
+		off = uio->uio_offset & (blocksize - 1);
+
+		towrite = omin(uio->uio_resid, blocksize - off);
+
+		DPRINTF(WRITE, ("%s: lbn: 0x%jd toread: 0x%zx (0x%x)\n",
+		    __func__, (uintmax_t)lbn, towrite, blocksize));
+
+		error = nandfs_bmap_lookup(node, lbn, &vblk);
+		if (error)
+			break;
+
+		DPRINTF(WRITE, ("%s: lbn: 0x%jd toread: 0x%zx (0x%x) "
+		    "vblk=%jx\n", __func__, (uintmax_t)lbn, towrite, blocksize,
+		    vblk));
+
+		if (vblk != 0)
+			error = nandfs_bread(node, lbn, NOCRED, 0, &bp);
+		else
+			error = nandfs_bcreate(node, lbn, NOCRED, 0, &bp);
+
+		DPRINTF(WRITE, ("%s: vp %p bread bp %p lbn %#jx\n", __func__,
+		    vp, bp, (uintmax_t)lbn));
+		if (error) {
+			if (bp)
+				brelse(bp);
+			break;
+		}
+
+		error = uiomove((char *)bp->b_data + off, (int)towrite, uio);
+		if (error)
+			break;
+
+		error = nandfs_dirty_buf(bp, 0);
+		if (error)
+			break;
+
+		modified++;
+	}
+
+	/* XXX proper handling when only part of file was properly written */
+	if (modified) {
+		if (resid > uio->uio_resid && ap->a_cred &&
+		    ap->a_cred->cr_uid != 0)
+			node->nn_inode.i_mode &= ~(ISUID | ISGID);
+
+		if (file_size < uio->uio_offset + uio->uio_resid) {
+			node->nn_inode.i_size = uio->uio_offset +
+			    uio->uio_resid;
+			node->nn_flags |= IN_CHANGE | IN_UPDATE;
+			vnode_pager_setsize(vp, uio->uio_offset +
+			    uio->uio_resid);
+			nandfs_itimes(vp);
+		}
+	}
+
+	DPRINTF(WRITE, ("%s: return:%d\n", __func__, error));
+
+	return (error);
+}
+
+static int
+nandfs_lookup(struct vop_cachedlookup_args *ap)
+{
+	struct vnode *dvp, **vpp;
+	struct componentname *cnp;
+	struct ucred *cred;
+	struct thread *td;
+	struct nandfs_node *dir_node, *node;
+	struct nandfsmount *nmp;
+	uint64_t ino, off;
+	const char *name;
+	int namelen, nameiop, islastcn, mounted_ro;
+	int error, found;
+
+	DPRINTF(VNCALL, ("%s\n", __func__));
+
+	dvp = ap->a_dvp;
+	vpp = ap->a_vpp;
+	*vpp = NULL;
+
+	cnp = ap->a_cnp;
+	cred = cnp->cn_cred;
+	td = cnp->cn_thread;
+
+	dir_node = VTON(dvp);
+	nmp = dir_node->nn_nmp;
+
+	/* Simplify/clarification flags */
+	nameiop = cnp->cn_nameiop;
+	islastcn = cnp->cn_flags & ISLASTCN;
+	mounted_ro = dvp->v_mount->mnt_flag & MNT_RDONLY;
+
+	/*
+	 * If requesting a modify on the last path element on a read-only
+	 * filingsystem, reject lookup;
+	 */
+	if (islastcn && mounted_ro && (nameiop == DELETE || nameiop == RENAME))
+		return (EROFS);
+
+	if (dir_node->nn_inode.i_links_count == 0)
+		return (ENOENT);
+
+	/*
+	 * Obviously, the file is not (anymore) in the namecache, we have to
+	 * search for it. There are three basic cases: '.', '..' and others.
+	 *
+	 * Following the guidelines of VOP_LOOKUP manpage and tmpfs.
+	 */
+	error = 0;
+	if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) {
+		DPRINTF(LOOKUP, ("\tlookup '.'\n"));
+		/* Special case 1 '.' */
+		VREF(dvp);
+		*vpp = dvp;
+		/* Done */
+	} else if (cnp->cn_flags & ISDOTDOT) {
+		/* Special case 2 '..' */
+		DPRINTF(LOOKUP, ("\tlookup '..'\n"));
+
+		/* Get our node */
+		name = "..";
+		namelen = 2;
+		error = nandfs_lookup_name_in_dir(dvp, name, namelen, &ino,
+		    &found, &off);
+		if (error)
+			goto out;
+		if (!found)
+			error = ENOENT;
+
+		/* First unlock parent */
+		VOP_UNLOCK(dvp, 0);
+
+		if (error == 0) {
+			DPRINTF(LOOKUP, ("\tfound '..'\n"));
+			/* Try to create/reuse the node */
+			error = nandfs_get_node(nmp, ino, &node);
+
+			if (!error) {
+				DPRINTF(LOOKUP,
+				    ("\tnode retrieved/created OK\n"));
+				*vpp = NTOV(node);
+			}
+		}
+
+		/* Try to relock parent */
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+	} else {
+		DPRINTF(LOOKUP, ("\tlookup file\n"));
+		/* All other files */
+		/* Look up filename in the directory returning its inode */
+		name = cnp->cn_nameptr;
+		namelen = cnp->cn_namelen;
+		error = nandfs_lookup_name_in_dir(dvp, name, namelen,
+		    &ino, &found, &off);
+		if (error)
+			goto out;
+		if (!found) {
+			DPRINTF(LOOKUP, ("\tNOT found\n"));
+			/*
+			 * UGH, didn't find name. If we're creating or
+			 * renaming on the last name this is OK and we ought
+			 * to return EJUSTRETURN if its allowed to be created.
+			 */
+			error = ENOENT;
+			if ((nameiop == CREATE || nameiop == RENAME) &&
+			    islastcn) {
+				error = VOP_ACCESS(dvp, VWRITE, cred,
+				    td);
+				if (!error) {
+					/* keep the component name */
+					cnp->cn_flags |= SAVENAME;
+					error = EJUSTRETURN;
+				}
+			}
+			/* Done */
+		} else {
+			if (ino == NANDFS_WHT_INO)
+				cnp->cn_flags |= ISWHITEOUT;
+
+			if ((cnp->cn_flags & ISWHITEOUT) &&
+			    (nameiop == LOOKUP))
+				return (ENOENT);
+
+			if ((nameiop == DELETE) && islastcn) {
+				if ((cnp->cn_flags & ISWHITEOUT) &&
+				    (cnp->cn_flags & DOWHITEOUT)) {
+					cnp->cn_flags |= SAVENAME;
+					dir_node->nn_diroff = off;
+					return (EJUSTRETURN);
+				}
+
+				error = VOP_ACCESS(dvp, VWRITE, cred,
+				    cnp->cn_thread);
+				if (error)
+					return (error);
+
+				/* Try to create/reuse the node */
+				error = nandfs_get_node(nmp, ino, &node);
+				if (!error) {
+					*vpp = NTOV(node);
+					node->nn_diroff = off;
+				}
+
+				if ((dir_node->nn_inode.i_mode & ISVTX) &&
+				    cred->cr_uid != 0 &&
+				    cred->cr_uid != dir_node->nn_inode.i_uid &&
+				    node->nn_inode.i_uid != cred->cr_uid) {
+					vput(*vpp);
+					*vpp = NULL;
+					return (EPERM);
+				}
+			} else if ((nameiop == RENAME) && islastcn) {
+				error = VOP_ACCESS(dvp, VWRITE, cred,
+				    cnp->cn_thread);
+				if (error)
+					return (error);
+
+				/* Try to create/reuse the node */
+				error = nandfs_get_node(nmp, ino, &node);
+				if (!error) {
+					*vpp = NTOV(node);
+					node->nn_diroff = off;
+				}
+			} else {
+				/* Try to create/reuse the node */
+				error = nandfs_get_node(nmp, ino, &node);
+				if (!error) {
+					*vpp = NTOV(node);
+					node->nn_diroff = off;
+				}
+			}
+		}
+	}
+
+out:
+	/*
+	 * Store result in the cache if requested. If we are creating a file,
+	 * the file might not be found and thus putting it into the namecache
+	 * might be seen as negative caching.
+	 */
+	if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
+		cache_enter(dvp, *vpp, cnp);
+
+	return (error);
+
+}
+
+static int
+nandfs_getattr(struct vop_getattr_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vattr *vap = ap->a_vap;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+
+	DPRINTF(VNCALL, ("%s: vp: %p\n", __func__, vp));
+	nandfs_itimes(vp);
+
+	/* Basic info */
+	VATTR_NULL(vap);
+	vap->va_atime.tv_sec = inode->i_mtime;
+	vap->va_atime.tv_nsec = inode->i_mtime_nsec;
+	vap->va_mtime.tv_sec = inode->i_mtime;
+	vap->va_mtime.tv_nsec = inode->i_mtime_nsec;
+	vap->va_ctime.tv_sec = inode->i_ctime;
+	vap->va_ctime.tv_nsec = inode->i_ctime_nsec;
+	vap->va_type = IFTOVT(inode->i_mode);
+	vap->va_mode = inode->i_mode & ~S_IFMT;
+	vap->va_nlink = inode->i_links_count;
+	vap->va_uid = inode->i_uid;
+	vap->va_gid = inode->i_gid;
+	vap->va_rdev = inode->i_special;
+	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+	vap->va_fileid = node->nn_ino;
+	vap->va_size = inode->i_size;
+	vap->va_blocksize = node->nn_nandfsdev->nd_blocksize;
+	vap->va_gen = 0;
+	vap->va_flags = inode->i_flags;
+	vap->va_bytes = inode->i_blocks * vap->va_blocksize;
+	vap->va_filerev = 0;
+	vap->va_vaflags = 0;
+
+	return (0);
+}
+
+static int
+nandfs_vtruncbuf(struct vnode *vp, uint64_t nblks)
+{
+	struct nandfs_device *nffsdev;
+	struct bufobj *bo;
+	struct buf *bp, *nbp;
+
+	bo = &vp->v_bufobj;
+	nffsdev = VTON(vp)->nn_nandfsdev;
+
+	ASSERT_VOP_LOCKED(vp, "nandfs_truncate");
+restart:
+	BO_LOCK(bo);
+restart_locked:
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
+		if (bp->b_lblkno < nblks)
+			continue;
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
+			goto restart_locked;
+
+		bremfree(bp);
+		bp->b_flags |= (B_INVAL | B_RELBUF);
+		bp->b_flags &= ~(B_ASYNC | B_MANAGED);
+		BO_UNLOCK(bo);
+		brelse(bp);
+		BO_LOCK(bo);
+	}
+
+	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+		if (bp->b_lblkno < nblks)
+			continue;
+		if (BUF_LOCK(bp,
+		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+		    BO_MTX(bo)) == ENOLCK)
+			goto restart;
+		bp->b_flags |= (B_INVAL | B_RELBUF);
+		bp->b_flags &= ~(B_ASYNC | B_MANAGED);
+		brelse(bp);
+		nandfs_dirty_bufs_decrement(nffsdev);
+		BO_LOCK(bo);
+	}
+
+	BO_UNLOCK(bo);
+
+	return (0);
+}
+
+static int
+nandfs_truncate(struct vnode *vp, uint64_t newsize)
+{
+	struct nandfs_device *nffsdev;
+	struct nandfs_node *node;
+	struct nandfs_inode *inode;
+	struct buf *bp = NULL;
+	uint64_t oblks, nblks, vblk, size, rest;
+	int error;
+
+	node = VTON(vp);
+	nffsdev = node->nn_nandfsdev;
+	inode = &node->nn_inode;
+
+	/* Calculate end of file */
+	size = inode->i_size;
+
+	if (newsize == size) {
+		node->nn_flags |= IN_CHANGE | IN_UPDATE;
+		nandfs_itimes(vp);
+		return (0);
+	}
+
+	if (newsize > size) {
+		inode->i_size = newsize;
+		vnode_pager_setsize(vp, newsize);
+		node->nn_flags |= IN_CHANGE | IN_UPDATE;
+		nandfs_itimes(vp);
+		return (0);
+	}
+
+	nblks = howmany(newsize, nffsdev->nd_blocksize);
+	oblks = howmany(size, nffsdev->nd_blocksize);
+	rest = newsize % nffsdev->nd_blocksize;
+
+	if (rest) {
+		error = nandfs_bmap_lookup(node, nblks - 1, &vblk);
+		if (error)
+			return (error);
+
+		if (vblk != 0)
+			error = nandfs_bread(node, nblks - 1, NOCRED, 0, &bp);
+		else
+			error = nandfs_bcreate(node, nblks - 1, NOCRED, 0, &bp);
+
+		if (error) {
+			if (bp)
+				brelse(bp);
+			return (error);
+		}
+
+		bzero((char *)bp->b_data + rest,
+		    (u_int)(nffsdev->nd_blocksize - rest));
+		error = nandfs_dirty_buf(bp, 0);
+		if (error)
+			return (error);
+	}
+
+	DPRINTF(VNCALL, ("%s: vp %p oblks %jx nblks %jx\n", __func__, vp, oblks,
+	    nblks));
+
+	error = nandfs_bmap_truncate_mapping(node, oblks - 1, nblks - 1);
+	if (error) {
+		if (bp)
+			nandfs_undirty_buf(bp);
+		return (error);
+	}
+
+	error = nandfs_vtruncbuf(vp, nblks);
+	if (error) {
+		if (bp)
+			nandfs_undirty_buf(bp);
+		return (error);
+	}
+
+	inode->i_size = newsize;
+	vnode_pager_setsize(vp, newsize);
+	node->nn_flags |= IN_CHANGE | IN_UPDATE;
+	nandfs_itimes(vp);
+
+	return (error);
+}
+
+static void
+nandfs_itimes_locked(struct vnode *vp)
+{
+	struct nandfs_node *node;
+	struct nandfs_inode *inode;
+	struct timespec ts;
+
+	ASSERT_VI_LOCKED(vp, __func__);
+
+	node = VTON(vp);
+	inode = &node->nn_inode;
+
+	if ((node->nn_flags & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
+		return;
+
+	if (((vp->v_mount->mnt_kern_flag &
+	    (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
+	    (node->nn_flags & (IN_CHANGE | IN_UPDATE)))
+		node->nn_flags |= IN_MODIFIED;
+
+	vfs_timestamp(&ts);
+	if (node->nn_flags & IN_UPDATE) {
+		inode->i_mtime = ts.tv_sec;
+		inode->i_mtime_nsec = ts.tv_nsec;
+	}
+	if (node->nn_flags & IN_CHANGE) {
+		inode->i_ctime = ts.tv_sec;
+		inode->i_ctime_nsec = ts.tv_nsec;
+	}
+
+	node->nn_flags &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
+}
+
+void
+nandfs_itimes(struct vnode *vp)
+{
+
+	VI_LOCK(vp);
+	nandfs_itimes_locked(vp);
+	VI_UNLOCK(vp);
+}
+
+static int
+nandfs_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td)
+{
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+	uint16_t nmode;
+	int error = 0;
+
+	DPRINTF(VNCALL, ("%s: vp %p, mode %x, cred %p, td %p\n", __func__, vp,
+	    mode, cred, td));
+	/*
+	 * To modify the permissions on a file, must possess VADMIN
+	 * for that file.
+	 */
+	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
+		return (error);
+
+	/*
+	 * Privileged processes may set the sticky bit on non-directories,
+	 * as well as set the setgid bit on a file with a group that the
+	 * process is not a member of. Both of these are allowed in
+	 * jail(8).
+	 */
+	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
+		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0))
+			return (EFTYPE);
+	}
+	if (!groupmember(inode->i_gid, cred) && (mode & ISGID)) {
+		error = priv_check_cred(cred, PRIV_VFS_SETGID, 0);
+		if (error)
+			return (error);
+	}
+
+	/*
+	 * Deny setting setuid if we are not the file owner.
+	 */
+	if ((mode & ISUID) && inode->i_uid != cred->cr_uid) {
+		error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0);
+		if (error)
+			return (error);
+	}
+
+	nmode = inode->i_mode;
+	nmode &= ~ALLPERMS;
+	nmode |= (mode & ALLPERMS);
+	inode->i_mode = nmode;
+	node->nn_flags |= IN_CHANGE;
+
+	DPRINTF(VNCALL, ("%s: to mode %x\n", __func__, nmode));
+
+	return (error);
+}
+
+static int
+nandfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred,
+    struct thread *td)
+{
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+	uid_t ouid;
+	gid_t ogid;
+	int error = 0;
+
+	if (uid == (uid_t)VNOVAL)
+		uid = inode->i_uid;
+	if (gid == (gid_t)VNOVAL)
+		gid = inode->i_gid;
+	/*
+	 * To modify the ownership of a file, must possess VADMIN for that
+	 * file.
+	 */
+	if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td)))
+		return (error);
+	/*
+	 * To change the owner of a file, or change the group of a file to a
+	 * group of which we are not a member, the caller must have
+	 * privilege.
+	 */
+	if (((uid != inode->i_uid && uid != cred->cr_uid) ||
+	    (gid != inode->i_gid && !groupmember(gid, cred))) &&
+	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0)))
+		return (error);
+	ogid = inode->i_gid;
+	ouid = inode->i_uid;
+
+	inode->i_gid = gid;
+	inode->i_uid = uid;
+
+	node->nn_flags |= IN_CHANGE;
+	if ((inode->i_mode & (ISUID | ISGID)) &&
+	    (ouid != uid || ogid != gid)) {
+		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
+			inode->i_mode &= ~(ISUID | ISGID);
+		}
+	}
+	DPRINTF(VNCALL, ("%s: vp %p, cred %p, td %p - ret OK\n", __func__, vp,
+	    cred, td));
+	return (0);
+}
+
+static int
+nandfs_setattr(struct vop_setattr_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+	struct vattr *vap = ap->a_vap;
+	struct ucred *cred = ap->a_cred;
+	struct thread *td = curthread;
+	uint32_t flags;
+	int error = 0;
+
+	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
+	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
+	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
+	    (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
+		DPRINTF(VNCALL, ("%s: unsettable attribute\n", __func__));
+		return (EINVAL);
+	}
+
+	if (vap->va_flags != VNOVAL) {
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p flags:%lx\n", __func__, vp,
+		    td, vap->va_flags));
+
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		/*
+		 * Callers may only modify the file flags on objects they
+		 * have VADMIN rights for.
+		 */
+		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
+			return (error);
+		/*
+		 * Unprivileged processes are not permitted to unset system
+		 * flags, or modify flags if any system flags are set.
+		 * Privileged non-jail processes may not modify system flags
+		 * if securelevel > 0 and any existing system flags are set.
+		 * Privileged jail processes behave like privileged non-jail
+		 * processes if the security.jail.chflags_allowed sysctl is
+		 * is non-zero; otherwise, they behave like unprivileged
+		 * processes.
+		 */
+
+		flags = inode->i_flags;
+		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) {
+			if (flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
+				error = securelevel_gt(cred, 0);
+				if (error)
+					return (error);
+			}
+			/* Snapshot flag cannot be set or cleared */
+			if (((vap->va_flags & SF_SNAPSHOT) != 0 &&
+			    (flags & SF_SNAPSHOT) == 0) ||
+			    ((vap->va_flags & SF_SNAPSHOT) == 0 &&
+			    (flags & SF_SNAPSHOT) != 0))
+				return (EPERM);
+
+			inode->i_flags = vap->va_flags;
+		} else {
+			if (flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
+			    (vap->va_flags & UF_SETTABLE) != vap->va_flags)
+				return (EPERM);
+
+			flags &= SF_SETTABLE;
+			flags |= (vap->va_flags & UF_SETTABLE);
+			inode->i_flags = flags;
+		}
+		node->nn_flags |= IN_CHANGE;
+		if (vap->va_flags & (IMMUTABLE | APPEND))
+			return (0);
+	}
+	if (inode->i_flags & (IMMUTABLE | APPEND))
+		return (EPERM);
+
+	if (vap->va_size != (u_quad_t)VNOVAL) {
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p size:%jx\n", __func__, vp, td,
+		    (uintmax_t)vap->va_size));
+
+		switch (vp->v_type) {
+		case VDIR:
+			return (EISDIR);
+		case VLNK:
+		case VREG:
+			if (vp->v_mount->mnt_flag & MNT_RDONLY)
+				return (EROFS);
+			if ((inode->i_flags & SF_SNAPSHOT) != 0)
+				return (EPERM);
+			break;
+		default:
+			return (0);
+		}
+
+		if (vap->va_size > node->nn_nandfsdev->nd_maxfilesize)
+			return (EFBIG);
+
+		KASSERT((vp->v_type == VREG), ("Set size %d", vp->v_type));
+		nandfs_truncate(vp, vap->va_size);
+		node->nn_flags |= IN_CHANGE;
+
+		return (0);
+	}
+
+	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p uid/gid %x/%x\n", __func__,
+		    vp, td, vap->va_uid, vap->va_gid));
+		error = nandfs_chown(vp, vap->va_uid, vap->va_gid, cred, td);
+		if (error)
+			return (error);
+	}
+
+	if (vap->va_mode != (mode_t)VNOVAL) {
+		if (vp->v_mount->mnt_flag & MNT_RDONLY)
+			return (EROFS);
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p mode %x\n", __func__, vp, td,
+		    vap->va_mode));
+
+		error = nandfs_chmod(vp, (int)vap->va_mode, cred, td);
+		if (error)
+			return (error);
+	}
+	if (vap->va_atime.tv_sec != VNOVAL ||
+	    vap->va_mtime.tv_sec != VNOVAL ||
+	    vap->va_birthtime.tv_sec != VNOVAL) {
+		DPRINTF(VNCALL, ("%s: vp:%p td:%p time a/m/b %jx/%jx/%jx\n",
+		    __func__, vp, td, (uintmax_t)vap->va_atime.tv_sec,
+		    (uintmax_t)vap->va_mtime.tv_sec,
+		    (uintmax_t)vap->va_birthtime.tv_sec));
+
+		if (vap->va_atime.tv_sec != VNOVAL)
+			node->nn_flags |= IN_ACCESS;
+		if (vap->va_mtime.tv_sec != VNOVAL)
+			node->nn_flags |= IN_CHANGE | IN_UPDATE;
+		if (vap->va_birthtime.tv_sec != VNOVAL)
+			node->nn_flags |= IN_MODIFIED;
+		nandfs_itimes(vp);
+		return (0);
+	}
+
+	return (0);
+}
+
+static int
+nandfs_open(struct vop_open_args *ap)
+{
+	struct nandfs_node *node = VTON(ap->a_vp);
+	uint64_t filesize;
+
+	DPRINTF(VNCALL, ("nandfs_open called ap->a_mode %x\n", ap->a_mode));
+
+	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
+		return (EOPNOTSUPP);
+
+	if ((node->nn_inode.i_flags & APPEND) &&
+	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
+		return (EPERM);
+
+	filesize = node->nn_inode.i_size;
+	vnode_create_vobject(ap->a_vp, filesize, ap->a_td);
+
+	return (0);
+}
+
+static int
+nandfs_close(struct vop_close_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+
+	DPRINTF(VNCALL, ("%s: vp %p node %p\n", __func__, vp, node));
+
+	mtx_lock(&vp->v_interlock);
+	if (vp->v_usecount > 1)
+		nandfs_itimes_locked(vp);
+	mtx_unlock(&vp->v_interlock);
+
+	return (0);
+}
+
+static int
+nandfs_check_possible(struct vnode *vp, struct vattr *vap, mode_t mode)
+{
+
+	/* Check if we are allowed to write */
+	switch (vap->va_type) {
+	case VDIR:
+	case VLNK:
+	case VREG:
+		/*
+		 * Normal nodes: check if we're on a read-only mounted
+		 * filingsystem and bomb out if we're trying to write.
+		 */
+		if ((mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY))
+			return (EROFS);
+		break;
+	case VBLK:
+	case VCHR:
+	case VSOCK:
+	case VFIFO:
+		/*
+		 * Special nodes: even on read-only mounted filingsystems
+		 * these are allowed to be written to if permissions allow.
+		 */
+		break;
+	default:
+		/* No idea what this is */
+		return (EINVAL);
+	}
+
+	/* Noone may write immutable files */
+	if ((mode & VWRITE) && (VTON(vp)->nn_inode.i_flags & IMMUTABLE))
+		return (EPERM);
+
+	return (0);
+}
+
+static int
+nandfs_check_permitted(struct vnode *vp, struct vattr *vap, mode_t mode,
+    struct ucred *cred)
+{
+
+	return (vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid, mode,
+	    cred, NULL));
+}
+
+static int
+nandfs_advlock(struct vop_advlock_args *ap)
+{
+	struct nandfs_node *nvp;
+	quad_t size;
+
+	nvp = VTON(ap->a_vp);
+	size = nvp->nn_inode.i_size;
+	return (lf_advlock(ap, &(nvp->nn_lockf), size));
+}
+
+static int
+nandfs_access(struct vop_access_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	accmode_t accmode = ap->a_accmode;
+	struct ucred *cred = ap->a_cred;
+	struct vattr vap;
+	int error;
+
+	DPRINTF(VNCALL, ("%s: vp:%p mode: %x\n", __func__, vp, accmode));
+
+	error = VOP_GETATTR(vp, &vap, NULL);
+	if (error)
+		return (error);
+
+	error = nandfs_check_possible(vp, &vap, accmode);
+	if (error) {
+		return (error);
+	}
+
+	error = nandfs_check_permitted(vp, &vap, accmode, cred);
+
+	return (error);
+}
+
+static int
+nandfs_print(struct vop_print_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *nvp = VTON(vp);
+
+	printf("\tvp=%p, nandfs_node=%p\n", vp, nvp);
+	printf("nandfs inode %#jx\n", (uintmax_t)nvp->nn_ino);
+	printf("flags = 0x%b\n", (u_int)nvp->nn_flags, PRINT_NODE_FLAGS);
+
+	return (0);
+}
+
+static void
+nandfs_read_filebuf(struct nandfs_node *node, struct buf *bp)
+{
+	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
+	struct buf *nbp;
+	nandfs_daddr_t vblk, pblk;
+	nandfs_lbn_t from;
+	uint32_t blocksize;
+	int error = 0;
+	int blk2dev = nandfsdev->nd_blocksize / DEV_BSIZE;
+
+	/*
+	 * Translate all the block sectors into a series of buffers to read
+	 * asynchronously from the nandfs device. Note that this lookup may
+	 * induce readin's too.
+	 */
+
+	blocksize = nandfsdev->nd_blocksize;
+	if (bp->b_bcount / blocksize != 1)
+		panic("invalid b_count in bp %p\n", bp);
+
+	from = bp->b_blkno;
+
+	DPRINTF(READ, ("\tread in from inode %#jx blkno %#jx"
+	    " count %#lx\n", (uintmax_t)node->nn_ino, from,
+	    bp->b_bcount));
+
+	/* Get virtual block numbers for the vnode's buffer span */
+	error = nandfs_bmap_lookup(node, from, &vblk);
+	if (error) {
+		bp->b_error = EINVAL;
+		bp->b_ioflags |= BIO_ERROR;
+		bufdone(bp);
+		return;
+	}
+
+	/* Translate virtual block numbers to physical block numbers */
+	error = nandfs_vtop(node, vblk, &pblk);
+	if (error) {
+		bp->b_error = EINVAL;
+		bp->b_ioflags |= BIO_ERROR;
+		bufdone(bp);
+		return;
+	}
+
+	/* Issue translated blocks */
+	bp->b_resid = bp->b_bcount;
+
+	/* Note virtual block 0 marks not mapped */
+	if (vblk == 0) {
+		vfs_bio_clrbuf(bp);
+		bufdone(bp);
+		return;
+	}
+
+	nbp = bp;
+	nbp->b_blkno = pblk * blk2dev;
+	bp->b_iooffset = dbtob(nbp->b_blkno);
+	MPASS(bp->b_iooffset >= 0);
+	BO_STRATEGY(&nandfsdev->nd_devvp->v_bufobj, nbp);
+	nandfs_vblk_set(bp, vblk);
+	DPRINTF(READ, ("read_filebuf : ino %#jx blk %#jx -> "
+	    "%#jx -> %#jx [bp %p]\n", (uintmax_t)node->nn_ino,
+	    (uintmax_t)(from), (uintmax_t)vblk,
+	    (uintmax_t)pblk, nbp));
+}
+
+static void
+nandfs_write_filebuf(struct nandfs_node *node, struct buf *bp)
+{
+	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
+
+	bp->b_iooffset = dbtob(bp->b_blkno);
+	MPASS(bp->b_iooffset >= 0);
+	BO_STRATEGY(&nandfsdev->nd_devvp->v_bufobj, bp);
+}
+
+static int
+nandfs_strategy(struct vop_strategy_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct buf *bp = ap->a_bp;
+	struct nandfs_node *node = VTON(vp);
+
+
+	/* check if we ought to be here */
+	KASSERT((vp->v_type != VBLK && vp->v_type != VCHR),
+	    ("nandfs_strategy on type %d", vp->v_type));
+
+	/* Translate if needed and pass on */
+	if (bp->b_iocmd == BIO_READ) {
+		nandfs_read_filebuf(node, bp);
+		return (0);
+	}
+
+	/* Send to segment collector */
+	nandfs_write_filebuf(node, bp);
+	return (0);
+}
+
+static int
+nandfs_readdir(struct vop_readdir_args *ap)
+{
+	struct uio *uio = ap->a_uio;
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_dir_entry *ndirent;
+	struct dirent dirent;
+	struct buf *bp;
+	uint64_t file_size, diroffset, transoffset, blkoff;
+	uint64_t blocknr;
+	uint32_t blocksize = node->nn_nandfsdev->nd_blocksize;
+	uint8_t *pos, name_len;
+	int error;
+
+	DPRINTF(READDIR, ("nandfs_readdir called\n"));
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	file_size = node->nn_inode.i_size;
+	DPRINTF(READDIR, ("nandfs_readdir filesize %jd resid %zd\n",
+	    (uintmax_t)file_size, uio->uio_resid ));
+
+	/* We are called just as long as we keep on pushing data in */
+	error = 0;
+	if ((uio->uio_offset < file_size) &&
+	    (uio->uio_resid >= sizeof(struct dirent))) {
+		diroffset = uio->uio_offset;
+		transoffset = diroffset;
+
+		blocknr = diroffset / blocksize;
+		blkoff = diroffset % blocksize;
+		error = nandfs_bread(node, blocknr, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (EIO);
+		}
+		while (diroffset < file_size) {
+			DPRINTF(READDIR, ("readdir : offset = %"PRIu64"\n",
+			    diroffset));
+			if (blkoff >= blocksize) {
+				blkoff = 0; blocknr++;
+				brelse(bp);
+				error = nandfs_bread(node, blocknr, NOCRED, 0,
+				    &bp);
+				if (error) {
+					brelse(bp);
+					return (EIO);
+				}
+			}
+
+			/* Read in one dirent */
+			pos = (uint8_t *)bp->b_data + blkoff;
+			ndirent = (struct nandfs_dir_entry *)pos;
+
+			name_len = ndirent->name_len;
+			memset(&dirent, 0, sizeof(struct dirent));
+			dirent.d_fileno = ndirent->inode;
+			if (dirent.d_fileno) {
+				dirent.d_type = ndirent->file_type;
+				dirent.d_namlen = name_len;
+				strncpy(dirent.d_name, ndirent->name, name_len);
+				dirent.d_reclen = GENERIC_DIRSIZ(&dirent);
+				DPRINTF(READDIR, ("copying `%*.*s`\n", name_len,
+				    name_len, dirent.d_name));
+			}
+
+			/*
+			 * If there isn't enough space in the uio to return a
+			 * whole dirent, break off read
+			 */
+			if (uio->uio_resid < GENERIC_DIRSIZ(&dirent))
+				break;
+
+			/* Transfer */
+			if (dirent.d_fileno)
+				uiomove(&dirent, GENERIC_DIRSIZ(&dirent), uio);
+
+			/* Advance */
+			diroffset += ndirent->rec_len;
+			blkoff += ndirent->rec_len;
+
+			/* Remember the last entry we transfered */
+			transoffset = diroffset;
+		}
+		brelse(bp);
+
+		/* Pass on last transfered offset */
+		uio->uio_offset = transoffset;
+	}
+
+	if (ap->a_eofflag)
+		*ap->a_eofflag = (uio->uio_offset >= file_size);
+
+	return (error);
+}
+
+static int
+nandfs_dirempty(struct vnode *dvp, uint64_t parentino, struct ucred *cred)
+{
+	struct nandfs_node *dnode = VTON(dvp);
+	struct nandfs_dir_entry *dirent;
+	uint64_t file_size = dnode->nn_inode.i_size;
+	uint64_t blockcount = dnode->nn_inode.i_blocks;
+	uint64_t blocknr;
+	uint32_t blocksize = dnode->nn_nandfsdev->nd_blocksize;
+	uint32_t limit;
+	uint32_t off;
+	uint8_t	*pos;
+	struct buf *bp;
+	int error;
+
+	DPRINTF(LOOKUP, ("%s: dvp %p parentino %#jx cred %p\n", __func__, dvp,
+	    (uintmax_t)parentino, cred));
+
+	KASSERT((file_size != 0), ("nandfs_dirempty for NULL dir %p", dvp));
+
+	blocknr = 0;
+	while (blocknr < blockcount) {
+		error = nandfs_bread(dnode, blocknr, NOCRED, 0, &bp);
+		if (error) {
+			brelse(bp);
+			return (0);
+		}
+
+		pos = (uint8_t *)bp->b_data;
+		off = 0;
+
+		if (blocknr == (blockcount - 1))
+			limit = file_size % blocksize;
+		else
+			limit = blocksize;
+
+		while (off < limit) {
+			dirent = (struct nandfs_dir_entry *)(pos + off);
+			off += dirent->rec_len;
+
+			if (dirent->inode == 0)
+				continue;
+
+			switch (dirent->name_len) {
+			case 0:
+				break;
+			case 1:
+				if (dirent->name[0] != '.')
+					goto notempty;
+
+				KASSERT(dirent->inode == dnode->nn_ino,
+				    (".'s inode does not match dir"));
+				break;
+			case 2:
+				if (dirent->name[0] != '.' &&
+				    dirent->name[1] != '.')
+					goto notempty;
+
+				KASSERT(dirent->inode == parentino,
+				    ("..'s inode does not match parent"));
+				break;
+			default:
+				goto notempty;
+			}
+		}
+
+		brelse(bp);
+		blocknr++;
+	}
+
+	return (1);
+notempty:
+	brelse(bp);
+	return (0);
+}
+
+static int
+nandfs_link(struct vop_link_args *ap)
+{
+	struct vnode *tdvp = ap->a_tdvp;
+	struct vnode *vp = ap->a_vp;
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_inode *inode = &node->nn_inode;
+	int error;
+
+	if (tdvp->v_mount != vp->v_mount)
+		return (EXDEV);
+
+	if (inode->i_links_count >= LINK_MAX)
+		return (EMLINK);
+
+	if (inode->i_flags & (IMMUTABLE | APPEND))
+		return (EPERM);
+
+	/* Update link count */
+	inode->i_links_count++;
+
+	/* Add dir entry */
+	error = nandfs_add_dirent(tdvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(inode->i_mode));
+	if (error) {
+		inode->i_links_count--;
+	}
+
+	node->nn_flags |= IN_CHANGE;
+	nandfs_itimes(vp);
+	DPRINTF(VNCALL, ("%s: tdvp %p vp %p cnp %p\n",
+	    __func__, tdvp, vp, cnp));
+
+	return (0);
+}
+
+static int
+nandfs_create(struct vop_create_args *ap)
+{
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	uint16_t mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfsmount *nmp = dir_node->nn_nmp;
+	struct nandfs_node *node;
+	int error;
+
+	DPRINTF(VNCALL, ("%s: dvp %p\n", __func__, dvp));
+
+	if (nandfs_fs_full(dir_node->nn_nandfsdev))
+		return (ENOSPC);
+
+	/* Create new vnode/inode */
+	error = nandfs_node_create(nmp, &node, mode);
+	if (error)
+		return (error);
+	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
+	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
+
+	/* Add new dir entry */
+	error = nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(mode));
+	if (error) {
+		if (nandfs_node_destroy(node)) {
+			nandfs_error("%s: error destroying node %p\n",
+			    __func__, node);
+		}
+		return (error);
+	}
+	*vpp = NTOV(node);
+
+	DPRINTF(VNCALL, ("created file vp %p nandnode %p ino %jx\n", *vpp, node,
+	    (uintmax_t)node->nn_ino));
+	return (0);
+}
+
+static int
+nandfs_remove(struct vop_remove_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vnode *dvp = ap->a_dvp;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_node *dnode = VTON(dvp);
+	struct componentname *cnp = ap->a_cnp;
+
+	DPRINTF(VNCALL, ("%s: dvp %p vp %p nandnode %p ino %#jx link %d\n",
+	    __func__, dvp, vp, node, (uintmax_t)node->nn_ino,
+	    node->nn_inode.i_links_count));
+
+	if (vp->v_type == VDIR)
+		return (EISDIR);
+
+	/* Files marked as immutable or append-only cannot be deleted. */
+	if ((node->nn_inode.i_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
+	    (dnode->nn_inode.i_flags & APPEND))
+		return (EPERM);
+
+	nandfs_remove_dirent(dvp, node, cnp);
+	node->nn_inode.i_links_count--;
+	node->nn_flags |= IN_CHANGE;
+
+	return (0);
+}
+
+/*
+ * Check if source directory is in the path of the target directory.
+ * Target is supplied locked, source is unlocked.
+ * The target is always vput before returning.
+ */
+static int
+nandfs_checkpath(struct nandfs_node *src, struct nandfs_node *dest,
+    struct ucred *cred)
+{
+	struct vnode *vp;
+	int error, rootino;
+	struct nandfs_dir_entry dirent;
+
+	vp = NTOV(dest);
+	if (src->nn_ino == dest->nn_ino) {
+		error = EEXIST;
+		goto out;
+	}
+	rootino = NANDFS_ROOT_INO;
+	error = 0;
+	if (dest->nn_ino == rootino)
+		goto out;
+
+	for (;;) {
+		if (vp->v_type != VDIR) {
+			error = ENOTDIR;
+			break;
+		}
+
+		error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirent,
+		    NANDFS_DIR_REC_LEN(2), (off_t)0, UIO_SYSSPACE,
+		    IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED,
+		    NULL, NULL);
+		if (error != 0)
+			break;
+		if (dirent.name_len != 2 ||
+		    dirent.name[0] != '.' ||
+		    dirent.name[1] != '.') {
+			error = ENOTDIR;
+			break;
+		}
+		if (dirent.inode == src->nn_ino) {
+			error = EINVAL;
+			break;
+		}
+		if (dirent.inode == rootino)
+			break;
+		vput(vp);
+		if ((error = VFS_VGET(vp->v_mount, dirent.inode,
+		    LK_EXCLUSIVE, &vp)) != 0) {
+			vp = NULL;
+			break;
+		}
+	}
+
+out:
+	if (error == ENOTDIR)
+		printf("checkpath: .. not a directory\n");
+	if (vp != NULL)
+		vput(vp);
+	return (error);
+}
+
+static int
+nandfs_rename(struct vop_rename_args *ap)
+{
+	struct vnode *tvp = ap->a_tvp;
+	struct vnode *tdvp = ap->a_tdvp;
+	struct vnode *fvp = ap->a_fvp;
+	struct vnode *fdvp = ap->a_fdvp;
+	struct componentname *tcnp = ap->a_tcnp;
+	struct componentname *fcnp = ap->a_fcnp;
+	int doingdirectory = 0, oldparent = 0, newparent = 0;
+	int error = 0;
+
+	struct nandfs_node *fdnode, *fnode, *fnode1;
+	struct nandfs_node *tdnode = VTON(tdvp);
+	struct nandfs_node *tnode;
+
+	uint32_t tdflags, fflags, fdflags;
+	uint16_t mode;
+
+	DPRINTF(VNCALL, ("%s: fdvp:%p fvp:%p tdvp:%p tdp:%p\n", __func__, fdvp,
+	    fvp, tdvp, tvp));
+
+	/*
+	 * Check for cross-device rename.
+	 */
+	if ((fvp->v_mount != tdvp->v_mount) ||
+	    (tvp && (fvp->v_mount != tvp->v_mount))) {
+		error = EXDEV;
+abortit:
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		if (tvp)
+			vput(tvp);
+		vrele(fdvp);
+		vrele(fvp);
+		return (error);
+	}
+
+	tdflags = tdnode->nn_inode.i_flags;
+	if (tvp &&
+	    ((VTON(tvp)->nn_inode.i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
+	    (tdflags & APPEND))) {
+		error = EPERM;
+		goto abortit;
+	}
+
+	/*
+	 * Renaming a file to itself has no effect.  The upper layers should
+	 * not call us in that case.  Temporarily just warn if they do.
+	 */
+	if (fvp == tvp) {
+		printf("nandfs_rename: fvp == tvp (can't happen)\n");
+		error = 0;
+		goto abortit;
+	}
+
+	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
+		goto abortit;
+
+	fdnode = VTON(fdvp);
+	fnode = VTON(fvp);
+
+	if (fnode->nn_inode.i_links_count >= LINK_MAX) {
+		VOP_UNLOCK(fvp, 0);
+		error = EMLINK;
+		goto abortit;
+	}
+
+	fflags = fnode->nn_inode.i_flags;
+	fdflags = fdnode->nn_inode.i_flags;
+
+	if ((fflags & (NOUNLINK | IMMUTABLE | APPEND)) ||
+	    (fdflags & APPEND)) {
+		VOP_UNLOCK(fvp, 0);
+		error = EPERM;
+		goto abortit;
+	}
+
+	mode = fnode->nn_inode.i_mode;
+	if ((mode & S_IFMT) == S_IFDIR) {
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+
+		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+		    (fdvp == fvp) ||
+		    ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) ||
+		    (fnode->nn_flags & IN_RENAME)) {
+			VOP_UNLOCK(fvp, 0);
+			error = EINVAL;
+			goto abortit;
+		}
+		fnode->nn_flags |= IN_RENAME;
+		doingdirectory = 1;
+		DPRINTF(VNCALL, ("%s: doingdirectory dvp %p\n", __func__,
+		    tdvp));
+		oldparent = fdnode->nn_ino;
+	}
+
+	vrele(fdvp);
+
+	tnode = NULL;
+	if (tvp)
+		tnode = VTON(tvp);
+
+	/*
+	 * Bump link count on fvp while we are moving stuff around. If we
+	 * crash before completing the work, the link count may be wrong
+	 * but correctable.
+	 */
+	fnode->nn_inode.i_links_count++;
+
+	/* Check for in path moving XXX */
+	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
+	VOP_UNLOCK(fvp, 0);
+	if (oldparent != tdnode->nn_ino)
+		newparent = tdnode->nn_ino;
+	if (doingdirectory && newparent) {
+		if (error)	/* write access check above */
+			goto bad;
+		if (tnode != NULL)
+			vput(tvp);
+
+		error = nandfs_checkpath(fnode, tdnode, tcnp->cn_cred);
+		if (error)
+			goto out;
+
+		VREF(tdvp);
+		error = relookup(tdvp, &tvp, tcnp);
+		if (error)
+			goto out;
+		vrele(tdvp);
+		tdnode = VTON(tdvp);
+		tnode = NULL;
+		if (tvp)
+			tnode = VTON(tvp);
+	}
+
+	/*
+	 * If the target doesn't exist, link the target to the source and
+	 * unlink the source. Otherwise, rewrite the target directory to
+	 * reference the source and remove the original entry.
+	 */
+
+	if (tvp == NULL) {
+		/*
+		 * Account for ".." in new directory.
+		 */
+		if (doingdirectory && fdvp != tdvp)
+			tdnode->nn_inode.i_links_count++;
+
+		DPRINTF(VNCALL, ("%s: new entry in dvp:%p\n", __func__, tdvp));
+		/*
+		 * Add name in new directory.
+		 */
+		error = nandfs_add_dirent(tdvp, fnode->nn_ino, tcnp->cn_nameptr,
+		    tcnp->cn_namelen, IFTODT(fnode->nn_inode.i_mode));
+		if (error) {
+			if (doingdirectory && fdvp != tdvp)
+				tdnode->nn_inode.i_links_count--;
+			goto bad;
+		}
+
+		vput(tdvp);
+	} else {
+		/*
+		 * If the parent directory is "sticky", then the user must
+		 * own the parent directory, or the destination of the rename,
+		 * otherwise the destination may not be changed (except by
+		 * root). This implements append-only directories.
+		 */
+		if ((tdnode->nn_inode.i_mode & S_ISTXT) &&
+		    tcnp->cn_cred->cr_uid != 0 &&
+		    tcnp->cn_cred->cr_uid != tdnode->nn_inode.i_uid &&
+		    tnode->nn_inode.i_uid != tcnp->cn_cred->cr_uid) {
+			error = EPERM;
+			goto bad;
+		}
+		/*
+		 * Target must be empty if a directory and have no links
+		 * to it. Also, ensure source and target are compatible
+		 * (both directories, or both not directories).
+		 */
+		mode = tnode->nn_inode.i_mode;
+		if ((mode & S_IFMT) == S_IFDIR) {
+			if (!nandfs_dirempty(tvp, tdnode->nn_ino,
+			    tcnp->cn_cred)) {
+				error = ENOTEMPTY;
+				goto bad;
+			}
+			if (!doingdirectory) {
+				error = ENOTDIR;
+				goto bad;
+			}
+			/*
+			 * Update name cache since directory is going away.
+			 */
+			cache_purge(tdvp);
+		} else if (doingdirectory) {
+			error = EISDIR;
+			goto bad;
+		}
+
+		DPRINTF(VNCALL, ("%s: update entry dvp:%p\n", __func__, tdvp));
+		/*
+		 * Change name tcnp in tdvp to point at fvp.
+		 */
+		error = nandfs_update_dirent(tdvp, fnode, tnode);
+		if (error)
+			goto bad;
+
+		if (doingdirectory && !newparent)
+			tdnode->nn_inode.i_links_count--;
+
+		vput(tdvp);
+
+		tnode->nn_inode.i_links_count--;
+		vput(tvp);
+		tnode = NULL;
+	}
+
+	/*
+	 * Unlink the source.
+	 */
+	fcnp->cn_flags &= ~MODMASK;
+	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+	VREF(fdvp);
+	error = relookup(fdvp, &fvp, fcnp);
+	if (error == 0)
+		vrele(fdvp);
+	if (fvp != NULL) {
+		fnode1 = VTON(fvp);
+		fdnode = VTON(fdvp);
+	} else {
+		/*
+		 * From name has disappeared.
+		 */
+		if (doingdirectory)
+			panic("nandfs_rename: lost dir entry");
+		vrele(ap->a_fvp);
+		return (0);
+	}
+
+	DPRINTF(VNCALL, ("%s: unlink source fnode:%p\n", __func__, fnode));
+
+	/*
+	 * Ensure that the directory entry still exists and has not
+	 * changed while the new name has been entered. If the source is
+	 * a file then the entry may have been unlinked or renamed. In
+	 * either case there is no further work to be done. If the source
+	 * is a directory then it cannot have been rmdir'ed; its link
+	 * count of three would cause a rmdir to fail with ENOTEMPTY.
+	 * The IN_RENAME flag ensures that it cannot be moved by another
+	 * rename.
+	 */
+	if (fnode != fnode1) {
+		if (doingdirectory)
+			panic("nandfs: lost dir entry");
+	} else {
+		/*
+		 * If the source is a directory with a
+		 * new parent, the link count of the old
+		 * parent directory must be decremented
+		 * and ".." set to point to the new parent.
+		 */
+		if (doingdirectory && newparent) {
+			DPRINTF(VNCALL, ("%s: new parent %#jx -> %#jx\n",
+			    __func__, (uintmax_t) oldparent,
+			    (uintmax_t) newparent));
+			error = nandfs_update_parent_dir(fvp, newparent);
+			if (!error) {
+				fdnode->nn_inode.i_links_count--;
+				fdnode->nn_flags |= IN_CHANGE;
+			}
+		}
+		error = nandfs_remove_dirent(fdvp, fnode, fcnp);
+		if (!error) {
+			fnode->nn_inode.i_links_count--;
+			fnode->nn_flags |= IN_CHANGE;
+		}
+		fnode->nn_flags &= ~IN_RENAME;
+	}
+	if (fdnode)
+		vput(fdvp);
+	if (fnode)
+		vput(fvp);
+	vrele(ap->a_fvp);
+	return (error);
+
+bad:
+	DPRINTF(VNCALL, ("%s: error:%d\n", __func__, error));
+	if (tnode)
+		vput(NTOV(tnode));
+	vput(NTOV(tdnode));
+out:
+	if (doingdirectory)
+		fnode->nn_flags &= ~IN_RENAME;
+	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
+		fnode->nn_inode.i_links_count--;
+		fnode->nn_flags |= IN_CHANGE;
+		fnode->nn_flags &= ~IN_RENAME;
+		vput(fvp);
+	} else
+		vrele(fvp);
+	return (error);
+}
+
+static int
+nandfs_mkdir(struct vop_mkdir_args *ap)
+{
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfs_inode *dir_inode = &dir_node->nn_inode;
+	struct nandfs_node *node;
+	struct nandfsmount *nmp = dir_node->nn_nmp;
+	uint16_t mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
+	int error;
+
+	DPRINTF(VNCALL, ("%s: dvp %p\n", __func__, dvp));
+
+	if (nandfs_fs_full(dir_node->nn_nandfsdev))
+		return (ENOSPC);
+
+	if (dir_inode->i_links_count >= LINK_MAX)
+		return (EMLINK);
+
+	error = nandfs_node_create(nmp, &node, mode);
+	if (error)
+		return (error);
+
+	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
+	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
+
+	*vpp = NTOV(node);
+
+	error = nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(mode));
+	if (error) {
+		vput(*vpp);
+		return (error);
+	}
+
+	dir_node->nn_inode.i_links_count++;
+	dir_node->nn_flags |= IN_CHANGE;
+
+	error = nandfs_init_dir(NTOV(node), node->nn_ino, dir_node->nn_ino);
+	if (error) {
+		vput(NTOV(node));
+		return (error);
+	}
+
+	DPRINTF(VNCALL, ("created dir vp %p nandnode %p ino %jx\n", *vpp, node,
+	    (uintmax_t)node->nn_ino));
+	return (0);
+}
+
+static int
+nandfs_mknod(struct vop_mknod_args *ap)
+{
+	struct vnode *dvp = ap->a_dvp;
+	struct vnode **vpp = ap->a_vpp;
+	struct vattr *vap = ap->a_vap;
+	uint16_t mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfsmount *nmp = dir_node->nn_nmp;
+	struct nandfs_node *node;
+	int error;
+
+	if (nandfs_fs_full(dir_node->nn_nandfsdev))
+		return (ENOSPC);
+
+	error = nandfs_node_create(nmp, &node, mode);
+	if (error)
+		return (error);
+	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
+	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
+	if (vap->va_rdev != VNOVAL)
+		node->nn_inode.i_special = vap->va_rdev;
+
+	*vpp = NTOV(node);
+
+	if (nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(mode))) {
+		vput(*vpp);
+		return (ENOTDIR);
+	}
+
+	node->nn_flags |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+
+	return (0);
+}
+
+static int
+nandfs_symlink(struct vop_symlink_args *ap)
+{
+	struct vnode **vpp = ap->a_vpp;
+	struct vnode *dvp = ap->a_dvp;
+	uint16_t mode = MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode);
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *dir_node = VTON(dvp);
+	struct nandfsmount *nmp = dir_node->nn_nmp;
+	struct nandfs_node *node;
+	int len, error;
+
+	if (nandfs_fs_full(dir_node->nn_nandfsdev))
+		return (ENOSPC);
+
+	error = nandfs_node_create(nmp, &node, S_IFLNK | mode);
+	if (error)
+		return (error);
+	node->nn_inode.i_gid = dir_node->nn_inode.i_gid;
+	node->nn_inode.i_uid = cnp->cn_cred->cr_uid;
+
+	*vpp = NTOV(node);
+
+	if (nandfs_add_dirent(dvp, node->nn_ino, cnp->cn_nameptr,
+	    cnp->cn_namelen, IFTODT(mode))) {
+		vput(*vpp);
+		return (ENOTDIR);
+	}
+
+
+	len = strlen(ap->a_target);
+	error = vn_rdwr(UIO_WRITE, *vpp, ap->a_target, len, (off_t)0,
+	    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
+	    cnp->cn_cred, NOCRED, NULL, NULL);
+	if (error)
+		vput(*vpp);
+
+	return (error);
+}
+
+static int
+nandfs_readlink(struct vop_readlink_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+
+	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
+}
+
+static int
+nandfs_rmdir(struct vop_rmdir_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vnode *dvp = ap->a_dvp;
+	struct componentname *cnp = ap->a_cnp;
+	struct nandfs_node *node, *dnode;
+	uint32_t dflag, flag;
+	int error = 0;
+
+	node = VTON(vp);
+	dnode = VTON(dvp);
+
+	/* Files marked as immutable or append-only cannot be deleted. */
+	if ((node->nn_inode.i_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
+	    (dnode->nn_inode.i_flags & APPEND))
+		return (EPERM);
+
+	DPRINTF(VNCALL, ("%s: dvp %p vp %p nandnode %p ino %#jx\n", __func__,
+	    dvp, vp, node, (uintmax_t)node->nn_ino));
+
+	if (node->nn_inode.i_links_count < 2)
+		return (EINVAL);
+
+	if (!nandfs_dirempty(vp, dnode->nn_ino, cnp->cn_cred))
+		return (ENOTEMPTY);
+
+	/* Files marked as immutable or append-only cannot be deleted. */
+	dflag = dnode->nn_inode.i_flags;
+	flag = node->nn_inode.i_flags;
+	if ((dflag & APPEND) ||
+	    (flag & (NOUNLINK | IMMUTABLE | APPEND))) {
+		return (EPERM);
+	}
+
+	if (vp->v_mountedhere != 0)
+		return (EINVAL);
+
+	nandfs_remove_dirent(dvp, node, cnp);
+	dnode->nn_inode.i_links_count -= 1;
+	dnode->nn_flags |= IN_CHANGE;
+
+	cache_purge(dvp);
+
+	error = nandfs_truncate(vp, (uint64_t)0);
+	if (error)
+		return (error);
+
+	node->nn_inode.i_links_count -= 2;
+	node->nn_flags |= IN_CHANGE;
+
+	cache_purge(vp);
+
+	return (error);
+}
+
+static int
+nandfs_fsync(struct vop_fsync_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	int locked;
+
+	DPRINTF(VNCALL, ("%s: vp %p nandnode %p ino %#jx\n", __func__, vp,
+	    node, (uintmax_t)node->nn_ino));
+
+	/*
+	 * Start syncing vnode only if inode was modified or
+	 * there are some dirty buffers
+	 */
+	if (VTON(vp)->nn_flags & IN_MODIFIED ||
+	    vp->v_bufobj.bo_dirty.bv_cnt) {
+		locked = VOP_ISLOCKED(vp);
+		VOP_UNLOCK(vp, 0);
+		nandfs_wakeup_wait_sync(node->nn_nandfsdev, SYNCER_FSYNC);
+		VOP_LOCK(vp, locked | LK_RETRY);
+	}
+
+	return (0);
+}
+
+static int
+nandfs_bmap(struct vop_bmap_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *nnode = VTON(vp);
+	struct nandfs_device *nandfsdev = nnode->nn_nandfsdev;
+	nandfs_daddr_t l2vmap, v2pmap;
+	int error;
+	int blk2dev = nandfsdev->nd_blocksize / DEV_BSIZE;
+
+	DPRINTF(VNCALL, ("%s: vp %p nandnode %p ino %#jx\n", __func__, vp,
+	    nnode, (uintmax_t)nnode->nn_ino));
+
+	if (ap->a_bop != NULL)
+		*ap->a_bop = &nandfsdev->nd_devvp->v_bufobj;
+	if (ap->a_bnp == NULL)
+		return (0);
+	if (ap->a_runp != NULL)
+		*ap->a_runp = 0;
+	if (ap->a_runb != NULL)
+		*ap->a_runb = 0;
+
+	/*
+	 * Translate all the block sectors into a series of buffers to read
+	 * asynchronously from the nandfs device. Note that this lookup may
+	 * induce readin's too.
+	 */
+
+	/* Get virtual block numbers for the vnode's buffer span */
+	error = nandfs_bmap_lookup(nnode, ap->a_bn, &l2vmap);
+	if (error)
+		return (-1);
+
+	/* Translate virtual block numbers to physical block numbers */
+	error = nandfs_vtop(nnode, l2vmap, &v2pmap);
+	if (error)
+		return (-1);
+
+	/* Note virtual block 0 marks not mapped */
+	if (l2vmap == 0)
+		*ap->a_bnp = -1;
+	else
+		*ap->a_bnp = v2pmap * blk2dev;	/* in DEV_BSIZE */
+
+	DPRINTF(VNCALL, ("%s: vp %p nandnode %p ino %#jx lblk %jx -> blk %jx\n",
+	    __func__, vp, nnode, (uintmax_t)nnode->nn_ino, (uintmax_t)ap->a_bn,
+	    (uintmax_t)*ap->a_bnp ));
+
+	return (0);
+}
+
+static void
+nandfs_force_syncer(struct nandfsmount *nmp)
+{
+
+	nmp->nm_flags |= NANDFS_FORCE_SYNCER;
+	nandfs_wakeup_wait_sync(nmp->nm_nandfsdev, SYNCER_FFORCE);
+}
+
+static int
+nandfs_ioctl(struct vop_ioctl_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	u_long command = ap->a_command;
+	caddr_t data = ap->a_data;
+	struct nandfs_node *node = VTON(vp);
+	struct nandfs_device *nandfsdev = node->nn_nandfsdev;
+	struct nandfsmount *nmp = node->nn_nmp;
+	uint64_t *tab, *cno;
+	struct nandfs_seg_stat *nss;
+	struct nandfs_cpmode *ncpm;
+	struct nandfs_argv *nargv;
+	struct nandfs_cpstat *ncp;
+	int error;
+
+	DPRINTF(VNCALL, ("%s: %x\n", __func__, (uint32_t)command));
+
+	error = priv_check(ap->a_td, PRIV_VFS_MOUNT);
+	if (error)
+		return (error);
+
+	if (nmp->nm_ronly) {
+		switch (command) {
+		case NANDFS_IOCTL_GET_FSINFO:
+		case NANDFS_IOCTL_GET_SUSTAT:
+		case NANDFS_IOCTL_GET_CPINFO:
+		case NANDFS_IOCTL_GET_CPSTAT:
+		case NANDFS_IOCTL_GET_SUINFO:
+		case NANDFS_IOCTL_GET_VINFO:
+		case NANDFS_IOCTL_GET_BDESCS:
+			break;
+		default:
+			return (EROFS);
+		}
+	}
+
+	switch (command) {
+	case NANDFS_IOCTL_GET_FSINFO:
+		error = nandfs_get_fsinfo(nmp, (struct nandfs_fsinfo *)data);
+		break;
+	case NANDFS_IOCTL_GET_SUSTAT:
+		nss = (struct nandfs_seg_stat *)data;
+		error = nandfs_get_seg_stat(nandfsdev, nss);
+		break;
+	case NANDFS_IOCTL_CHANGE_CPMODE:
+		ncpm = (struct nandfs_cpmode *)data;
+		error = nandfs_chng_cpmode(nandfsdev->nd_cp_node, ncpm);
+		nandfs_force_syncer(nmp);
+		break;
+	case NANDFS_IOCTL_GET_CPINFO:
+		nargv = (struct nandfs_argv *)data;
+		error = nandfs_get_cpinfo_ioctl(nandfsdev->nd_cp_node, nargv);
+		break;
+	case NANDFS_IOCTL_DELETE_CP:
+		tab = (uint64_t *)data;
+		error = nandfs_delete_cp(nandfsdev->nd_cp_node, tab[0], tab[1]);
+		nandfs_force_syncer(nmp);
+		break;
+	case NANDFS_IOCTL_GET_CPSTAT:
+		ncp = (struct nandfs_cpstat *)data;
+		error = nandfs_get_cpstat(nandfsdev->nd_cp_node, ncp);
+		break;
+	case NANDFS_IOCTL_GET_SUINFO:
+		nargv = (struct nandfs_argv *)data;
+		error = nandfs_get_segment_info_ioctl(nandfsdev, nargv);
+		break;
+	case NANDFS_IOCTL_GET_VINFO:
+		nargv = (struct nandfs_argv *)data;
+		error = nandfs_get_dat_vinfo_ioctl(nandfsdev, nargv);
+		break;
+	case NANDFS_IOCTL_GET_BDESCS:
+		nargv = (struct nandfs_argv *)data;
+		error = nandfs_get_dat_bdescs_ioctl(nandfsdev, nargv);
+		break;
+	case NANDFS_IOCTL_SYNC:
+		cno = (uint64_t *)data;
+		nandfs_force_syncer(nmp);
+		*cno = nandfsdev->nd_last_cno;
+		error = 0;
+		break;
+	case NANDFS_IOCTL_MAKE_SNAP:
+		cno = (uint64_t *)data;
+		error = nandfs_make_snap(nandfsdev, cno);
+		nandfs_force_syncer(nmp);
+		break;
+	case NANDFS_IOCTL_DELETE_SNAP:
+		cno = (uint64_t *)data;
+		error = nandfs_delete_snap(nandfsdev, *cno);
+		nandfs_force_syncer(nmp);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+
+	return (error);
+}
+
+/*
+ * Whiteout vnode call
+ */
+static int
+nandfs_whiteout(struct vop_whiteout_args *ap)
+{
+	struct vnode *dvp = ap->a_dvp;
+	struct componentname *cnp = ap->a_cnp;
+	int error = 0;
+
+	switch (ap->a_flags) {
+	case LOOKUP:
+		return (0);
+	case CREATE:
+		/* Create a new directory whiteout */
+#ifdef INVARIANTS
+		if ((cnp->cn_flags & SAVENAME) == 0)
+			panic("ufs_whiteout: missing name");
+#endif
+		error = nandfs_add_dirent(dvp, NANDFS_WHT_INO, cnp->cn_nameptr,
+		    cnp->cn_namelen, DT_WHT);
+		break;
+
+	case DELETE:
+		/* Remove an existing directory whiteout */
+		cnp->cn_flags &= ~DOWHITEOUT;
+		error = nandfs_remove_dirent(dvp, NULL, cnp);
+		break;
+	default:
+		panic("nandf_whiteout: unknown op: %d", ap->a_flags);
+	}
+
+	return (error);
+}
+
+static int
+nandfs_pathconf(struct vop_pathconf_args *ap)
+{
+	int error;
+
+	error = 0;
+	switch (ap->a_name) {
+	case _PC_LINK_MAX:
+		*ap->a_retval = LINK_MAX;
+		break;
+	case _PC_NAME_MAX:
+		*ap->a_retval = NAME_MAX;
+		break;
+	case _PC_PATH_MAX:
+		*ap->a_retval = PATH_MAX;
+		break;
+	case _PC_PIPE_BUF:
+		*ap->a_retval = PIPE_BUF;
+		break;
+	case _PC_CHOWN_RESTRICTED:
+		*ap->a_retval = 1;
+		break;
+	case _PC_NO_TRUNC:
+		*ap->a_retval = 1;
+		break;
+	case _PC_ACL_EXTENDED:
+		*ap->a_retval = 0;
+		break;
+	case _PC_ALLOC_SIZE_MIN:
+		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
+		break;
+	case _PC_FILESIZEBITS:
+		*ap->a_retval = 64;
+		break;
+	case _PC_REC_INCR_XFER_SIZE:
+		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
+		break;
+	case _PC_REC_MAX_XFER_SIZE:
+		*ap->a_retval = -1; /* means ``unlimited'' */
+		break;
+	case _PC_REC_MIN_XFER_SIZE:
+		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error);
+}
+
+static int
+nandfs_vnlock1(struct vop_lock1_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	int error, vi_locked;
+
+	/*
+	 * XXX can vnode go away while we are sleeping?
+	 */
+	vi_locked = mtx_owned(&vp->v_interlock);
+	if (vi_locked)
+		VI_UNLOCK(vp);
+	error = NANDFS_WRITELOCKFLAGS(node->nn_nandfsdev,
+	    ap->a_flags & LK_NOWAIT);
+	if (vi_locked && !error)
+		VI_LOCK(vp);
+	if (error)
+		return (error);
+
+	error = vop_stdlock(ap);
+	if (error) {
+		NANDFS_WRITEUNLOCK(node->nn_nandfsdev);
+		return (error);
+	}
+
+	return (0);
+}
+
+static int
+nandfs_vnunlock(struct vop_unlock_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+	int error;
+
+	error = vop_stdunlock(ap);
+	if (error)
+		return (error);
+
+	NANDFS_WRITEUNLOCK(node->nn_nandfsdev);
+
+	return (0);
+}
+
+/*
+ * Global vfs data structures
+ */
+struct vop_vector nandfs_vnodeops = {
+	.vop_default =		&default_vnodeops,
+	.vop_access =		nandfs_access,
+	.vop_advlock =		nandfs_advlock,
+	.vop_bmap =		nandfs_bmap,
+	.vop_close =		nandfs_close,
+	.vop_create =		nandfs_create,
+	.vop_fsync =		nandfs_fsync,
+	.vop_getattr =		nandfs_getattr,
+	.vop_inactive =		nandfs_inactive,
+	.vop_cachedlookup =	nandfs_lookup,
+	.vop_ioctl =		nandfs_ioctl,
+	.vop_link =		nandfs_link,
+	.vop_lookup =		vfs_cache_lookup,
+	.vop_mkdir =		nandfs_mkdir,
+	.vop_mknod =		nandfs_mknod,
+	.vop_open =		nandfs_open,
+	.vop_pathconf =		nandfs_pathconf,
+	.vop_print =		nandfs_print,
+	.vop_read =		nandfs_read,
+	.vop_readdir =		nandfs_readdir,
+	.vop_readlink =		nandfs_readlink,
+	.vop_reclaim =		nandfs_reclaim,
+	.vop_remove =		nandfs_remove,
+	.vop_rename =		nandfs_rename,
+	.vop_rmdir =		nandfs_rmdir,
+	.vop_whiteout =		nandfs_whiteout,
+	.vop_write =		nandfs_write,
+	.vop_setattr =		nandfs_setattr,
+	.vop_strategy =		nandfs_strategy,
+	.vop_symlink =		nandfs_symlink,
+	.vop_lock1 =		nandfs_vnlock1,
+	.vop_unlock =		nandfs_vnunlock,
+};
+
+struct vop_vector nandfs_system_vnodeops = {
+	.vop_default =		&default_vnodeops,
+	.vop_close =		nandfs_close,
+	.vop_inactive =		nandfs_inactive,
+	.vop_reclaim =		nandfs_reclaim,
+	.vop_strategy =		nandfs_strategy,
+	.vop_fsync =		nandfs_fsync,
+	.vop_bmap =		nandfs_bmap,
+	.vop_access =		VOP_PANIC,
+	.vop_advlock =		VOP_PANIC,
+	.vop_create =		VOP_PANIC,
+	.vop_getattr =		VOP_PANIC,
+	.vop_cachedlookup =	VOP_PANIC,
+	.vop_ioctl =		VOP_PANIC,
+	.vop_link =		VOP_PANIC,
+	.vop_lookup =		VOP_PANIC,
+	.vop_mkdir =		VOP_PANIC,
+	.vop_mknod =		VOP_PANIC,
+	.vop_open =		VOP_PANIC,
+	.vop_pathconf =		VOP_PANIC,
+	.vop_print =		VOP_PANIC,
+	.vop_read =		VOP_PANIC,
+	.vop_readdir =		VOP_PANIC,
+	.vop_readlink =		VOP_PANIC,
+	.vop_remove =		VOP_PANIC,
+	.vop_rename =		VOP_PANIC,
+	.vop_rmdir =		VOP_PANIC,
+	.vop_whiteout =		VOP_PANIC,
+	.vop_write =		VOP_PANIC,
+	.vop_setattr =		VOP_PANIC,
+	.vop_symlink =		VOP_PANIC,
+};
+
+static int
+nandfsfifo_close(struct vop_close_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct nandfs_node *node = VTON(vp);
+
+	DPRINTF(VNCALL, ("%s: vp %p node %p\n", __func__, vp, node));
+
+	mtx_lock(&vp->v_interlock);
+	if (vp->v_usecount > 1)
+		nandfs_itimes_locked(vp);
+	mtx_unlock(&vp->v_interlock);
+
+	return (fifo_specops.vop_close(ap));
+}
+
+struct vop_vector nandfs_fifoops = {
+	.vop_default =		&fifo_specops,
+	.vop_fsync =		VOP_PANIC,
+	.vop_access =		nandfs_access,
+	.vop_close =		nandfsfifo_close,
+	.vop_getattr =		nandfs_getattr,
+	.vop_inactive =		nandfs_inactive,
+	.vop_print =		nandfs_print,
+	.vop_read =		VOP_PANIC,
+	.vop_reclaim =		nandfs_reclaim,
+	.vop_setattr =		nandfs_setattr,
+	.vop_write =		VOP_PANIC,
+	.vop_lock1 =		nandfs_vnlock1,
+	.vop_unlock =		nandfs_vnunlock,
+};
+
+int
+nandfs_vinit(struct vnode *vp, uint64_t ino)
+{
+	struct nandfs_node *node;
+
+	ASSERT_VOP_LOCKED(vp, __func__);
+
+	node = VTON(vp);
+
+	/* Check if we're fetching the root */
+	if (ino == NANDFS_ROOT_INO)
+		vp->v_vflag |= VV_ROOT;
+
+	if (ino != NANDFS_GC_INO)
+		vp->v_type = IFTOVT(node->nn_inode.i_mode);
+	else
+		vp->v_type = VREG;
+
+	if (vp->v_type == VFIFO)
+		vp->v_op = &nandfs_fifoops;
+
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfs/nfs_commonacl.c
--- a/head/sys/fs/nfs/nfs_commonacl.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfs/nfs_commonacl.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfs/nfs_commonacl.c 224086 2011-07-16 08:51:09Z zack $");
+__FBSDID("$FreeBSD: head/sys/fs/nfs/nfs_commonacl.c 235568 2012-05-17 21:52:17Z rmacklem $");
 
 #ifndef APPLEKEXT
 #include <fs/nfs/nfsport.h>
@@ -468,9 +468,7 @@
 		error = NFSERR_ATTRNOTSUPP;
 		goto out;
 	}
-	error = VOP_ACLCHECK(vp, ACL_TYPE_NFS4, aclp, cred, p);
-	if (!error)
-		error = VOP_SETACL(vp, ACL_TYPE_NFS4, aclp, cred, p);
+	error = VOP_SETACL(vp, ACL_TYPE_NFS4, aclp, cred, p);
 
 out:
 	NFSEXITCODE(error);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfs_clbio.c
--- a/head/sys/fs/nfsclient/nfs_clbio.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfs_clbio.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clbio.c 233101 2012-03-17 23:03:20Z kib $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clbio.c 237987 2012-07-02 09:53:08Z kib $");
 
 #include "opt_kdtrace.h"
 
@@ -281,7 +281,11 @@
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
-	cred = curthread->td_ucred;		/* XXX */
+	/* Set the cred to n_writecred for the write rpcs. */
+	if (np->n_writecred != NULL)
+		cred = crhold(np->n_writecred);
+	else
+		cred = crhold(curthread->td_ucred);	/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
@@ -345,6 +349,7 @@
 	    iomode = NFSWRITE_FILESYNC;
 
 	error = ncl_writerpc(vp, &uio, cred, &iomode, &must_commit, 0);
+	crfree(cred);
 
 	pmap_qremove(kva, npages);
 	relpbuf(bp, &ncl_pbuf_freecnt);
@@ -717,7 +722,7 @@
 	    };
 
 	    if (n > 0) {
-		    error = uiomove(bp->b_data + on, (int)n, uio);
+		    error = vn_io_fault_uiomove(bp->b_data + on, (int)n, uio);
 	    }
 	    if (vp->v_type == VLNK)
 		n = 0;
@@ -892,8 +897,9 @@
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
 	int bcount;
-	int n, on, error = 0;
-	off_t tmp_off;
+	int bp_cached, n, on, error = 0, error1;
+	size_t orig_resid, local_resid;
+	off_t orig_size, tmp_off;
 
 	KASSERT(uio->uio_rw == UIO_WRITE, ("ncl_write mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
@@ -945,6 +951,11 @@
 			mtx_unlock(&np->n_mtx);
 	}
 
+	orig_resid = uio->uio_resid;
+	mtx_lock(&np->n_mtx);
+	orig_size = np->n_size;
+	mtx_unlock(&np->n_mtx);
+
 	/*
 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
 	 * get the append lock.
@@ -1122,7 +1133,10 @@
 		 * normally.
 		 */
 
+		bp_cached = 1;
 		if (on == 0 && n == bcount) {
+			if ((bp->b_flags & B_CACHE) == 0)
+				bp_cached = 0;
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
@@ -1173,7 +1187,7 @@
 		 * significant cache coherency problems with multiple clients,
 		 * especially if locking is implemented later on.
 		 *
-		 * as an optimization we could theoretically maintain
+		 * As an optimization we could theoretically maintain
 		 * a linked list of discontinuous areas, but we would still
 		 * have to commit them separately so there isn't much
 		 * advantage to it except perhaps a bit of asynchronization.
@@ -1188,7 +1202,23 @@
 			goto again;
 		}
 
-		error = uiomove((char *)bp->b_data + on, n, uio);
+		local_resid = uio->uio_resid;
+		error = vn_io_fault_uiomove((char *)bp->b_data + on, n, uio);
+
+		if (error != 0 && !bp_cached) {
+			/*
+			 * This block has no other content then what
+			 * possibly was written by the faulty uiomove.
+			 * Release it, forgetting the data pages, to
+			 * prevent the leak of uninitialized data to
+			 * usermode.
+			 */
+			bp->b_ioflags |= BIO_ERROR;
+			brelse(bp);
+			uio->uio_offset -= local_resid - uio->uio_resid;
+			uio->uio_resid = local_resid;
+			break;
+		}
 
 		/*
 		 * Since this block is being modified, it must be written
@@ -1198,17 +1228,18 @@
 		 */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
-		if (error) {
-			bp->b_ioflags |= BIO_ERROR;
-			brelse(bp);
-			break;
-		}
+		/*
+		 * Get the partial update on the progress made from
+		 * uiomove, if an error occured.
+		 */
+		if (error != 0)
+			n = local_resid - uio->uio_resid;
 
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate
 		 * condition.
 		 */
-		if (n) {
+		if (n > 0) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
@@ -1228,17 +1259,34 @@
 		if ((ioflag & IO_SYNC)) {
 			if (ioflag & IO_INVAL)
 				bp->b_flags |= B_NOCACHE;
-			error = bwrite(bp);
-			if (error)
+			error1 = bwrite(bp);
+			if (error1 != 0) {
+				if (error == 0)
+					error = error1;
 				break;
+			}
 		} else if ((n + on) == biosize) {
 			bp->b_flags |= B_ASYNC;
 			(void) ncl_writebp(bp, 0, NULL);
 		} else {
 			bdwrite(bp);
 		}
+
+		if (error != 0)
+			break;
 	} while (uio->uio_resid > 0 && n > 0);
 
+	if (error != 0) {
+		if (ioflag & IO_UNIT) {
+			VATTR_NULL(&vattr);
+			vattr.va_size = orig_size;
+			/* IO_SYNC is handled implicitely */
+			(void)VOP_SETATTR(vp, &vattr, cred);
+			uio->uio_offset -= orig_resid - uio->uio_resid;
+			uio->uio_resid = orig_resid;
+		}
+	}
+
 	return (error);
 }
 
@@ -1817,7 +1865,7 @@
 		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
 		 * buffer that now needs to be truncated.
 		 */
-		error = vtruncbuf(vp, cred, td, nsize, biosize);
+		error = vtruncbuf(vp, cred, nsize, biosize);
 		lbn = nsize / biosize;
 		bufsize = nsize & (biosize - 1);
 		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfs_clnode.c
--- a/head/sys/fs/nfsclient/nfs_clnode.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfs_clnode.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clnode.c 230605 2012-01-27 02:46:12Z rmacklem $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clnode.c 237244 2012-06-18 22:17:28Z rmacklem $");
 
 #include "opt_kdtrace.h"
 
@@ -210,18 +210,28 @@
 	struct nfsnode *np;
 	struct sillyrename *sp;
 	struct vnode *vp = ap->a_vp;
+	boolean_t retv;
 
 	np = VTONFS(vp);
 
 	if (NFS_ISV4(vp) && vp->v_type == VREG) {
 		/*
 		 * Since mmap()'d files do I/O after VOP_CLOSE(), the NFSv4
-		 * Close operations are delayed until now. Any dirty buffers
-		 * must be flushed before the close, so that the stateid is
-		 * available for the writes.
+		 * Close operations are delayed until now. Any dirty
+		 * buffers/pages must be flushed before the close, so that the
+		 * stateid is available for the writes.
 		 */
-		(void) ncl_flush(vp, MNT_WAIT, NULL, ap->a_td, 1, 0);
-		(void) nfsrpc_close(vp, 1, ap->a_td);
+		if (vp->v_object != NULL) {
+			VM_OBJECT_LOCK(vp->v_object);
+			retv = vm_object_page_clean(vp->v_object, 0, 0,
+			    OBJPC_SYNC);
+			VM_OBJECT_UNLOCK(vp->v_object);
+		} else
+			retv = TRUE;
+		if (retv == TRUE) {
+			(void)ncl_flush(vp, MNT_WAIT, NULL, ap->a_td, 1, 0);
+			(void)nfsrpc_close(vp, 1, ap->a_td);
+		}
 	}
 
 	mtx_lock(&np->n_mtx);
@@ -257,15 +267,6 @@
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsdmap *dp, *dp2;
 
-	if (NFS_ISV4(vp) && vp->v_type == VREG)
-		/*
-		 * Since mmap()'d files do I/O after VOP_CLOSE(), the NFSv4
-		 * Close operations are delayed until ncl_inactive().
-		 * However, since VOP_INACTIVE() is not guaranteed to be
-		 * called, we need to do it again here.
-		 */
-		(void) nfsrpc_close(vp, 1, ap->a_td);
-
 	/*
 	 * If the NLM is running, give it a chance to abort pending
 	 * locks.
@@ -278,6 +279,15 @@
 	 */
 	vnode_destroy_vobject(vp);
 
+	if (NFS_ISV4(vp) && vp->v_type == VREG)
+		/*
+		 * We can now safely close any remaining NFSv4 Opens for
+		 * this file. Most opens will have already been closed by
+		 * ncl_inactive(), but there are cases where it is not
+		 * called, so we need to do it again here.
+		 */
+		(void) nfsrpc_close(vp, 1, ap->a_td);
+
 	vfs_hash_remove(vp);
 
 	/*
@@ -300,6 +310,8 @@
 			FREE((caddr_t)dp2, M_NFSDIROFF);
 		}
 	}
+	if (np->n_writecred != NULL)
+		crfree(np->n_writecred);
 	FREE((caddr_t)np->n_fhp, M_NFSFH);
 	if (np->n_v4 != NULL)
 		FREE((caddr_t)np->n_v4, M_NFSV4NODE);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfs_clvfsops.c
--- a/head/sys/fs/nfsclient/nfs_clvfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfs_clvfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clvfsops.c 234386 2012-04-17 16:28:22Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clvfsops.c 237367 2012-06-21 09:26:06Z kib $");
 
 
 #include "opt_bootp.h"
@@ -1136,7 +1136,8 @@
 out:
 	if (!error) {
 		MNT_ILOCK(mp);
-		mp->mnt_kern_flag |= (MNTK_MPSAFE|MNTK_LOOKUP_SHARED);
+		mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED |
+		    MNTK_NO_IOPF;
 		MNT_IUNLOCK(mp);
 	}
 	return (error);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfs_clvnops.c
--- a/head/sys/fs/nfsclient/nfs_clvnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfs_clvnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clvnops.c 233101 2012-03-17 23:03:20Z kib $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsclient/nfs_clvnops.c 235332 2012-05-12 12:02:51Z rmacklem $");
 
 /*
  * vnode op calls for Sun NFS version 2, 3 and 4
@@ -513,6 +513,7 @@
 	struct vattr vattr;
 	int error;
 	int fmode = ap->a_mode;
+	struct ucred *cred;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 		return (EOPNOTSUPP);
@@ -604,7 +605,22 @@
 		}
 		np->n_directio_opens++;
 	}
+
+	/*
+	 * If this is an open for writing, capture a reference to the
+	 * credentials, so they can be used by ncl_putpages(). Using
+	 * these write credentials is preferable to the credentials of
+	 * whatever thread happens to be doing the VOP_PUTPAGES() since
+	 * the write RPCs are less likely to fail with EACCES.
+	 */
+	if ((fmode & FWRITE) != 0) {
+		cred = np->n_writecred;
+		np->n_writecred = crhold(ap->a_cred);
+	} else
+		cred = NULL;
 	mtx_unlock(&np->n_mtx);
+	if (cred != NULL)
+		crfree(cred);
 	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 	return (0);
 }
@@ -1546,7 +1562,10 @@
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
-		if (attrflag)
+		if (attrflag == 0)
+			error = nfsrpc_getattr(newvp, cnp->cn_cred,
+			    cnp->cn_thread, &nfsva, NULL);
+		if (error == 0)
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsclient/nfsnode.h
--- a/head/sys/fs/nfsclient/nfsnode.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsclient/nfsnode.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/nfsclient/nfsnode.h 230394 2012-01-20 20:02:01Z jhb $
+ * $FreeBSD: head/sys/fs/nfsclient/nfsnode.h 235332 2012-05-12 12:02:51Z rmacklem $
  */
 
 #ifndef _NFSCLIENT_NFSNODE_H_
@@ -123,6 +123,7 @@
 	int                     n_directio_asyncwr;
 	u_int64_t		 n_change;	/* old Change attribute */
 	struct nfsv4node	*n_v4;		/* extra V4 stuff */
+	struct ucred		*n_writecred;	/* Cred. for putpages */
 };
 
 #define	n_atim		n_un1.nf_atim
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsserver/nfs_nfsdport.c
--- a/head/sys/fs/nfsserver/nfs_nfsdport.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsserver/nfs_nfsdport.c	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsserver/nfs_nfsdport.c 234482 2012-04-20 06:50:44Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsserver/nfs_nfsdport.c 235136 2012-05-08 03:39:44Z jwd $");
 
 #include <sys/capability.h>
 
@@ -505,11 +505,10 @@
 
 out:
 	if (error) {
-		uma_zfree(namei_zone, cnp->cn_pnbuf);
+		nfsvno_relpathbuf(ndp);
 		ndp->ni_vp = NULL;
 		ndp->ni_dvp = NULL;
 		ndp->ni_startdir = NULL;
-		cnp->cn_flags &= ~HASBUF;
 	} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) {
 		ndp->ni_dvp = NULL;
 	}
@@ -1047,6 +1046,8 @@
 	else
 		vput(ndp->ni_dvp);
 	vput(vp);
+	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
+		nfsvno_relpathbuf(ndp);
 	NFSEXITCODE(error);
 	return (error);
 }
@@ -1086,6 +1087,8 @@
 	else
 		vput(ndp->ni_dvp);
 	vput(vp);
+	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
+		nfsvno_relpathbuf(ndp);
 	NFSEXITCODE(error);
 	return (error);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nfsserver/nfs_nfsdstate.c
--- a/head/sys/fs/nfsserver/nfs_nfsdstate.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nfsserver/nfs_nfsdstate.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/nfsserver/nfs_nfsdstate.c 231949 2012-02-21 01:05:12Z kib $");
+__FBSDID("$FreeBSD: head/sys/fs/nfsserver/nfs_nfsdstate.c 235381 2012-05-12 22:20:55Z rmacklem $");
 
 #ifndef APPLEKEXT
 #include <fs/nfs/nfsport.h>
@@ -331,11 +331,13 @@
 		 * Must wait until any outstanding callback on the old clp
 		 * completes.
 		 */
+		NFSLOCKSTATE();
 		while (clp->lc_cbref) {
 			clp->lc_flags |= LCL_WAKEUPWANTED;
-			(void) tsleep((caddr_t)clp, PZERO - 1,
+			(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
 			    "nfsd clp", 10 * hz);
 		}
+		NFSUNLOCKSTATE();
 		nfsrv_zapclient(clp, p);
 		*new_clpp = NULL;
 		goto out;
@@ -385,10 +387,13 @@
 	 * Must wait until any outstanding callback on the old clp
 	 * completes.
 	 */
+	NFSLOCKSTATE();
 	while (clp->lc_cbref) {
 		clp->lc_flags |= LCL_WAKEUPWANTED;
-		(void) tsleep((caddr_t)clp, PZERO - 1, "nfsd clp", 10 * hz);
+		(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1, "nfsd clp",
+		    10 * hz);
 	}
+	NFSUNLOCKSTATE();
 	nfsrv_zapclient(clp, p);
 	*new_clpp = NULL;
 
@@ -3816,11 +3821,9 @@
 	clp->lc_cbref--;
 	if ((clp->lc_flags & LCL_WAKEUPWANTED) && clp->lc_cbref == 0) {
 		clp->lc_flags &= ~LCL_WAKEUPWANTED;
-		NFSUNLOCKSTATE();
-		wakeup((caddr_t)clp);
-	} else {
-		NFSUNLOCKSTATE();
+		wakeup(clp);
 	}
+	NFSUNLOCKSTATE();
 
 	NFSEXITCODE(error);
 	return (error);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs.h
--- a/head/sys/fs/ntfs/ntfs.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs.h	Wed Jul 25 16:40:53 2012 +0300
@@ -25,16 +25,16 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs.h 232100 2012-02-24 07:30:44Z kevlo $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs.h 236140 2012-05-27 09:34:47Z ed $
  */
 
 /*#define NTFS_DEBUG 1*/
 
-typedef u_int64_t cn_t;
-typedef u_int16_t wchar;
+typedef uint64_t cn_t;
+typedef uint16_t wchar;
 
 #pragma pack(1)
-#define BBSIZE			1024
+#define	BBSIZE			1024
 #define	BBOFF			((off_t)(0))
 #define	BBLOCK			0
 #define	NTFS_MFTINO		0
@@ -45,157 +45,157 @@
 #define	NTFS_BOOTINO		7
 #define	NTFS_BADCLUSINO		8
 #define	NTFS_UPCASEINO		10
-#define NTFS_MAXFILENAME	255
+#define	NTFS_MAXFILENAME	255
 
 struct fixuphdr {
-	u_int32_t       fh_magic;
-	u_int16_t       fh_foff;
-	u_int16_t       fh_fnum;
+	uint32_t	fh_magic;
+	uint16_t	fh_foff;
+	uint16_t	fh_fnum;
 };
 
-#define NTFS_AF_INRUN	0x00000001
+#define	NTFS_AF_INRUN	0x00000001
 struct attrhdr {
-	u_int32_t       a_type;
-	u_int32_t       reclen;
-	u_int8_t        a_flag;
-	u_int8_t        a_namelen;
-	u_int8_t        a_nameoff;
-	u_int8_t        reserved1;
-	u_int8_t        a_compression;
-	u_int8_t        reserved2;
-	u_int16_t       a_index;
+	uint32_t	a_type;
+	uint32_t	reclen;
+	uint8_t		a_flag;
+	uint8_t		a_namelen;
+	uint8_t		a_nameoff;
+	uint8_t		reserved1;
+	uint8_t		a_compression;
+	uint8_t		reserved2;
+	uint16_t	a_index;
 };
-#define NTFS_A_STD	0x10
-#define NTFS_A_ATTRLIST	0x20
-#define NTFS_A_NAME	0x30
-#define NTFS_A_VOLUMENAME	0x60
-#define NTFS_A_DATA	0x80
+#define	NTFS_A_STD	0x10
+#define	NTFS_A_ATTRLIST	0x20
+#define	NTFS_A_NAME	0x30
+#define	NTFS_A_VOLUMENAME	0x60
+#define	NTFS_A_DATA	0x80
 #define	NTFS_A_INDXROOT	0x90
 #define	NTFS_A_INDX	0xA0
-#define NTFS_A_INDXBITMAP 0xB0
+#define	NTFS_A_INDXBITMAP 0xB0
 
-#define NTFS_MAXATTRNAME	255
+#define	NTFS_MAXATTRNAME	255
 struct attr {
-	struct attrhdr  a_hdr;
+	struct attrhdr	a_hdr;
 	union {
 		struct {
-			u_int16_t       a_datalen;
-			u_int16_t       reserved1;
-			u_int16_t       a_dataoff;
-			u_int16_t       a_indexed;
-		}               a_S_r;
+			uint16_t	a_datalen;
+			uint16_t	reserved1;
+			uint16_t	a_dataoff;
+			uint16_t	a_indexed;
+		} a_S_r;
 		struct {
-			cn_t            a_vcnstart;
-			cn_t            a_vcnend;
-			u_int16_t       a_dataoff;
-			u_int16_t       a_compressalg;
-			u_int32_t       reserved1;
-			u_int64_t       a_allocated;
-			u_int64_t       a_datalen;
-			u_int64_t       a_initialized;
-		}               a_S_nr;
-	}               a_S;
+			cn_t		a_vcnstart;
+			cn_t		a_vcnend;
+			uint16_t	a_dataoff;
+			uint16_t	a_compressalg;
+			uint32_t	reserved1;
+			uint64_t	a_allocated;
+			uint64_t	a_datalen;
+			uint64_t	a_initialized;
+		} a_S_nr;
+	} a_S;
 };
-#define a_r	a_S.a_S_r
-#define a_nr	a_S.a_S_nr
+#define	a_r	a_S.a_S_r
+#define	a_nr	a_S.a_S_nr
 
 typedef struct {
-	u_int64_t       t_create;
-	u_int64_t       t_write;
-	u_int64_t       t_mftwrite;
-	u_int64_t       t_access;
-}               ntfs_times_t;
+	uint64_t	t_create;
+	uint64_t	t_write;
+	uint64_t	t_mftwrite;
+	uint64_t	t_access;
+} ntfs_times_t;
 
-#define NTFS_FFLAG_RDONLY	0x01LL
-#define NTFS_FFLAG_HIDDEN	0x02LL
-#define NTFS_FFLAG_SYSTEM	0x04LL
-#define NTFS_FFLAG_ARCHIVE	0x20LL
-#define NTFS_FFLAG_COMPRESSED	0x0800LL
-#define NTFS_FFLAG_DIR		0x10000000LL
+#define	NTFS_FFLAG_RDONLY	0x01LL
+#define	NTFS_FFLAG_HIDDEN	0x02LL
+#define	NTFS_FFLAG_SYSTEM	0x04LL
+#define	NTFS_FFLAG_ARCHIVE	0x20LL
+#define	NTFS_FFLAG_COMPRESSED	0x0800LL
+#define	NTFS_FFLAG_DIR		0x10000000LL
 
 struct attr_name {
-	u_int32_t       n_pnumber;	/* Parent ntnode */
-	u_int32_t       reserved;
-	ntfs_times_t    n_times;
-	u_int64_t       n_size;
-	u_int64_t       n_attrsz;
-	u_int64_t       n_flag;
-	u_int8_t        n_namelen;
-	u_int8_t        n_nametype;
-	u_int16_t       n_name[1];
+	uint32_t	n_pnumber;	/* Parent ntnode */
+	uint32_t	reserved;
+	ntfs_times_t	n_times;
+	uint64_t	n_size;
+	uint64_t	n_attrsz;
+	uint64_t	n_flag;
+	uint8_t		n_namelen;
+	uint8_t		n_nametype;
+	uint16_t	n_name[1];
 };
 
-#define NTFS_IRFLAG_INDXALLOC	0x00000001
+#define	NTFS_IRFLAG_INDXALLOC	0x00000001
 struct attr_indexroot {
-	u_int32_t       ir_unkn1;	/* always 0x30 */
-	u_int32_t       ir_unkn2;	/* always 0x1 */
-	u_int32_t       ir_size;/* ??? */
-	u_int32_t       ir_unkn3;	/* number of cluster */
-	u_int32_t       ir_unkn4;	/* always 0x10 */
-	u_int32_t       ir_datalen;	/* sizeof simething */
-	u_int32_t       ir_allocated;	/* same as above */
-	u_int16_t       ir_flag;/* ?? always 1 */
-	u_int16_t       ir_unkn7;
+	uint32_t	ir_unkn1;	/* always 0x30 */
+	uint32_t	ir_unkn2;	/* always 0x1 */
+	uint32_t	ir_size;/* ??? */
+	uint32_t	ir_unkn3;	/* number of cluster */
+	uint32_t	ir_unkn4;	/* always 0x10 */
+	uint32_t	ir_datalen;	/* sizeof simething */
+	uint32_t	ir_allocated;	/* same as above */
+	uint16_t	ir_flag;/* ?? always 1 */
+	uint16_t	ir_unkn7;
 };
 
 struct attr_attrlist {
-	u_int32_t       al_type;	/* Attribute type */
-	u_int16_t       reclen;		/* length of this entry */
-	u_int8_t        al_namelen;	/* Attribute name len */
-	u_int8_t        al_nameoff;	/* Name offset from entry start */
-	u_int64_t       al_vcnstart;	/* VCN number */
-	u_int32_t       al_inumber;	/* Parent ntnode */
-	u_int32_t       reserved;
-	u_int16_t       al_index;	/* Attribute index in MFT record */
-	u_int16_t       al_name[1];	/* Name */
+	uint32_t	al_type;	/* Attribute type */
+	uint16_t	reclen;		/* length of this entry */
+	uint8_t		al_namelen;	/* Attribute name len */
+	uint8_t		al_nameoff;	/* Name offset from entry start */
+	uint64_t	al_vcnstart;	/* VCN number */
+	uint32_t	al_inumber;	/* Parent ntnode */
+	uint32_t	reserved;
+	uint16_t	al_index;	/* Attribute index in MFT record */
+	uint16_t	al_name[1];	/* Name */
 };
 
-#define	NTFS_INDXMAGIC	(u_int32_t)(0x58444E49)
+#define	NTFS_INDXMAGIC	(uint32_t)(0x58444E49)
 struct attr_indexalloc {
 	struct fixuphdr ia_fixup;
-	u_int64_t       unknown1;
-	cn_t            ia_bufcn;
-	u_int16_t       ia_hdrsize;
-	u_int16_t       unknown2;
-	u_int32_t       ia_inuse;
-	u_int32_t       ia_allocated;
+	uint64_t	unknown1;
+	cn_t		ia_bufcn;
+	uint16_t	ia_hdrsize;
+	uint16_t	unknown2;
+	uint32_t	ia_inuse;
+	uint32_t	ia_allocated;
 };
 
 #define	NTFS_IEFLAG_SUBNODE	0x00000001
 #define	NTFS_IEFLAG_LAST	0x00000002
 
 struct attr_indexentry {
-	u_int32_t       ie_number;
-	u_int32_t       unknown1;
-	u_int16_t       reclen;
-	u_int16_t       ie_size;
-	u_int32_t       ie_flag;/* 1 - has subnodes, 2 - last */
-	u_int32_t       ie_fpnumber;
-	u_int32_t       unknown2;
-	ntfs_times_t    ie_ftimes;
-	u_int64_t       ie_fallocated;
-	u_int64_t       ie_fsize;
-	u_int64_t       ie_fflag;
-	u_int8_t        ie_fnamelen;
-	u_int8_t        ie_fnametype;
-	wchar           ie_fname[NTFS_MAXFILENAME];
+	uint32_t	ie_number;
+	uint32_t	unknown1;
+	uint16_t	reclen;
+	uint16_t	ie_size;
+	uint32_t	ie_flag; /* 1 - has subnodes, 2 - last */
+	uint32_t	ie_fpnumber;
+	uint32_t	unknown2;
+	ntfs_times_t	ie_ftimes;
+	uint64_t	ie_fallocated;
+	uint64_t	ie_fsize;
+	uint64_t	ie_fflag;
+	uint8_t		ie_fnamelen;
+	uint8_t		ie_fnametype;
+	wchar		ie_fname[NTFS_MAXFILENAME];
 	/* cn_t		ie_bufcn;	 buffer with subnodes */
 };
 
-#define	NTFS_FILEMAGIC	(u_int32_t)(0x454C4946)
+#define	NTFS_FILEMAGIC	(uint32_t)(0x454C4946)
 #define	NTFS_BLOCK_SIZE	512
 #define	NTFS_FRFLAG_DIR	0x0002
 struct filerec {
-	struct fixuphdr fr_fixup;
-	u_int8_t        reserved[8];
-	u_int16_t       fr_seqnum;	/* Sequence number */
-	u_int16_t       fr_nlink;
-	u_int16_t       fr_attroff;	/* offset to attributes */
-	u_int16_t       fr_flags;	/* 1-nonresident attr, 2-directory */
-	u_int32_t       fr_size;/* hdr + attributes */
-	u_int32_t       fr_allocated;	/* allocated length of record */
-	u_int64_t       fr_mainrec;	/* main record */
-	u_int16_t       fr_attrnum;	/* maximum attr number + 1 ??? */
+	struct fixuphdr	fr_fixup;
+	uint8_t		reserved[8];
+	uint16_t	fr_seqnum;	/* Sequence number */
+	uint16_t	fr_nlink;
+	uint16_t	fr_attroff;	/* offset to attributes */
+	uint16_t	fr_flags;	/* 1-nonresident attr, 2-directory */
+	uint32_t	fr_size;/* hdr + attributes */
+	uint32_t	fr_allocated;	/* allocated length of record */
+	uint64_t	fr_mainrec;	/* main record */
+	uint16_t	fr_attrnum;	/* maximum attr number + 1 ??? */
 };
 
 #define	NTFS_ATTRNAME_MAXLEN	0x40
@@ -203,66 +203,66 @@
 #define	NTFS_ADFLAG_INDEX	0x0002	/* Attrib can be indexed */
 struct attrdef {
 	wchar		ad_name[NTFS_ATTRNAME_MAXLEN];
-	u_int32_t	ad_type;
-	u_int32_t	reserved1[2];
-	u_int32_t	ad_flag;
-	u_int64_t	ad_minlen;
-	u_int64_t	ad_maxlen;	/* -1 for nonlimited */
+	uint32_t	ad_type;
+	uint32_t	reserved1[2];
+	uint32_t	ad_flag;
+	uint64_t	ad_minlen;
+	uint64_t	ad_maxlen;	/* -1 for nonlimited */
 };
 
 struct ntvattrdef {
 	char		ad_name[0x40];
 	int		ad_namelen;
-	u_int32_t	ad_type;
+	uint32_t	ad_type;
 };
 
 #define	NTFS_BBID	"NTFS    "
 #define	NTFS_BBIDLEN	8
 struct bootfile {
-	u_int8_t        reserved1[3];	/* asm jmp near ... */
-	u_int8_t        bf_sysid[8];	/* 'NTFS    ' */
-	u_int16_t       bf_bps;		/* bytes per sector */
-	u_int8_t        bf_spc;		/* sectors per cluster */
-	u_int8_t        reserved2[7];	/* unused (zeroed) */
-	u_int8_t        bf_media;	/* media desc. (0xF8) */
-	u_int8_t        reserved3[2];
-	u_int16_t       bf_spt;		/* sectors per track */
-	u_int16_t       bf_heads;	/* number of heads */
-	u_int8_t        reserver4[12];
-	u_int64_t       bf_spv;		/* sectors per volume */
-	cn_t            bf_mftcn;	/* $MFT cluster number */
-	cn_t            bf_mftmirrcn;	/* $MFTMirr cn */
-	u_int8_t        bf_mftrecsz;	/* MFT record size (clust) */
+	uint8_t		reserved1[3];	/* asm jmp near ... */
+	uint8_t		bf_sysid[8];	/* 'NTFS    ' */
+	uint16_t	bf_bps;		/* bytes per sector */
+	uint8_t		bf_spc;		/* sectors per cluster */
+	uint8_t		reserved2[7];	/* unused (zeroed) */
+	uint8_t		bf_media;	/* media desc. (0xF8) */
+	uint8_t		reserved3[2];
+	uint16_t	bf_spt;		/* sectors per track */
+	uint16_t	bf_heads;	/* number of heads */
+	uint8_t		reserver4[12];
+	uint64_t	bf_spv;		/* sectors per volume */
+	cn_t		bf_mftcn;	/* $MFT cluster number */
+	cn_t		bf_mftmirrcn;	/* $MFTMirr cn */
+	uint8_t		bf_mftrecsz;	/* MFT record size (clust) */
 					/* 0xF6 inducates 1/4 */
-	u_int32_t       bf_ibsz;	/* index buffer size */
-	u_int32_t       bf_volsn;	/* volume ser. num. */
+	uint32_t	bf_ibsz;	/* index buffer size */
+	uint32_t	bf_volsn;	/* volume ser. num. */
 };
 
 #define	NTFS_SYSNODESNUM	0x0B
 struct ntfsmount {
 	struct mount   *ntm_mountp;	/* filesystem vfs structure */
-	struct bootfile ntm_bootfile;
+	struct bootfile	ntm_bootfile;
 	struct g_consumer *ntm_cp;
 	struct bufobj  *ntm_bo;
 	struct vnode   *ntm_devvp;	/* block device mounted vnode */
 	struct vnode   *ntm_sysvn[NTFS_SYSNODESNUM];
-	u_int32_t       ntm_bpmftrec;
-	uid_t           ntm_uid;
-	gid_t           ntm_gid;
-	mode_t          ntm_mode;
+	uint32_t	ntm_bpmftrec;
+	uid_t		ntm_uid;
+	gid_t		ntm_gid;
+	mode_t		ntm_mode;
 	uint64_t	ntm_flag;
 	cn_t		ntm_cfree;
 	struct ntvattrdef *ntm_ad;
 	int		ntm_adnum;
- 	wchar *		ntm_82u;	/* 8bit to Unicode */
- 	char **		ntm_u28;	/* Unicode to 8 bit */
+	wchar *		ntm_82u;	/* 8bit to Unicode */
+	char **		ntm_u28;	/* Unicode to 8 bit */
 	void *		ntm_ic_l2u;	/* Local to Unicode (iconv) */
 	void *		ntm_ic_u2l;	/* Unicode to Local (iconv) */
-	u_int8_t	ntm_multiplier; /* NTFS blockno to DEV_BSIZE sectorno */
+	uint8_t		ntm_multiplier; /* NTFS blockno to DEV_BSIZE sectorno */
 };
 
-#define ntm_mftcn	ntm_bootfile.bf_mftcn
-#define ntm_mftmirrcn	ntm_bootfile.bf_mftmirrcn
+#define	ntm_mftcn	ntm_bootfile.bf_mftcn
+#define	ntm_mftmirrcn	ntm_bootfile.bf_mftmirrcn
 #define	ntm_mftrecsz	ntm_bootfile.bf_mftrecsz
 #define	ntm_spc		ntm_bootfile.bf_spc
 #define	ntm_bps		ntm_bootfile.bf_bps
@@ -272,17 +272,17 @@
 #define	NTFS_NEXTREC(s, type) ((type)(((caddr_t) s) + (s)->reclen))
 
 /* Convert mount ptr to ntfsmount ptr. */
-#define VFSTONTFS(mp)	((struct ntfsmount *)((mp)->mnt_data))
-#define VTONT(v)	FTONT(VTOF(v))
+#define	VFSTONTFS(mp)	((struct ntfsmount *)((mp)->mnt_data))
+#define	VTONT(v)	FTONT(VTOF(v))
 #define	VTOF(v)		((struct fnode *)((v)->v_data))
 #define	FTOV(f)		((f)->f_vp)
 #define	FTONT(f)	((f)->f_ip)
-#define ntfs_cntobn(cn)	(daddr_t)((cn) * (ntmp->ntm_spc))
-#define ntfs_cntob(cn)	(off_t)((cn) * (ntmp)->ntm_spc * (ntmp)->ntm_bps)
-#define ntfs_btocn(off)	(cn_t)((off) / ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
-#define ntfs_btocl(off)	(cn_t)((off + ntfs_cntob(1) - 1) / ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
-#define ntfs_btocnoff(off)	(off_t)((off) % ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
-#define ntfs_bntob(bn)	(daddr_t)((bn) * (ntmp)->ntm_bps)
+#define	ntfs_cntobn(cn)	(daddr_t)((cn) * (ntmp->ntm_spc))
+#define	ntfs_cntob(cn)	(off_t)((cn) * (ntmp)->ntm_spc * (ntmp)->ntm_bps)
+#define	ntfs_btocn(off)	(cn_t)((off) / ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
+#define	ntfs_btocl(off)	(cn_t)((off + ntfs_cntob(1) - 1) / ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
+#define	ntfs_btocnoff(off)	(off_t)((off) % ((ntmp)->ntm_spc * (ntmp)->ntm_bps))
+#define	ntfs_bntob(bn)	(daddr_t)((bn) * (ntmp)->ntm_bps)
 
 #define	ntfs_bpbl	(daddr_t)((ntmp)->ntm_bps)
 
@@ -294,15 +294,15 @@
 #endif
 
 #if defined(NTFS_DEBUG)
-#define dprintf(a) printf a
+#define	dprintf(a)	printf a
 #if NTFS_DEBUG > 1
-#define ddprintf(a) printf a
+#define	ddprintf(a)	printf a
 #else
-#define ddprintf(a)	(void)0
+#define	ddprintf(a)	(void)0
 #endif
 #else
-#define dprintf(a)	(void)0
-#define ddprintf(a)	(void)0
+#define	dprintf(a)	(void)0
+#define	ddprintf(a)	(void)0
 #endif
 
 extern struct vop_vector ntfs_vnodeops;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs_subr.c
--- a/head/sys/fs/ntfs/ntfs_subr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs_subr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs_subr.c 229407 2012-01-03 19:09:01Z pfg $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs_subr.c 238315 2012-07-10 00:01:00Z attilio $
  */
 
 #include <sys/param.h>
@@ -1353,174 +1353,6 @@
 }
 
 /*
- * This is one of write routine.
- */
-int
-ntfs_writeattr_plain(
-	struct ntfsmount * ntmp,
-	struct ntnode * ip,
-	u_int32_t attrnum,	
-	char *attrname,
-	off_t roff,
-	size_t rsize,
-	void *rdata,
-	size_t * initp,
-	struct uio *uio)
-{
-	size_t          init;
-	int             error = 0;
-	off_t           off = roff, left = rsize, towrite;
-	caddr_t         data = rdata;
-	struct ntvattr *vap;
-	*initp = 0;
-
-	while (left) {
-		error = ntfs_ntvattrget(ntmp, ip, attrnum, attrname,
-					ntfs_btocn(off), &vap);
-		if (error)
-			return (error);
-		towrite = MIN(left, ntfs_cntob(vap->va_vcnend + 1) - off);
-		ddprintf(("ntfs_writeattr_plain: o: %d, s: %d (%d - %d)\n",
-			 (u_int32_t) off, (u_int32_t) towrite,
-			 (u_int32_t) vap->va_vcnstart,
-			 (u_int32_t) vap->va_vcnend));
-		error = ntfs_writentvattr_plain(ntmp, ip, vap,
-					 off - ntfs_cntob(vap->va_vcnstart),
-					 towrite, data, &init, uio);
-		if (error) {
-			printf("ntfs_writeattr_plain: " \
-			       "ntfs_writentvattr_plain failed: o: %d, s: %d\n",
-			       (u_int32_t) off, (u_int32_t) towrite);
-			printf("ntfs_writeattr_plain: attrib: %d - %d\n",
-			       (u_int32_t) vap->va_vcnstart, 
-			       (u_int32_t) vap->va_vcnend);
-			ntfs_ntvattrrele(vap);
-			break;
-		}
-		ntfs_ntvattrrele(vap);
-		left -= towrite;
-		off += towrite;
-		data = data + towrite;
-		*initp += init;
-	}
-
-	return (error);
-}
-
-/*
- * This is one of write routine.
- *
- * ntnode should be locked.
- */
-int
-ntfs_writentvattr_plain(
-	struct ntfsmount * ntmp,
-	struct ntnode * ip,
-	struct ntvattr * vap,
-	off_t roff,
-	size_t rsize,
-	void *rdata,
-	size_t * initp,
-	struct uio *uio)
-{
-	int             error = 0;
-	off_t           off;
-	int             cnt;
-	cn_t            ccn, ccl, cn, left, cl;
-	caddr_t         data = rdata;
-	struct buf     *bp;
-	size_t          tocopy;
-
-	*initp = 0;
-
-	if ((vap->va_flag & NTFS_AF_INRUN) == 0) {
-		printf("ntfs_writevattr_plain: CAN'T WRITE RES. ATTRIBUTE\n");
-		return ENOTTY;
-	}
-
-	ddprintf(("ntfs_writentvattr_plain: data in run: %ld chains\n",
-		 vap->va_vruncnt));
-
-	off = roff;
-	left = rsize;
-	ccl = 0;
-	ccn = 0;
-	cnt = 0;
-	for (; left && (cnt < vap->va_vruncnt); cnt++) {
-		ccn = vap->va_vruncn[cnt];
-		ccl = vap->va_vruncl[cnt];
-
-		ddprintf(("ntfs_writentvattr_plain: " \
-			 "left %d, cn: 0x%x, cl: %d, off: %d\n", \
-			 (u_int32_t) left, (u_int32_t) ccn, \
-			 (u_int32_t) ccl, (u_int32_t) off));
-
-		if (ntfs_cntob(ccl) < off) {
-			off -= ntfs_cntob(ccl);
-			cnt++;
-			continue;
-		}
-		if (!ccn && ip->i_number != NTFS_BOOTINO)
-			continue; /* XXX */
-
-		ccl -= ntfs_btocn(off);
-		cn = ccn + ntfs_btocn(off);
-		off = ntfs_btocnoff(off);
-
-		while (left && ccl) {
-			/*
-			 * Always read and write single clusters at a time -
-			 * we need to avoid requesting differently-sized
-			 * blocks at the same disk offsets to avoid
-			 * confusing the buffer cache.
-			 */
-			tocopy = MIN(left, ntfs_cntob(1) - off);
-			cl = ntfs_btocl(tocopy + off);
-			KASSERT(cl == 1 && tocopy <= ntfs_cntob(1),
-			    ("single cluster limit mistake"));
-			ddprintf(("ntfs_writentvattr_plain: write: " \
-				"cn: 0x%x cl: %d, off: %d len: %d, left: %d\n",
-				(u_int32_t) cn, (u_int32_t) cl, 
-				(u_int32_t) off, (u_int32_t) tocopy, 
-				(u_int32_t) left));
-			if ((off == 0) && (tocopy == ntfs_cntob(cl)))
-			{
-				bp = getblk(ntmp->ntm_devvp, ntfs_cntobn(cn)
-					    * ntmp->ntm_multiplier,
-					    ntfs_cntob(cl), 0, 0, 0);
-				clrbuf(bp);
-			} else {
-				error = bread(ntmp->ntm_devvp, ntfs_cntobn(cn)
-					      * ntmp->ntm_multiplier,
-					      ntfs_cntob(cl), NOCRED, &bp);
-				if (error) {
-					brelse(bp);
-					return (error);
-				}
-			}
-			if (uio)
-				uiomove(bp->b_data + off, tocopy, uio);
-			else
-				memcpy(bp->b_data + off, data, tocopy);
-			bawrite(bp);
-			data = data + tocopy;
-			*initp += tocopy;
-			off = 0;
-			left -= tocopy;
-			cn += cl;
-			ccl -= cl;
-		}
-	}
-
-	if (left) {
-		printf("ntfs_writentvattr_plain: POSSIBLE RUN ERROR\n");
-		error = EINVAL;
-	}
-
-	return (error);
-}
-
-/*
  * This is one of read routines.
  *
  * ntnode should be locked.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs_subr.h
--- a/head/sys/fs/ntfs/ntfs_subr.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs_subr.h	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs_subr.h 228864 2011-12-24 15:49:52Z kevlo $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs_subr.h 238315 2012-07-10 00:01:00Z attilio $
  */
 
 #define	VA_LOADED		0x0001
@@ -99,8 +99,6 @@
 void ntfs_ntrele(struct ntnode *);
 void ntfs_ntput(struct ntnode *);
 int ntfs_loadntnode( struct ntfsmount *, struct ntnode * );
-int ntfs_writentvattr_plain(struct ntfsmount *, struct ntnode *, struct ntvattr *, off_t, size_t, void *, size_t *, struct uio *);
-int ntfs_writeattr_plain(struct ntfsmount *, struct ntnode *, u_int32_t, char *, off_t, size_t, void *, size_t *, struct uio *);
 void ntfs_toupper_init(void);
 void ntfs_toupper_destroy(void);
 int ntfs_toupper_use(struct mount *, struct ntfsmount *);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs_vfsops.c
--- a/head/sys/fs/ntfs/ntfs_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs_vfsops.c 232483 2012-03-04 09:38:20Z kevlo $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs_vfsops.c 238320 2012-07-10 00:23:25Z attilio $
  */
 
 
@@ -152,7 +152,6 @@
 ntfs_mount(struct mount *mp)
 {
 	int err = 0, error;
-	accmode_t accmode;
 	struct vnode *devvp;
 	struct nameidata ndp;
 	struct thread *td;
@@ -162,6 +161,11 @@
 	if (vfs_filteropt(mp->mnt_optnew, ntfs_opts))
 		return (EINVAL);
 
+	/* Force mount as read-only. */
+	MNT_ILOCK(mp);
+	mp->mnt_flag |= MNT_RDONLY;
+	MNT_IUNLOCK(mp);
+
 	from = vfs_getopts(mp->mnt_optnew, "from", &error);
 	if (error)	
 		return (error);
@@ -173,11 +177,10 @@
 	if (mp->mnt_flag & MNT_UPDATE) {
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) {
 			/* Process export requests in vfs_mount.c */
-			goto success;
+			return (0);
 		} else {
 			printf("ntfs_mount(): MNT_UPDATE not supported\n");
-			err = EINVAL;
-			goto error_1;
+			return (EINVAL);
 		}
 	}
 
@@ -187,10 +190,8 @@
 	 */
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td);
 	err = namei(&ndp);
-	if (err) {
-		/* can't get devvp!*/
-		goto error_1;
-	}
+	if (err)
+		return (err);
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 	devvp = ndp.ni_vp;
 
@@ -203,10 +204,7 @@
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
-	accmode = VREAD;
-	if ((mp->mnt_flag & MNT_RDONLY) == 0)
-		accmode |= VWRITE;
-	err = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
+	err = VOP_ACCESS(devvp, VREAD, td->td_ucred, td);
 	if (err)
 		err = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (err) {
@@ -214,52 +212,23 @@
 		return (err);
 	}
 
-	if (mp->mnt_flag & MNT_UPDATE) {
-#if 0
-		/*
-		 ********************
-		 * UPDATE
-		 ********************
-		 */
 
-		if (devvp != ntmp->um_devvp)
-			err = EINVAL;	/* needs translation */
-		vput(devvp);
-		if (err)
-			return (err);
-#endif
-	} else {
-		/*
-		 ********************
-		 * NEW MOUNT
-		 ********************
-		 */
+	/*
+	 * Since this is a new mount, we want the names for the device and
+	 * the mount point copied in.  If an error occurs, the mountpoint is
+	 * discarded by the upper level code.  Note that vfs_mount() handles
+	 * copying the mountpoint f_mntonname for us, so we don't have to do
+	 * it here unless we want to set it to something other than "path"
+	 * for some rason.
+	 */
 
-		/*
-		 * Since this is a new mount, we want the names for
-		 * the device and the mount point copied in.  If an
-		 * error occurs, the mountpoint is discarded by the
-		 * upper level code.  Note that vfs_mount() handles
-		 * copying the mountpoint f_mntonname for us, so we
-		 * don't have to do it here unless we want to set it
-		 * to something other than "path" for some rason.
-		 */
-		/* Save "mounted from" info for mount point (NULL pad)*/
+	err = ntfs_mountfs(devvp, mp, td);
+	if (err == 0) {
+
+		/* Save "mounted from" info for mount point. */
 		vfs_mountedfrom(mp, from);
-
-		err = ntfs_mountfs(devvp, mp, td);
-	}
-	if (err) {
+	} else
 		vrele(devvp);
-		return (err);
-	}
-
-	goto success;
-
-error_1:	/* no state to back out*/
-	/* XXX: missing NDFREE(&ndp, ...) */
-
-success:
 	return (err);
 }
 
@@ -275,13 +244,12 @@
 	struct buf *bp;
 	struct ntfsmount *ntmp;
 	struct cdev *dev = devvp->v_rdev;
-	int error, ronly, i, v;
+	int error, i, v;
 	struct vnode *vp;
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	char *cs_ntfs, *cs_local;
 
-	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 	DROP_GIANT();
 	g_topology_lock();
 
@@ -296,7 +264,7 @@
  	if ((pp != NULL) && ((pp->acr | pp->acw | pp->ace ) != 0)) 
 		error = EPERM;
 	else 
-		error = g_vfs_open(devvp, &cp, "ntfs", ronly ? 0 : 1);
+		error = g_vfs_open(devvp, &cp, "ntfs", 0);
 
 	g_topology_unlock();
 	PICKUP_GIANT();
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/ntfs/ntfs_vnops.c
--- a/head/sys/fs/ntfs/ntfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/ntfs/ntfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/ntfs/ntfs_vnops.c 228864 2011-12-24 15:49:52Z kevlo $
+ * $FreeBSD: head/sys/fs/ntfs/ntfs_vnops.c 238315 2012-07-10 00:01:00Z attilio $
  *
  */
 
@@ -67,7 +67,6 @@
 #include <sys/unistd.h> /* for pathconf(2) constants */
 
 static vop_read_t	ntfs_read;
-static vop_write_t	ntfs_write;
 static vop_getattr_t	ntfs_getattr;
 static vop_inactive_t	ntfs_inactive;
 static vop_reclaim_t	ntfs_reclaim;
@@ -78,7 +77,6 @@
 static vop_close_t	ntfs_close;
 static vop_readdir_t	ntfs_readdir;
 static vop_cachedlookup_t	ntfs_lookup;
-static vop_fsync_t	ntfs_fsync;
 static vop_pathconf_t	ntfs_pathconf;
 static vop_vptofh_t	ntfs_vptofh;
 
@@ -272,6 +270,7 @@
 	register struct fnode *fp = VTOF(vp);
 	register struct ntnode *ip = FTONT(fp);
 	struct ntfsmount *ntmp = ip->i_mp;
+	u_int32_t toread;
 	int error;
 
 	dprintf(("ntfs_strategy: offset: %d, blkno: %d, lblkno: %d\n",
@@ -281,99 +280,33 @@
 	dprintf(("strategy: bcount: %d flags: 0x%x\n", 
 		(u_int32_t)bp->b_bcount,bp->b_flags));
 
-	if (bp->b_iocmd == BIO_READ) {
-		u_int32_t toread;
+	KASSERT(bp->b_iocmd == BIO_READ, ("Invalid buffer\n"));
 
-		if (ntfs_cntob(bp->b_blkno) >= fp->f_size) {
-			clrbuf(bp);
-			error = 0;
-		} else {
-			toread = MIN(bp->b_bcount,
-				 fp->f_size-ntfs_cntob(bp->b_blkno));
-			dprintf(("ntfs_strategy: toread: %d, fsize: %d\n",
-				toread,(u_int32_t)fp->f_size));
+	if (ntfs_cntob(bp->b_blkno) >= fp->f_size) {
+		clrbuf(bp);
+		error = 0;
+	} else {
+		toread = MIN(bp->b_bcount,
+			 fp->f_size-ntfs_cntob(bp->b_blkno));
+		dprintf(("ntfs_strategy: toread: %d, fsize: %d\n",
+			toread,(u_int32_t)fp->f_size));
 
-			error = ntfs_readattr(ntmp, ip, fp->f_attrtype,
-				fp->f_attrname, ntfs_cntob(bp->b_blkno),
-				toread, bp->b_data, NULL);
+		error = ntfs_readattr(ntmp, ip, fp->f_attrtype,
+			fp->f_attrname, ntfs_cntob(bp->b_blkno),
+			toread, bp->b_data, NULL);
 
-			if (error) {
-				printf("ntfs_strategy: ntfs_readattr failed\n");
-				bp->b_error = error;
-				bp->b_ioflags |= BIO_ERROR;
-			}
+		if (error) {
+			printf("ntfs_strategy: ntfs_readattr failed\n");
+			bp->b_error = error;
+			bp->b_ioflags |= BIO_ERROR;
+		}
 
-			bzero(bp->b_data + toread, bp->b_bcount - toread);
-		}
-	} else {
-		size_t tmp;
-		u_int32_t towrite;
-
-		if (ntfs_cntob(bp->b_blkno) + bp->b_bcount >= fp->f_size) {
-			printf("ntfs_strategy: CAN'T EXTEND FILE\n");
-			bp->b_error = error = EFBIG;
-			bp->b_ioflags |= BIO_ERROR;
-		} else {
-			towrite = MIN(bp->b_bcount,
-				fp->f_size-ntfs_cntob(bp->b_blkno));
-			dprintf(("ntfs_strategy: towrite: %d, fsize: %d\n",
-				towrite,(u_int32_t)fp->f_size));
-
-			error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype,	
-				fp->f_attrname, ntfs_cntob(bp->b_blkno),towrite,
-				bp->b_data, &tmp, NULL);
-
-			if (error) {
-				printf("ntfs_strategy: ntfs_writeattr fail\n");
-				bp->b_error = error;
-				bp->b_ioflags |= BIO_ERROR;
-			}
-		}
+		bzero(bp->b_data + toread, bp->b_bcount - toread);
 	}
 	bufdone(bp);
 	return (0);
 }
 
-static int
-ntfs_write(ap)
-	struct vop_write_args /* {
-		struct vnode *a_vp;
-		struct uio *a_uio;
-		int  a_ioflag;
-		struct ucred *a_cred;
-	} */ *ap;
-{
-	register struct vnode *vp = ap->a_vp;
-	register struct fnode *fp = VTOF(vp);
-	register struct ntnode *ip = FTONT(fp);
-	struct uio *uio = ap->a_uio;
-	struct ntfsmount *ntmp = ip->i_mp;
-	u_int64_t towrite;
-	size_t written;
-	int error;
-
-	dprintf(("ntfs_write: ino: %d, off: %d resid: %d, segflg: %d\n",ip->i_number,(u_int32_t)uio->uio_offset,uio->uio_resid,uio->uio_segflg));
-	dprintf(("ntfs_write: filesize: %d",(u_int32_t)fp->f_size));
-
-	if (uio->uio_resid + uio->uio_offset > fp->f_size) {
-		printf("ntfs_write: CAN'T WRITE BEYOND END OF FILE\n");
-		return (EFBIG);
-	}
-
-	towrite = MIN(uio->uio_resid, fp->f_size - uio->uio_offset);
-
-	dprintf((", towrite: %d\n",(u_int32_t)towrite));
-
-	error = ntfs_writeattr_plain(ntmp, ip, fp->f_attrtype,
-		fp->f_attrname, uio->uio_offset, towrite, NULL, &written, uio);
-#ifdef NTFS_DEBUG
-	if (error)
-		printf("ntfs_write: ntfs_writeattr failed: %d\n", error);
-#endif
-
-	return (error);
-}
-
 int
 ntfs_access(ap)
 	struct vop_access_args /* {
@@ -390,7 +323,7 @@
 	dprintf(("ntfs_access: %d\n",ip->i_number));
 
 	/*
-	 * Disallow write attempts on read-only filesystems;
+	 * Disallow write attempts as we assume read-only filesystems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
@@ -399,8 +332,8 @@
 		case VDIR:
 		case VLNK:
 		case VREG:
-			if (vp->v_mount->mnt_flag & MNT_RDONLY)
-				return (EROFS);
+			return (EROFS);
+		default:
 			break;
 		}
 	}
@@ -493,8 +426,13 @@
 
 	/* Simulate . in every dir except ROOT */
 	if( ip->i_number != NTFS_ROOTINO ) {
-		struct dirent dot = { NTFS_ROOTINO,
-				sizeof(struct dirent), DT_DIR, 1, "." };
+		struct dirent dot = {
+			.d_fileno = NTFS_ROOTINO,
+			.d_reclen = sizeof(struct dirent),
+			.d_type = DT_DIR,
+			.d_namlen = 1,
+			.d_name = "."
+		};
 
 		if( uio->uio_offset < sizeof(struct dirent) ) {
 			dot.d_fileno = ip->i_number;
@@ -508,8 +446,13 @@
 
 	/* Simulate .. in every dir including ROOT */
 	if( uio->uio_offset < 2 * sizeof(struct dirent) ) {
-		struct dirent dotdot = { NTFS_ROOTINO,
-				sizeof(struct dirent), DT_DIR, 2, ".." };
+		struct dirent dotdot = {
+			.d_fileno = NTFS_ROOTINO,
+			.d_reclen = sizeof(struct dirent),
+			.d_type = DT_DIR,
+			.d_namlen = 2,
+			.d_name = ".."
+		};
 
 		error = uiomove((char *)&dotdot,sizeof(struct dirent),uio);
 		if(error)
@@ -620,7 +563,6 @@
 		return (error);
 
 	if ((cnp->cn_flags & ISLASTCN) &&
-	    (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
@@ -669,24 +611,6 @@
 }
 
 /*
- * Flush the blocks of a file to disk.
- *
- * This function is worthless for vnodes that represent directories. Maybe we
- * could just do a sync if they try an fsync on a directory file.
- */
-static int
-ntfs_fsync(ap)
-	struct vop_fsync_args /* {
-		struct vnode *a_vp;
-		struct ucred *a_cred;
-		int a_waitfor;
-		struct thread *a_td;
-	} */ *ap;
-{
-	return (0);
-}
-
-/*
  * Return POSIX pathconf information applicable to NTFS filesystem
  */
 int
@@ -746,7 +670,6 @@
 	.vop_bmap =		ntfs_bmap,
 	.vop_cachedlookup =	ntfs_lookup,
 	.vop_close =		ntfs_close,
-	.vop_fsync =		ntfs_fsync,
 	.vop_getattr =		ntfs_getattr,
 	.vop_inactive =		ntfs_inactive,
 	.vop_lookup =		vfs_cache_lookup,
@@ -756,6 +679,5 @@
 	.vop_readdir =		ntfs_readdir,
 	.vop_reclaim =		ntfs_reclaim,
 	.vop_strategy =		ntfs_strategy,
-	.vop_write =		ntfs_write,
 	.vop_vptofh =		ntfs_vptofh,
 };
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/nullfs/null_vnops.c
--- a/head/sys/fs/nullfs/null_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/nullfs/null_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  *	...and...
  *	@(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project
  *
- * $FreeBSD: head/sys/fs/nullfs/null_vnops.c 232303 2012-02-29 15:15:36Z kib $
+ * $FreeBSD: head/sys/fs/nullfs/null_vnops.c 234607 2012-04-23 14:10:34Z trasz $
  */
 
 /*
@@ -678,7 +678,6 @@
 null_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
-	struct thread *td = ap->a_td;
 
 	vp->v_object = NULL;
 
@@ -686,7 +685,7 @@
 	 * If this is the last reference, then free up the vnode
 	 * so as not to tie up the lower vnodes.
 	 */
-	vrecycle(vp, td);
+	vrecycle(vp);
 
 	return (0);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/portalfs/portal_vnops.c
--- a/head/sys/fs/portalfs/portal_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/portalfs/portal_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  *
  *	@(#)portal_vnops.c	8.14 (Berkeley) 5/21/95
  *
- * $FreeBSD: head/sys/fs/portalfs/portal_vnops.c 226497 2011-10-18 07:31:49Z des $
+ * $FreeBSD: head/sys/fs/portalfs/portal_vnops.c 238697 2012-07-22 15:40:31Z kevlo $
  */
 
 /*
@@ -110,7 +110,7 @@
 	char *pname = cnp->cn_nameptr;
 	struct portalnode *pt;
 	int error;
-	struct vnode *fvp = 0;
+	struct vnode *fvp = NULL;
 	char *path;
 	int size;
 
@@ -217,14 +217,14 @@
 		struct thread *a_td;
 	} */ *ap;
 {
-	struct socket *so = 0;
+	struct socket *so = NULL;
 	struct portalnode *pt;
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct uio auio;
 	struct iovec aiov[2];
 	int res;
-	struct mbuf *cm = 0;
+	struct mbuf *cm = NULL;
 	struct cmsghdr *cmsg;
 	int newfds;
 	int *ip;
@@ -356,7 +356,7 @@
 
 	len = auio.uio_resid = sizeof(int);
 	do {
-		struct mbuf *m = 0;
+		struct mbuf *m = NULL;
 		int flags = MSG_WAITALL;
 		error = soreceive(so, (struct sockaddr **) 0, &auio,
 					&m, &cm, &flags);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/smbfs/smbfs_node.c
--- a/head/sys/fs/smbfs/smbfs_node.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/smbfs/smbfs_node.c	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/smbfs/smbfs_node.c 227293 2011-11-07 06:44:47Z ed $
+ * $FreeBSD: head/sys/fs/smbfs/smbfs_node.c 238539 2012-07-16 22:07:29Z brueffer $
  */
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -223,19 +223,16 @@
 	if (fap == NULL)
 		return ENOENT;
 
-	np = malloc(sizeof *np, M_SMBNODE, M_WAITOK);
 	error = getnewvnode("smbfs", mp, &smbfs_vnodeops, &vp);
-	if (error) {
-		free(np, M_SMBNODE);
-		return error;
-	}
+	if (error != 0)
+		return (error);
 	error = insmntque(vp, mp);	/* XXX: Too early for mpsafe fs */
-	if (error != 0) {
-		free(np, M_SMBNODE);
+	if (error != 0)
 		return (error);
-	}
+
+	np = malloc(sizeof *np, M_SMBNODE, M_WAITOK | M_ZERO);
+
 	vp->v_type = fap->fa_attr & SMB_FA_DIR ? VDIR : VREG;
-	bzero(np, sizeof(*np));
 	vp->v_data = np;
 	np->n_vnode = vp;
 	np->n_mount = VFSTOSMBFS(mp);
@@ -373,7 +370,7 @@
 		smbfs_attr_cacheremove(vp);
 	}
 	if (np->n_flag & NGONE)
-		vrecycle(vp, td);
+		vrecycle(vp);
 	return (0);
 }
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/tmpfs/tmpfs_vnops.c
--- a/head/sys/fs/tmpfs/tmpfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/tmpfs/tmpfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -34,7 +34,7 @@
  * tmpfs vnode interface.
  */
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/fs/tmpfs/tmpfs_vnops.c 234064 2012-04-09 17:05:18Z attilio $");
+__FBSDID("$FreeBSD: head/sys/fs/tmpfs/tmpfs_vnops.c 234607 2012-04-23 14:10:34Z trasz $");
 
 #include <sys/param.h>
 #include <sys/fcntl.h>
@@ -1577,7 +1577,6 @@
 tmpfs_inactive(struct vop_inactive_args *v)
 {
 	struct vnode *vp = v->a_vp;
-	struct thread *l = v->a_td;
 
 	struct tmpfs_node *node;
 
@@ -1586,7 +1585,7 @@
 	node = VP_TO_TMPFS_NODE(vp);
 
 	if (node->tn_links == 0)
-		vrecycle(vp, l);
+		vrecycle(vp);
 
 	return 0;
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/udf/udf_vfsops.c
--- a/head/sys/fs/udf/udf_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/udf/udf_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/fs/udf/udf_vfsops.c 222167 2011-05-22 01:07:54Z rmacklem $
+ * $FreeBSD: head/sys/fs/udf/udf_vfsops.c 238697 2012-07-22 15:40:31Z kevlo $
  */
 
 /* udf_vfsops.c */
@@ -190,7 +190,7 @@
 {
 	struct vnode *devvp;	/* vnode of the mount device */
 	struct thread *td;
-	struct udf_mnt *imp = 0;
+	struct udf_mnt *imp = NULL;
 	struct vfsoptlist *opts;
 	char *fspec, *cs_disk, *cs_local;
 	int error, len, *udf_flags;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/unionfs/union_subr.c
--- a/head/sys/fs/unionfs/union_subr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/unionfs/union_subr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * Copyright (c) 1994 Jan-Simon Pendry
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
- * Copyright (c) 2005, 2006 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
- * Copyright (c) 2006 Daichi Goto <daichi at freebsd.org>
+ * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
+ * Copyright (c) 2006, 2012 Daichi Goto <daichi at freebsd.org>
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
- * $FreeBSD: head/sys/fs/unionfs/union_subr.c 232701 2012-03-08 20:27:20Z jhb $
+ * $FreeBSD: head/sys/fs/unionfs/union_subr.c 235503 2012-05-16 10:44:09Z gleb $
  */
 
 #include <sys/param.h>
@@ -350,19 +350,22 @@
 	uvp = unp->un_uppervp;
 	dvp = unp->un_dvp;
 	unp->un_lowervp = unp->un_uppervp = NULLVP;
-
 	vp->v_vnlock = &(vp->v_lock);
 	vp->v_data = NULL;
-	lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_INTERLOCK, VI_MTX(vp));
+	vp->v_object = NULL;
+	VI_UNLOCK(vp);
+
 	if (lvp != NULLVP)
-		VOP_UNLOCK(lvp, 0);
+		VOP_UNLOCK(lvp, LK_RELEASE);
 	if (uvp != NULLVP)
-		VOP_UNLOCK(uvp, 0);
-	vp->v_object = NULL;
+		VOP_UNLOCK(uvp, LK_RELEASE);
 
 	if (dvp != NULLVP && unp->un_hash.le_prev != NULL)
 		unionfs_rem_cached_vnode(unp, dvp);
 
+	if (lockmgr(vp->v_vnlock, LK_EXCLUSIVE, VI_MTX(vp)) != 0)
+		panic("the lock for deletion is unacquirable.");
+
 	if (lvp != NULLVP) {
 		vfslocked = VFS_LOCK_GIANT(lvp->v_mount);
 		vrele(lvp);
@@ -550,7 +553,7 @@
 		cn->cn_flags |= (cnp->cn_flags & SAVESTART);
 
 	vref(dvp);
-	VOP_UNLOCK(dvp, 0);
+	VOP_UNLOCK(dvp, LK_RELEASE);
 
 	if ((error = relookup(dvp, vpp, cn))) {
 		uma_zfree(namei_zone, cn->cn_pnbuf);
@@ -957,7 +960,7 @@
 	*vpp = vp;
 
 unionfs_vn_create_on_upper_free_out1:
-	VOP_UNLOCK(udvp, 0);
+	VOP_UNLOCK(udvp, LK_RELEASE);
 
 unionfs_vn_create_on_upper_free_out2:
 	if (cn.cn_flags & HASBUF) {
@@ -1181,7 +1184,7 @@
 		edp = (struct dirent*)&buf[sizeof(buf) - uio.uio_resid];
 		for (dp = (struct dirent*)buf; !error && dp < edp;
 		     dp = (struct dirent*)((caddr_t)dp + dp->d_reclen)) {
-			if (dp->d_type == DT_WHT ||
+			if (dp->d_type == DT_WHT || dp->d_fileno == 0 ||
 			    (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
 			    (dp->d_namlen == 2 && !bcmp(dp->d_name, "..", 2)))
 				continue;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/unionfs/union_vfsops.c
--- a/head/sys/fs/unionfs/union_vfsops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/unionfs/union_vfsops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,8 +1,8 @@
 /*-
  * Copyright (c) 1994, 1995 The Regents of the University of California.
  * Copyright (c) 1994, 1995 Jan-Simon Pendry.
- * Copyright (c) 2005, 2006 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
- * Copyright (c) 2006 Daichi Goto <daichi at freebsd.org>
+ * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
+ * Copyright (c) 2006, 2012 Daichi Goto <daichi at freebsd.org>
  * All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)union_vfsops.c	8.20 (Berkeley) 5/20/95
- * $FreeBSD: head/sys/fs/unionfs/union_vfsops.c 232918 2012-03-13 10:04:13Z kevlo $
+ * $FreeBSD: head/sys/fs/unionfs/union_vfsops.c 234867 2012-05-01 07:46:30Z daichi $
  */
 
 #include <sys/param.h>
@@ -165,7 +165,7 @@
 		uid = va.va_uid;
 		gid = va.va_gid;
 	}
-	VOP_UNLOCK(mp->mnt_vnodecovered, 0);
+	VOP_UNLOCK(mp->mnt_vnodecovered, LK_RELEASE);
 	if (error)
 		return (error);
 
@@ -250,7 +250,7 @@
 	 * Save reference
 	 */
 	if (below) {
-		VOP_UNLOCK(upperrootvp, 0);
+		VOP_UNLOCK(upperrootvp, LK_RELEASE);
 		vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY);
 		ump->um_lowervp = upperrootvp;
 		ump->um_uppervp = lowerrootvp;
@@ -281,7 +281,7 @@
 	/*
 	 * Unlock the node
 	 */
-	VOP_UNLOCK(ump->um_uppervp, 0);
+	VOP_UNLOCK(ump->um_uppervp, LK_RELEASE);
 
 	/*
 	 * Get the unionfs root vnode.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/fs/unionfs/union_vnops.c
--- a/head/sys/fs/unionfs/union_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/fs/unionfs/union_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry.
  * Copyright (c) 1992, 1993, 1994, 1995
  *      The Regents of the University of California.
- * Copyright (c) 2005, 2006 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
- * Copyright (c) 2006 Daichi Goto <daichi at freebsd.org>
+ * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa at ongs.co.jp>, ONGS Inc.
+ * Copyright (c) 2006, 2012 Daichi Goto <daichi at freebsd.org>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
@@ -34,7 +34,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)union_vnops.c	8.32 (Berkeley) 6/23/95
- * $FreeBSD: head/sys/fs/unionfs/union_vnops.c 226234 2011-10-10 21:32:08Z trasz $
+ * $FreeBSD: head/sys/fs/unionfs/union_vnops.c 234944 2012-05-03 07:22:29Z daichi $
  *
  */
 
@@ -75,21 +75,6 @@
 	KASSERT(((vp)->v_op == &unionfs_vnodeops), \
 	    ("unionfs: it is not unionfs-vnode"))
 
-/* lockmgr lock <-> reverse table */
-struct lk_lr_table {
-	int	lock;
-	int	revlock;
-};
-
-static struct lk_lr_table un_llt[] = {
-	{LK_SHARED, LK_RELEASE},
-	{LK_EXCLUSIVE, LK_RELEASE},
-	{LK_UPGRADE, LK_DOWNGRADE},
-	{LK_DOWNGRADE, LK_UPGRADE},
-	{0, 0}
-};
-
-
 static int
 unionfs_lookup(struct vop_cachedlookup_args *ap)
 {
@@ -141,7 +126,7 @@
 		if (udvp != NULLVP) {
 			dtmpvp = udvp;
 			if (ldvp != NULLVP)
-				VOP_UNLOCK(ldvp, 0);
+				VOP_UNLOCK(ldvp, LK_RELEASE);
 		}
 		else
 			dtmpvp = ldvp;
@@ -149,7 +134,7 @@
 		error = VOP_LOOKUP(dtmpvp, &vp, cnp);
 
 		if (dtmpvp == udvp && ldvp != NULLVP) {
-			VOP_UNLOCK(udvp, 0);
+			VOP_UNLOCK(udvp, LK_RELEASE);
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		}
 
@@ -161,10 +146,10 @@
 			 */
 			if (nameiop == DELETE  || nameiop == RENAME ||
 			    (cnp->cn_lkflags & LK_TYPE_MASK))
-				VOP_UNLOCK(vp, 0);
+				VOP_UNLOCK(vp, LK_RELEASE);
 			vrele(vp);
 
-			VOP_UNLOCK(dvp, 0);
+			VOP_UNLOCK(dvp, LK_RELEASE);
 			*(ap->a_vpp) = dunp->un_dvp;
 			vref(dunp->un_dvp);
 
@@ -202,7 +187,7 @@
 			}
 			if (nameiop == DELETE || nameiop == RENAME ||
 			    (cnp->cn_lkflags & LK_TYPE_MASK))
-				VOP_UNLOCK(uvp, 0);
+				VOP_UNLOCK(uvp, LK_RELEASE);
 		}
 
 		/* check whiteout */
@@ -246,7 +231,7 @@
 				return (lerror);
 			}
 			if (cnp->cn_lkflags & LK_TYPE_MASK)
-				VOP_UNLOCK(lvp, 0);
+				VOP_UNLOCK(lvp, LK_RELEASE);
 		}
 	}
 
@@ -281,7 +266,7 @@
 			goto unionfs_lookup_out;
 
 		if (LK_SHARED == (cnp->cn_lkflags & LK_TYPE_MASK))
-			VOP_UNLOCK(vp, 0);
+			VOP_UNLOCK(vp, LK_RELEASE);
 		if (LK_EXCLUSIVE != VOP_ISLOCKED(vp)) {
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			lockflag = 1;
@@ -289,7 +274,7 @@
 		error = unionfs_mkshadowdir(MOUNTTOUNIONFSMOUNT(dvp->v_mount),
 		    udvp, VTOUNIONFS(vp), cnp, td);
 		if (lockflag != 0)
-			VOP_UNLOCK(vp, 0);
+			VOP_UNLOCK(vp, LK_RELEASE);
 		if (error != 0) {
 			UNIONFSDEBUG("unionfs_lookup: Unable to create shadow dir.");
 			if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE)
@@ -386,7 +371,7 @@
 		if (vp->v_type == VSOCK)
 			*(ap->a_vpp) = vp;
 		else {
-			VOP_UNLOCK(vp, 0);
+			VOP_UNLOCK(vp, LK_RELEASE);
 			error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, curthread);
 			vrele(vp);
@@ -460,7 +445,7 @@
 		if (vp->v_type == VSOCK)
 			*(ap->a_vpp) = vp;
 		else {
-			VOP_UNLOCK(vp, 0);
+			VOP_UNLOCK(vp, LK_RELEASE);
 			error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, curthread);
 			vrele(vp);
@@ -564,6 +549,7 @@
 	struct unionfs_node_status *unsp;
 	struct ucred   *cred;
 	struct thread  *td;
+	struct vnode   *vp;
 	struct vnode   *ovp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n");
@@ -571,12 +557,14 @@
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	locked = 0;
-	unp = VTOUNIONFS(ap->a_vp);
+	vp = ap->a_vp;
+	unp = VTOUNIONFS(vp);
 	cred = ap->a_cred;
 	td = ap->a_td;
 
-	if (VOP_ISLOCKED(ap->a_vp) != LK_EXCLUSIVE) {
-		vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
+	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+		if (vn_lock(vp, LK_UPGRADE) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		locked = 1;
 	}
 	unionfs_get_node_status(unp, td, &unsp);
@@ -599,7 +587,7 @@
 	if (error != 0)
 		goto unionfs_close_abort;
 
-	ap->a_vp->v_object = ovp->v_object;
+	vp->v_object = ovp->v_object;
 
 	if (ovp == unp->un_uppervp) {
 		unsp->uns_upper_opencnt--;
@@ -610,7 +598,7 @@
 				unsp->uns_lower_opencnt--;
 			}
 			if (unsp->uns_lower_opencnt > 0)
-				ap->a_vp->v_object = unp->un_lowervp->v_object;
+				vp->v_object = unp->un_lowervp->v_object;
 		}
 	} else
 		unsp->uns_lower_opencnt--;
@@ -619,7 +607,7 @@
 	unionfs_tryrem_node_status(unp, unsp);
 
 	if (locked != 0)
-		VOP_UNLOCK(ap->a_vp, 0);
+		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_close: leave (%d)\n", error);
 
@@ -914,7 +902,7 @@
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, unsp);
-	VOP_UNLOCK(ap->a_vp, 0);
+	VOP_UNLOCK(ap->a_vp, LK_RELEASE);
 
 	if (ovp == NULLVP)
 		return (EBADF);
@@ -941,7 +929,7 @@
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, unsp);
-	VOP_UNLOCK(ap->a_vp, 0);
+	VOP_UNLOCK(ap->a_vp, LK_RELEASE);
 
 	if (ovp == NULLVP)
 		return (EBADF);
@@ -1001,7 +989,7 @@
 		ump = NULL;
 		vp = uvp = lvp = NULLVP;
 		/* search vnode */
-		VOP_UNLOCK(ap->a_vp, 0);
+		VOP_UNLOCK(ap->a_vp, LK_RELEASE);
 		error = unionfs_relookup(udvp, &vp, cnp, &cn, td,
 		    cnp->cn_nameptr, strlen(cnp->cn_nameptr), DELETE);
 		if (error != 0 && error != ENOENT) {
@@ -1204,7 +1192,7 @@
 			if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 				goto unionfs_rename_abort;
 			error = unionfs_copyfile(unp, 1, fcnp->cn_cred, td);
-			VOP_UNLOCK(fvp, 0);
+			VOP_UNLOCK(fvp, LK_RELEASE);
 			if (error != 0)
 				goto unionfs_rename_abort;
 			break;
@@ -1212,7 +1200,7 @@
 			if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 				goto unionfs_rename_abort;
 			error = unionfs_mkshadowdir(ump, rfdvp, unp, fcnp, td);
-			VOP_UNLOCK(fvp, 0);
+			VOP_UNLOCK(fvp, LK_RELEASE);
 			if (error != 0)
 				goto unionfs_rename_abort;
 			break;
@@ -1269,13 +1257,13 @@
 		if ((error = vn_lock(fdvp, LK_EXCLUSIVE)) != 0)
 			goto unionfs_rename_abort;
 		error = unionfs_relookup_for_delete(fdvp, fcnp, td);
-		VOP_UNLOCK(fdvp, 0);
+		VOP_UNLOCK(fdvp, LK_RELEASE);
 		if (error != 0)
 			goto unionfs_rename_abort;
 
 		/* Locke of tvp is canceled in order to avoid recursive lock. */
 		if (tvp != NULLVP && tvp != tdvp)
-			VOP_UNLOCK(tvp, 0);
+			VOP_UNLOCK(tvp, LK_RELEASE);
 		error = unionfs_relookup_for_rename(tdvp, tcnp, td);
 		if (tvp != NULLVP && tvp != tdvp)
 			vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
@@ -1293,11 +1281,11 @@
 	}
 
 	if (ltdvp != NULLVP)
-		VOP_UNLOCK(ltdvp, 0);
+		VOP_UNLOCK(ltdvp, LK_RELEASE);
 	if (tdvp != rtdvp)
 		vrele(tdvp);
 	if (ltvp != NULLVP)
-		VOP_UNLOCK(ltvp, 0);
+		VOP_UNLOCK(ltvp, LK_RELEASE);
 	if (tvp != rtvp && tvp != NULLVP) {
 		if (rtvp == NULLVP)
 			vput(tvp);
@@ -1371,7 +1359,7 @@
 		}
 
 		if ((error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap)) == 0) {
-			VOP_UNLOCK(uvp, 0);
+			VOP_UNLOCK(uvp, LK_RELEASE);
 			cnp->cn_lkflags = LK_EXCLUSIVE;
 			error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
@@ -1427,7 +1415,9 @@
 		ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 		if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP)
 			cnp->cn_flags |= DOWHITEOUT;
-		error = VOP_RMDIR(udvp, uvp, cnp);
+		error = unionfs_relookup_for_delete(ap->a_dvp, cnp, td);
+		if (!error)
+			error = VOP_RMDIR(udvp, uvp, cnp);
 	}
 	else if (lvp != NULLVP)
 		error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path);
@@ -1467,7 +1457,7 @@
 	if (udvp != NULLVP) {
 		error = VOP_SYMLINK(udvp, &uvp, cnp, ap->a_vap, ap->a_target);
 		if (error == 0) {
-			VOP_UNLOCK(uvp, 0);
+			VOP_UNLOCK(uvp, LK_RELEASE);
 			cnp->cn_lkflags = LK_EXCLUSIVE;
 			error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
@@ -1487,9 +1477,11 @@
 	int		error;
 	int		eofflag;
 	int		locked;
+	int		uio_offset_bk;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct uio     *uio;
+	struct vnode   *vp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
@@ -1505,17 +1497,42 @@
 	error = 0;
 	eofflag = 0;
 	locked = 0;
-	unp = VTOUNIONFS(ap->a_vp);
+	uio_offset_bk = 0;
 	uio = ap->a_uio;
-	uvp = unp->un_uppervp;
-	lvp = unp->un_lowervp;
+	uvp = NULLVP;
+	lvp = NULLVP;
 	td = uio->uio_td;
 	ncookies_bk = 0;
 	cookies_bk = NULL;
 
-	if (ap->a_vp->v_type != VDIR)
+	vp = ap->a_vp;
+	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
+	/* check the open count. unionfs needs to open before readdir. */
+	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+		if (vn_lock(vp, LK_UPGRADE) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		locked = 1;
+	}
+	unp = VTOUNIONFS(vp);
+	if (unp == NULL)
+		error = EBADF;
+	else {
+		uvp = unp->un_uppervp;
+		lvp = unp->un_lowervp;
+		unionfs_get_node_status(unp, td, &unsp);
+		if ((uvp != NULLVP && unsp->uns_upper_opencnt <= 0) ||
+			(lvp != NULLVP && unsp->uns_lower_opencnt <= 0)) {
+			unionfs_tryrem_node_status(unp, unsp);
+			error = EBADF;
+		}
+	}
+	if (locked)
+		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
+	if (error != 0)
+		goto unionfs_readdir_exit;
+
 	/* check opaque */
 	if (uvp != NULLVP && lvp != NULLVP) {
 		if ((error = VOP_GETATTR(uvp, &va, ap->a_cred)) != 0)
@@ -1524,22 +1541,6 @@
 			lvp = NULLVP;
 	}
 
-	/* check the open count. unionfs needs to open before readdir. */
-	if (VOP_ISLOCKED(ap->a_vp) != LK_EXCLUSIVE) {
-		vn_lock(ap->a_vp, LK_UPGRADE | LK_RETRY);
-		locked = 1;
-	}
-	unionfs_get_node_status(unp, td, &unsp);
-	if ((uvp != NULLVP && unsp->uns_upper_opencnt <= 0) ||
-	    (lvp != NULLVP && unsp->uns_lower_opencnt <= 0)) {
-		unionfs_tryrem_node_status(unp, unsp);
-		error = EBADF;
-	}
-	if (locked == 1)
-		vn_lock(ap->a_vp, LK_DOWNGRADE | LK_RETRY);
-	if (error != 0)
-		goto unionfs_readdir_exit;
-
 	/* upper only */
 	if (uvp != NULLVP && lvp == NULLVP) {
 		error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag,
@@ -1576,7 +1577,7 @@
 		unsp->uns_readdir_status = 1;
 
 		/*
-		 * ufs(and other fs) needs size of uio_resid larger than
+		 * UFS(and other FS) needs size of uio_resid larger than
 		 * DIRBLKSIZ.
 		 * size of DIRBLKSIZ equals DEV_BSIZE.
 		 * (see: ufs/ufs/ufs_vnops.c ufs_readdir func , ufs/ufs/dir.h)
@@ -1585,7 +1586,7 @@
 			goto unionfs_readdir_exit;
 
 		/*
-		 * backup cookies
+		 * Backup cookies.
 		 * It prepares to readdir in lower.
 		 */
 		if (ap->a_ncookies != NULL) {
@@ -1601,6 +1602,11 @@
 	/* initialize for readdir in lower */
 	if (unsp->uns_readdir_status == 1) {
 		unsp->uns_readdir_status = 2;
+		/*
+		 * Backup uio_offset. See the comment after the
+		 * VOP_READDIR call on the lower layer.
+		 */
+		uio_offset_bk = uio->uio_offset;
 		uio->uio_offset = 0;
 	}
 
@@ -1612,6 +1618,19 @@
 	error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag,
 			    ap->a_ncookies, ap->a_cookies);
 
+	/*
+	 * We can't return an uio_offset of 0: this would trigger an
+	 * infinite loop, because the next call to unionfs_readdir would
+	 * always restart with the upper layer (uio_offset == 0) and
+	 * always return some data.
+	 *
+	 * This happens when the lower layer root directory is removed.
+	 * (A root directory deleting of unionfs should not be permitted.
+	 *  But current VFS can not do it.)
+	 */
+	if (uio->uio_offset == 0)
+		uio->uio_offset = uio_offset_bk;
+
 	if (cookies_bk != NULL) {
 		/* merge cookies */
 		int		size;
@@ -1623,7 +1642,7 @@
 		pos = newcookies;
 
 		memcpy(pos, cookies_bk, ncookies_bk * sizeof(u_long));
-		pos += ncookies_bk * sizeof(u_long);
+		pos += ncookies_bk;
 		memcpy(pos, *(ap->a_cookies), *(ap->a_ncookies) * sizeof(u_long));
 		free(cookies_bk, M_TEMP);
 		free(*(ap->a_cookies), M_TEMP);
@@ -1702,7 +1721,7 @@
 unionfs_inactive(struct vop_inactive_args *ap)
 {
 	ap->a_vp->v_object = NULL;
-	vrecycle(ap->a_vp, ap->a_td);
+	vrecycle(ap->a_vp);
 	return (0);
 }
 
@@ -1743,18 +1762,66 @@
 }
 
 static int
-unionfs_get_llt_revlock(int flags)
+unionfs_islocked(struct vop_islocked_args *ap)
 {
-	int count;
-
-	flags &= LK_TYPE_MASK;
-	for (count = 0; un_llt[count].lock != 0; count++) {
-		if (flags == un_llt[count].lock) {
-			return un_llt[count].revlock;
-		}
+	struct unionfs_node *unp;
+
+	KASSERT_UNIONFS_VNODE(ap->a_vp);
+
+	unp = VTOUNIONFS(ap->a_vp);
+	if (unp == NULL)
+		return (vop_stdislocked(ap));
+
+	if (unp->un_uppervp != NULLVP)
+		return (VOP_ISLOCKED(unp->un_uppervp));
+	if (unp->un_lowervp != NULLVP)
+		return (VOP_ISLOCKED(unp->un_lowervp));
+	return (vop_stdislocked(ap));
+}
+
+static int
+unionfs_get_llt_revlock(struct vnode *vp, int flags)
+{
+	int revlock;
+
+	revlock = 0;
+
+	switch (flags & LK_TYPE_MASK) {
+	case LK_SHARED:
+		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+			revlock = LK_UPGRADE;
+		else
+			revlock = LK_RELEASE;
+		break;
+	case LK_EXCLUSIVE:
+	case LK_UPGRADE:
+		revlock = LK_RELEASE;
+		break;
+	case LK_DOWNGRADE:
+		revlock = LK_UPGRADE;
+		break;
+	default:
+		break;
 	}
 
-	return 0;
+	return (revlock);
+}
+
+/*
+ * The state of an acquired lock is adjusted similarly to
+ * the time of error generating. 
+ * flags: LK_RELEASE or LK_UPGRADE
+ */
+static void
+unionfs_revlock(struct vnode *vp, int flags)
+{
+	if (flags & LK_RELEASE)
+		VOP_UNLOCK(vp, flags);
+	else {
+		/* UPGRADE */
+		if (vn_lock(vp, flags) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	}
 }
 
 static int
@@ -1763,6 +1830,7 @@
 	int		error;
 	int		flags;
 	int		revlock;
+	int		interlock;
 	int		uhold;
 	struct mount   *mp;
 	struct unionfs_mount *ump;
@@ -1774,15 +1842,13 @@
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = 0;
+	interlock = 1;
 	uhold = 0;
 	flags = ap->a_flags;
 	vp = ap->a_vp;
 
 	if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK))
-		return (VOP_UNLOCK(vp, flags));
-
-	if ((revlock = unionfs_get_llt_revlock(flags)) == 0)
-		panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK);
+		return (VOP_UNLOCK(vp, flags | LK_RELEASE));
 
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
@@ -1798,6 +1864,9 @@
 	lvp = unp->un_lowervp;
 	uvp = unp->un_uppervp;
 
+	if ((revlock = unionfs_get_llt_revlock(vp, flags)) == 0)
+		panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK);
+
 	if ((mp->mnt_kern_flag & MNTK_MPSAFE) != 0 &&
 	    (vp->v_iflag & VI_OWEINACT) != 0)
 		flags |= LK_NOWAIT;
@@ -1811,6 +1880,23 @@
 		flags |= LK_CANRECURSE;
 
 	if (lvp != NULLVP) {
+		if (uvp != NULLVP && flags & LK_UPGRADE) {
+			/* Share Lock is once released and a deadlock is avoided.  */
+			VI_LOCK_FLAGS(uvp, MTX_DUPOK);
+			vholdl(uvp);
+			uhold = 1;
+			VI_UNLOCK(vp);
+			VOP_UNLOCK(uvp, LK_RELEASE | LK_INTERLOCK);
+			VI_LOCK(vp);
+			unp = VTOUNIONFS(vp);
+			if (unp == NULL) {
+				/* vnode is released. */
+				VI_UNLOCK(vp);
+				VOP_UNLOCK(lvp, LK_RELEASE);
+				vdrop(uvp);
+				return (EBUSY);
+			}
+		}
 		VI_LOCK_FLAGS(lvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
 		vholdl(lvp);
@@ -1823,19 +1909,28 @@
 		VI_LOCK(vp);
 		unp = VTOUNIONFS(vp);
 		if (unp == NULL) {
+			/* vnode is released. */
 			VI_UNLOCK(vp);
 			if (error == 0)
-				VOP_UNLOCK(lvp, 0);
+				VOP_UNLOCK(lvp, LK_RELEASE);
 			vdrop(lvp);
+			if (uhold != 0)
+				vdrop(uvp);
 			return (vop_stdlock(ap));
 		}
 	}
 
 	if (error == 0 && uvp != NULLVP) {
+		if (uhold && flags & LK_UPGRADE) {
+			flags &= ~LK_TYPE_MASK;
+			flags |= LK_EXCLUSIVE;
+		}
 		VI_LOCK_FLAGS(uvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
-		vholdl(uvp);
-		uhold = 1;
+		if (uhold == 0) {
+			vholdl(uvp);
+			uhold = 1;
+		}
 
 		VI_UNLOCK(vp);
 		ap->a_flags &= ~LK_INTERLOCK;
@@ -1845,30 +1940,27 @@
 		VI_LOCK(vp);
 		unp = VTOUNIONFS(vp);
 		if (unp == NULL) {
+			/* vnode is released. */
 			VI_UNLOCK(vp);
-			if (error == 0) {
-				VOP_UNLOCK(uvp, 0);
-				if (lvp != NULLVP)
-					VOP_UNLOCK(lvp, 0);
+			if (error == 0)
+				VOP_UNLOCK(uvp, LK_RELEASE);
+			vdrop(uvp);
+			if (lvp != NULLVP) {
+				VOP_UNLOCK(lvp, LK_RELEASE);
+				vdrop(lvp);
 			}
-			if (lvp != NULLVP)
-				vdrop(lvp);
-			vdrop(uvp);
 			return (vop_stdlock(ap));
 		}
-
 		if (error != 0 && lvp != NULLVP) {
+			/* rollback */
 			VI_UNLOCK(vp);
-			if ((revlock & LK_TYPE_MASK) == LK_RELEASE)
-				VOP_UNLOCK(lvp, revlock);
-			else
-				vn_lock(lvp, revlock | LK_RETRY);
-			goto unionfs_lock_abort;
+			unionfs_revlock(lvp, revlock);
+			interlock = 0;
 		}
 	}
 
-	VI_UNLOCK(vp);
-unionfs_lock_abort:
+	if (interlock)
+		VI_UNLOCK(vp);
 	if (lvp != NULLVP)
 		vdrop(lvp);
 	if (uhold != 0)
@@ -2013,7 +2105,7 @@
 			unionfs_tryrem_node_status(unp, unsp);
 	}
 
-	VOP_UNLOCK(vp, 0);
+	VOP_UNLOCK(vp, LK_RELEASE);
 
 	error = VOP_ADVLOCK(uvp, ap->a_id, ap->a_op, ap->a_fl, ap->a_flags);
 
@@ -2022,7 +2114,7 @@
 	return error;
 
 unionfs_advlock_abort:
-	VOP_UNLOCK(vp, 0);
+	VOP_UNLOCK(vp, LK_RELEASE);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error);
 
@@ -2150,7 +2242,8 @@
 	error = VOP_OPENEXTATTR(tvp, ap->a_cred, ap->a_td);
 
 	if (error == 0) {
-		vn_lock(vp, LK_UPGRADE | LK_RETRY);
+		if (vn_lock(vp, LK_UPGRADE) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (tvp == unp->un_uppervp)
 			unp->un_flag |= UNIONFS_OPENEXTU;
 		else
@@ -2186,7 +2279,8 @@
 	error = VOP_CLOSEEXTATTR(tvp, ap->a_commit, ap->a_cred, ap->a_td);
 
 	if (error == 0) {
-		vn_lock(vp, LK_UPGRADE | LK_RETRY);
+		if (vn_lock(vp, LK_UPGRADE) != 0)
+			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (tvp == unp->un_uppervp)
 			unp->un_flag &= ~UNIONFS_OPENEXTU;
 		else
@@ -2435,6 +2529,7 @@
 	.vop_getextattr =	unionfs_getextattr,
 	.vop_getwritemount =	unionfs_getwritemount,
 	.vop_inactive =		unionfs_inactive,
+	.vop_islocked =		unionfs_islocked,
 	.vop_ioctl =		unionfs_ioctl,
 	.vop_link =		unionfs_link,
 	.vop_listextattr =	unionfs_listextattr,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/acpica/acpi_machdep.c
--- a/head/sys/i386/acpica/acpi_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/acpica/acpi_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/acpica/acpi_machdep.c 235556 2012-05-17 17:58:53Z jhb $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -44,8 +44,6 @@
 
 #include <machine/nexusvar.h>
 
-SYSCTL_DECL(_debug_acpi);
-
 uint32_t acpi_resume_beep;
 TUNABLE_INT("debug.acpi.resume_beep", &acpi_resume_beep);
 SYSCTL_UINT(_debug_acpi, OID_AUTO, resume_beep, CTLFLAG_RW, &acpi_resume_beep,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/acpica/acpi_wakecode.S
--- a/head/sys/i386/acpica/acpi_wakecode.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/acpica/acpi_wakecode.S	Wed Jul 25 16:40:53 2012 +0300
@@ -1,6 +1,8 @@
 /*-
  * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
+ * Copyright (c) 2001-2012 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
+ * Copyright (c) 2003 Peter Wemm
+ * Copyright (c) 2008-2012 Jung-uk Kim <jkim at FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -24,11 +26,13 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/acpica/acpi_wakecode.S 237027 2012-06-13 21:03:01Z jkim $
  */
 
 #include <machine/asmacros.h>
+#include <machine/ppireg.h>
 #include <machine/specialreg.h>
+#include <machine/timerreg.h>
 
 #include "assym.s"
 
@@ -39,221 +43,166 @@
  * Depending on the previous sleep state, we may need to initialize more
  * of the system (i.e., S3 suspend-to-RAM vs. S4 suspend-to-disk).
  */
-	.align 4
+
+	.data				/* So we can modify it */
+
+	ALIGN_TEXT
 	.code16
-wakeup_16:
-	nop
-	cli
-	cld
-
+wakeup_start:
 	/*
 	 * Set up segment registers for real mode, a small stack for
 	 * any calls we make, and clear any flags.
 	 */
-	movw	%cs,%ax
-	movw	%ax,%ds
-	movw	%ax,%ss
-	movw	$PAGE_SIZE,%sp
-	pushl	$0
-	popfl
+	cli				/* make sure no interrupts */
+	mov	%cs, %ax		/* copy %cs to %ds.  Remember these */
+	mov	%ax, %ds		/* are offsets rather than selectors */
+	mov	%ax, %ss
+	movw	$PAGE_SIZE, %sp
+	xorw	%ax, %ax
+	pushw	%ax
+	popfw
 
 	/* To debug resume hangs, beep the speaker if the user requested. */
-	cmpl	$1,resume_beep
-	jne	nobeep
-	movb	$0xc0,%al
-	outb	%al,$0x42
-	movb	$0x04,%al
-	outb	%al,$0x42
-	inb	$0x61,%al
-	orb	$0x3,%al
-	outb	%al,$0x61
-nobeep:
+	testb	$~0, resume_beep - wakeup_start
+	jz	1f
+	movb	$0, resume_beep - wakeup_start
+
+	/* Set PIC timer2 to beep. */
+	movb	$(TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT), %al
+	outb	%al, $TIMER_MODE
+
+	/* Turn on speaker. */
+	inb	$IO_PPI, %al
+	orb	$PIT_SPKR, %al
+	outb	%al, $IO_PPI
+
+	/* Set frequency. */
+	movw	$0x4c0, %ax
+	outb	%al, $TIMER_CNTR2
+	shrw	$8, %ax
+	outb	%al, $TIMER_CNTR2
+1:
 
 	/* Re-initialize video BIOS if the reset_video tunable is set. */
-	cmpl	$1,reset_video
-	jne	nobiosreset
-	lcall	$0xc000,$3
+	testb	$~0, reset_video - wakeup_start
+	jz	1f
+	movb	$0, reset_video - wakeup_start
+	lcall	$0xc000, $3
 
-	/*
-	 * Set up segment registers for real mode again in case the
-	 * previous BIOS call clobbers them.
-	 */
-	movw	%cs,%ax
-	movw	%ax,%ds
-	movw	%ax,%ss
-nobiosreset:
+	/* When we reach here, int 0x10 should be ready.  Hide cursor. */
+	movb	$0x01, %ah
+	movb	$0x20, %ch
+	int	$0x10
 
-	/* Load GDT for real mode.  Use 32 bit prefix for addresses >16 MB. */
-	lgdtl	physical_gdt
-
-	/* Restore CR2, CR3 and CR4 */
-	movl	previous_cr2,%eax
-	movl	%eax,%cr2
-	movl	previous_cr3,%eax
-	movl	%eax,%cr3
-	movl	previous_cr4,%eax
-	movl	%eax,%cr4
-
-	/* Transfer some values to protected mode with an inline stack */
-#define NVALUES	9
-#define TRANSFER_STACK32(val, idx)	\
-	movl	val,%eax;		\
-	movl	%eax,wakeup_32stack+(idx+1)+(idx*4)
-
-	TRANSFER_STACK32(previous_ss,		(NVALUES - 9))
-	TRANSFER_STACK32(previous_fs,		(NVALUES - 8))
-	TRANSFER_STACK32(previous_ds,		(NVALUES - 7))
-	TRANSFER_STACK32(physical_gdt+2,	(NVALUES - 6))
-	TRANSFER_STACK32(where_to_recover,	(NVALUES - 5))
-	TRANSFER_STACK32(previous_idt+2,	(NVALUES - 4))
-	TRANSFER_STACK32(previous_ldt,		(NVALUES - 3))
-	TRANSFER_STACK32(previous_gdt+2,	(NVALUES - 2))
-	TRANSFER_STACK32(previous_tr,		(NVALUES - 1))
-	TRANSFER_STACK32(previous_cr0,		(NVALUES - 0))
-
-	mov	physical_esp,%esi	/* to be used in 32bit code */
-
-	/* Enable protected mode */
-	movl	%cr0,%eax
-	orl	$(CR0_PE),%eax
-	movl	%eax,%cr0
-
-wakeup_sw32:
-	/* Switch to protected mode by intersegmental jump */
-	ljmpl	$KCSEL,$0x12345678	/* Code location, to be replaced */
-
-	/*
-	 * Now switched to protected mode without paging enabled.
-	 *	%esi: KERNEL stack pointer (physical address)
-	 */
-	.code32
-wakeup_32:
-	nop
-
-	/* Set up segment registers for protected mode */
-	movw	$KDSEL,%ax		/* KDSEL to segment registers */
-	movw	%ax,%ds
-	movw	%ax,%es
-	movw	%ax,%gs
-	movw	%ax,%ss
-	movw	$KPSEL,%ax		/* KPSEL to %fs */
-	movw	%ax,%fs
-	movl	%esi,%esp		/* physical address stack pointer */
-
-wakeup_32stack:
-	/* Operands are overwritten in 16 bit code by TRANSFER_STACK32 macro */
-	pushl	$0xabcdef09		/* ss + dummy */
-	pushl	$0xabcdef08		/* fs + gs */
-	pushl	$0xabcdef07		/* ds + es */
-	pushl	$0xabcdef06		/* gdt:base (physical address) */
-	pushl	$0xabcdef05		/* recover address */
-	pushl	$0xabcdef04		/* idt:base */
-	pushl	$0xabcdef03		/* ldt + idt:limit */
-	pushl	$0xabcdef02		/* gdt:base */
-	pushl	$0xabcdef01		/* TR + gdt:limit */
-	pushl	$0xabcdef00		/* CR0 */
-
-	movl	%esp,%ebp
-#define CR0_REGISTER		0(%ebp)
-#define TASK_REGISTER		4(%ebp)
-#define PREVIOUS_GDT		6(%ebp)
-#define PREVIOUS_LDT		12(%ebp)
-#define PREVIOUS_IDT		14(%ebp)
-#define RECOVER_ADDR		20(%ebp)
-#define PHYSICAL_GDT_BASE	24(%ebp)
-#define PREVIOUS_DS		28(%ebp)
-#define PREVIOUS_ES		30(%ebp)
-#define PREVIOUS_FS		32(%ebp)
-#define PREVIOUS_GS		34(%ebp)
-#define PREVIOUS_SS		36(%ebp)
-
-	/* Fixup TSS type field */
-#define TSS_TYPEFIX_MASK	0xf9
-	xorl	%esi,%esi
-	movl	PHYSICAL_GDT_BASE,%ebx
-	movw	TASK_REGISTER,%si
-	leal	(%ebx,%esi),%eax	/* get TSS segment descriptor */
-	andb	$TSS_TYPEFIX_MASK,5(%eax)
-
-	/* Prepare to return to sleep/wakeup code point */
-	lgdtl	PREVIOUS_GDT
-	lidtl	PREVIOUS_IDT
-
-	/* Pack values from the GDT to be loaded into segment registers. */
-	movl	PREVIOUS_DS,%ebx
-	movl	PREVIOUS_FS,%ecx
-	movl	PREVIOUS_SS,%edx
-	movw	TASK_REGISTER,%si
-	shll	$16,%esi
-	movw	PREVIOUS_LDT,%si
-	movl	RECOVER_ADDR,%edi
-
-	/* Enable paging and etc. */
-	movl	CR0_REGISTER,%eax
-	movl	%eax,%cr0
-
-	/* Flush the prefetch queue */
-	jmp	1f
-1:	jmp	1f
+	/* Re-start in case the previous BIOS call clobbers them. */
+	jmp	wakeup_start
 1:
 
 	/*
-	 * Now we are in kernel virtual memory addressing with the following
-	 * original register values:
-	 *	%ebx: ds + es
-	 *	%ecx: fs + gs
-	 *	%edx: ss + dummy
-	 *	%esi: LDTR + TR
-	 *	%edi: recover address
-	 * We'll load these back into the segment registers now.
+	 * Find relocation base and patch the gdt descript and ljmp targets
 	 */
-	nop
+	xorl	%ebx, %ebx
+	mov	%cs, %bx
+	sall	$4, %ebx		/* %ebx is now our relocation base */
 
-	movl	%esi,%eax		/* LDTR + TR */
-	lldt	%ax			/* load LDT register */
-	shrl	$16,%eax
-	ltr	%ax			/* load task register */
+	/*
+	 * Load the descriptor table pointer.  We'll need it when running
+	 * in 16-bit protected mode.
+	 */
+	lgdtl	bootgdtdesc - wakeup_start
 
-	/* Restore segment registers */
-	movl	%ebx,%eax		/* ds + es */
-	movw	%ax,%ds
-	shrl	$16,%eax
-	movw	%ax,%es
-	movl	%ecx,%eax		/* fs + gs */
-	movw	%ax,%fs
-	shrl	$16,%eax
-	movw	%ax,%gs
-	movl	%edx,%eax		/* ss */
-	movw	%ax,%ss
+	/* Enable protected mode */
+	movl	$CR0_PE, %eax
+	mov	%eax, %cr0
 
-	/* Jump to acpi_restorecpu() */
-	jmp	*%edi
+	/*
+	 * Now execute a far jump to turn on protected mode.  This
+	 * causes the segment registers to turn into selectors and causes
+	 * %cs to be loaded from the gdt.
+	 *
+	 * The following instruction is:
+	 * ljmpl $bootcode32 - bootgdt, $wakeup_32 - wakeup_start
+	 * but gas cannot assemble that.  And besides, we patch the targets
+	 * in early startup and its a little clearer what we are patching.
+	 */
+wakeup_sw32:
+	.byte	0x66			/* size override to 32 bits */
+	.byte	0xea			/* opcode for far jump */
+	.long	wakeup_32 - wakeup_start /* offset in segment */
+	.word	bootcode32 - bootgdt	/* index in gdt for 32 bit code */
 
-/* used in real mode */
-physical_gdt:		.word 0
-			.long 0
-physical_esp:		.long 0
-previous_cr2:		.long 0
-previous_cr3:		.long 0
-previous_cr4:		.long 0
-resume_beep:		.long 0
-reset_video:		.long 0
+	/*
+	 * At this point, we are running in 32 bit legacy protected mode.
+	 */
+	ALIGN_TEXT
+	.code32
+wakeup_32:
 
-/*
- * Transfer from real mode to protected mode.  The order of these variables
- * is very important, DO NOT INSERT OR CHANGE unless you know why.
- */
-previous_cr0:		.long 0
-previous_tr:		.word 0
-previous_gdt:		.word 0
-			.long 0
-previous_ldt:		.word 0
-previous_idt:		.word 0
-			.long 0
-where_to_recover:	.long 0
-previous_ds:		.word 0
-previous_es:		.word 0
-previous_fs:		.word 0
-previous_gs:		.word 0
-previous_ss:		.word 0
-dummy:			.word 0
+	mov	$bootdata32 - bootgdt, %eax
+	mov	%ax, %ds
+
+	/* Get PCB and return address. */
+	movl	wakeup_pcb - wakeup_start(%ebx), %ecx
+	movl	wakeup_ret - wakeup_start(%ebx), %edx
+
+	/* Restore CR4 and CR3. */
+	movl	wakeup_cr4 - wakeup_start(%ebx), %eax
+	mov	%eax, %cr4
+	movl	wakeup_cr3 - wakeup_start(%ebx), %eax
+	mov	%eax, %cr3
+
+	/*
+	 * Finally, switch to long bit mode by enabling paging.  We have
+	 * to be very careful here because all the segmentation disappears
+	 * out from underneath us.  The spec says we can depend on the
+	 * subsequent pipelined branch to execute, but *only if* everthing
+	 * is still identity mapped.  If any mappings change, the pipeline
+	 * will flush.
+	 */
+	mov	%cr0, %eax
+	orl	$CR0_PG, %eax
+	mov	%eax, %cr0
+
+	jmp	1f
+1:
+	/* Jump to return address. */
+	jmp	*%edx
+
+	.data
+
+resume_beep:
+	.byte	0
+reset_video:
+	.byte	0
+
+	ALIGN_DATA
+bootgdt:
+	.long	0x00000000
+	.long	0x00000000
+
+bootcode32:
+	.long	0x0000ffff
+	.long	0x00cf9b00
+
+bootdata32:
+	.long	0x0000ffff
+	.long	0x00cf9300
+bootgdtend:
+
+bootgdtdesc:
+	.word	bootgdtend - bootgdt	/* Length */
+	.long	bootgdt - wakeup_start	/* Offset plus %ds << 4 */
+
+	ALIGN_DATA
+wakeup_cr4:
+	.long	0
+wakeup_cr3:
+	.long	0
+wakeup_pcb:
+	.long	0
+wakeup_ret:
+	.long	0
+wakeup_gdt:		/* not used */
+	.word	0
+	.long	0
+dummy:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/acpica/acpi_wakeup.c
--- a/head/sys/i386/acpica/acpi_wakeup.c	Wed Jul 25 16:32:50 2012 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,371 +0,0 @@
-/*-
- * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
- * Copyright (c) 2001 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/acpica/acpi_wakeup.c 233250 2012-03-20 21:37:52Z jkim $");
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/kernel.h>
-#include <sys/bus.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/memrange.h>
-#include <sys/proc.h>
-#include <sys/sysctl.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-#include <vm/vm_object.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
-
-#include <machine/bus.h>
-#include <machine/cpufunc.h>
-#include <machine/intr_machdep.h>
-#include <x86/mca.h>
-#include <machine/segments.h>
-
-#include <contrib/dev/acpica/include/acpi.h>
-
-#include <dev/acpica/acpivar.h>
-
-#include "acpi_wakecode.h"
-#include "acpi_wakedata.h"
-
-/* Make sure the code is less than one page and leave room for the stack. */
-CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024);
-
-#ifndef _SYS_CDEFS_H_
-#error this file needs sys/cdefs.h as a prerequisite
-#endif
-
-extern uint32_t	acpi_resume_beep;
-extern uint32_t	acpi_reset_video;
-extern void	initializecpu(void);
-
-static struct region_descriptor __used	saved_idt, saved_gdt;
-static struct region_descriptor	*p_gdt;
-static uint16_t __used 	saved_ldt;
-
-static uint32_t	__used	r_eax, r_ebx, r_ecx, r_edx, r_ebp, r_esi, r_edi,
-			r_efl, r_cr0, r_cr2, r_cr3, r_cr4, ret_addr;
-
-static uint16_t	__used	r_cs, r_ds, r_es, r_fs, r_gs, r_ss, r_tr;
-static uint32_t	__used	r_esp;
-
-static void		acpi_printcpu(void);
-static void		acpi_realmodeinst(void *arg, bus_dma_segment_t *segs,
-					  int nsegs, int error);
-static void		acpi_alloc_wakeup_handler(void);
-
-/* XXX shut gcc up */
-extern int		acpi_savecpu(void);
-extern int		acpi_restorecpu(void);
-
-#ifdef __GNUCLIKE_ASM
-__asm__("				\n\
-	.text				\n\
-	.p2align 2, 0x90		\n\
-	.type acpi_restorecpu, @function\n\
-acpi_restorecpu:			\n\
-	.align 4			\n\
-	movl	r_eax,%eax		\n\
-	movl	r_ebx,%ebx		\n\
-	movl	r_ecx,%ecx		\n\
-	movl	r_edx,%edx		\n\
-	movl	r_ebp,%ebp		\n\
-	movl	r_esi,%esi		\n\
-	movl	r_edi,%edi		\n\
-	movl	r_esp,%esp		\n\
-					\n\
-	pushl	r_efl			\n\
-	popfl				\n\
-					\n\
-	movl	ret_addr,%eax		\n\
-	movl	%eax,(%esp)		\n\
-	xorl	%eax,%eax		\n\
-	ret				\n\
-					\n\
-	.text				\n\
-	.p2align 2, 0x90		\n\
-	.type acpi_savecpu, @function	\n\
-acpi_savecpu:				\n\
-	movw	%cs,r_cs		\n\
-	movw	%ds,r_ds		\n\
-	movw	%es,r_es		\n\
-	movw	%fs,r_fs		\n\
-	movw	%gs,r_gs		\n\
-	movw	%ss,r_ss		\n\
-					\n\
-	movl	%eax,r_eax		\n\
-	movl	%ebx,r_ebx		\n\
-	movl	%ecx,r_ecx		\n\
-	movl	%edx,r_edx		\n\
-	movl	%ebp,r_ebp		\n\
-	movl	%esi,r_esi		\n\
-	movl	%edi,r_edi		\n\
-					\n\
-	movl	%cr0,%eax		\n\
-	movl	%eax,r_cr0		\n\
-	movl	%cr2,%eax		\n\
-	movl	%eax,r_cr2		\n\
-	movl	%cr3,%eax		\n\
-	movl	%eax,r_cr3		\n\
-	movl	%cr4,%eax		\n\
-	movl	%eax,r_cr4		\n\
-					\n\
-	pushfl				\n\
-	popl	r_efl			\n\
-					\n\
-	movl	%esp,r_esp		\n\
-					\n\
-	sgdt	saved_gdt		\n\
-	sidt	saved_idt		\n\
-	sldt	saved_ldt		\n\
-	str	r_tr			\n\
-					\n\
-	movl	(%esp),%eax		\n\
-	movl	%eax,ret_addr		\n\
-	movl	$1,%eax			\n\
-	ret				\n\
-");
-#endif /* __GNUCLIKE_ASM */
-
-static void
-acpi_printcpu(void)
-{
-	printf("======== acpi_printcpu() debug dump ========\n");
-	printf("gdt[%04x:%08x] idt[%04x:%08x] ldt[%04x] tr[%04x] efl[%08x]\n",
-		saved_gdt.rd_limit, saved_gdt.rd_base,
-		saved_idt.rd_limit, saved_idt.rd_base,
-		saved_ldt, r_tr, r_efl);
-	printf("eax[%08x] ebx[%08x] ecx[%08x] edx[%08x]\n",
-		r_eax, r_ebx, r_ecx, r_edx);
-	printf("esi[%08x] edi[%08x] ebp[%08x] esp[%08x]\n",
-		r_esi, r_edi, r_ebp, r_esp);
-	printf("cr0[%08x] cr2[%08x] cr3[%08x] cr4[%08x]\n",
-		r_cr0, r_cr2, r_cr3, r_cr4);
-	printf("cs[%04x] ds[%04x] es[%04x] fs[%04x] gs[%04x] ss[%04x]\n",
-		r_cs, r_ds, r_es, r_fs, r_gs, r_ss);
-}
-
-#define WAKECODE_FIXUP(offset, type, val) do	{		\
-	type	*addr;						\
-	addr = (type *)(sc->acpi_wakeaddr + offset);		\
-	*addr = val;						\
-} while (0)
-
-#define WAKECODE_BCOPY(offset, type, val) do	{		\
-	void	*addr;						\
-	addr = (void *)(sc->acpi_wakeaddr + offset);		\
-	bcopy(&(val), addr, sizeof(type));			\
-} while (0)
-
-/* Turn off bits 1&2 of the PIT, stopping the beep. */
-static void
-acpi_stop_beep(void *arg)
-{
-	outb(0x61, inb(0x61) & ~0x3);
-}
-
-int
-acpi_sleep_machdep(struct acpi_softc *sc, int state)
-{
-	ACPI_STATUS		status;
-	struct pmap		*pm;
-	int			ret;
-	uint32_t		cr3;
-	u_long			ef;
-
-	ret = -1;
-	if (sc->acpi_wakeaddr == 0)
-		return (ret);
-
-	AcpiSetFirmwareWakingVector(sc->acpi_wakephys);
-
-	ef = intr_disable();
-	intr_suspend();
-
-	/*
-	 * Temporarily switch to the kernel pmap because it provides an
-	 * identity mapping (setup at boot) for the low physical memory
-	 * region containing the wakeup code.
-	 */
-	pm = kernel_pmap;
-	cr3 = rcr3();
-#ifdef PAE
-	load_cr3(vtophys(pm->pm_pdpt));
-#else
-	load_cr3(vtophys(pm->pm_pdir));
-#endif
-
-	ret_addr = 0;
-	if (acpi_savecpu()) {
-		/* Execute Sleep */
-
-		p_gdt = (struct region_descriptor *)
-				(sc->acpi_wakeaddr + physical_gdt);
-		p_gdt->rd_limit = saved_gdt.rd_limit;
-		p_gdt->rd_base = vtophys(saved_gdt.rd_base);
-
-		WAKECODE_FIXUP(physical_esp, uint32_t, vtophys(r_esp));
-		WAKECODE_FIXUP(previous_cr0, uint32_t, r_cr0);
-		WAKECODE_FIXUP(previous_cr2, uint32_t, r_cr2);
-		WAKECODE_FIXUP(previous_cr3, uint32_t, r_cr3);
-		WAKECODE_FIXUP(previous_cr4, uint32_t, r_cr4);
-
-		WAKECODE_FIXUP(resume_beep, uint32_t, acpi_resume_beep);
-		WAKECODE_FIXUP(reset_video, uint32_t, acpi_reset_video);
-
-		WAKECODE_FIXUP(previous_tr,  uint16_t, r_tr);
-		WAKECODE_BCOPY(previous_gdt, struct region_descriptor, saved_gdt);
-		WAKECODE_FIXUP(previous_ldt, uint16_t, saved_ldt);
-		WAKECODE_BCOPY(previous_idt, struct region_descriptor, saved_idt);
-
-		WAKECODE_FIXUP(where_to_recover, void *, acpi_restorecpu);
-
-		WAKECODE_FIXUP(previous_ds,  uint16_t, r_ds);
-		WAKECODE_FIXUP(previous_es,  uint16_t, r_es);
-		WAKECODE_FIXUP(previous_fs,  uint16_t, r_fs);
-		WAKECODE_FIXUP(previous_gs,  uint16_t, r_gs);
-		WAKECODE_FIXUP(previous_ss,  uint16_t, r_ss);
-
-		if (bootverbose)
-			acpi_printcpu();
-
-		/* Call ACPICA to enter the desired sleep state */
-		if (state == ACPI_STATE_S4 && sc->acpi_s4bios)
-			status = AcpiEnterSleepStateS4bios();
-		else
-			status = AcpiEnterSleepState(state, acpi_sleep_flags);
-
-		if (status != AE_OK) {
-			device_printf(sc->acpi_dev,
-				"AcpiEnterSleepState failed - %s\n",
-				AcpiFormatException(status));
-			goto out;
-		}
-
-		for (;;)
-			ia32_pause();
-	} else {
-		pmap_init_pat();
-		PCPU_SET(switchtime, 0);
-		PCPU_SET(switchticks, ticks);
-		if (bootverbose) {
-			acpi_savecpu();
-			acpi_printcpu();
-		}
-		ret = 0;
-	}
-
-out:
-	load_cr3(cr3);
-	mca_resume();
-	intr_resume();
-	intr_restore(ef);
-
-	if (ret == 0 && mem_range_softc.mr_op != NULL &&
-	    mem_range_softc.mr_op->reinit != NULL)
-		mem_range_softc.mr_op->reinit(&mem_range_softc);
-
-	/* If we beeped, turn it off after a delay. */
-	if (acpi_resume_beep)
-		timeout(acpi_stop_beep, NULL, 3 * hz);
-
-	return (ret);
-}
-
-static bus_dma_tag_t	acpi_waketag;
-static bus_dmamap_t	acpi_wakemap;
-static vm_offset_t	acpi_wakeaddr;
-
-static void
-acpi_alloc_wakeup_handler(void)
-{
-	void *wakeaddr;
-
-	if (!cold)
-		return;
-
-	/*
-	 * Specify the region for our wakeup code.  We want it in the low 1 MB
-	 * region, excluding video memory and above (0xa0000).  We ask for
-	 * it to be page-aligned, just to be safe.
-	 */
-	if (bus_dma_tag_create(/*parent*/ NULL,
-	    /*alignment*/ PAGE_SIZE, /*no boundary*/ 0,
-	    /*lowaddr*/ 0x9ffff, /*highaddr*/ BUS_SPACE_MAXADDR, NULL, NULL,
-	    /*maxsize*/ PAGE_SIZE, /*segments*/ 1, /*maxsegsize*/ PAGE_SIZE,
-	    0, busdma_lock_mutex, &Giant, &acpi_waketag) != 0) {
-		printf("acpi_alloc_wakeup_handler: can't create wake tag\n");
-		return;
-	}
-	if (bus_dmamem_alloc(acpi_waketag, &wakeaddr, BUS_DMA_NOWAIT,
-	    &acpi_wakemap) != 0) {
-		printf("acpi_alloc_wakeup_handler: can't alloc wake memory\n");
-		return;
-	}
-	acpi_wakeaddr = (vm_offset_t)wakeaddr;
-}
-
-SYSINIT(acpiwakeup, SI_SUB_KMEM, SI_ORDER_ANY, acpi_alloc_wakeup_handler, 0);
-
-static void
-acpi_realmodeinst(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
-{
-	struct acpi_softc *sc;
-	uint32_t *addr;
-
-	/* Overwrite the ljmp target with the real address */
-	sc = arg;
-	sc->acpi_wakephys = segs[0].ds_addr;
-	addr = (uint32_t *)&wakecode[wakeup_sw32 + 2];
-	*addr = sc->acpi_wakephys + wakeup_32;
-
-	/* Copy the wake code into our low page and save its physical addr. */
-	bcopy(wakecode, (void *)sc->acpi_wakeaddr, sizeof(wakecode));
-	if (bootverbose) {
-		device_printf(sc->acpi_dev, "wakeup code va %#x pa %#jx\n",
-		    acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys);
-	}
-}
-
-void
-acpi_install_wakeup_handler(struct acpi_softc *sc)
-{
-	if (acpi_wakeaddr == 0)
-		return;
-
-	sc->acpi_waketag = acpi_waketag;
-	sc->acpi_wakeaddr = acpi_wakeaddr;
-	sc->acpi_wakemap = acpi_wakemap;
-
-	bus_dmamap_load(sc->acpi_waketag, sc->acpi_wakemap,
-	    (void *)sc->acpi_wakeaddr, PAGE_SIZE, acpi_realmodeinst, sc, 0);
-}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/conf/GENERIC
--- a/head/sys/i386/conf/GENERIC	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/conf/GENERIC	Wed Jul 25 16:40:53 2012 +0300
@@ -16,7 +16,7 @@
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
-# $FreeBSD: head/sys/i386/conf/GENERIC 234504 2012-04-20 21:37:42Z brooks $
+# $FreeBSD: head/sys/i386/conf/GENERIC 237263 2012-06-19 07:34:13Z np $
 
 cpu		I486_CPU
 cpu		I586_CPU
@@ -30,6 +30,7 @@
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
+options 	TCP_OFFLOAD		# TCP offload
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
@@ -46,6 +47,7 @@
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_PART_GPT		# GUID Partition Tables.
+options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD4		# Compatible with FreeBSD4
 options 	COMPAT_FREEBSD5		# Compatible with FreeBSD5
@@ -66,6 +68,7 @@
 options 	CAPABILITIES		# Capsicum capabilities
 options 	MAC			# TrustedBSD MAC Framework
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
+options 	DDB_CTF			# Kernel ELF linker loads CTF data
 options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
 
 # Debugging support.  Always need this:
@@ -75,7 +78,6 @@
 # For full debugger support use this instead:
 options 	DDB			# Support DDB.
 options 	GDB			# Support remote GDB.
-options 	DDB_CTF			# kernel ELF linker loads CTF data
 options 	DEADLKRES		# Enable the deadlock resolver
 options 	INVARIANTS		# Enable calls of extra sanity checking
 options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
@@ -284,6 +286,8 @@
 device		ath_pci		# Atheros pci/cardbus glue
 device		ath_hal		# pci/cardbus chip support
 options 	AH_SUPPORT_AR5416	# enable AR5416 tx/rx descriptors
+options 	AH_AR5416_INTERRUPT_MITIGATION	# AR5416 interrupt mitigation
+options 	ATH_ENABLE_11N	# Enable 802.11n support for AR5416 and later
 device		ath_rate_sample	# SampleRate tx rate control for ath
 #device		bwi		# Broadcom BCM430x/BCM431x wireless NICs.
 #device		bwn		# Broadcom BCM43xx wireless NICs.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/conf/XEN
--- a/head/sys/i386/conf/XEN	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/conf/XEN	Wed Jul 25 16:40:53 2012 +0300
@@ -1,13 +1,13 @@
 #
 # XEN -- Kernel configuration for i386 XEN DomU
 #
-# $FreeBSD: head/sys/i386/conf/XEN 233271 2012-03-21 08:38:42Z ed $
+# $FreeBSD: head/sys/i386/conf/XEN 237263 2012-06-19 07:34:13Z np $
 
 cpu		I686_CPU
 ident		XEN
 
 makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
-makeoptions	WITHOUT_MODULES="aha ahb amd cxgb dpt drm hptmv ida malo mps mwl nve sound sym trm xfs"
+makeoptions	WITHOUT_MODULES="aha ahb amd cxgb dpt drm drm2 hptmv ida malo mps mwl nve rdma sound sym trm xfs"
 
 options 	SCHED_ULE		# ULE scheduler
 options 	PREEMPTION		# Enable kernel thread preemption
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/apic_vector.s
--- a/head/sys/i386/i386/apic_vector.s	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/apic_vector.s	Wed Jul 25 16:40:53 2012 +0300
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/i386/apic_vector.s 235683 2012-05-20 08:17:20Z iwasaki $
  */
 
 /*
@@ -334,6 +334,26 @@
 	iret
 
 /*
+ * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
+ */
+#ifndef XEN
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(cpususpend)
+	PUSH_FRAME
+	SET_KERNEL_SREGS
+	cld
+
+	movl	lapic, %eax
+	movl	$0, LA_EOI(%eax)	/* End Of Interrupt to APIC */
+
+	call	cpususpend_handler
+
+	POP_FRAME
+	jmp	doreti_iret
+#endif
+
+/*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/bios.c
--- a/head/sys/i386/i386/bios.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/bios.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/i386/bios.c 236213 2012-05-29 01:48:06Z kevlo $");
 
 /*
  * Code for dealing with the BIOS in x86 PC systems.
@@ -372,9 +372,11 @@
 	    break;
 
 	default:
+	    va_end(ap);
 	    return (EINVAL);
 	}
     }
+    va_end(ap);
 
     if (flags & BIOSARGS_FLAG) {
 	if (arg_end - arg_start > ctob(16))
@@ -448,9 +450,11 @@
 	    break;
 
 	default:
+	    va_end(ap);
 	    return (EINVAL);
 	}
     }
+    va_end(ap);
 
     set_bios_selectors(&args->seg, flags);
     bioscall_vector.vec16.offset = (u_short)args->entry;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/elf_machdep.c
--- a/head/sys/i386/i386/elf_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/elf_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/i386/elf_machdep.c 237435 2012-06-22 07:16:29Z kib $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -74,12 +74,15 @@
 	.sv_setregs	= exec_setregs,
 	.sv_fixlimit	= NULL,
 	.sv_maxssiz	= NULL,
-	.sv_flags	= SV_ABI_FREEBSD | SV_IA32 | SV_ILP32,
+	.sv_flags	= SV_ABI_FREEBSD | SV_IA32 | SV_ILP32 | SV_SHP,
 	.sv_set_syscall_retval = cpu_set_syscall_retval,
 	.sv_fetch_syscall_args = cpu_fetch_syscall_args,
 	.sv_syscallnames = syscallnames,
+	.sv_shared_page_base = SHAREDPAGE,
+	.sv_shared_page_len = PAGE_SIZE,
 	.sv_schedtail	= NULL,
 };
+INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec);
 
 static Elf32_Brandinfo freebsd_brand_info = {
 	.brand		= ELFOSABI_FREEBSD,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/genassym.c
--- a/head/sys/i386/i386/genassym.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/genassym.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/genassym.c 224187 2011-07-18 15:19:40Z attilio $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/genassym.c 235622 2012-05-18 18:55:58Z iwasaki $");
 
 #include "opt_apic.h"
 #include "opt_compat.h"
@@ -121,7 +121,10 @@
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(KERNLOAD, KERNLOAD);
 ASSYM(MCLBYTES, MCLBYTES);
+ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0));
+ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2));
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
+ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4));
 ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi));
 ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi));
 ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp));
@@ -130,7 +133,11 @@
 ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip));
 ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0));
 
+ASSYM(PCB_DS, offsetof(struct pcb, pcb_ds));
+ASSYM(PCB_ES, offsetof(struct pcb, pcb_es));
+ASSYM(PCB_FS, offsetof(struct pcb, pcb_fs));
 ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs));
+ASSYM(PCB_SS, offsetof(struct pcb, pcb_ss));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
@@ -143,6 +150,7 @@
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 
 ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd));
+ASSYM(PCB_GSD, offsetof(struct pcb, pcb_gsd));
 ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
@@ -152,6 +160,11 @@
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 ASSYM(PCB_VM86CALL, PCB_VM86CALL);
 
+ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt));
+ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt));
+ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt));
+ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr));
+
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip));
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/initcpu.c
--- a/head/sys/i386/i386/initcpu.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/initcpu.c	Wed Jul 25 16:40:53 2012 +0300
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/initcpu.c 230767 2012-01-30 07:56:00Z kib $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/initcpu.c 235622 2012-05-18 18:55:58Z iwasaki $");
 
 #include "opt_cpu.h"
 
@@ -48,7 +48,6 @@
 #define CPU_ENABLE_SSE
 #endif
 
-void initializecpu(void);
 #if defined(I586_CPU) && defined(CPU_WT_ALLOC)
 void	enable_K5_wt_alloc(void);
 void	enable_K6_wt_alloc(void);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/machdep.c
--- a/head/sys/i386/i386/machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/machdep.c 234105 2012-04-10 16:08:46Z marius $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/machdep.c 238310 2012-07-09 20:42:08Z jhb $");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
@@ -75,6 +75,7 @@
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
@@ -180,7 +181,6 @@
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
-extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
@@ -248,6 +248,8 @@
 
 struct mtx icu_lock;
 
+struct mem_range_softc mem_range_softc;
+
 static void
 cpu_startup(dummy)
 	void *dummy;
@@ -337,12 +339,10 @@
 	cpu_setregs();
 #endif
 
-#ifdef SMP
 	/*
 	 * Add BSP as an interrupt target.
 	 */
 	intr_add_cpu(0);
-#endif
 }
 
 /*
@@ -472,7 +472,13 @@
 	}
 
 	regs->tf_esp = (int)fp;
-	regs->tf_eip = PS_STRINGS - szosigcode;
+	if (p->p_sysent->sv_sigcode_base != 0) {
+		regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
+		    szosigcode;
+	} else {
+		/* a.out sysentvec does not use shared page */
+		regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode;
+	}
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
@@ -599,7 +605,8 @@
 	}
 
 	regs->tf_esp = (int)sfp;
-	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
+	regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
+	    szfreebsd4_sigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
@@ -750,7 +757,7 @@
 	}
 
 	regs->tf_esp = (int)sfp;
-	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
+	regs->tf_eip = p->p_sysent->sv_sigcode_base;
 	regs->tf_eflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
@@ -2178,7 +2185,7 @@
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 #ifndef XEN
-	int hasbrokenint12, i;
+	int hasbrokenint12, i, res;
 	u_int extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
@@ -2263,7 +2270,8 @@
 	pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
-	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
+	res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
+	KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
 
 	vmf.vmf_ebx = 0;
 	do {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/mem.c
--- a/head/sys/i386/i386/mem.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/mem.c	Wed Jul 25 16:40:53 2012 +0300
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/i386/mem.c 238310 2012-07-09 20:42:08Z jhb $");
 
 /*
  * Memory special file
@@ -72,8 +72,6 @@
  */
 MALLOC_DEFINE(M_MEMDESC, "memdesc", "memory range descriptors");
 
-struct mem_range_softc mem_range_softc;
-
 static struct sx memsxlock;
 SX_SYSINIT(memsxlockinit, &memsxlock, "/dev/mem lock");
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/minidump_machdep.c
--- a/head/sys/i386/i386/minidump_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/minidump_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/minidump_machdep.c 221173 2011-04-28 16:02:05Z attilio $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/minidump_machdep.c 236503 2012-06-03 08:01:12Z avg $");
 
 #include "opt_watchdog.h"
 
@@ -36,9 +36,7 @@
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/msgbuf.h>
-#ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
-#endif
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <machine/atomic.h>
@@ -143,9 +141,9 @@
 			printf(" %lld", PG2MB(progress >> PAGE_SHIFT));
 			counter &= (1<<24) - 1;
 		}
-#ifdef SW_WATCHDOG
+
 		wdog_kern_pat(WD_LASTVAL);
-#endif
+
 		if (ptr) {
 			error = dump_write(di, ptr, 0, dumplo, len);
 			if (error)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/mp_machdep.c
--- a/head/sys/i386/i386/mp_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/mp_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/mp_machdep.c 234208 2012-04-13 07:18:19Z avg $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/mp_machdep.c 236938 2012-06-12 00:14:54Z iwasaki $");
 
 #include "opt_apic.h"
 #include "opt_cpu.h"
@@ -146,6 +146,7 @@
 static void *dpcpu;
 
 struct pcb stoppcbs[MAXCPU];
+struct pcb **susppcbs = NULL;
 
 /* Variables needed for SMP tlb shootdown. */
 vm_offset_t smp_tlb_addr1;
@@ -587,6 +588,9 @@
 	setidt(IPI_STOP, IDTVEC(cpustop),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
+	/* Install an inter-CPU IPI for CPU suspend/resume */
+	setidt(IPI_SUSPEND, IDTVEC(cpususpend),
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
@@ -1077,6 +1081,60 @@
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
+	ipi_startup(apic_id, vector);
+
+	/* Wait up to 5 seconds for it to start. */
+	for (ms = 0; ms < 5000; ms++) {
+		if (mp_naps > cpus)
+			return 1;	/* return SUCCESS */
+		DELAY(1000);
+	}
+	return 0;		/* return FAILURE */
+}
+
+#ifdef COUNT_XINVLTLB_HITS
+u_int xhits_gbl[MAXCPU];
+u_int xhits_pg[MAXCPU];
+u_int xhits_rng[MAXCPU];
+static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
+    sizeof(xhits_gbl), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
+    sizeof(xhits_pg), "IU", "");
+SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
+    sizeof(xhits_rng), "IU", "");
+
+u_int ipi_global;
+u_int ipi_page;
+u_int ipi_range;
+u_int ipi_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
+    0, "");
+
+u_int ipi_masked_global;
+u_int ipi_masked_page;
+u_int ipi_masked_range;
+u_int ipi_masked_range_size;
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
+    &ipi_masked_global, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
+    &ipi_masked_page, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
+    &ipi_masked_range, 0, "");
+SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
+    &ipi_masked_range_size, 0, "");
+#endif /* COUNT_XINVLTLB_HITS */
+
+/*
+ * Init and startup IPI.
+ */
+void
+ipi_startup(int apic_id, int vector)
+{
+
 	/*
 	 * first we do an INIT/RESET IPI this INIT IPI might be run, reseting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
@@ -1127,52 +1185,8 @@
 	    vector, apic_id);
 	lapic_ipi_wait(-1);
 	DELAY(200);		/* wait ~200uS */
-
-	/* Wait up to 5 seconds for it to start. */
-	for (ms = 0; ms < 5000; ms++) {
-		if (mp_naps > cpus)
-			return 1;	/* return SUCCESS */
-		DELAY(1000);
-	}
-	return 0;		/* return FAILURE */
 }
 
-#ifdef COUNT_XINVLTLB_HITS
-u_int xhits_gbl[MAXCPU];
-u_int xhits_pg[MAXCPU];
-u_int xhits_rng[MAXCPU];
-static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
-    sizeof(xhits_gbl), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
-    sizeof(xhits_pg), "IU", "");
-SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
-    sizeof(xhits_rng), "IU", "");
-
-u_int ipi_global;
-u_int ipi_page;
-u_int ipi_range;
-u_int ipi_range_size;
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
-    0, "");
-
-u_int ipi_masked_global;
-u_int ipi_masked_page;
-u_int ipi_masked_range;
-u_int ipi_masked_range_size;
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW,
-    &ipi_masked_global, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW,
-    &ipi_masked_page, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW,
-    &ipi_masked_range, 0, "");
-SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW,
-    &ipi_masked_range_size, 0, "");
-#endif /* COUNT_XINVLTLB_HITS */
-
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
@@ -1498,6 +1512,39 @@
 }
 
 /*
+ * Handle an IPI_SUSPEND by saving our current context and spinning until we
+ * are resumed.
+ */
+void
+cpususpend_handler(void)
+{
+	u_int cpu;
+
+	cpu = PCPU_GET(cpuid);
+
+	if (savectx(susppcbs[cpu])) {
+		wbinvd();
+		CPU_SET_ATOMIC(cpu, &suspended_cpus);
+	} else {
+		pmap_init_pat();
+		PCPU_SET(switchtime, 0);
+		PCPU_SET(switchticks, ticks);
+
+		/* Indicate that we are resumed */
+		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
+	}
+
+	/* Wait for resume */
+	while (!CPU_ISSET(cpu, &started_cpus))
+		ia32_pause();
+
+	CPU_CLR_ATOMIC(cpu, &started_cpus);
+
+	/* Resume MCA and local APIC */
+	mca_resume();
+	lapic_setup(0);
+}
+/*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/pmap.c
--- a/head/sys/i386/i386/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -75,7 +75,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 233433 2012-03-24 19:43:49Z alc $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 237623 2012-06-27 03:45:25Z alc $");
 
 /*
  *	Manages physical address maps.
@@ -118,6 +118,7 @@
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
@@ -231,8 +232,20 @@
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
 /*
+ * Isolate the global pv list lock from data and other locks to prevent false
+ * sharing within the cache.
+ */
+static struct {
+	struct rwlock	lock;
+	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
+} pvh_global __aligned(CACHE_LINE_SIZE);
+
+#define	pvh_global_lock	pvh_global.lock
+
+/*
  * Data for the pv entry allocation mechanism
  */
+static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static struct md_page *pv_table;
 static int shpgperproc = PMAP_SHPGPERPROC;
@@ -283,8 +296,9 @@
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
+static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
+static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
@@ -391,6 +405,12 @@
 	kernel_pmap->pm_root = NULL;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
+
+ 	/*
+	 * Initialize the global pv list lock.
+	 */
+	rw_init(&pvh_global_lock, "pmap pv global");
+
 	LIST_INIT(&allpmaps);
 
 	/*
@@ -1275,7 +1295,7 @@
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  *
- * If the given pmap is not the current pmap, vm_page_queue_mtx
+ * If the given pmap is not the current pmap, pvh_global_lock
  * must be held and curthread pinned to a CPU.
  */
 static pt_entry_t *
@@ -1291,7 +1311,7 @@
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
-		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+		rw_assert(&pvh_global_lock, RA_WLOCKED);
 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP1 & PG_FRAME) != newpf) {
@@ -1840,9 +1860,9 @@
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (flags & M_WAITOK) {
 			PMAP_UNLOCK(pmap);
-			vm_page_unlock_queues();
+			rw_wunlock(&pvh_global_lock);
 			VM_WAIT;
-			vm_page_lock_queues();
+			rw_wlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 
@@ -2143,6 +2163,7 @@
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
+CTASSERT(_NPCPV == 336);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
@@ -2156,7 +2177,7 @@
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
-static uint32_t pc_freemask[11] = {
+static const uint32_t pc_freemask[_NPCM] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
@@ -2187,83 +2208,155 @@
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
-
-static int pmap_collect_inactive, pmap_collect_active;
-
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
-	"Current number times pmap_collect called on inactive queue");
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
-	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
- * another pv entry chunk.  This is normally called to
- * unmap inactive pages, and if necessary, active pages.
+ * another pv entry chunk.
  */
-static void
-pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
+static vm_page_t
+pmap_pv_reclaim(pmap_t locked_pmap)
 {
+	struct pch newtail;
+	struct pv_chunk *pc;
+	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
-	pv_entry_t next_pv, pv;
+	pv_entry_t pv;
 	vm_offset_t va;
-	vm_page_t m, free;
-
+	vm_page_t free, m, m_pc;
+	uint32_t inuse;
+	int bit, field, freed;
+
+	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+	pmap = NULL;
+	free = m_pc = NULL;
+	TAILQ_INIT(&newtail);
 	sched_pin();
-	TAILQ_FOREACH(m, &vpq->pl, pageq) {
-		if ((m->flags & PG_MARKER) != 0 || m->hold_count || m->busy)
-			continue;
-		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
-			va = pv->pv_va;
-			pmap = PV_PMAP(pv);
+	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
+	    free == NULL)) {
+		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+		if (pmap != pc->pc_pmap) {
+			if (pmap != NULL) {
+				pmap_invalidate_all(pmap);
+				if (pmap != locked_pmap)
+					PMAP_UNLOCK(pmap);
+			}
+			pmap = pc->pc_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
-			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
+			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
+				pmap = NULL;
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 				continue;
-			pmap->pm_stats.resident_count--;
-			pde = pmap_pde(pmap, va);
-			KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found"
-			    " a 4mpage in page %p's pv list", m));
-			pte = pmap_pte_quick(pmap, va);
-			tpte = pte_load_clear(pte);
-			KASSERT((tpte & PG_W) == 0,
-			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
-			if (tpte & PG_A)
-				vm_page_aflag_set(m, PGA_REFERENCED);
-			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
-				vm_page_dirty(m);
-			free = NULL;
-			pmap_unuse_pt(pmap, va, &free);
-			pmap_invalidate_page(pmap, va);
-			pmap_free_zero_pages(free);
-			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-			free_pv_entry(pmap, pv);
-			if (pmap != locked_pmap)
-				PMAP_UNLOCK(pmap);
+			}
 		}
-		if (TAILQ_EMPTY(&m->md.pv_list) &&
-		    TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list))
-			vm_page_aflag_clear(m, PGA_WRITEABLE);
+
+		/*
+		 * Destroy every non-wired, 4 KB page mapping in the chunk.
+		 */
+		freed = 0;
+		for (field = 0; field < _NPCM; field++) {
+			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
+			    inuse != 0; inuse &= ~(1UL << bit)) {
+				bit = bsfl(inuse);
+				pv = &pc->pc_pventry[field * 32 + bit];
+				va = pv->pv_va;
+				pde = pmap_pde(pmap, va);
+				if ((*pde & PG_PS) != 0)
+					continue;
+				pte = pmap_pte_quick(pmap, va);
+				if ((*pte & PG_W) != 0)
+					continue;
+				tpte = pte_load_clear(pte);
+				if ((tpte & PG_G) != 0)
+					pmap_invalidate_page(pmap, va);
+				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+					vm_page_dirty(m);
+				if ((tpte & PG_A) != 0)
+					vm_page_aflag_set(m, PGA_REFERENCED);
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list) &&
+				    (m->flags & PG_FICTITIOUS) == 0) {
+					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
+					if (TAILQ_EMPTY(&pvh->pv_list)) {
+						vm_page_aflag_clear(m,
+						    PGA_WRITEABLE);
+					}
+				}
+				pc->pc_map[field] |= 1UL << bit;
+				pmap_unuse_pt(pmap, va, &free);
+				freed++;
+			}
+		}
+		if (freed == 0) {
+			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+			continue;
+		}
+		/* Every freed mapping is for a 4 KB page. */
+		pmap->pm_stats.resident_count -= freed;
+		PV_STAT(pv_entry_frees += freed);
+		PV_STAT(pv_entry_spare += freed);
+		pv_entry_count -= freed;
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		for (field = 0; field < _NPCM; field++)
+			if (pc->pc_map[field] != pc_freemask[field]) {
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+
+				/*
+				 * One freed pv entry in locked_pmap is
+				 * sufficient.
+				 */
+				if (pmap == locked_pmap)
+					goto out;
+				break;
+			}
+		if (field == _NPCM) {
+			PV_STAT(pv_entry_spare -= _NPCPV);
+			PV_STAT(pc_chunk_count--);
+			PV_STAT(pc_chunk_frees++);
+			/* Entire chunk is free; return it. */
+			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
+			pmap_qremove((vm_offset_t)pc, 1);
+			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+			break;
+		}
 	}
+out:
 	sched_unpin();
+	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
+	if (pmap != NULL) {
+		pmap_invalidate_all(pmap);
+		if (pmap != locked_pmap)
+			PMAP_UNLOCK(pmap);
+	}
+	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
+		m_pc = free;
+		free = m_pc->right;
+		/* Recycle a freed page table page. */
+		m_pc->wire_count = 1;
+		atomic_add_int(&cnt.v_wire_count, 1);
+	}
+	pmap_free_zero_pages(free);
+	return (m_pc);
 }
 
-
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
-	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
@@ -2273,13 +2366,30 @@
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
-	/* move to head of list */
-	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx]) {
-			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+			/*
+			 * 98% of the time, pc is already at the head of the
+			 * list.  If it isn't already, move it to the head.
+			 */
+			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
+			    pc)) {
+				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+			}
 			return;
 		}
+	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+	free_pv_chunk(pc);
+}
+
+static void
+free_pv_chunk(struct pv_chunk *pc)
+{
+	vm_page_t m;
+
+ 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
@@ -2296,18 +2406,17 @@
  * when needed.
  */
 static pv_entry_t
-get_pv_entry(pmap_t pmap, int try)
+get_pv_entry(pmap_t pmap, boolean_t try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
-	struct vpgqueues *pq;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
@@ -2315,7 +2424,6 @@
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max tunable.\n");
-	pq = NULL;
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
@@ -2341,33 +2449,20 @@
 		}
 	}
 	/*
-	 * Access to the ptelist "pv_vafree" is synchronized by the page
-	 * queues lock.  If "pv_vafree" is currently non-empty, it will
+	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
+	 * global lock.  If "pv_vafree" is currently non-empty, it will
 	 * remain non-empty until pmap_ptelist_alloc() completes.
 	 */
-	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, (pq ==
-	    &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
+	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
-		/*
-		 * Reclaim pv entries: At first, destroy mappings to
-		 * inactive pages.  After that, if a pv chunk entry
-		 * is still needed, destroy mappings to active pages.
-		 */
-		if (pq == NULL) {
-			PV_STAT(pmap_collect_inactive++);
-			pq = &vm_page_queues[PQ_INACTIVE];
-		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
-			PV_STAT(pmap_collect_active++);
-			pq = &vm_page_queues[PQ_ACTIVE];
-		} else
-			panic("get_pv_entry: increase vm.pmap.shpgperproc");
-		pmap_collect(pmap, pq);
-		goto retry;
+		m = pmap_pv_reclaim(pmap);
+		if (m == NULL)
+			goto retry;
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
@@ -2377,6 +2472,7 @@
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
+	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
@@ -2388,7 +2484,7 @@
 {
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_list);
@@ -2406,7 +2502,7 @@
 	vm_offset_t va_last;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
 
@@ -2439,7 +2535,7 @@
 	vm_offset_t va_last;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
 
@@ -2480,7 +2576,7 @@
 {
 	struct md_page *pvh;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	pmap_pvh_free(&m->md, pmap, va);
 	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
@@ -2498,8 +2594,8 @@
 {
 	pv_entry_t pv;
 
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
@@ -2513,8 +2609,8 @@
 {
 	pv_entry_t pv;
 
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
@@ -2533,7 +2629,7 @@
 	struct md_page *pvh;
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
@@ -2611,7 +2707,7 @@
 	 */
 	if (va >= KERNBASE)
 		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
-	else if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) {
+	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
 		if ((*PMAP1 & PG_FRAME) != mptepa) {
 			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
@@ -2770,7 +2866,7 @@
 	pt_entry_t oldpte;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
@@ -2801,7 +2897,7 @@
 {
 	pt_entry_t *pte;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
@@ -2833,7 +2929,7 @@
 
 	anyvalid = 0;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	PMAP_LOCK(pmap);
 
@@ -2922,7 +3018,7 @@
 	sched_unpin();
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
@@ -2954,7 +3050,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	free = NULL;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -2995,7 +3091,7 @@
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	pmap_free_zero_pages(free);
 }
 
@@ -3050,7 +3146,7 @@
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
-	int anychanged;
+	boolean_t anychanged, pv_lists_locked;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
@@ -3066,10 +3162,16 @@
 		return;
 #endif
 
-	anychanged = 0;
-
-	vm_page_lock_queues();
-	sched_pin();
+	if (pmap_is_current(pmap))
+		pv_lists_locked = FALSE;
+	else {
+		pv_lists_locked = TRUE;
+resume:
+		rw_wlock(&pvh_global_lock);
+		sched_pin();
+	}
+	anychanged = FALSE;
+
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pt_entry_t obits, pbits;
@@ -3104,12 +3206,27 @@
 				 */
 				if (pmap_protect_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva, prot))
-					anychanged = 1;
+					anychanged = TRUE;
 				continue;
-			} else if (!pmap_demote_pde(pmap,
-			    &pmap->pm_pdir[pdirindex], sva)) {
-				/* The large page mapping was destroyed. */
-				continue;
+			} else {
+				if (!pv_lists_locked) {
+					pv_lists_locked = TRUE;
+					if (!rw_try_wlock(&pvh_global_lock)) {
+						if (anychanged)
+							pmap_invalidate_all(
+							    pmap);
+						PMAP_UNLOCK(pmap);
+						goto resume;
+					}
+				}
+				if (!pmap_demote_pde(pmap,
+				    &pmap->pm_pdir[pdirindex], sva)) {
+					/*
+					 * The large page mapping was
+					 * destroyed.
+					 */
+					continue;
+				}
 			}
 		}
 
@@ -3155,14 +3272,16 @@
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
-					anychanged = 1;
+					anychanged = TRUE;
 			}
 		}
 	}
-	sched_unpin();
 	if (anychanged)
 		pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();
+	if (pv_lists_locked) {
+		sched_unpin();
+		rw_wunlock(&pvh_global_lock);
+	}
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3332,7 +3451,7 @@
 
 	mpte = NULL;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 
@@ -3502,7 +3621,7 @@
 		pmap_promote_pde(pmap, pde, va);
 
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3517,7 +3636,7 @@
 {
 	pd_entry_t *pde, newpde;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pde = pmap_pde(pmap, va);
 	if (*pde != 0) {
@@ -3586,7 +3705,7 @@
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
@@ -3600,7 +3719,7 @@
 			    mpte);
 		m = TAILQ_NEXT(m, listq);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3617,10 +3736,10 @@
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3635,7 +3754,7 @@
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
@@ -3841,9 +3960,9 @@
 		if (!wired != ((*pde & PG_W) == 0)) {
 			if (!are_queues_locked) {
 				are_queues_locked = TRUE;
-				if (!mtx_trylock(&vm_page_queue_mtx)) {
+				if (!rw_try_wlock(&pvh_global_lock)) {
 					PMAP_UNLOCK(pmap);
-					vm_page_lock_queues();
+					rw_wlock(&pvh_global_lock);
 					goto retry;
 				}
 			}
@@ -3867,7 +3986,7 @@
 	pmap_pte_release(pte);
 out:
 	if (are_queues_locked)
-		vm_page_unlock_queues();
+		rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -3896,7 +4015,7 @@
 	if (!pmap_is_current(src_pmap))
 		return;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
@@ -3986,7 +4105,7 @@
 	}
 out:
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
@@ -4128,7 +4247,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
@@ -4150,7 +4269,7 @@
 				break;
 		}
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4168,13 +4287,13 @@
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	count = pmap_pvh_wired_mappings(&m->md, count);
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
 	        count);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
@@ -4190,7 +4309,7 @@
 	pt_entry_t *pte;
 	pv_entry_t pv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
@@ -4215,11 +4334,11 @@
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4249,13 +4368,13 @@
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
-			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
+			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfl(inuse);
 				bitmask = 1UL << bit;
@@ -4347,20 +4466,13 @@
 			}
 		}
 		if (allfree) {
-			PV_STAT(pv_entry_spare -= _NPCPV);
-			PV_STAT(pc_chunk_count--);
-			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
-			pmap_qremove((vm_offset_t)pc, 1);
-			vm_page_unwire(m, 0);
-			vm_page_free(m);
-			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+			free_pv_chunk(pc);
 		}
 	}
 	sched_unpin();
 	pmap_invalidate_all(pmap);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
@@ -4388,11 +4500,11 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_modified_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4409,7 +4521,7 @@
 	pmap_t pmap;
 	boolean_t rv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
@@ -4462,11 +4574,11 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_referenced_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -4482,7 +4594,7 @@
 	pmap_t pmap;
 	boolean_t rv;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) {
@@ -4523,7 +4635,7 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -4564,7 +4676,7 @@
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -4593,7 +4705,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -4652,7 +4764,7 @@
 	}
 out:
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rtval);
 }
 
@@ -4682,7 +4794,7 @@
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -4743,7 +4855,7 @@
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -4763,7 +4875,7 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_reference: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
@@ -4810,7 +4922,7 @@
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/ptrace_machdep.c
--- a/head/sys/i386/i386/ptrace_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/ptrace_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/i386/i386/ptrace_machdep.c 238675 2012-07-21 21:39:02Z kib $");
 
 #include "opt_cpu.h"
 
@@ -54,10 +54,12 @@
 	fpstate = &td->td_pcb->pcb_user_save.sv_xmm;
 	switch (req) {
 	case PT_GETXMMREGS:
+		npxgetregs(td);
 		error = copyout(fpstate, addr, sizeof(*fpstate));
 		break;
 
 	case PT_SETXMMREGS:
+		npxgetregs(td);
 		error = copyin(addr, fpstate, sizeof(*fpstate));
 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 		break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/swtch.s
--- a/head/sys/i386/i386/swtch.s	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/swtch.s	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/i386/swtch.s 237027 2012-06-13 21:03:01Z jkim $
  */
 
 #include "opt_npx.h"
@@ -386,6 +386,36 @@
 	pushfl
 	popl	PCB_PSL(%ecx)
 
+	movl	%cr0,%eax
+	movl	%eax,PCB_CR0(%ecx)
+	movl	%cr2,%eax
+	movl	%eax,PCB_CR2(%ecx)
+	movl	%cr4,%eax
+	movl	%eax,PCB_CR4(%ecx)
+
+	movl	%dr0,%eax
+	movl	%eax,PCB_DR0(%ecx)
+	movl	%dr1,%eax
+	movl	%eax,PCB_DR1(%ecx)
+	movl	%dr2,%eax
+	movl	%eax,PCB_DR2(%ecx)
+	movl	%dr3,%eax
+	movl	%eax,PCB_DR3(%ecx)
+	movl	%dr6,%eax
+	movl	%eax,PCB_DR6(%ecx)
+	movl	%dr7,%eax
+	movl	%eax,PCB_DR7(%ecx)
+
+	mov	%ds,PCB_DS(%ecx)
+	mov	%es,PCB_ES(%ecx)
+	mov	%fs,PCB_FS(%ecx)
+	mov	%ss,PCB_SS(%ecx)
+	
+	sgdt	PCB_GDT(%ecx)
+	sidt	PCB_IDT(%ecx)
+	sldt	PCB_LDT(%ecx)
+	str	PCB_TR(%ecx)
+
 #ifdef DEV_NPX
 	/*
 	 * If fpcurthread == NULL, then the npx h/w state is irrelevant and the
@@ -425,5 +455,84 @@
 	popfl
 #endif	/* DEV_NPX */
 
+	movl	$1,%eax
 	ret
 END(savectx)
+
+/*
+ * resumectx(pcb) __fastcall
+ * Resuming processor state from pcb.
+ */
+ENTRY(resumectx)
+	/* Restore GDT. */
+	lgdt	PCB_GDT(%ecx)
+
+	/* Restore segment registers */
+	movzwl	PCB_DS(%ecx),%eax
+	mov	%ax,%ds
+	movzwl	PCB_ES(%ecx),%eax
+	mov	%ax,%es
+	movzwl	PCB_FS(%ecx),%eax
+	mov	%ax,%fs
+	movzwl	PCB_GS(%ecx),%eax
+	movw	%ax,%gs
+	movzwl	PCB_SS(%ecx),%eax
+	mov	%ax,%ss
+
+	/* Restore CR2, CR4, CR3 and CR0 */
+	movl	PCB_CR2(%ecx),%eax
+	movl	%eax,%cr2
+	movl	PCB_CR4(%ecx),%eax
+	movl	%eax,%cr4
+	movl	PCB_CR3(%ecx),%eax
+	movl	%eax,%cr3
+	movl	PCB_CR0(%ecx),%eax
+	movl	%eax,%cr0
+	jmp	1f
+1:
+
+	/* Restore descriptor tables */
+	lidt	PCB_IDT(%ecx)
+	lldt	PCB_LDT(%ecx)
+
+#define SDT_SYS386TSS	9
+#define SDT_SYS386BSY	11
+	/* Clear "task busy" bit and reload TR */
+	movl	PCPU(TSS_GDT),%eax
+	andb	$(~SDT_SYS386BSY | SDT_SYS386TSS),5(%eax)
+	movzwl	PCB_TR(%ecx),%eax
+	ltr	%ax
+#undef SDT_SYS386TSS
+#undef SDT_SYS386BSY
+
+	/* Restore debug registers */
+	movl	PCB_DR0(%ecx),%eax
+	movl	%eax,%dr0
+	movl	PCB_DR1(%ecx),%eax
+	movl	%eax,%dr1
+	movl	PCB_DR2(%ecx),%eax
+	movl	%eax,%dr2
+	movl	PCB_DR3(%ecx),%eax
+	movl	%eax,%dr3
+	movl	PCB_DR6(%ecx),%eax
+	movl	%eax,%dr6
+	movl	PCB_DR7(%ecx),%eax
+	movl	%eax,%dr7
+
+#ifdef DEV_NPX
+	/* XXX FIX ME */
+#endif
+
+	/* Restore other registers */
+	movl	PCB_EDI(%ecx),%edi
+	movl	PCB_ESI(%ecx),%esi
+	movl	PCB_EBP(%ecx),%ebp
+	movl	PCB_ESP(%ecx),%esp
+	movl	PCB_EBX(%ecx),%ebx
+
+	/* reload code selector by turning return into intersegmental return */
+	pushl	PCB_EIP(%ecx)
+	movl	$KCSEL,4(%esp)
+	xorl	%eax,%eax
+	lret
+END(resumectx)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/trap.c
--- a/head/sys/i386/i386/trap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/trap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/trap.c 233781 2012-04-02 15:07:22Z jhb $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/trap.c 238678 2012-07-21 21:52:48Z kib $");
 
 /*
  * 386 Trap and System call handling
@@ -369,7 +369,7 @@
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 #ifdef DEV_NPX
-			ucode = npxtrap();
+			ucode = npxtrap_x87();
 			if (ucode == -1)
 				goto userout;
 #else
@@ -532,7 +532,13 @@
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
-			ucode = 0; /* XXX */
+#if defined(DEV_NPX) && !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
+			ucode = npxtrap_sse();
+			if (ucode == -1)
+				goto userout;
+#else
+			ucode = 0;
+#endif
 			i = SIGFPE;
 			break;
 		}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/i386/vm86.c
--- a/head/sys/i386/i386/vm86.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/i386/vm86.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/i386/vm86.c 234350 2012-04-16 19:31:44Z jkim $");
+__FBSDID("$FreeBSD: head/sys/i386/i386/vm86.c 237924 2012-07-01 12:59:00Z brueffer $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -650,7 +650,6 @@
 			return (1);
 		}
 	return (0);
-	panic("vm86_getptr: address not found");
 }
 	
 int
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/apicvar.h
--- a/head/sys/i386/include/apicvar.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/apicvar.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/include/apicvar.h 232230 2012-02-27 17:30:21Z jhb $
+ * $FreeBSD: head/sys/i386/include/apicvar.h 235622 2012-05-18 18:55:58Z iwasaki $
  */
 
 #ifndef _MACHINE_APICVAR_H_
@@ -126,7 +126,8 @@
 #define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST)
 
 #define	IPI_STOP	(APIC_IPI_INTS + 7)	/* Stop CPU until restarted. */
-#define	IPI_STOP_HARD	(APIC_IPI_INTS + 8)	/* Stop CPU with a NMI. */
+#define	IPI_SUSPEND	(APIC_IPI_INTS + 8)	/* Suspend CPU until restarted. */
+#define	IPI_STOP_HARD	(APIC_IPI_INTS + 9)	/* Stop CPU with a NMI. */
 
 /*
  * The spurious interrupt can share the priority class with the IPIs since
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/atomic.h
--- a/head/sys/i386/include/atomic.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/atomic.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/include/atomic.h 220404 2011-04-06 23:59:59Z jkim $
+ * $FreeBSD: head/sys/i386/include/atomic.h 236456 2012-06-02 18:10:16Z kib $
  */
 #ifndef _MACHINE_ATOMIC_H_
 #define	_MACHINE_ATOMIC_H_
@@ -32,9 +32,9 @@
 #error this file needs sys/cdefs.h as a prerequisite
 #endif
 
-#define	mb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory")
-#define	wmb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory")
-#define	rmb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory")
+#define	mb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc")
+#define	wmb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc")
+#define	rmb()	__asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc")
 
 /*
  * Various simple operations on memory, each of which is atomic in the
@@ -79,8 +79,9 @@
 int	atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src);
 u_int	atomic_fetchadd_int(volatile u_int *p, u_int v);
 
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)			\
-u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p);	\
+#define	ATOMIC_LOAD(TYPE, LOP)					\
+u_##TYPE	atomic_load_acq_##TYPE(volatile u_##TYPE *p)
+#define	ATOMIC_STORE(TYPE)					\
 void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 
 #else /* !KLD_MODULE && __GNUCLIKE_ASM */
@@ -280,16 +281,29 @@
 	return (v);
 }
 
+/*
+ * We assume that a = b will do atomic loads and stores.  Due to the
+ * IA32 memory model, a simple store guarantees release semantics.
+ *
+ * However, loads may pass stores, so for atomic_load_acq we have to
+ * ensure a Store/Load barrier to do the load in SMP kernels.  We use
+ * "lock cmpxchg" as recommended by the AMD Software Optimization
+ * Guide, and not mfence.  For UP kernels, however, the cache of the
+ * single processor is always consistent, so we only need to take care
+ * of the compiler.
+ */
+#define	ATOMIC_STORE(TYPE)				\
+static __inline void					\
+atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
+{							\
+	__asm __volatile("" : : : "memory");		\
+	*p = v;						\
+}							\
+struct __hack
+
 #if defined(_KERNEL) && !defined(SMP)
 
-/*
- * We assume that a = b will do atomic loads and stores.  However, on a
- * PentiumPro or higher, reads may pass writes, so for that case we have
- * to use a serializing instruction (i.e. with LOCK) to do the load in
- * SMP kernels.  For UP kernels, however, the cache of the single processor
- * is always consistent, so we only need to take care of compiler.
- */
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)		\
+#define	ATOMIC_LOAD(TYPE, LOP)				\
 static __inline u_##TYPE				\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
 {							\
@@ -299,18 +313,11 @@
 	__asm __volatile("" : : : "memory");		\
 	return (tmp);					\
 }							\
-							\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__asm __volatile("" : : : "memory");		\
-	*p = v;						\
-}							\
 struct __hack
 
 #else /* !(_KERNEL && !SMP) */
 
-#define	ATOMIC_STORE_LOAD(TYPE, LOP, SOP)		\
+#define	ATOMIC_LOAD(TYPE, LOP)				\
 static __inline u_##TYPE				\
 atomic_load_acq_##TYPE(volatile u_##TYPE *p)		\
 {							\
@@ -324,19 +331,6 @@
 							\
 	return (res);					\
 }							\
-							\
-/*							\
- * The XCHG instruction asserts LOCK automagically.	\
- */							\
-static __inline void					\
-atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
-{							\
-	__asm __volatile(SOP				\
-	: "=m" (*p),			/* 0 */		\
-	  "+r" (v)			/* 1 */		\
-	: "m" (*p)			/* 2 */		\
-	: "memory");					\
-}							\
 struct __hack
 
 #endif /* _KERNEL && !SMP */
@@ -363,13 +357,19 @@
 ATOMIC_ASM(add,	     long,  "addl %1,%0",  "ir",  v);
 ATOMIC_ASM(subtract, long,  "subl %1,%0",  "ir",  v);
 
-ATOMIC_STORE_LOAD(char,	"cmpxchgb %b0,%1", "xchgb %b1,%0");
-ATOMIC_STORE_LOAD(short,"cmpxchgw %w0,%1", "xchgw %w1,%0");
-ATOMIC_STORE_LOAD(int,	"cmpxchgl %0,%1",  "xchgl %1,%0");
-ATOMIC_STORE_LOAD(long,	"cmpxchgl %0,%1",  "xchgl %1,%0");
+ATOMIC_LOAD(char,  "cmpxchgb %b0,%1");
+ATOMIC_LOAD(short, "cmpxchgw %w0,%1");
+ATOMIC_LOAD(int,   "cmpxchgl %0,%1");
+ATOMIC_LOAD(long,  "cmpxchgl %0,%1");
+
+ATOMIC_STORE(char);
+ATOMIC_STORE(short);
+ATOMIC_STORE(int);
+ATOMIC_STORE(long);
 
 #undef ATOMIC_ASM
-#undef ATOMIC_STORE_LOAD
+#undef ATOMIC_LOAD
+#undef ATOMIC_STORE
 
 #ifndef WANT_FUNCTIONS
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/bootinfo.h
--- a/head/sys/i386/include/bootinfo.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/bootinfo.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/bootinfo.h 235391 2012-05-13 09:25:39Z avg $
  */
 
 #ifndef	_MACHINE_BOOTINFO_H_
@@ -65,13 +65,13 @@
 	u_int32_t	bi_kernend;		/* end of kernel space */
 	u_int32_t	bi_envp;		/* environment */
 	u_int32_t	bi_modulep;		/* preloaded modules */
+	uint32_t	bi_memdesc_version;	/* EFI memory desc version */
+	uint64_t	bi_memdesc_size;	/* sizeof EFI memory desc */
+	uint64_t	bi_memmap;		/* pa of EFI memory map */
+	uint64_t	bi_memmap_size;		/* size of EFI memory map */
 	uint64_t	bi_hcdp;		/* DIG64 HCDP table */
 	uint64_t	bi_fpswa;		/* FPSWA interface */
 	uint64_t	bi_systab;		/* pa of EFI system table */
-	uint64_t	bi_memmap;		/* pa of EFI memory map */
-	uint64_t	bi_memmap_size;		/* size of EFI memory map */
-	uint64_t	bi_memdesc_size;	/* sizeof EFI memory desc */
-	uint32_t	bi_memdesc_version;	/* EFI memory desc version */
 };
 
 #ifdef _KERNEL
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/cpufunc.h
--- a/head/sys/i386/include/cpufunc.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/cpufunc.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/include/cpufunc.h 223796 2011-07-05 18:42:10Z jkim $
+ * $FreeBSD: head/sys/i386/include/cpufunc.h 238311 2012-07-09 20:55:39Z jhb $
  */
 
 /*
@@ -97,6 +97,13 @@
 }
 
 static __inline void
+clts(void)
+{
+
+	__asm __volatile("clts");
+}
+
+static __inline void
 disable_intr(void)
 {
 #ifdef XEN
@@ -688,6 +695,9 @@
 int	breakpoint(void);
 u_int	bsfl(u_int mask);
 u_int	bsrl(u_int mask);
+void	clflush(u_long addr);
+void	clts(void);
+void	cpuid_count(u_int ax, u_int cx, u_int *p);
 void	disable_intr(void);
 void	do_cpuid(u_int ax, u_int *p);
 void	enable_intr(void);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/elf.h
--- a/head/sys/i386/include/elf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/elf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/elf.h 237430 2012-06-22 06:38:31Z kib $
  */
 
 #ifndef _MACHINE_ELF_H_
@@ -96,6 +96,7 @@
 #define	AT_NCPUS	19	/* Number of CPUs. */
 #define	AT_PAGESIZES	20	/* Pagesizes. */
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
+#define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #define	AT_STACKPROT	23	/* Initial stack protection. */
 
 #define	AT_COUNT	24	/* Count of defined aux entry types. */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/in_cksum.h
--- a/head/sys/i386/include/in_cksum.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/in_cksum.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  *	from tahoe:	in_cksum.c	1.2	86/01/05
  *	from:		@(#)in_cksum.c	1.3 (Berkeley) 1/19/91
  *	from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/in_cksum.h 235941 2012-05-24 22:00:48Z bz $
  */
 
 #ifndef _MACHINE_IN_CKSUM_H_
@@ -54,6 +54,7 @@
  * therefore always exactly five 32-bit words.
  */
 #if defined(__GNUCLIKE_ASM) && !defined(__INTEL_COMPILER)
+#if defined(IPVERSION) && (IPVERSION == 4)
 static __inline u_int
 in_cksum_hdr(const struct ip *ip)
 {
@@ -88,6 +89,7 @@
 	__tmpsum = (int)ntohs(ip->ip_sum) + 256;
 	ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16));
 }
+#endif
 
 static __inline u_short
 in_addword(u_short sum, u_short b)
@@ -121,6 +123,7 @@
 }
 
 #else
+#if defined(IPVERSION) && (IPVERSION == 4)
 #define	in_cksum_update(ip) \
 	do { \
 		int __tmpsum; \
@@ -129,10 +132,13 @@
 	} while(0)
 
 #endif
+#endif
 
 #ifdef _KERNEL
 #if !defined(__GNUCLIKE_ASM) || defined(__INTEL_COMPILER)
+#if defined(IPVERSION) && (IPVERSION == 4)
 u_int in_cksum_hdr(const struct ip *ip);
+#endif
 u_short in_addword(u_short sum, u_short b);
 u_short in_pseudo(u_int sum, u_int b, u_int c);
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/intr_machdep.h
--- a/head/sys/i386/include/intr_machdep.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/intr_machdep.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/include/intr_machdep.h 234207 2012-04-13 07:15:40Z avg $
+ * $FreeBSD: head/sys/i386/include/intr_machdep.h 234989 2012-05-03 21:44:01Z attilio $
  */
 
 #ifndef __MACHINE_INTR_MACHDEP_H__
@@ -131,9 +131,7 @@
 enum intr_trigger elcr_read_trigger(u_int irq);
 void	elcr_resume(void);
 void	elcr_write_trigger(u_int irq, enum intr_trigger trigger);
-#ifdef SMP
 void	intr_add_cpu(u_int cpu);
-#endif
 int	intr_add_handler(const char *name, int vector, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep);
 #ifdef SMP
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/md_var.h
--- a/head/sys/i386/include/md_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/md_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/md_var.h 235622 2012-05-18 18:55:58Z iwasaki $
  */
 
 #ifndef _MACHINE_MD_VAR_H_
@@ -91,6 +91,7 @@
 void	doreti_popl_fs_fault(void) __asm(__STRING(doreti_popl_fs_fault));
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
+void	initializecpu(void);
 void	enable_sse(void);
 void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
 void	i686_pagezero(void *addr);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/npx.h
--- a/head/sys/i386/include/npx.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/npx.h	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)npx.h	5.3 (Berkeley) 1/18/91
- * $FreeBSD: head/sys/i386/include/npx.h 233044 2012-03-16 20:24:30Z tijl $
+ * $FreeBSD: head/sys/i386/include/npx.h 238678 2012-07-21 21:52:48Z kib $
  */
 
 /*
@@ -55,7 +55,8 @@
 void	npxinit(void);
 void	npxsave(union savefpu *addr);
 void	npxsetregs(struct thread *td, union savefpu *addr);
-int	npxtrap(void);
+int	npxtrap_x87(void);
+int	npxtrap_sse(void);
 void	npxuserinited(struct thread *);
 struct fpu_kern_ctx *fpu_kern_alloc_ctx(u_int flags);
 void	fpu_kern_free_ctx(struct fpu_kern_ctx *ctx);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/pcb.h
--- a/head/sys/i386/include/pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)pcb.h	5.10 (Berkeley) 5/12/91
- * $FreeBSD$
+ * $FreeBSD: head/sys/i386/include/pcb.h 237027 2012-06-13 21:03:01Z jkim $
  */
 
 #ifndef _I386_PCB_H_
@@ -45,7 +45,10 @@
 #include <machine/npx.h>
 
 struct pcb {
+	int	pcb_cr0;
+	int	pcb_cr2;
 	int	pcb_cr3;
+	int	pcb_cr4;
 	int	pcb_edi;
 	int	pcb_esi;
 	int	pcb_ebp;
@@ -71,20 +74,30 @@
 #define	PCB_KERNNPX	0x40	/* kernel uses npx */
 
 	caddr_t	pcb_onfault;	/* copyin/out fault recovery */
+	int	pcb_ds;
+	int	pcb_es;
+	int	pcb_fs;
 	int	pcb_gs;
+	int	pcb_ss;
 	struct segment_descriptor pcb_fsd;
 	struct segment_descriptor pcb_gsd;
 	struct	pcb_ext	*pcb_ext;	/* optional pcb extension */
 	int	pcb_psl;	/* process status long */
 	u_long	pcb_vm86[2];	/* vm86bios scratch space */
 	union	savefpu *pcb_save;
+
+	struct region_descriptor pcb_gdt;
+	struct region_descriptor pcb_idt;
+	uint16_t	pcb_ldt;
+	uint16_t	pcb_tr;
 };
 
 #ifdef _KERNEL
 struct trapframe;
 
 void	makectx(struct trapframe *, struct pcb *);
-void	savectx(struct pcb *);
+int	savectx(struct pcb *) __returns_twice;
+void	resumectx(struct pcb *) __fastcall;
 #endif
 
 #endif /* _I386_PCB_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/pmap.h
--- a/head/sys/i386/include/pmap.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/pmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  *
  *	from: hp300: @(#)pmap.h	7.2 (Berkeley) 12/16/90
  *	from: @(#)pmap.h	7.4 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/i386/include/pmap.h 222813 2011-06-07 08:46:13Z attilio $
+ * $FreeBSD: head/sys/i386/include/pmap.h 237168 2012-06-16 18:56:19Z alc $
  */
 
 #ifndef _MACHINE_PMAP_H_
@@ -481,7 +481,7 @@
 	pmap_t			pc_pmap;
 	TAILQ_ENTRY(pv_chunk)	pc_list;
 	uint32_t		pc_map[_NPCM];	/* bitmap; 1 = free */
-	uint32_t		pc_spare[2];
+	TAILQ_ENTRY(pv_chunk)	pc_lru;
 	struct pv_entry		pc_pventry[_NPCPV];
 };
 
@@ -498,6 +498,7 @@
 extern vm_offset_t virtual_end;
 
 #define	pmap_page_get_memattr(m)	((vm_memattr_t)(m)->md.pat_mode)
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
 #define	pmap_unmapbios(va, sz)	pmap_unmapdev((va), (sz))
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/smp.h
--- a/head/sys/i386/include/smp.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/smp.h	Wed Jul 25 16:40:53 2012 +0300
@@ -6,7 +6,7 @@
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
- * $FreeBSD: head/sys/i386/include/smp.h 222853 2011-06-08 08:12:15Z avg $
+ * $FreeBSD: head/sys/i386/include/smp.h 236938 2012-06-12 00:14:54Z iwasaki $
  *
  */
 
@@ -53,13 +53,18 @@
 	IDTVEC(invlcache),	/* Write back and invalidate cache */
 	IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ 
 	IDTVEC(cpustop),	/* CPU stops & waits to be restarted */
+	IDTVEC(cpususpend),	/* CPU suspends & waits to be resumed */
 	IDTVEC(rendezvous),	/* handle CPU rendezvous */
 	IDTVEC(lazypmap);	/* handle lazy pmap release */
 
 /* functions in mp_machdep.c */
 void	cpu_add(u_int apic_id, char boot_cpu);
 void	cpustop_handler(void);
+#ifndef XEN
+void	cpususpend_handler(void);
+#endif
 void	init_secondary(void);
+void	ipi_startup(int apic_id, int vector);
 void	ipi_all_but_self(u_int ipi);
 #ifndef XEN
 void 	ipi_bitmap_handler(struct trapframe frame);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/vdso.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/i386/include/vdso.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/i386/include/vdso.h 237433 2012-06-22 07:06:40Z kib $ */
+
+#include <x86/vdso.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/include/vmparam.h
--- a/head/sys/i386/include/vmparam.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/include/vmparam.h	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)vmparam.h	5.9 (Berkeley) 5/12/91
- * $FreeBSD: head/sys/i386/include/vmparam.h 228398 2011-12-10 18:42:00Z alc $
+ * $FreeBSD: head/sys/i386/include/vmparam.h 237435 2012-06-22 07:16:29Z kib $
  */
 
 
@@ -165,7 +165,8 @@
 
 #define VM_MAXUSER_ADDRESS	VADDR(PTDPTDI, 0)
 
-#define USRSTACK		VM_MAXUSER_ADDRESS
+#define	SHAREDPAGE		(VM_MAXUSER_ADDRESS - PAGE_SIZE)
+#define	USRSTACK		SHAREDPAGE
 
 #define VM_MAX_ADDRESS		VADDR(PTDPTDI, PTDPTDI)
 #define VM_MIN_ADDRESS		((vm_offset_t)0)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/isa/npx.c
--- a/head/sys/i386/isa/npx.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/isa/npx.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/isa/npx.c 230426 2012-01-21 17:45:27Z kib $");
+__FBSDID("$FreeBSD: head/sys/i386/isa/npx.c 238678 2012-07-21 21:52:48Z kib $");
 
 #include "opt_cpu.h"
 #include "opt_isa.h"
@@ -99,15 +99,7 @@
 #ifdef CPU_ENABLE_SSE
 #define	fxrstor(addr)		__asm __volatile("fxrstor %0" : : "m" (*(addr)))
 #define	fxsave(addr)		__asm __volatile("fxsave %0" : "=m" (*(addr)))
-#endif
-#ifdef XEN
-#define	start_emulating()	(HYPERVISOR_fpu_taskswitch(1))
-#define	stop_emulating()	(HYPERVISOR_fpu_taskswitch(0))
-#else
-#define	start_emulating()	__asm __volatile( \
-				    "smsw %%ax; orb %0,%%al; lmsw %%ax" \
-				    : : "n" (CR0_TS) : "ax")
-#define	stop_emulating()	__asm __volatile("clts")
+#define	stmxcsr(addr)		__asm __volatile("stmxcsr %0" : : "m" (*(addr)))
 #endif
 #else	/* !(__GNUCLIKE_ASM && !lint) */
 
@@ -122,12 +114,19 @@
 #ifdef CPU_ENABLE_SSE
 void	fxsave(caddr_t addr);
 void	fxrstor(caddr_t addr);
+void	stmxcsr(u_int csr);
 #endif
-void	start_emulating(void);
-void	stop_emulating(void);
 
 #endif	/* __GNUCLIKE_ASM && !lint */
 
+#ifdef XEN
+#define	start_emulating()	(HYPERVISOR_fpu_taskswitch(1))
+#define	stop_emulating()	(HYPERVISOR_fpu_taskswitch(0))
+#else
+#define	start_emulating()	load_cr0(rcr0() | CR0_TS)
+#define	stop_emulating()	clts()
+#endif
+
 #ifdef CPU_ENABLE_SSE
 #define GET_FPU_CW(thread) \
 	(cpu_fxsr ? \
@@ -584,29 +583,30 @@
 };
 
 /*
- * Preserve the FP status word, clear FP exceptions, then generate a SIGFPE.
+ * Read the FP status and control words, then generate si_code value
+ * for SIGFPE.  The error code chosen will be one of the
+ * FPE_... macros.  It will be sent as the second argument to old
+ * BSD-style signal handlers and as "siginfo_t->si_code" (second
+ * argument) to SA_SIGINFO signal handlers.
  *
- * Clearing exceptions is necessary mainly to avoid IRQ13 bugs.  We now
- * depend on longjmp() restoring a usable state.  Restoring the state
- * or examining it might fail if we didn't clear exceptions.
+ * Some time ago, we cleared the x87 exceptions with FNCLEX there.
+ * Clearing exceptions was necessary mainly to avoid IRQ13 bugs.  The
+ * usermode code which understands the FPU hardware enough to enable
+ * the exceptions, can also handle clearing the exception state in the
+ * handler.  The only consequence of not clearing the exception is the
+ * rethrow of the SIGFPE on return from the signal handler and
+ * reexecution of the corresponding instruction.
  *
- * The error code chosen will be one of the FPE_... macros. It will be
- * sent as the second argument to old BSD-style signal handlers and as
- * "siginfo_t->si_code" (second argument) to SA_SIGINFO signal handlers.
- *
- * XXX the FP state is not preserved across signal handlers.  So signal
- * handlers cannot afford to do FP unless they preserve the state or
- * longjmp() out.  Both preserving the state and longjmp()ing may be
- * destroyed by IRQ13 bugs.  Clearing FP exceptions is not an acceptable
- * solution for signals other than SIGFPE.
+ * For XMM traps, the exceptions were never cleared.
  */
 int
-npxtrap()
+npxtrap_x87(void)
 {
 	u_short control, status;
 
 	if (!hw_float) {
-		printf("npxtrap: fpcurthread = %p, curthread = %p, hw_float = %d\n",
+		printf(
+	"npxtrap_x87: fpcurthread = %p, curthread = %p, hw_float = %d\n",
 		       PCPU_GET(fpcurthread), curthread, hw_float);
 		panic("npxtrap from nowhere");
 	}
@@ -624,13 +624,32 @@
 		fnstcw(&control);
 		fnstsw(&status);
 	}
-
-	if (PCPU_GET(fpcurthread) == curthread)
-		fnclex();
 	critical_exit();
 	return (fpetable[status & ((~control & 0x3f) | 0x40)]);
 }
 
+#ifdef CPU_ENABLE_SSE
+int
+npxtrap_sse(void)
+{
+	u_int mxcsr;
+
+	if (!hw_float) {
+		printf(
+	"npxtrap_sse: fpcurthread = %p, curthread = %p, hw_float = %d\n",
+		       PCPU_GET(fpcurthread), curthread, hw_float);
+		panic("npxtrap from nowhere");
+	}
+	critical_enter();
+	if (PCPU_GET(fpcurthread) != curthread)
+		mxcsr = curthread->td_pcb->pcb_save->sv_xmm.sv_env.en_mxcsr;
+	else
+		stmxcsr(&mxcsr);
+	critical_exit();
+	return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
+}
+#endif
+
 /*
  * Implement device not available (DNA) exception
  *
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/linux/linux.h
--- a/head/sys/i386/linux/linux.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/linux/linux.h	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/i386/linux/linux.h 230132 2012-01-15 13:23:18Z uqs $
+ * $FreeBSD: head/sys/i386/linux/linux.h 235063 2012-05-05 19:42:38Z netchild $
  */
 
 #ifndef _I386_LINUX_H_
@@ -42,6 +42,7 @@
 #define	ldebug(name)	isclr(linux_debug_map, LINUX_SYS_linux_ ## name)
 #define	ARGS(nm, fmt)	"linux(%ld): "#nm"("fmt")\n", (long)td->td_proc->p_pid
 #define	LMSG(fmt)	"linux(%ld): "fmt"\n", (long)td->td_proc->p_pid
+#define	LINUX_DTRACE	linuxulator
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_LINUX);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/linux/linux_dummy.c
--- a/head/sys/i386/linux/linux_dummy.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/linux/linux_dummy.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,16 +27,25 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/linux/linux_dummy.c 234352 2012-04-16 21:22:02Z jkim $");
+__FBSDID("$FreeBSD: head/sys/i386/linux/linux_dummy.c 235063 2012-05-05 19:42:38Z netchild $");
+
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
 
 #include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <i386/linux/linux.h>
 #include <i386/linux/linux_proto.h>
+#include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_util.h>
 
+/* DTrace init */
+LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE);
+
 DUMMY(stime);
 DUMMY(fstat);
 DUMMY(olduname);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/i386/xen/pmap.c
--- a/head/sys/i386/xen/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/i386/xen/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -75,7 +75,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/i386/xen/pmap.c 229007 2011-12-30 18:16:15Z alc $");
+__FBSDID("$FreeBSD: head/sys/i386/xen/pmap.c 236534 2012-06-04 03:51:08Z alc $");
 
 /*
  *	Manages physical address maps.
@@ -179,7 +179,6 @@
 #define PMAP_INLINE
 #endif
 
-#define PV_STATS
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
@@ -230,6 +229,7 @@
 /*
  * Data for the pv entry allocation mechanism
  */
+static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static int shpgperproc = PMAP_SHPGPERPROC;
 
@@ -277,8 +277,9 @@
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
+static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
+static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
@@ -1914,6 +1915,7 @@
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
+CTASSERT(_NPCPV == 336);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
@@ -1927,7 +1929,7 @@
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
-static uint32_t pc_freemask[11] = {
+static const uint32_t pc_freemask[_NPCM] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
@@ -1958,74 +1960,140 @@
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
-
-static int pmap_collect_inactive, pmap_collect_active;
-
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
-	"Current number times pmap_collect called on inactive queue");
-SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
-	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
- * another pv entry chunk.  This is normally called to
- * unmap inactive pages, and if necessary, active pages.
+ * another pv entry chunk.
  */
-static void
-pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
+static vm_page_t
+pmap_pv_reclaim(pmap_t locked_pmap)
 {
+	struct pch newtail;
+	struct pv_chunk *pc;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
-	pv_entry_t next_pv, pv;
+	pv_entry_t pv;
 	vm_offset_t va;
-	vm_page_t m, free;
-
+	vm_page_t free, m, m_pc;
+	uint32_t inuse;
+	int bit, field, freed;
+
+	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+	pmap = NULL;
+	free = m_pc = NULL;
+	TAILQ_INIT(&newtail);
 	sched_pin();
-	TAILQ_FOREACH(m, &vpq->pl, pageq) {
-		if ((m->flags & PG_MARKER) != 0 || m->hold_count || m->busy)
-			continue;
-		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
-			va = pv->pv_va;
-			pmap = PV_PMAP(pv);
+	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
+	    free == NULL)) {
+		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
+		if (pmap != pc->pc_pmap) {
+			if (pmap != NULL) {
+				pmap_invalidate_all(pmap);
+				if (pmap != locked_pmap)
+					PMAP_UNLOCK(pmap);
+			}
+			pmap = pc->pc_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
-			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
+			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
+				pmap = NULL;
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 				continue;
-			pmap->pm_stats.resident_count--;
-			pte = pmap_pte_quick(pmap, va);
-			tpte = pte_load_clear(pte);
-			KASSERT((tpte & PG_W) == 0,
-			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
-			if (tpte & PG_A)
-				vm_page_aflag_set(m, PGA_REFERENCED);
-			if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
-				vm_page_dirty(m);
-			free = NULL;
-			pmap_unuse_pt(pmap, va, &free);
-			pmap_invalidate_page(pmap, va);
-			pmap_free_zero_pages(free);
-			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-			free_pv_entry(pmap, pv);
-			if (pmap != locked_pmap)
-				PMAP_UNLOCK(pmap);
+			}
 		}
-		if (TAILQ_EMPTY(&m->md.pv_list))
-			vm_page_aflag_clear(m, PGA_WRITEABLE);
+
+		/*
+		 * Destroy every non-wired, 4 KB page mapping in the chunk.
+		 */
+		freed = 0;
+		for (field = 0; field < _NPCM; field++) {
+			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
+			    inuse != 0; inuse &= ~(1UL << bit)) {
+				bit = bsfl(inuse);
+				pv = &pc->pc_pventry[field * 32 + bit];
+				va = pv->pv_va;
+				pte = pmap_pte_quick(pmap, va);
+				if ((*pte & PG_W) != 0)
+					continue;
+				tpte = pte_load_clear(pte);
+				if ((tpte & PG_G) != 0)
+					pmap_invalidate_page(pmap, va);
+				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
+				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
+					vm_page_dirty(m);
+				if ((tpte & PG_A) != 0)
+					vm_page_aflag_set(m, PGA_REFERENCED);
+				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+				if (TAILQ_EMPTY(&m->md.pv_list))
+					vm_page_aflag_clear(m, PGA_WRITEABLE);
+				pc->pc_map[field] |= 1UL << bit;
+				pmap_unuse_pt(pmap, va, &free);
+				freed++;
+			}
+		}
+		if (freed == 0) {
+			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+			continue;
+		}
+		/* Every freed mapping is for a 4 KB page. */
+		pmap->pm_stats.resident_count -= freed;
+		PV_STAT(pv_entry_frees += freed);
+		PV_STAT(pv_entry_spare += freed);
+		pv_entry_count -= freed;
+		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+		for (field = 0; field < _NPCM; field++)
+			if (pc->pc_map[field] != pc_freemask[field]) {
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
+
+				/*
+				 * One freed pv entry in locked_pmap is
+				 * sufficient.
+				 */
+				if (pmap == locked_pmap)
+					goto out;
+				break;
+			}
+		if (field == _NPCM) {
+			PV_STAT(pv_entry_spare -= _NPCPV);
+			PV_STAT(pc_chunk_count--);
+			PV_STAT(pc_chunk_frees++);
+			/* Entire chunk is free; return it. */
+			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
+			pmap_qremove((vm_offset_t)pc, 1);
+			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+			break;
+		}
 	}
+out:
 	sched_unpin();
+	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
+	if (pmap != NULL) {
+		pmap_invalidate_all(pmap);
+		if (pmap != locked_pmap)
+			PMAP_UNLOCK(pmap);
+	}
+	if (m_pc == NULL && pv_vafree != 0 && free != NULL) {
+		m_pc = free;
+		free = m_pc->right;
+		/* Recycle a freed page table page. */
+		m_pc->wire_count = 1;
+		atomic_add_int(&cnt.v_wire_count, 1);
+	}
+	pmap_free_zero_pages(free);
+	return (m_pc);
 }
 
-
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
-	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
@@ -2039,13 +2107,30 @@
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
-	/* move to head of list */
-	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx]) {
-			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
+			/*
+			 * 98% of the time, pc is already at the head of the
+			 * list.  If it isn't already, move it to the head.
+			 */
+			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
+			    pc)) {
+				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
+				    pc_list);
+			}
 			return;
 		}
+	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+	free_pv_chunk(pc);
+}
+
+static void
+free_pv_chunk(struct pv_chunk *pc)
+{
+	vm_page_t m;
+
+ 	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
@@ -2062,11 +2147,10 @@
  * when needed.
  */
 static pv_entry_t
-get_pv_entry(pmap_t pmap, int try)
+get_pv_entry(pmap_t pmap, boolean_t try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
-	struct vpgqueues *pq;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
@@ -2081,7 +2165,6 @@
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max tunable.\n");
-	pq = NULL;
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
@@ -2111,29 +2194,16 @@
 	 * queues lock.  If "pv_vafree" is currently non-empty, it will
 	 * remain non-empty until pmap_ptelist_alloc() completes.
 	 */
-	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, (pq ==
-	    &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) |
+	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
-		/*
-		 * Reclaim pv entries: At first, destroy mappings to
-		 * inactive pages.  After that, if a pv chunk entry
-		 * is still needed, destroy mappings to active pages.
-		 */
-		if (pq == NULL) {
-			PV_STAT(pmap_collect_inactive++);
-			pq = &vm_page_queues[PQ_INACTIVE];
-		} else if (pq == &vm_page_queues[PQ_INACTIVE]) {
-			PV_STAT(pmap_collect_active++);
-			pq = &vm_page_queues[PQ_ACTIVE];
-		} else
-			panic("get_pv_entry: increase vm.pmap.shpgperproc");
-		pmap_collect(pmap, pq);
-		goto retry;
+		m = pmap_pv_reclaim(pmap);
+		if (m == NULL)
+			goto retry;
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
@@ -2145,6 +2215,7 @@
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
+	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
@@ -3470,7 +3541,7 @@
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
-			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
+			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfl(inuse);
 				bitmask = 1UL << bit;
@@ -3531,15 +3602,8 @@
 		}
 		PT_UPDATES_FLUSH();
 		if (allfree) {
-			PV_STAT(pv_entry_spare -= _NPCPV);
-			PV_STAT(pc_chunk_count--);
-			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
-			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
-			pmap_qremove((vm_offset_t)pc, 1);
-			vm_page_unwire(m, 0);
-			vm_page_free(m);
-			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
+			free_pv_chunk(pc);
 		}
 	}
 	PT_UPDATES_FLUSH();
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/acpica/acpi_wakeup.c
--- a/head/sys/ia64/acpica/acpi_wakeup.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/acpica/acpi_wakeup.c	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/acpica/acpi_wakeup.c 236409 2012-06-01 17:07:52Z jkim $
  */
 
 #include <sys/param.h>
@@ -39,6 +39,13 @@
 	return (0);
 }
 
+int
+acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
+    int intr_enabled)
+{
+	return (0);
+}
+
 void
 acpi_install_wakeup_handler(struct acpi_softc *sc)
 {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/busdma_machdep.c
--- a/head/sys/ia64/ia64/busdma_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/busdma_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/ia64/ia64/busdma_machdep.c 232356 2012-03-01 19:58:34Z jhb $");
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/busdma_machdep.c 238184 2012-07-07 00:25:17Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -262,7 +262,7 @@
 			atomic_add_int(&parent->ref_count, 1);
 	}
 
-	if (newtag->lowaddr < ptoa(Maxmem) && (flags & BUS_DMA_ALLOCNOW) != 0) {
+	if (newtag->lowaddr < paddr_max && (flags & BUS_DMA_ALLOCNOW) != 0) {
 		/* Must bounce */
 
 		if (ptoa(total_bpages) < maxsize) {
@@ -340,7 +340,7 @@
 	 * exclusion region, a data alignment that is stricter than 1, and/or
 	 * an active address boundary.
 	 */
-	if (dmat->lowaddr < ptoa(Maxmem)) {
+	if (dmat->lowaddr < paddr_max) {
 		/* Must bounce */
 		int maxpages;
 
@@ -356,7 +356,7 @@
 		 * Attempt to add pages to our pool on a per-instance
 		 * basis up to a sane limit.
 		 */
-		maxpages = MIN(MAX_BPAGES, Maxmem - atop(dmat->lowaddr));
+		maxpages = MIN(MAX_BPAGES, atop(paddr_max - dmat->lowaddr));
 		if ((dmat->flags & BUS_DMA_MIN_ALLOC_COMP) == 0
 		 || (dmat->map_count > 0 && total_bpages < maxpages)) {
 			int pages;
@@ -438,7 +438,7 @@
 	 */
 	if ((dmat->maxsize <= PAGE_SIZE) &&
 	   (dmat->alignment < dmat->maxsize) &&
-	    dmat->lowaddr >= ptoa(Maxmem)) {
+	    dmat->lowaddr >= paddr_max) {
 		*vaddr = malloc(dmat->maxsize, M_DEVBUF, mflags);
 	} else {
 		/*
@@ -473,7 +473,7 @@
 		panic("bus_dmamem_free: Invalid map freed\n");
 	if ((dmat->maxsize <= PAGE_SIZE) &&
 	   (dmat->alignment < dmat->maxsize) &&
-	    dmat->lowaddr >= ptoa(Maxmem))
+	    dmat->lowaddr >= paddr_max)
 		free(vaddr, M_DEVBUF);
 	else {
 		contigfree(vaddr, dmat->maxsize, M_DEVBUF);
@@ -506,7 +506,7 @@
 	else
 		pmap = NULL;
 
-	if ((dmat->lowaddr < ptoa(Maxmem) || dmat->boundary > 0 ||
+	if ((dmat->lowaddr < paddr_max || dmat->boundary > 0 ||
 	    dmat->alignment > 1) && map != &nobounce_dmamap &&
 	    map->pagesneeded == 0) {
 		vm_offset_t vendaddr;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/machdep.c
--- a/head/sys/ia64/ia64/machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/ia64/ia64/machdep.c 232250 2012-02-28 13:19:34Z gavin $");
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/machdep.c 238257 2012-07-08 18:00:22Z marcel $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -152,22 +152,11 @@
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
-
 struct msgbuf *msgbufp = NULL;
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = NULL;
 
-long Maxmem = 0;
-long realmem = 0;
-
-#define	PHYSMAP_SIZE	(2 * VM_PHYSSEG_MAX)
-
-vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
-
-/* must be 2 less so 0 0 can signal end of chunks */
-#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
-
 struct kva_md_info kmi;
 
 #define	Mhz	1000000L
@@ -270,25 +259,8 @@
 #ifdef PERFMON
 	perfmon_init();
 #endif
-	printf("real memory  = %ld (%ld MB)\n", ia64_ptob(Maxmem),
-	    ia64_ptob(Maxmem) / 1048576);
-	realmem = Maxmem;
-
-	/*
-	 * Display any holes after the first chunk of extended memory.
-	 */
-	if (bootverbose) {
-		int indx;
-
-		printf("Physical memory chunk(s):\n");
-		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
-			long size1 = phys_avail[indx + 1] - phys_avail[indx];
-
-			printf("0x%08lx - 0x%08lx, %ld bytes (%ld pages)\n",
-			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
-			    size1 >> PAGE_SHIFT);
-		}
-	}
+	printf("real memory  = %ld (%ld MB)\n", ptoa(realmem),
+	    ptoa(realmem) / 1048576);
 
 	vm_ksubmap_init(&kmi);
 
@@ -534,6 +506,14 @@
 }
 
 void
+cpu_pcpu_setup(struct pcpu *pc, u_int acpi_id, u_int sapic_id)
+{
+
+	pc->pc_acpi_id = acpi_id;
+	pc->pc_md.lid = IA64_LID_SET_SAPIC_ID(sapic_id);
+}
+ 
+void
 spinlock_enter(void)
 {
 	struct thread *td;
@@ -700,43 +680,86 @@
 ia64_init(void)
 {
 	struct ia64_init_return ret;
-	int phys_avail_cnt;
-	vm_offset_t kernstart, kernend;
-	vm_offset_t kernstartpfn, kernendpfn, pfn0, pfn1;
+	struct efi_md *md;
+	pt_entry_t *pbvm_pgtbl_ent, *pbvm_pgtbl_lim;
 	char *p;
-	struct efi_md *md;
+	vm_size_t mdlen;
 	int metadata_missing;
 
-	/* NO OUTPUT ALLOWED UNTIL FURTHER NOTICE */
+	/*
+	 * NO OUTPUT ALLOWED UNTIL FURTHER NOTICE.
+	 */
 
-	/*
-	 * TODO: Disable interrupts, floating point etc.
-	 * Maybe flush cache and tlb
-	 */
 	ia64_set_fpsr(IA64_FPSR_DEFAULT);
 
 	/*
-	 * TODO: Get critical system information (if possible, from the
-	 * information provided by the boot program).
+	 * Region 6 is direct mapped UC and region 7 is direct mapped
+	 * WC. The details of this is controlled by the Alt {I,D}TLB
+	 * handlers. Here we just make sure that they have the largest
+	 * possible page size to minimise TLB usage.
 	 */
+	ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (PAGE_SHIFT << 2));
+	ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (PAGE_SHIFT << 2));
+	ia64_srlz_d();
+
+	/* Initialize/setup physical memory datastructures */
+	ia64_physmem_init();
 
 	/*
-	 * Look for the I/O ports first - we need them for console
-	 * probing.
+	 * Process the memory map. This gives us the PAL locations,
+	 * the I/O port base address, the available memory regions
+	 * for initializing the physical memory map.
 	 */
 	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
+		mdlen = md->md_pages * EFI_PAGE_SIZE;
 		switch (md->md_type) {
 		case EFI_MD_TYPE_IOPORT:
 			ia64_port_base = (uintptr_t)pmap_mapdev(md->md_phys,
-			    md->md_pages * EFI_PAGE_SIZE);
+			    mdlen);
 			break;
 		case EFI_MD_TYPE_PALCODE:
-			ia64_pal_size = md->md_pages * EFI_PAGE_SIZE;
 			ia64_pal_base = md->md_phys;
+			ia64_pal_size = mdlen;
+			/*FALLTHROUGH*/
+		case EFI_MD_TYPE_BAD:
+		case EFI_MD_TYPE_FIRMWARE:
+		case EFI_MD_TYPE_RECLAIM:
+		case EFI_MD_TYPE_RT_CODE:
+		case EFI_MD_TYPE_RT_DATA:
+			/* Don't use these memory regions. */
+			ia64_physmem_track(md->md_phys, mdlen);
+			break;
+		case EFI_MD_TYPE_BS_CODE:
+		case EFI_MD_TYPE_BS_DATA:
+		case EFI_MD_TYPE_CODE:
+		case EFI_MD_TYPE_DATA:
+		case EFI_MD_TYPE_FREE:
+			/* These are ok to use. */
+			ia64_physmem_add(md->md_phys, mdlen);
 			break;
 		}
 	}
 
+	/*
+	 * Remove the PBVM and its page table from phys_avail. The loader
+	 * passes the physical address of the page table to us. The virtual
+	 * address of the page table is fixed.
+	 * Track and the PBVM limit for later use.
+	 */
+	ia64_physmem_delete(bootinfo->bi_pbvm_pgtbl, bootinfo->bi_pbvm_pgtblsz);
+	pbvm_pgtbl_ent = (void *)IA64_PBVM_PGTBL;
+	pbvm_pgtbl_lim = (void *)(IA64_PBVM_PGTBL + bootinfo->bi_pbvm_pgtblsz);
+	while (pbvm_pgtbl_ent < pbvm_pgtbl_lim) {
+		if ((*pbvm_pgtbl_ent & PTE_PRESENT) == 0)
+			break;
+		ia64_physmem_delete(*pbvm_pgtbl_ent & PTE_PPN_MASK,
+		    IA64_PBVM_PAGE_SIZE);
+		pbvm_pgtbl_ent++;
+	}
+
+	/* Finalize physical memory datastructures */
+	ia64_physmem_fini();
+
 	metadata_missing = 0;
 	if (bootinfo->bi_modulep)
 		preload_metadata = (caddr_t)bootinfo->bi_modulep;
@@ -757,31 +780,6 @@
 		bootverbose = 1;
 
 	/*
-	 * Find the beginning and end of the kernel.
-	 */
-	kernstart = trunc_page(kernel_text);
-#ifdef DDB
-	ksym_start = bootinfo->bi_symtab;
-	ksym_end = bootinfo->bi_esymtab;
-	kernend = (vm_offset_t)round_page(ksym_end);
-#else
-	kernend = (vm_offset_t)round_page(_end);
-#endif
-	/* But if the bootstrap tells us otherwise, believe it! */
-	if (bootinfo->bi_kernend)
-		kernend = round_page(bootinfo->bi_kernend);
-
-	/*
-	 * Region 6 is direct mapped UC and region 7 is direct mapped
-	 * WC. The details of this is controlled by the Alt {I,D}TLB
-	 * handlers. Here we just make sure that they have the largest
-	 * possible page size to minimise TLB usage.
-	 */
-	ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (PAGE_SHIFT << 2));
-	ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (PAGE_SHIFT << 2));
-	ia64_srlz_d();
-
-	/*
 	 * Wire things up so we can call the firmware.
 	 */
 	map_pal_code();
@@ -800,9 +798,8 @@
 	pcpup = &pcpu0;
 	ia64_set_k4((u_int64_t)pcpup);
 	pcpu_init(pcpup, 0, sizeof(pcpu0));
-	dpcpu_init((void *)kernend, 0);
-	PCPU_SET(md.lid, ia64_get_lid());
-	kernend += DPCPU_SIZE;
+	dpcpu_init(ia64_physmem_alloc(DPCPU_SIZE, PAGE_SIZE), 0);
+	cpu_pcpu_setup(pcpup, ~0U, ia64_get_lid());
 	PCPU_SET(curthread, &thread0);
 
 	/*
@@ -828,105 +825,20 @@
 		freeenv(p);
 	}
 
-	kernstartpfn = atop(IA64_RR_MASK(kernstart));
-	kernendpfn = atop(IA64_RR_MASK(kernend));
-
-	/*
-	 * Size the memory regions and load phys_avail[] with the results.
-	 */
-
-	/*
-	 * Find out how much memory is available, by looking at
-	 * the memory descriptors.
-	 */
-
-#ifdef DEBUG_MD
-	printf("Memory descriptor count: %d\n", mdcount);
-#endif
-
-	phys_avail_cnt = 0;
-	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
-#ifdef DEBUG_MD
-		printf("MD %p: type %d pa 0x%lx cnt 0x%lx\n", md,
-		    md->md_type, md->md_phys, md->md_pages);
-#endif
-
-		pfn0 = ia64_btop(round_page(md->md_phys));
-		pfn1 = ia64_btop(trunc_page(md->md_phys + md->md_pages * 4096));
-		if (pfn1 <= pfn0)
-			continue;
-
-		if (md->md_type != EFI_MD_TYPE_FREE)
-			continue;
-
-		/*
-		 * We have a memory descriptor that describes conventional
-		 * memory that is for general use. We must determine if the
-		 * loader has put the kernel in this region.
-		 */
-		physmem += (pfn1 - pfn0);
-		if (pfn0 <= kernendpfn && kernstartpfn <= pfn1) {
-			/*
-			 * Must compute the location of the kernel
-			 * within the segment.
-			 */
-#ifdef DEBUG_MD
-			printf("Descriptor %p contains kernel\n", mp);
-#endif
-			if (pfn0 < kernstartpfn) {
-				/*
-				 * There is a chunk before the kernel.
-				 */
-#ifdef DEBUG_MD
-				printf("Loading chunk before kernel: "
-				       "0x%lx / 0x%lx\n", pfn0, kernstartpfn);
-#endif
-				phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
-				phys_avail[phys_avail_cnt+1] = ia64_ptob(kernstartpfn);
-				phys_avail_cnt += 2;
-			}
-			if (kernendpfn < pfn1) {
-				/*
-				 * There is a chunk after the kernel.
-				 */
-#ifdef DEBUG_MD
-				printf("Loading chunk after kernel: "
-				       "0x%lx / 0x%lx\n", kernendpfn, pfn1);
-#endif
-				phys_avail[phys_avail_cnt] = ia64_ptob(kernendpfn);
-				phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
-				phys_avail_cnt += 2;
-			}
-		} else {
-			/*
-			 * Just load this cluster as one chunk.
-			 */
-#ifdef DEBUG_MD
-			printf("Loading descriptor %d: 0x%lx / 0x%lx\n", i,
-			       pfn0, pfn1);
-#endif
-			phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
-			phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
-			phys_avail_cnt += 2;
-			
-		}
-	}
-	phys_avail[phys_avail_cnt] = 0;
-
-	Maxmem = physmem;
 	init_param2(physmem);
 
 	/*
 	 * Initialize error message buffer (at end of core).
 	 */
-	msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize);
+	msgbufp = ia64_physmem_alloc(msgbufsize, PAGE_SIZE);
 	msgbufinit(msgbufp, msgbufsize);
 
 	proc_linkup0(&proc0, &thread0);
 	/*
 	 * Init mapping for kernel stack for proc 0
 	 */
-	thread0.td_kstack = pmap_steal_memory(KSTACK_PAGES * PAGE_SIZE);
+	p = ia64_physmem_alloc(KSTACK_PAGES * PAGE_SIZE, PAGE_SIZE);
+	thread0.td_kstack = (uintptr_t)p;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 
 	mutex_init();
@@ -952,6 +864,11 @@
 	/*
 	 * Initialize debuggers, and break into them if appropriate.
 	 */
+#ifdef DDB
+	ksym_start = bootinfo->bi_symtab;
+	ksym_end = bootinfo->bi_esymtab;
+#endif
+
 	kdb_init();
 
 #ifdef KDB
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/mp_machdep.c
--- a/head/sys/ia64/ia64/mp_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/mp_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/ia64/ia64/mp_machdep.c 223758 2011-07-04 12:04:52Z attilio $");
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/mp_machdep.c 238257 2012-07-08 18:00:22Z marcel $");
 
 #include "opt_kstack_pages.h"
 
@@ -309,9 +309,8 @@
 	} else
 		pc = pcpup;
 
-	pc->pc_acpi_id = acpi_id;
-	pc->pc_md.lid = IA64_LID_SET_SAPIC_ID(sapic_id);
-
+	cpu_pcpu_setup(pc, acpi_id, sapic_id);
+ 
 	CPU_SET(pc->pc_cpuid, &all_cpus);
 }
 
@@ -466,6 +465,7 @@
 	 */
 	ia64_bind_intr();
 }
+SYSINIT(start_aps, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, cpu_mp_unleash, NULL);
 
 /*
  * send an IPI to a set of cpus.
@@ -522,5 +522,3 @@
 	ia64_mf_a();
 	CTR3(KTR_SMP, "ipi_send(%p, %d): cpuid=%d", cpu, xiv, PCPU_GET(cpuid));
 }
-
-SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, cpu_mp_unleash, NULL);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/nexus.c
--- a/head/sys/ia64/ia64/nexus.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/nexus.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/ia64/ia64/nexus.c 224184 2011-07-18 14:04:37Z jhb $
+ * $FreeBSD: head/sys/ia64/ia64/nexus.c 235041 2012-05-04 23:16:29Z marcel $
  */
 
 /*
@@ -65,9 +65,6 @@
 
 #include <dev/acpica/acpivar.h>
 
-#include <isa/isareg.h>
-#include <sys/rtprio.h>
-
 #include "clock_if.h"
 
 static MALLOC_DEFINE(M_NEXUSDEV, "nexusdev", "Nexus device");
@@ -191,12 +188,6 @@
 nexus_attach(device_t dev)
 {
 
-	/*
-	 * Mask the legacy PICs - we will use the I/O SAPIC for interrupt.
-	 */
-	outb(IO_ICU1+1, 0xff);
-	outb(IO_ICU2+1, 0xff);
-
 	if (acpi_identify() == 0)
 		BUS_ADD_CHILD(dev, 10, "acpi", 0);
 	clock_register(dev, 1000);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/physmem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/ia64/ia64/physmem.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,258 @@
+/*-
+ * Copyright (c) 2012 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/physmem.c 238190 2012-07-07 05:17:43Z marcel $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <machine/md_var.h>
+#include <machine/vmparam.h>
+
+static u_int phys_avail_segs;
+
+vm_paddr_t phys_avail[2 * VM_PHYSSEG_MAX + 2];
+
+vm_paddr_t paddr_max;
+
+long realmem;
+
+static u_int
+ia64_physmem_find(vm_paddr_t base, vm_paddr_t lim)
+{
+	u_int idx;
+
+	for (idx = 0; phys_avail[idx + 1] != 0; idx += 2) {
+		if (phys_avail[idx] >= lim ||
+		    phys_avail[idx + 1] > base)
+			break;
+	}
+	return (idx);
+}
+
+static int
+ia64_physmem_insert(u_int idx, vm_paddr_t base, vm_paddr_t lim)
+{
+	u_int ridx;
+
+	if (phys_avail_segs == VM_PHYSSEG_MAX)
+		return (ENOMEM);
+
+	ridx = phys_avail_segs * 2;
+	while (idx < ridx) {
+		phys_avail[ridx + 1] = phys_avail[ridx - 1];
+		phys_avail[ridx] = phys_avail[ridx - 2];
+		ridx -= 2;
+	}
+	phys_avail[idx] = base;
+	phys_avail[idx + 1] = lim;
+	phys_avail_segs++;
+	return (0);
+}
+
+static int
+ia64_physmem_remove(u_int idx)
+{
+
+	if (phys_avail_segs == 0)
+		return (ENOENT);
+	do {
+		phys_avail[idx] = phys_avail[idx + 2];
+		phys_avail[idx + 1] = phys_avail[idx + 3];
+		idx += 2;
+	} while (phys_avail[idx + 1] != 0);
+	phys_avail_segs--;
+	return (0);
+}
+
+int
+ia64_physmem_add(vm_paddr_t base, vm_size_t len)
+{
+	vm_paddr_t lim;
+	u_int idx;
+
+	realmem += len;
+
+	lim = base + len;
+	idx = ia64_physmem_find(base, lim);
+	if (phys_avail[idx] == lim) {
+		phys_avail[idx] = base;
+		return (0);
+	}
+	if (idx > 0 && phys_avail[idx - 1] == base) {
+		phys_avail[idx - 1] = lim;
+		return (0);
+	}
+	return (ia64_physmem_insert(idx, base, lim));
+}
+
+int
+ia64_physmem_delete(vm_paddr_t base, vm_size_t len)
+{
+	vm_paddr_t lim;
+	u_int idx;
+
+	lim = base + len;
+	idx = ia64_physmem_find(base, lim);
+	if (phys_avail[idx] >= lim || phys_avail[idx + 1] == 0)
+		return (ENOENT);
+	if (phys_avail[idx] < base && phys_avail[idx + 1] > lim) {
+		len = phys_avail[idx + 1] - lim;
+		phys_avail[idx + 1] = base;
+		base = lim;
+		lim = base + len;
+		return (ia64_physmem_insert(idx + 2, base, lim));
+	} else {
+		if (phys_avail[idx] == base)
+			phys_avail[idx] = lim;
+		if (phys_avail[idx + 1] == lim)
+			phys_avail[idx + 1] = base;
+		if (phys_avail[idx] >= phys_avail[idx + 1])
+			return (ia64_physmem_remove(idx));
+	}
+	return (0);
+}
+
+int
+ia64_physmem_fini(void)
+{
+	vm_paddr_t base, lim, size;
+	u_int idx;
+
+	idx = 0;
+	while (phys_avail[idx + 1] != 0) {
+		base = round_page(phys_avail[idx]);
+		lim = trunc_page(phys_avail[idx + 1]);
+		if (base < lim) {
+			phys_avail[idx] = base;
+			phys_avail[idx + 1] = lim;
+			size = lim - base;
+			physmem += atop(size);
+			paddr_max = lim;
+			idx += 2;
+		} else
+			ia64_physmem_remove(idx);
+	}
+
+	/*
+	 * Round realmem to a multple of 128MB. Hopefully that compensates
+	 * for any loss of DRAM that isn't accounted for in the memory map.
+	 * I'm thinking legacy BIOS or VGA here. In any case, it's ok if
+	 * we got it wrong, because we don't actually use realmem. It's
+	 * just for show...
+	 */
+	size = 1U << 27;
+	realmem = (realmem + size - 1) & ~(size - 1);
+	realmem = atop(realmem);
+	return (0);
+}
+
+int
+ia64_physmem_init(void)
+{
+
+	/* Nothing to do just yet. */
+	return (0);
+}
+
+int
+ia64_physmem_track(vm_paddr_t base, vm_size_t len)
+{
+
+	realmem += len;
+	return (0);
+}
+
+void *
+ia64_physmem_alloc(vm_size_t len, vm_size_t align)
+{
+	vm_paddr_t base, lim, pa;
+	void *ptr;
+	u_int idx;
+
+	if (phys_avail_segs == 0)
+		return (NULL);
+
+	len = round_page(len);
+
+	/*
+	 * Try and allocate with least effort.
+	 */
+	idx = phys_avail_segs * 2;
+	while (idx > 0) {
+		idx -= 2;
+		base = phys_avail[idx];
+		lim = phys_avail[idx + 1];
+
+		if (lim - base < len)
+			continue;
+
+		/* First try from the end. */
+		pa = lim - len;
+		if ((pa & (align - 1)) == 0) {
+			if (pa == base)
+				ia64_physmem_remove(idx);
+			else
+				phys_avail[idx + 1] = pa;
+			goto gotit;
+		}
+
+		/* Try from the start next. */
+		pa = base;
+		if ((pa & (align - 1)) == 0) {
+			if (pa + len == lim)
+				ia64_physmem_remove(idx);
+			else
+				phys_avail[idx] += len;
+			goto gotit;
+		}
+	}
+
+	/*
+	 * Find a good segment and split it up.
+	 */
+	idx = phys_avail_segs * 2;
+	while (idx > 0) {
+		idx -= 2;
+		base = phys_avail[idx];
+		lim = phys_avail[idx + 1];
+
+		pa = (base + align - 1) & ~(align - 1);
+		if (pa + len <= lim) {
+			ia64_physmem_delete(pa, len);
+			goto gotit;
+		}
+	}
+
+	/* Out of luck. */
+	return (NULL);
+
+ gotit:
+	ptr = (void *)IA64_PHYS_TO_RR7(pa);
+	bzero(ptr, len);
+	return (ptr);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/ia64/pmap.c
--- a/head/sys/ia64/ia64/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/ia64/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -46,7 +46,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/ia64/ia64/pmap.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/ia64/ia64/pmap.c 238190 2012-07-07 05:17:43Z marcel $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -243,36 +243,6 @@
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
 		    vm_page_t m);
 
-vm_offset_t
-pmap_steal_memory(vm_size_t size)
-{
-	vm_size_t bank_size;
-	vm_offset_t pa, va;
-
-	size = round_page(size);
-
-	bank_size = phys_avail[1] - phys_avail[0];
-	while (size > bank_size) {
-		int i;
-		for (i = 0; phys_avail[i+2]; i+= 2) {
-			phys_avail[i] = phys_avail[i+2];
-			phys_avail[i+1] = phys_avail[i+3];
-		}
-		phys_avail[i] = 0;
-		phys_avail[i+1] = 0;
-		if (!phys_avail[0])
-			panic("pmap_steal_memory: out of memory");
-		bank_size = phys_avail[1] - phys_avail[0];
-	}
-
-	pa = phys_avail[0];
-	phys_avail[0] += size;
-
-	va = IA64_PHYS_TO_RR7(pa);
-	bzero((caddr_t) va, size);
-	return va;
-}
-
 static void
 pmap_initialize_vhpt(vm_offset_t vhpt)
 {
@@ -289,21 +259,23 @@
 }
 
 #ifdef SMP
-MALLOC_DECLARE(M_SMP);
-
 vm_offset_t
 pmap_alloc_vhpt(void)
 {
 	vm_offset_t vhpt;
+	vm_page_t m;
 	vm_size_t size;
 
 	size = 1UL << pmap_vhpt_log2size;
-	vhpt = (uintptr_t)contigmalloc(size, M_SMP, 0, 0UL, ~0UL, size, 0UL);
-	if (vhpt != 0) {
-		vhpt = IA64_PHYS_TO_RR7(ia64_tpa(vhpt));
+	m = vm_page_alloc_contig(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
+	    VM_ALLOC_WIRED, atop(size), 0UL, ~0UL, size, 0UL,
+	    VM_MEMATTR_DEFAULT);
+	if (m != NULL) {
+		vhpt = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m));
 		pmap_initialize_vhpt(vhpt);
+		return (vhpt);
 	}
-	return (vhpt);
+	return (0);
 }
 #endif
 
@@ -316,7 +288,7 @@
 	struct ia64_pal_result res;
 	vm_offset_t base;
 	size_t size;
-	int i, j, count, ridbits;
+	int i, ridbits;
 
 	/*
 	 * Query the PAL Code to find the loop parameters for the
@@ -378,7 +350,7 @@
 
 	pmap_ridmax = (1 << ridbits);
 	pmap_ridmapsz = pmap_ridmax / 64;
-	pmap_ridmap = (uint64_t *)pmap_steal_memory(pmap_ridmax / 8);
+	pmap_ridmap = ia64_physmem_alloc(pmap_ridmax / 8, PAGE_SIZE);
 	pmap_ridmap[0] |= 0xff;
 	pmap_rididx = 0;
 	pmap_ridcount = 8;
@@ -387,14 +359,10 @@
 	/*
 	 * Allocate some memory for initial kernel 'page tables'.
 	 */
-	ia64_kptdir = (void *)pmap_steal_memory(PAGE_SIZE);
+	ia64_kptdir = ia64_physmem_alloc(PAGE_SIZE, PAGE_SIZE);
 	nkpt = 0;
 	kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 
-	for (i = 0; phys_avail[i+2]; i+= 2)
-		;
-	count = i+2;
-
 	/*
 	 * Determine a valid (mappable) VHPT size.
 	 */
@@ -408,35 +376,18 @@
 	if (pmap_vhpt_log2size & 1)
 		pmap_vhpt_log2size--;
 
-	base = 0;
 	size = 1UL << pmap_vhpt_log2size;
-	for (i = 0; i < count; i += 2) {
-		base = (phys_avail[i] + size - 1) & ~(size - 1);
-		if (base + size <= phys_avail[i+1])
-			break;
-	}
-	if (!phys_avail[i])
+	base = (uintptr_t)ia64_physmem_alloc(size, size);
+	if (base == 0)
 		panic("Unable to allocate VHPT");
 
-	if (base != phys_avail[i]) {
-		/* Split this region. */
-		for (j = count; j > i; j -= 2) {
-			phys_avail[j] = phys_avail[j-2];
-			phys_avail[j+1] = phys_avail[j-2+1];
-		}
-		phys_avail[i+1] = base;
-		phys_avail[i+2] = base + size;
-	} else
-		phys_avail[i] = base + size;
-
-	base = IA64_PHYS_TO_RR7(base);
 	PCPU_SET(md.vhpt, base);
 	if (bootverbose)
 		printf("VHPT: address=%#lx, size=%#lx\n", base, size);
 
 	pmap_vhpt_nbuckets = size / sizeof(struct ia64_lpte);
-	pmap_vhpt_bucket = (void *)pmap_steal_memory(pmap_vhpt_nbuckets *
-	    sizeof(struct ia64_bucket));
+	pmap_vhpt_bucket = ia64_physmem_alloc(pmap_vhpt_nbuckets *
+	    sizeof(struct ia64_bucket), PAGE_SIZE);
 	for (i = 0; i < pmap_vhpt_nbuckets; i++) {
 		/* Stolen memory is zeroed. */
 		mtx_init(&pmap_vhpt_bucket[i].mutex, "VHPT bucket lock", NULL,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/_stdint.h
--- a/head/sys/ia64/include/_stdint.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/_stdint.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/include/_stdint.h 237517 2012-06-24 04:15:58Z andrew $
  */
 
 #ifndef	_MACHINE__STDINT_H_
@@ -149,12 +149,6 @@
 /* Limit of size_t. */
 #define	SIZE_MAX	UINT64_MAX
 
-#ifndef WCHAR_MIN /* Also possibly defined in <wchar.h> */
-/* Limits of wchar_t. */
-#define	WCHAR_MIN	INT32_MIN
-#define	WCHAR_MAX	INT32_MAX
-#endif
-
 /* Limits of wint_t. */
 #define	WINT_MIN	INT32_MIN
 #define	WINT_MAX	INT32_MAX
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/_types.h
--- a/head/sys/ia64/include/_types.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/_types.h	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  *
  *	From: @(#)ansi.h	8.2 (Berkeley) 1/4/94
  *	From: @(#)types.h	8.3 (Berkeley) 1/5/94
- * $FreeBSD: head/sys/ia64/include/_types.h 228469 2011-12-13 13:38:03Z ed $
+ * $FreeBSD: head/sys/ia64/include/_types.h 237517 2012-06-24 04:15:58Z andrew $
  */
 
 #ifndef _MACHINE__TYPES_H_
@@ -96,6 +96,10 @@
 typedef	__uint64_t	__vm_paddr_t;
 typedef	__uint64_t	__vm_pindex_t;
 typedef	__uint64_t	__vm_size_t;
+typedef	int		__wchar_t;
+
+#define	__WCHAR_MIN	__INT_MIN	/* min value for a wchar_t */
+#define	__WCHAR_MAX	__INT_MAX	/* max value for a wchar_t */
 
 /*
  * Unusual type definitions.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/elf.h
--- a/head/sys/ia64/include/elf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/elf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/include/elf.h 237430 2012-06-22 06:38:31Z kib $
  */
 
 #ifndef _MACHINE_ELF_H_
@@ -95,6 +95,7 @@
 #define	AT_NCPUS	19	/* Number of CPUs. */
 #define	AT_PAGESIZES	20	/* Pagesizes. */
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
+#define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #define	AT_STACKPROT	23	/* Initial stack protection. */
 
 #define	AT_COUNT	24	/* Count of defined aux entry types. */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/in_cksum.h
--- a/head/sys/ia64/include/in_cksum.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/in_cksum.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  *	from tahoe:	in_cksum.c	1.2	86/01/05
  *	from:		@(#)in_cksum.c	1.3 (Berkeley) 1/19/91
  *	from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/include/in_cksum.h 235941 2012-05-24 22:00:48Z bz $
  */
 
 #ifndef _MACHINE_IN_CKSUM_H_
@@ -39,6 +39,7 @@
 
 #define in_cksum(m, len)	in_cksum_skip(m, len, 0)
 
+#if defined(IPVERSION) && (IPVERSION == 4)
 /*
  * It it useful to have an Internet checksum routine which is inlineable
  * and optimized specifically for the task of computing IP header checksums
@@ -65,9 +66,12 @@
 	} while(0)
 
 #endif
+#endif
 
 #ifdef _KERNEL
+#if defined(IPVERSION) && (IPVERSION == 4)
 u_int in_cksum_hdr(const struct ip *ip);
+#endif
 u_short	in_addword(u_short sum, u_short b);
 u_short	in_pseudo(u_int sum, u_int b, u_int c);
 u_short	in_cksum_skip(struct mbuf *m, int len, int skip);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/md_var.h
--- a/head/sys/ia64/include/md_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/md_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/ia64/include/md_var.h 238257 2012-07-08 18:00:22Z marcel $
  */
 
 #ifndef _MACHINE_MD_VAR_H_
@@ -61,6 +61,7 @@
 #ifdef _KERNEL
 
 struct _special;
+struct pcpu;
 struct thread;
 struct trapframe;
 
@@ -73,14 +74,14 @@
 };
 
 extern uint64_t ia64_lapic_addr;
-
-extern long Maxmem;
+extern vm_paddr_t paddr_max;
 extern u_int busdma_swi_pending;
 
 void	*acpi_find_table(const char *sig);
 void	busdma_swi(void);
 int	copyout_regstack(struct thread *, uint64_t *, uint64_t *);
 void	cpu_mp_add(u_int, u_int, u_int);
+void	cpu_pcpu_setup(struct pcpu *, u_int, u_int);
 int	do_ast(struct trapframe *);
 void	ia32_trap(int, struct trapframe *);
 int	ia64_count_cpus(void);
@@ -93,6 +94,12 @@
 int	ia64_highfp_save_ipi(void);
 struct ia64_init_return ia64_init(void);
 u_int	ia64_itc_freq(void);
+int	ia64_physmem_add(vm_paddr_t, vm_size_t);
+void	*ia64_physmem_alloc(vm_size_t, vm_size_t);
+int	ia64_physmem_delete(vm_paddr_t, vm_size_t);
+int	ia64_physmem_fini(void);
+int	ia64_physmem_init(void);
+int	ia64_physmem_track(vm_paddr_t, vm_size_t);
 void	ia64_probe_sapics(void);
 void	ia64_sync_icache(vm_offset_t, vm_size_t);
 void	interrupt(struct trapframe *);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/param.h
--- a/head/sys/ia64/include/param.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/param.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/ia64/include/param.h 224217 2011-07-19 13:00:30Z attilio $ */
+/* $FreeBSD: head/sys/ia64/include/param.h 238184 2012-07-07 00:25:17Z marcel $ */
 /* From: NetBSD: param.h,v 1.20 1997/09/19 13:52:53 leo Exp */
 
 /*-
@@ -110,9 +110,6 @@
 #define atop(x)			((unsigned long)(x) >> PAGE_SHIFT)
 #define ptoa(x)			((unsigned long)(x) << PAGE_SHIFT)
 
-#define	ia64_btop(x)		((unsigned long)(x) >> PAGE_SHIFT)
-#define	ia64_ptob(x)		((unsigned long)(x) << PAGE_SHIFT)
-
 #define pgtok(x)                ((x) * (PAGE_SIZE / 1024)) 
 
 #endif	/* !_IA64_INCLUDE_PARAM_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/pcb.h
--- a/head/sys/ia64/include/pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$FreeBSD$
+ *	$FreeBSD: head/sys/ia64/include/pcb.h 234785 2012-04-29 11:04:31Z dim $
  */
 
 #ifndef _MACHINE_PCB_H_
@@ -65,10 +65,10 @@
 
 void makectx(struct trapframe *, struct pcb *);
 void restorectx(struct pcb *) __dead2;
-int swapctx(struct pcb *old, struct pcb *new);
+int swapctx(struct pcb *old, struct pcb *new) __returns_twice;
 
 void ia32_restorectx(struct pcb *);
-void ia32_savectx(struct pcb *);
+void ia32_savectx(struct pcb *) __returns_twice;
 
 #endif
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/pmap.h
--- a/head/sys/ia64/include/pmap.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/ia64/include/pmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  *	from: hp300: @(#)pmap.h	7.2 (Berkeley) 12/16/90
  *	from: @(#)pmap.h	7.4 (Berkeley) 5/12/91
  *	from: i386 pmap.h,v 1.54 1997/11/20 19:30:35 bde Exp
- * $FreeBSD: head/sys/ia64/include/pmap.h 223873 2011-07-08 16:30:54Z marcel $
+ * $FreeBSD: head/sys/ia64/include/pmap.h 237168 2012-06-16 18:56:19Z alc $
  */
 
 #ifndef _MACHINE_PMAP_H_
@@ -118,6 +118,7 @@
 
 #define	pmap_page_get_memattr(m)	((m)->md.memattr)
 #define	pmap_page_is_mapped(m)	(!TAILQ_EMPTY(&(m)->md.pv_list))
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
 #define	pmap_mapbios(pa, sz)	pmap_mapdev(pa, sz)
 #define	pmap_unmapbios(va, sz)	pmap_unmapdev(va, sz)
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/ia64/include/vdso.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/ia64/include/vdso.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,41 @@
+/*-
+ * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/ia64/include/vdso.h 237433 2012-06-22 07:06:40Z kib $
+ */
+
+#ifndef _IA64_VDSO_H
+#define	_IA64_VDSO_H
+
+#define	VDSO_TIMEHANDS_MD			\
+	uint32_t	th_res[8];
+
+#ifdef _KERNEL
+#ifdef COMPAT_FREEBSD32
+
+#define	VDSO_TIMEHANDS_MD32	VDSO_TIMEHANDS_MD
+
+#endif
+#endif
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/capabilities.conf
--- a/head/sys/kern/capabilities.conf	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/capabilities.conf	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
 ## - sys_exit(2), abort2(2) and close(2) are very important.
 ## - Sorted alphabetically, please keep it that way.
 ##
-## $FreeBSD: head/sys/kern/capabilities.conf 224987 2011-08-18 22:51:30Z jonathan $
+## $FreeBSD: head/sys/kern/capabilities.conf 236361 2012-05-31 19:32:37Z pjd $
 ##
 
 ##
@@ -445,13 +445,17 @@
 faccessat
 fstatat
 fchmodat
+fchownat
 futimesat
+linkat
 mkdirat
-rmdirat
 mkfifoat
 mknodat
 openat
+readlinkat
 renameat
+symlinkat
+unlinkat
 
 ##
 ## Allow entry into open(2). This system call will fail, since access to the
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/dtio_kdtrace.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/kern/dtio_kdtrace.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2012 Advanced Computing Technologies LLC
+ * Written by George Neville-Neil gnn at freebsd.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/dtio_kdtrace.c 238366 2012-07-11 16:27:02Z gnn $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <sys/dtrace.h>
+#include "../sys/dtrace_bsd.h"
+
+
+static int	dtio_unload(void);
+static void	dtio_getargdesc(void *, dtrace_id_t, void *,
+		    dtrace_argdesc_t *);
+static void	dtio_provide(void *, dtrace_probedesc_t *);
+static void	dtio_destroy(void *, dtrace_id_t, void *);
+static void	dtio_enable(void *, dtrace_id_t, void *);
+static void	dtio_disable(void *, dtrace_id_t, void *);
+static void	dtio_load(void *);
+
+static dtrace_pattr_t dtio_attr = {
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+};
+
+static char    *genunix = "genunix";
+
+/*
+ * Name strings.
+ */
+static char	*dtio_start_str = "start";
+static char	*dtio_done_str = "done";
+static char	*dtio_wait_start_str = "wait-start";
+static char	*dtio_wait_done_str = "wait-done";
+
+static dtrace_pops_t dtio_pops = {
+	dtio_provide,
+	NULL,
+	dtio_enable,
+	dtio_disable,
+	NULL,
+	NULL,
+	dtio_getargdesc,
+	NULL,
+	NULL,
+	dtio_destroy
+};
+
+static dtrace_provider_id_t	dtio_id;
+
+extern uint32_t	dtio_start_id;
+extern uint32_t	dtio_done_id;
+extern uint32_t	dtio_wait_start_id;
+extern uint32_t	dtio_wait_done_id;
+
+static void
+dtio_getargdesc(void *arg, dtrace_id_t id, void *parg,
+    dtrace_argdesc_t *desc)
+{
+	const char *p = NULL;
+
+	switch (desc->dtargd_ndx) {
+	case 0:
+		p = "struct bio *";
+		break;
+	case 1:
+		p = "struct devstat *";
+		break;
+	default:
+		desc->dtargd_ndx = DTRACE_ARGNONE;
+	}
+
+	if (p != NULL)
+		strlcpy(desc->dtargd_native, p, sizeof(desc->dtargd_native));
+}
+
+static void
+dtio_provide(void *arg, dtrace_probedesc_t *desc)
+{
+	if (desc != NULL)
+		return;
+
+	if (dtrace_probe_lookup(dtio_id, genunix, NULL, 
+				dtio_start_str) == 0) {
+		dtio_start_id = dtrace_probe_create(dtio_id, genunix, NULL, 
+						   dtio_start_str, 0, NULL);
+	}
+	if (dtrace_probe_lookup(dtio_id, genunix, NULL, dtio_done_str) == 0) {
+		dtio_done_id = dtrace_probe_create(dtio_id, genunix, NULL, 
+						   dtio_done_str, 0, NULL);
+	}
+	if (dtrace_probe_lookup(dtio_id, genunix, NULL, 
+				dtio_wait_start_str) == 0) {
+		dtio_wait_start_id = dtrace_probe_create(dtio_id, genunix, 
+							 NULL, 
+							 dtio_wait_start_str, 
+							 0, NULL);
+	}
+	if (dtrace_probe_lookup(dtio_id, genunix, NULL, 
+				dtio_wait_done_str) == 0) {
+		dtio_wait_done_id = dtrace_probe_create(dtio_id, genunix, NULL, 
+						   dtio_wait_done_str, 0, NULL);
+	}
+
+}
+
+static void
+dtio_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+}
+
+static void
+dtio_enable(void *arg, dtrace_id_t id, void *parg)
+{
+	if (id == dtio_start_id)
+		dtrace_io_start_probe =
+			(dtrace_io_start_probe_func_t)dtrace_probe;
+	else if (id == dtio_done_id)
+		dtrace_io_done_probe =
+			(dtrace_io_done_probe_func_t)dtrace_probe;
+	else if (id == dtio_wait_start_id)
+		dtrace_io_wait_start_probe =
+			(dtrace_io_wait_start_probe_func_t)dtrace_probe;
+	else if (id == dtio_wait_done_id)
+		dtrace_io_wait_done_probe =
+			(dtrace_io_wait_done_probe_func_t)dtrace_probe;
+	else
+		printf("dtrace io provider: unknown ID\n");
+
+}
+
+static void
+dtio_disable(void *arg, dtrace_id_t id, void *parg)
+{
+	if (id == dtio_start_id)
+		dtrace_io_start_probe = NULL;
+	else if (id == dtio_done_id)
+		dtrace_io_done_probe = NULL;
+	else if (id == dtio_wait_start_id)
+		dtrace_io_wait_start_probe = NULL;
+	else if (id == dtio_wait_done_id)
+		dtrace_io_wait_done_probe = NULL;
+	else 
+		printf("dtrace io provider: unknown ID\n");
+	
+}
+
+static void
+dtio_load(void *dummy)
+{
+	if (dtrace_register("io", &dtio_attr, DTRACE_PRIV_USER, NULL, 
+			    &dtio_pops, NULL, &dtio_id) != 0)
+		return;
+}
+
+
+static int
+dtio_unload()
+{
+	dtrace_io_start_probe = NULL;
+	dtrace_io_done_probe = NULL;
+	dtrace_io_wait_start_probe = NULL;
+	dtrace_io_wait_done_probe = NULL;
+
+	return (dtrace_unregister(dtio_id));
+}
+
+static int
+dtio_modevent(module_t mod __unused, int type, void *data __unused)
+{
+	int error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		break;
+
+	case MOD_UNLOAD:
+		break;
+
+	case MOD_SHUTDOWN:
+		break;
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+SYSINIT(dtio_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+    dtio_load, NULL);
+SYSUNINIT(dtio_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+    dtio_unload, NULL);
+
+DEV_MODULE(dtio, dtio_modevent, NULL);
+MODULE_VERSION(dtio, 1);
+MODULE_DEPEND(dtio, dtrace, 1, 1, 1);
+MODULE_DEPEND(dtio, opensolaris, 1, 1, 1);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/imgact_aout.c
--- a/head/sys/kern/imgact_aout.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/imgact_aout.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/imgact_aout.c 223165 2011-06-16 22:00:59Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/imgact_aout.c 238687 2012-07-22 13:41:45Z kib $");
 
 #include <sys/param.h>
 #include <sys/exec.h>
@@ -106,6 +106,7 @@
 #define	AOUT32_USRSTACK	0xbfc00000
 #define	AOUT32_PS_STRINGS \
     (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
+#define	AOUT32_MINUSER	FREEBSD32_MINUSER
 
 extern const char *freebsd32_syscallnames[];
 extern u_long ia32_maxssiz;
@@ -129,7 +130,7 @@
 	.sv_imgact_try	= NULL,
 	.sv_minsigstksz	= MINSIGSTKSZ,
 	.sv_pagesize	= IA32_PAGE_SIZE,
-	.sv_minuser	= 0,
+	.sv_minuser	= AOUT32_MINUSER,
 	.sv_maxuser	= AOUT32_USRSTACK,
 	.sv_usrstack	= AOUT32_USRSTACK,
 	.sv_psstrings	= AOUT32_PS_STRINGS,
@@ -174,9 +175,9 @@
 	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
 	 * NetBSD is in network byte order.. ugh.
 	 */
-	if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
-	    ((a_out->a_magic >> 16) & 0xff) != 0 &&
-	    ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
+	if (((a_out->a_midmag >> 16) & 0xff) != 0x86 &&
+	    ((a_out->a_midmag >> 16) & 0xff) != 0 &&
+	    ((((int)ntohl(a_out->a_midmag)) >> 16) & 0xff) != 0x86)
                 return -1;
 
 	/*
@@ -184,7 +185,7 @@
 	 *	We do two cases: host byte order and network byte order
 	 *	(for NetBSD compatibility)
 	 */
-	switch ((int)(a_out->a_magic & 0xffff)) {
+	switch ((int)(a_out->a_midmag & 0xffff)) {
 	case ZMAGIC:
 		virtual_offset = 0;
 		if (a_out->a_text) {
@@ -203,7 +204,7 @@
 		break;
 	default:
 		/* NetBSD compatibility */
-		switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
+		switch ((int)(ntohl(a_out->a_midmag) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			virtual_offset = PAGE_SIZE;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/imgact_elf.c
--- a/head/sys/kern/imgact_elf.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/imgact_elf.c	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/imgact_elf.c 232828 2012-03-11 19:38:49Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/imgact_elf.c 238617 2012-07-19 11:15:53Z kib $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -83,7 +83,7 @@
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
 static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
-    const char *interp, int32_t *osrel);
+    const char *interp, int interp_name_len, int32_t *osrel);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry, size_t pagesize);
 static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
@@ -254,7 +254,7 @@
 
 static Elf_Brandinfo *
 __elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
-    int32_t *osrel)
+    int interp_name_len, int32_t *osrel)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	Elf_Brandinfo *bi;
@@ -300,7 +300,10 @@
 			if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
 				continue;
 			if (hdr->e_machine == bi->machine &&
-			    strcmp(interp, bi->interp_path) == 0)
+			    /* ELF image p_filesz includes terminating zero */
+			    strlen(bi->interp_path) + 1 == interp_name_len &&
+			    strncmp(interp, bi->interp_path, interp_name_len)
+			    == 0)
 				return (bi);
 		}
 	}
@@ -722,7 +725,7 @@
 	u_long seg_size, seg_addr;
 	u_long addr, baddr, et_dyn_addr, entry = 0, proghdr = 0;
 	int32_t osrel = 0;
-	int error = 0, i, n;
+	int error = 0, i, n, interp_name_len = 0;
 	const char *interp = NULL, *newinterp = NULL;
 	Elf_Brandinfo *brand_info;
 	char *path;
@@ -763,9 +766,11 @@
 		case PT_INTERP:
 			/* Path to interpreter */
 			if (phdr[i].p_filesz > MAXPATHLEN ||
-			    phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE)
+			    phdr[i].p_offset >= PAGE_SIZE ||
+			    phdr[i].p_offset + phdr[i].p_filesz >= PAGE_SIZE)
 				return (ENOEXEC);
 			interp = imgp->image_header + phdr[i].p_offset;
+			interp_name_len = phdr[i].p_filesz;
 			break;
 		case PT_GNU_STACK:
 			if (__elfN(nxstack))
@@ -775,7 +780,8 @@
 		}
 	}
 
-	brand_info = __elfN(get_brandinfo)(imgp, interp, &osrel);
+	brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len,
+	    &osrel);
 	if (brand_info == NULL) {
 		uprintf("ELF binary type \"%u\" not known.\n",
 		    hdr->e_ident[EI_OSABI]);
@@ -1011,6 +1017,10 @@
 		AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
 		AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
 	}
+	if (imgp->sysent->sv_timekeep_base != 0) {
+		AUXARGS_ENTRY(pos, AT_TIMEKEEP,
+		    imgp->sysent->sv_timekeep_base);
+	}
 	AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
 	    != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 	    imgp->sysent->sv_stackprot);
@@ -1558,6 +1568,7 @@
 	int i;
 
 	if (pnote == NULL || pnote->p_offset >= PAGE_SIZE ||
+	    pnote->p_filesz > PAGE_SIZE ||
 	    pnote->p_offset + pnote->p_filesz >= PAGE_SIZE)
 		return (FALSE);
 
@@ -1565,15 +1576,17 @@
 	note_end = (const Elf_Note *)(imgp->image_header +
 	    pnote->p_offset + pnote->p_filesz);
 	for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
-		if (!aligned(note, Elf32_Addr))
+		if (!aligned(note, Elf32_Addr) || (const char *)note_end -
+		    (const char *)note < sizeof(Elf_Note))
 			return (FALSE);
 		if (note->n_namesz != checknote->hdr.n_namesz ||
 		    note->n_descsz != checknote->hdr.n_descsz ||
 		    note->n_type != checknote->hdr.n_type)
 			goto nextnote;
 		note_name = (const char *)(note + 1);
-		if (strncmp(checknote->vendor, note_name,
-		    checknote->hdr.n_namesz) != 0)
+		if (note_name + checknote->hdr.n_namesz >=
+		    (const char *)note_end || strncmp(checknote->vendor,
+		    note_name, checknote->hdr.n_namesz) != 0)
 			goto nextnote;
 
 		/*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/imgact_gzip.c
--- a/head/sys/kern/imgact_gzip.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/imgact_gzip.c	Wed Jul 25 16:40:53 2012 +0300
@@ -22,7 +22,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/imgact_gzip.c 231885 2012-02-17 23:47:16Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/imgact_gzip.c 237694 2012-06-28 07:33:43Z imp $");
 
 #include <sys/param.h>
 #include <sys/exec.h>
@@ -161,7 +161,7 @@
 	 * Set file/virtual offset based on a.out variant. We do two cases:
 	 * host byte order and network byte order (for NetBSD compatibility)
 	 */
-	switch ((int) (gz->a_out.a_magic & 0xffff)) {
+	switch ((int) (gz->a_out.a_midmag & 0xffff)) {
 	case ZMAGIC:
 		gz->virtual_offset = 0;
 		if (gz->a_out.a_text) {
@@ -177,7 +177,7 @@
 		break;
 	default:
 		/* NetBSD compatibility */
-		switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
+		switch ((int) (ntohl(gz->a_out.a_midmag) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			gz->virtual_offset = PAGE_SIZE;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/init_main.c
--- a/head/sys/kern/init_main.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/init_main.c	Wed Jul 25 16:40:53 2012 +0300
@@ -42,7 +42,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/init_main.c 230455 2012-01-22 11:01:36Z pjd $");
+__FBSDID("$FreeBSD: head/sys/kern/init_main.c 236404 2012-06-01 15:42:37Z jhb $");
 
 #include "opt_ddb.h"
 #include "opt_init_path.h"
@@ -158,6 +158,24 @@
 	newsysinit_end = newset + count;
 }
 
+#if defined (DDB) && defined(VERBOSE_SYSINIT)
+static const char *
+symbol_name(vm_offset_t va, db_strategy_t strategy)
+{
+	const char *name;
+	c_db_sym_t sym;
+	db_expr_t  offset;
+
+	if (va == 0)
+		return (NULL);
+	sym = db_search_symbol(va, strategy, &offset);
+	if (offset != 0)
+		return (NULL);
+	db_symbol_values(sym, &name, NULL);
+	return (name);
+}
+#endif
+
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
@@ -238,15 +256,16 @@
 		}
 		if (verbose) {
 #if defined(DDB)
-			const char *name;
-			c_db_sym_t sym;
-			db_expr_t  offset;
+			const char *func, *data;
 
-			sym = db_search_symbol((vm_offset_t)(*sipp)->func,
-			    DB_STGY_PROC, &offset);
-			db_symbol_values(sym, &name, NULL);
-			if (name != NULL)
-				printf("   %s(%p)... ", name, (*sipp)->udata);
+			func = symbol_name((vm_offset_t)(*sipp)->func,
+			    DB_STGY_PROC);
+			data = symbol_name((vm_offset_t)(*sipp)->udata,
+			    DB_STGY_ANY);
+			if (func != NULL && data != NULL)
+				printf("   %s(&%s)... ", func, data);
+			else if (func != NULL)
+				printf("   %s(%p)... ", func, (*sipp)->udata);
 			else
 #endif
 				printf("   %p(%p)... ", (*sipp)->func,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/init_sysent.c
--- a/head/sys/kern/init_sysent.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/init_sysent.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/kern/init_sysent.c 227776 2011-11-21 01:26:10Z lstewart $
- * created from FreeBSD: head/sys/kern/syscalls.master 227691 2011-11-19 06:35:15Z ed 
+ * $FreeBSD: head/sys/kern/init_sysent.c 236363 2012-05-31 19:34:53Z pjd $
+ * created from FreeBSD: head/sys/kern/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 #include "opt_compat.h"
@@ -525,19 +525,19 @@
 	{ AS(cpuset_setaffinity_args), (sy_call_t *)sys_cpuset_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 488 = cpuset_setaffinity */
 	{ AS(faccessat_args), (sy_call_t *)sys_faccessat, AUE_FACCESSAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 489 = faccessat */
 	{ AS(fchmodat_args), (sy_call_t *)sys_fchmodat, AUE_FCHMODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 490 = fchmodat */
-	{ AS(fchownat_args), (sy_call_t *)sys_fchownat, AUE_FCHOWNAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 491 = fchownat */
+	{ AS(fchownat_args), (sy_call_t *)sys_fchownat, AUE_FCHOWNAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 491 = fchownat */
 	{ AS(fexecve_args), (sy_call_t *)sys_fexecve, AUE_FEXECVE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 492 = fexecve */
 	{ AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 493 = fstatat */
 	{ AS(futimesat_args), (sy_call_t *)sys_futimesat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 494 = futimesat */
-	{ AS(linkat_args), (sy_call_t *)sys_linkat, AUE_LINKAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 495 = linkat */
+	{ AS(linkat_args), (sy_call_t *)sys_linkat, AUE_LINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 495 = linkat */
 	{ AS(mkdirat_args), (sy_call_t *)sys_mkdirat, AUE_MKDIRAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 496 = mkdirat */
 	{ AS(mkfifoat_args), (sy_call_t *)sys_mkfifoat, AUE_MKFIFOAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 497 = mkfifoat */
 	{ AS(mknodat_args), (sy_call_t *)sys_mknodat, AUE_MKNODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 498 = mknodat */
 	{ AS(openat_args), (sy_call_t *)sys_openat, AUE_OPENAT_RWTC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 499 = openat */
-	{ AS(readlinkat_args), (sy_call_t *)sys_readlinkat, AUE_READLINKAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 500 = readlinkat */
+	{ AS(readlinkat_args), (sy_call_t *)sys_readlinkat, AUE_READLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 500 = readlinkat */
 	{ AS(renameat_args), (sy_call_t *)sys_renameat, AUE_RENAMEAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 501 = renameat */
-	{ AS(symlinkat_args), (sy_call_t *)sys_symlinkat, AUE_SYMLINKAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 502 = symlinkat */
-	{ AS(unlinkat_args), (sy_call_t *)sys_unlinkat, AUE_UNLINKAT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 503 = unlinkat */
+	{ AS(symlinkat_args), (sy_call_t *)sys_symlinkat, AUE_SYMLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 502 = symlinkat */
+	{ AS(unlinkat_args), (sy_call_t *)sys_unlinkat, AUE_UNLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 503 = unlinkat */
 	{ AS(posix_openpt_args), (sy_call_t *)sys_posix_openpt, AUE_POSIX_OPENPT, NULL, 0, 0, 0, SY_THR_STATIC },	/* 504 = posix_openpt */
 	{ AS(gssd_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },	/* 505 = gssd_syscall */
 	{ AS(jail_get_args), (sy_call_t *)sys_jail_get, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 506 = jail_get */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_acct.c
--- a/head/sys/kern/kern_acct.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_acct.c	Wed Jul 25 16:40:53 2012 +0300
@@ -68,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_acct.c 225617 2011-09-16 13:58:51Z kmacy $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_acct.c 234927 2012-05-02 14:25:39Z jhb $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -122,7 +122,7 @@
 static uint32_t	encode_long(long);
 static void	acctwatch(void);
 static void	acct_thread(void *);
-static int	acct_disable(struct thread *);
+static int	acct_disable(struct thread *, int);
 
 /*
  * Accounting vnode pointer, saved vnode pointer, and flags for each.
@@ -196,7 +196,7 @@
 sys_acct(struct thread *td, struct acct_args *uap)
 {
 	struct nameidata nd;
-	int error, flags, vfslocked;
+	int error, flags, vfslocked, replacing;
 
 	error = priv_check(td, PRIV_ACCT);
 	if (error)
@@ -246,6 +246,13 @@
 	sx_xlock(&acct_sx);
 
 	/*
+	 * Don't log spurious disable/enable messages if we are
+	 * switching from one accounting file to another due to log
+	 * rotation.
+	 */
+	replacing = (acct_vp != NULL && uap->path != NULL);
+
+	/*
 	 * If accounting was previously enabled, kill the old space-watcher,
 	 * close the file, and (if no new file was specified, leave).  Reset
 	 * the suspended state regardless of whether accounting remains
@@ -254,7 +261,7 @@
 	acct_suspended = 0;
 	if (acct_vp != NULL) {
 		vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
-		error = acct_disable(td);
+		error = acct_disable(td, !replacing);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	if (uap->path == NULL) {
@@ -299,7 +306,8 @@
 	}
 	acct_configured = 1;
 	sx_xunlock(&acct_sx);
-	log(LOG_NOTICE, "Accounting enabled\n");
+	if (!replacing)
+		log(LOG_NOTICE, "Accounting enabled\n");
 	return (error);
 }
 
@@ -308,7 +316,7 @@
  * our reference to the credential, and clearing the vnode's flags.
  */
 static int
-acct_disable(struct thread *td)
+acct_disable(struct thread *td, int logging)
 {
 	int error;
 
@@ -319,7 +327,8 @@
 	acct_vp = NULL;
 	acct_cred = NULL;
 	acct_flags = 0;
-	log(LOG_NOTICE, "Accounting disabled\n");
+	if (logging)
+		log(LOG_NOTICE, "Accounting disabled\n");
 	return (error);
 }
 
@@ -574,7 +583,7 @@
 	 */
 	vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
 	if (acct_vp->v_type == VBAD) {
-		(void) acct_disable(NULL);
+		(void) acct_disable(NULL, 1);
 		VFS_UNLOCK_GIANT(vfslocked);
 		acct_state |= ACCT_EXITREQ;
 		return;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_clock.c
--- a/head/sys/kern/kern_clock.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_clock.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,11 +35,12 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_clock.c 233628 2012-03-28 20:58:30Z fabient $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_clock.c 235459 2012-05-15 01:30:25Z rstone $");
 
 #include "opt_kdb.h"
 #include "opt_device_polling.h"
 #include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
 #include "opt_ntp.h"
 #include "opt_watchdog.h"
 
@@ -56,6 +57,7 @@
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
@@ -88,6 +90,9 @@
 /* Spin-lock protecting profiling statistics. */
 static struct mtx time_lock;
 
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE2(sched, , , tick, tick, "struct thread *", "struct proc *");
+
 static int
 sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
 {
@@ -760,6 +765,7 @@
 		ru->ru_maxrss = rss;
 	KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock",
 	    "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz);
+	SDT_PROBE2(sched, , , tick, td, td->td_proc);
 	thread_lock_flags(td, MTX_QUIET);
 	for ( ; cnt > 0; cnt--)
 		sched_clock(td);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_conf.c
--- a/head/sys/kern/kern_conf.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_conf.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_conf.c 231386 2012-02-10 14:55:47Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_conf.c 235899 2012-05-24 11:24:44Z mav $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -993,9 +993,10 @@
 	max_parentpath_len = SPECNAMELEN - physpath_len - /*/*/1;
 	parentpath_len = strlen(pdev->si_name);
 	if (max_parentpath_len < parentpath_len) {
-		printf("make_dev_physpath_alias: WARNING - Unable to alias %s "
-		    "to %s/%s - path too long\n",
-		    pdev->si_name, physpath, pdev->si_name);
+		if (bootverbose)
+			printf("WARNING: Unable to alias %s "
+			    "to %s/%s - path too long\n",
+			    pdev->si_name, physpath, pdev->si_name);
 		ret = ENAMETOOLONG;
 		goto out;
 	}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_descrip.c
--- a/head/sys/kern/kern_descrip.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_descrip.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 234131 2012-04-11 14:08:09Z eadler $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_descrip.c 238667 2012-07-21 13:02:11Z kib $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -102,7 +102,7 @@
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
-		     "file desc to leader structures");
+    "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 
 MALLOC_DECLARE(M_FADVISE);
@@ -113,21 +113,24 @@
 /* Flags for do_dup() */
 #define DUP_FIXED	0x1	/* Force fixed allocation */
 #define DUP_FCNTL	0x2	/* fcntl()-style errors */
-
-static int do_dup(struct thread *td, int flags, int old, int new,
-    register_t *retval);
-static int	fd_first_free(struct filedesc *, int, int);
-static int	fd_last_used(struct filedesc *, int, int);
-static void	fdgrowtable(struct filedesc *, int);
+#define	DUP_CLOEXEC	0x4	/* Atomically set FD_CLOEXEC. */
+
+static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
+		    struct thread *td, int holdleaders);
+static int	do_dup(struct thread *td, int flags, int old, int new,
+		    register_t *retval);
+static int	fd_first_free(struct filedesc *fdp, int low, int size);
+static int	fd_last_used(struct filedesc *fdp, int size);
+static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
-static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
-static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
-static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
 static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
 static int	fill_procdesc_info(struct procdesc *pdp,
-    struct kinfo_file *kif);
+		    struct kinfo_file *kif);
+static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
 static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
+static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
+static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
 
 /*
  * A process is initially started out with NDFILE descriptors stored within
@@ -181,14 +184,15 @@
  */
 volatile int openfiles;			/* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
-void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
+void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /* A mutex to protect the association between a proc and filedesc. */
-static struct mtx	fdesc_mtx;
+static struct mtx fdesc_mtx;
 
 /*
- * Find the first zero bit in the given bitmap, starting at low and not
- * exceeding size - 1.
+ * If low >= size, just return low. Otherwise find the first zero bit in the
+ * given bitmap, starting at low and not exceeding size - 1. Return size if
+ * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
@@ -214,19 +218,16 @@
 }
 
 /*
- * Find the highest non-zero bit in the given bitmap, starting at low and
- * not exceeding size - 1.
+ * Find the highest non-zero bit in the given bitmap, starting at 0 and
+ * not exceeding size - 1. Return -1 if not found.
  */
 static int
-fd_last_used(struct filedesc *fdp, int low, int size)
+fd_last_used(struct filedesc *fdp, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
-	if (low >= size)
-		return (-1);
-
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
@@ -234,17 +235,21 @@
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
-	for (minoff = NDSLOT(low); off >= minoff; --off)
+	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
-	return (low - 1);
+	return (-1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
-        KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
-            ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
@@ -256,8 +261,8 @@
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
-	KASSERT(!fdisused(fdp, fd),
-	    ("fd already used"));
+
+	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
@@ -274,16 +279,15 @@
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
-	KASSERT(fdisused(fdp, fd),
-	    ("fd is already unused"));
-	KASSERT(fdp->fd_ofiles[fd] == NULL,
-	    ("fd is still in use"));
+
+	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
+	KASSERT(fdp->fd_ofiles[fd] == NULL, ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
-		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
+		fdp->fd_lastfile = fd_last_used(fdp, fd);
 }
 
 /*
@@ -363,7 +367,7 @@
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 	struct flock fl;
-	struct oflock ofl;
+	struct __oflock ofl;
 	intptr_t arg;
 	int error;
 	int cmd;
@@ -427,23 +431,13 @@
 	return (error);
 }
 
-static inline struct file *
-fdtofp(int fd, struct filedesc *fdp)
-{
-	struct file *fp;
-
-	FILEDESC_LOCK_ASSERT(fdp);
-	if ((unsigned)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
-		return (NULL);
-	return (fp);
-}
-
 static inline int
 fdunwrap(int fd, cap_rights_t rights, struct filedesc *fdp, struct file **fpp)
 {
 
-	*fpp = fdtofp(fd, fdp);
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	*fpp = fget_locked(fdp, fd);
 	if (*fpp == NULL)
 		return (EBADF);
 
@@ -472,6 +466,7 @@
 	int vfslocked;
 	u_int old, new;
 	uint64_t bsize;
+	off_t foffset;
 
 	vfslocked = 0;
 	error = 0;
@@ -485,6 +480,12 @@
 		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
 		break;
 
+	case F_DUPFD_CLOEXEC:
+		tmp = arg;
+		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
+		    td->td_retval);
+		break;
+
 	case F_DUP2FD:
 		tmp = arg;
 		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
@@ -492,7 +493,7 @@
 
 	case F_GETFD:
 		FILEDESC_SLOCK(fdp);
-		if ((fp = fdtofp(fd, fdp)) == NULL) {
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
@@ -504,7 +505,7 @@
 
 	case F_SETFD:
 		FILEDESC_XLOCK(fdp);
-		if ((fp = fdtofp(fd, fdp)) == NULL) {
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_XUNLOCK(fdp);
 			error = EBADF;
 			break;
@@ -613,14 +614,15 @@
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
-			if (fp->f_offset < 0 ||
+			foffset = foffset_get(fp);
+			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
-			     fp->f_offset > OFF_MAX - flp->l_start)) {
+			     foffset > OFF_MAX - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
-			flp->l_start += fp->f_offset;
+			flp->l_start += foffset;
 		}
 
 		/*
@@ -675,10 +677,30 @@
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
-		/* Check for race with close */
+		if (error != 0 || flp->l_type == F_UNLCK ||
+		    flp->l_type == F_UNLCKSYS) {
+			fdrop(fp, td);
+			break;
+		}
+
+		/*
+		 * Check for a race with close.
+		 *
+		 * The vnode is now advisory locked (or unlocked, but this case
+		 * is not really important) as the caller requested.
+		 * We had to drop the filedesc lock, so we need to recheck if
+		 * the descriptor is still valid, because if it was closed
+		 * in the meantime we need to remove advisory lock from the
+		 * vnode - close on any descriptor leading to an advisory
+		 * locked vnode, removes that lock.
+		 * We will return 0 on purpose in that case, as the result of
+		 * successful advisory lock might have been externally visible
+		 * already. This is fine - effectively we pretend to the caller
+		 * that the closing thread was a bit slower and that the
+		 * advisory lock succeeded before the close.
+		 */
 		FILEDESC_SLOCK(fdp);
-		if ((unsigned) fd >= fdp->fd_nfiles ||
-		    fp != fdp->fd_ofiles[fd]) {
+		if (fget_locked(fdp, fd) != fp) {
 			FILEDESC_SUNLOCK(fdp);
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
@@ -686,7 +708,7 @@
 			flp->l_type = F_UNLCK;
 			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
-					   F_UNLCK, flp, F_POSIX);
+			    F_UNLCK, flp, F_POSIX);
 			VFS_UNLOCK_GIANT(vfslocked);
 			vfslocked = 0;
 		} else
@@ -714,15 +736,16 @@
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
+			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
-			    fp->f_offset > OFF_MAX - flp->l_start) ||
+			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
-			     fp->f_offset < OFF_MIN - flp->l_start)) {
+			     foffset < OFF_MIN - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
-			flp->l_start += fp->f_offset;
+			flp->l_start += foffset;
 		}
 		/*
 		 * VOP_ADVLOCK() may block.
@@ -743,7 +766,7 @@
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		FILEDESC_SLOCK(fdp);
-		if ((fp = fdtofp(fd, fdp)) == NULL) {
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
@@ -799,7 +822,7 @@
 	struct proc *p;
 	struct file *fp;
 	struct file *delfp;
-	int error, holdleaders, maxfd;
+	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
@@ -820,7 +843,7 @@
 		return (flags & DUP_FCNTL ? EINVAL : EBADF);
 
 	FILEDESC_XLOCK(fdp);
-	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
+	if (fget_locked(fdp, old) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
@@ -871,77 +894,29 @@
 		}
 	}
 
+	KASSERT(fp == fdp->fd_ofiles[old], ("old fd has been modified"));
+	KASSERT(old != new, ("new fd is same as old"));
+
+	delfp = fdp->fd_ofiles[new];
 	/*
-	 * If the old file changed out from under us then treat it as a
-	 * bad file descriptor.  Userland should do its own locking to
-	 * avoid this case.
-	 */
-	if (fdp->fd_ofiles[old] != fp) {
-		/* we've allocated a descriptor which we won't use */
-		if (fdp->fd_ofiles[new] == NULL)
-			fdunused(fdp, new);
-		FILEDESC_XUNLOCK(fdp);
-		fdrop(fp, td);
-		return (EBADF);
-	}
-	KASSERT(old != new,
-	    ("new fd is same as old"));
-
-	/*
-	 * Save info on the descriptor being overwritten.  We cannot close
-	 * it without introducing an ownership race for the slot, since we
-	 * need to drop the filedesc lock to call closef().
-	 *
-	 * XXX this duplicates parts of close().
-	 */
-	delfp = fdp->fd_ofiles[new];
-	holdleaders = 0;
-	if (delfp != NULL) {
-		if (td->td_proc->p_fdtol != NULL) {
-			/*
-			 * Ask fdfree() to sleep to ensure that all relevant
-			 * process leaders can be traversed in closef().
-			 */
-			fdp->fd_holdleaderscount++;
-			holdleaders = 1;
-		}
-	}
-
-	/*
-	 * Duplicate the source descriptor
+	 * Duplicate the source descriptor.
 	 */
 	fdp->fd_ofiles[new] = fp;
-	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
+	if ((flags & DUP_CLOEXEC) != 0)
+		fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] | UF_EXCLOSE;
+	else
+		fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
 	if (new > fdp->fd_lastfile)
 		fdp->fd_lastfile = new;
 	*retval = new;
 
-	/*
-	 * If we dup'd over a valid file, we now own the reference to it
-	 * and must dispose of it using closef() semantics (as if a
-	 * close() were performed on it).
-	 *
-	 * XXX this duplicates parts of close().
-	 */
 	if (delfp != NULL) {
-		knote_fdclose(td, new);
-		if (delfp->f_type == DTYPE_MQUEUE)
-			mq_fdclose(td, new, delfp);
-		FILEDESC_XUNLOCK(fdp);
-		(void) closef(delfp, td);
-		if (holdleaders) {
-			FILEDESC_XLOCK(fdp);
-			fdp->fd_holdleaderscount--;
-			if (fdp->fd_holdleaderscount == 0 &&
-			    fdp->fd_holdleaderswakeup != 0) {
-				fdp->fd_holdleaderswakeup = 0;
-				wakeup(&fdp->fd_holdleaderscount);
-			}
-			FILEDESC_XUNLOCK(fdp);
-		}
+		(void) closefp(fdp, new, delfp, td, 1);
+		/* closefp() drops the FILEDESC lock for us. */
 	} else {
 		FILEDESC_XUNLOCK(fdp);
 	}
+
 	return (0);
 }
 
@@ -1165,6 +1140,61 @@
 }
 
 /*
+ * Function drops the filedesc lock on return.
+ */
+static int
+closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
+    int holdleaders)
+{
+	struct file *fp_object;
+	int error;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	if (holdleaders) {
+		if (td->td_proc->p_fdtol != NULL) {
+			/*
+			 * Ask fdfree() to sleep to ensure that all relevant
+			 * process leaders can be traversed in closef().
+			 */
+			fdp->fd_holdleaderscount++;
+		} else {
+			holdleaders = 0;
+		}
+	}
+
+	/*
+	 * We now hold the fp reference that used to be owned by the
+	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
+	 * knote_fdclose to prevent a race of the fd getting opened, a knote
+	 * added, and deleteing a knote for the new fd.
+	 */
+	knote_fdclose(td, fd);
+
+	/*
+	 * When we're closing an fd with a capability, we need to notify
+	 * mqueue if the underlying object is of type mqueue.
+	 */
+	(void)cap_funwrap(fp, 0, &fp_object);
+	if (fp_object->f_type == DTYPE_MQUEUE)
+		mq_fdclose(td, fd, fp_object);
+	FILEDESC_XUNLOCK(fdp);
+
+	error = closef(fp, td);
+	if (holdleaders) {
+		FILEDESC_XLOCK(fdp);
+		fdp->fd_holdleaderscount--;
+		if (fdp->fd_holdleaderscount == 0 &&
+		    fdp->fd_holdleaderswakeup != 0) {
+			fdp->fd_holdleaderswakeup = 0;
+			wakeup(&fdp->fd_holdleaderscount);
+		}
+		FILEDESC_XUNLOCK(fdp);
+	}
+	return (error);
+}
+
+/*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
@@ -1188,63 +1218,23 @@
 	int fd;
 {
 	struct filedesc *fdp;
-	struct file *fp, *fp_object;
-	int error;
-	int holdleaders;
-
-	error = 0;
-	holdleaders = 0;
+	struct file *fp;
+
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_XLOCK(fdp);
-	if ((unsigned)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL) {
+	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdp->fd_ofiles[fd] = NULL;
 	fdp->fd_ofileflags[fd] = 0;
 	fdunused(fdp, fd);
-	if (td->td_proc->p_fdtol != NULL) {
-		/*
-		 * Ask fdfree() to sleep to ensure that all relevant
-		 * process leaders can be traversed in closef().
-		 */
-		fdp->fd_holdleaderscount++;
-		holdleaders = 1;
-	}
-
-	/*
-	 * We now hold the fp reference that used to be owned by the
-	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
-	 * knote_fdclose to prevent a race of the fd getting opened, a knote
-	 * added, and deleteing a knote for the new fd.
-	 */
-	knote_fdclose(td, fd);
-
-	/*
-	 * When we're closing an fd with a capability, we need to notify
-	 * mqueue if the underlying object is of type mqueue.
-	 */
-	(void)cap_funwrap(fp, 0, &fp_object);
-	if (fp_object->f_type == DTYPE_MQUEUE)
-		mq_fdclose(td, fd, fp_object);
-	FILEDESC_XUNLOCK(fdp);
-
-	error = closef(fp, td);
-	if (holdleaders) {
-		FILEDESC_XLOCK(fdp);
-		fdp->fd_holdleaderscount--;
-		if (fdp->fd_holdleaderscount == 0 &&
-		    fdp->fd_holdleaderswakeup != 0) {
-			fdp->fd_holdleaderswakeup = 0;
-			wakeup(&fdp->fd_holdleaderscount);
-		}
-		FILEDESC_XUNLOCK(fdp);
-	}
-	return (error);
+
+	/* closefp() drops the FILEDESC lock for us. */
+	return (closefp(fdp, fd, fp, td, 1));
 }
 
 /*
@@ -1407,6 +1397,7 @@
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		int vfslocked;
+
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
@@ -1417,7 +1408,7 @@
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
-		error = 0;
+			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
@@ -1428,9 +1419,7 @@
 }
 
 /*
- * Grow the file table to accomodate (at least) nfd descriptors.  This may
- * block and drop the filedesc lock, but it will reacquire it before
- * returning.
+ * Grow the file table to accomodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
@@ -1456,7 +1445,6 @@
 		return;
 
 	/* allocate a new table and (if required) new bitmaps */
-	FILEDESC_XUNLOCK(fdp);
 	ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	nfileflags = (char *)&ntable[nnfiles];
@@ -1465,20 +1453,7 @@
 		    M_FILEDESC, M_ZERO | M_WAITOK);
 	else
 		nmap = NULL;
-	FILEDESC_XLOCK(fdp);
-
-	/*
-	 * We now have new tables ready to go.  Since we dropped the
-	 * filedesc lock to call malloc(), watch out for a race.
-	 */
-	onfiles = fdp->fd_nfiles;
-	if (onfiles >= nnfiles) {
-		/* we lost the race, but that's OK */
-		free(ntable, M_FILEDESC);
-		if (nmap != NULL)
-			free(nmap, M_FILEDESC);
-		return;
-	}
+
 	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
 	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
 	otable = fdp->fd_ofiles;
@@ -1512,7 +1487,7 @@
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
-	int fd = -1, maxfd;
+	int fd = -1, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
@@ -1527,36 +1502,38 @@
 	PROC_UNLOCK(p);
 
 	/*
-	 * Search the bitmap for a free descriptor.  If none is found, try
-	 * to grow the file table.  Keep at it until we either get a file
-	 * descriptor or run into process or system limits; fdgrowtable()
-	 * may drop the filedesc lock, so we're in a race.
+	 * Search the bitmap for a free descriptor starting at minfd.
+	 * If none is found, grow the file table.
 	 */
-	for (;;) {
-		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
-		if (fd >= maxfd)
-			return (EMFILE);
-		if (fd < fdp->fd_nfiles)
-			break;
+	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+	if (fd >= maxfd)
+		return (EMFILE);
+	if (fd >= fdp->fd_nfiles) {
+		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
 		PROC_LOCK(p);
-		error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd));
+		error = racct_set(p, RACCT_NOFILE, allocfd);
 		PROC_UNLOCK(p);
 		if (error != 0)
 			return (EMFILE);
 #endif
-		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
+		/*
+		 * fd is already equal to first free descriptor >= minfd, so
+		 * we only need to grow the table and we are done.
+		 */
+		fdgrowtable(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
+	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
+	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
-	KASSERT(fdp->fd_ofiles[fd] == NULL,
-	    ("free descriptor isn't"));
-	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
+	KASSERT(fdp->fd_ofiles[fd] == NULL, ("file descriptor isn't free"));
+	KASSERT(fdp->fd_ofileflags[fd] == 0, ("file flags are set"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
@@ -1571,7 +1548,6 @@
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = td->td_proc->p_fd;
-	struct file **fpp;
 	int i, lim, last;
 
 	FILEDESC_LOCK_ASSERT(fdp);
@@ -1587,9 +1563,8 @@
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 	last = min(fdp->fd_nfiles, lim);
-	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
-	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
-		if (*fpp == NULL && --n <= 0)
+	for (i = fdp->fd_freefile; i < last; i++) {
+		if (fdp->fd_ofiles[i] == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
@@ -1848,7 +1823,6 @@
 fdfree(struct thread *td)
 {
 	struct filedesc *fdp;
-	struct file **fpp;
 	int i, locked;
 	struct filedesc_to_leader *fdtol;
 	struct file *fp;
@@ -1875,13 +1849,10 @@
 			 fdtol->fdl_refcount));
 		if (fdtol->fdl_refcount == 1 &&
 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
-			for (i = 0, fpp = fdp->fd_ofiles;
-			     i <= fdp->fd_lastfile;
-			     i++, fpp++) {
-				if (*fpp == NULL ||
-				    (*fpp)->f_type != DTYPE_VNODE)
+			for (i = 0; i <= fdp->fd_lastfile; i++) {
+				fp = fdp->fd_ofiles[i];
+				if (fp == NULL || fp->f_type != DTYPE_VNODE)
 					continue;
-				fp = *fpp;
 				fhold(fp);
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
@@ -1891,15 +1862,11 @@
 				vp = fp->f_vnode;
 				locked = VFS_LOCK_GIANT(vp->v_mount);
 				(void) VOP_ADVLOCK(vp,
-						   (caddr_t)td->td_proc->
-						   p_leader,
-						   F_UNLCK,
-						   &lf,
-						   F_POSIX);
+				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
+				    &lf, F_POSIX);
 				VFS_UNLOCK_GIANT(locked);
 				FILEDESC_XLOCK(fdp);
 				fdrop(fp, td);
-				fpp = fdp->fd_ofiles + i;
 			}
 		}
 	retry:
@@ -1944,12 +1911,11 @@
 	if (i > 0)
 		return;
 
-	fpp = fdp->fd_ofiles;
-	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
-		if (*fpp) {
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		fp = fdp->fd_ofiles[i];
+		if (fp != NULL) {
 			FILEDESC_XLOCK(fdp);
-			fp = *fpp;
-			*fpp = NULL;
+			fdp->fd_ofiles[i] = NULL;
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 		}
@@ -2086,6 +2052,7 @@
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
+	struct file *fp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
@@ -2093,31 +2060,20 @@
 	if (fdp == NULL)
 		return;
 
-	FILEDESC_XLOCK(fdp);
-
 	/*
 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
 	 * may block and rip them out from under us.
 	 */
+	FILEDESC_XLOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
-		if (fdp->fd_ofiles[i] != NULL &&
-		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
+		fp = fdp->fd_ofiles[i];
+		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
-			struct file *fp;
-
-			knote_fdclose(td, i);
-			/*
-			 * NULL-out descriptor prior to close to avoid
-			 * a race while close blocks.
-			 */
-			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
-			if (fp->f_type == DTYPE_MQUEUE)
-				mq_fdclose(td, i, fp);
-			FILEDESC_XUNLOCK(fdp);
-			(void) closef(fp, td);
+			(void) closefp(fdp, i, fp, td, 0);
+			/* closefp() drops the FILEDESC lock. */
 			FILEDESC_XLOCK(fdp);
 		}
 	}
@@ -2198,7 +2154,7 @@
 	 * node, not the capability itself.
 	 */
 	(void)cap_funwrap(fp, 0, &fp_object);
-	if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) {
+	if (fp_object->f_type == DTYPE_VNODE && td != NULL) {
 		int vfslocked;
 
 		vp = fp_object->f_vnode;
@@ -2209,7 +2165,7 @@
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
-					   F_UNLCK, &lf, F_POSIX);
+			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
@@ -2233,8 +2189,8 @@
 				lf.l_type = F_UNLCK;
 				vp = fp_object->f_vnode;
 				(void) VOP_ADVLOCK(vp,
-						   (caddr_t)fdtol->fdl_leader,
-						   F_UNLCK, &lf, F_POSIX);
+				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
+				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
@@ -2329,8 +2285,8 @@
 	struct file *fp;
 #ifdef CAPABILITIES
 	struct file *fp_fromcap;
+#endif
 	int error;
-#endif
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
@@ -2369,7 +2325,7 @@
 		else
 			error = cap_funwrap_mmap(fp, needrights, maxprotp,
 			    &fp_fromcap);
-		if (error) {
+		if (error != 0) {
 			fdrop(fp, td);
 			return (error);
 		}
@@ -2394,14 +2350,30 @@
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
-	 *
-	 * Only one flag, or 0, may be specified.
 	 */
-	if ((flags == FREAD && (fp->f_flag & FREAD) == 0) ||
-	    (flags == FWRITE && (fp->f_flag & FWRITE) == 0)) {
+	error = 0;
+	switch (flags) {
+	case FREAD:
+	case FWRITE:
+		if ((fp->f_flag & flags) == 0)
+			error = EBADF;
+		break;
+	case FEXEC:
+	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
+		    ((fp->f_flag & FWRITE) != 0))
+			error = EBADF;
+		break;
+	case 0:
+		break;
+	default:
+		KASSERT(0, ("wrong flags"));
+	}
+
+	if (error != 0) {
 		fdrop(fp, td);
-		return (EBADF);
+		return (error);
 	}
+
 	*fpp = fp;
 	return (0);
 }
@@ -2498,6 +2470,13 @@
 	return (_fgetvp(td, fd, FREAD, rights, NULL, vpp));
 }
 
+int
+fgetvp_exec(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
+{
+
+	return (_fgetvp(td, fd, FEXEC, rights, NULL, vpp));
+}
+
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t rights,
@@ -2647,10 +2626,13 @@
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
-dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
+dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp)
 {
-	struct file *wfp;
 	struct file *fp;
+	int error, indx;
+
+	KASSERT(openerror == ENODEV || openerror == ENXIO,
+	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
@@ -2658,12 +2640,17 @@
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
-	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
-	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
+	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
+	error = fdalloc(td, 0, &indx);
+	if (error != 0) {
+		FILEDESC_XUNLOCK(fdp);
+		return (error);
+	}
+
 	/*
 	 * There are two cases of interest here.
 	 *
@@ -2671,61 +2658,36 @@
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
-	 *
-	 * Any other error code is just returned.
 	 */
-	switch (error) {
+	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
-		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
+		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
+			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
-		fp = fdp->fd_ofiles[indx];
-		fdp->fd_ofiles[indx] = wfp;
+		fdp->fd_ofiles[indx] = fp;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
-		if (fp == NULL)
-			fdused(fdp, indx);
-		fhold(wfp);
-		FILEDESC_XUNLOCK(fdp);
-		if (fp != NULL)
-			/*
-			 * We now own the reference to fp that the ofiles[]
-			 * array used to own.  Release it.
-			 */
-			fdrop(fp, td);
-		return (0);
-
+		fhold(fp);
+		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
-		fp = fdp->fd_ofiles[indx];
-		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+		fdp->fd_ofiles[indx] = fp;
 		fdp->fd_ofiles[dfd] = NULL;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		fdp->fd_ofileflags[dfd] = 0;
 		fdunused(fdp, dfd);
-		if (fp == NULL)
-			fdused(fdp, indx);
-		FILEDESC_XUNLOCK(fdp);
-
-		/*
-		 * We now own the reference to fp that the ofiles[] array
-		 * used to own.  Release it.
-		 */
-		if (fp != NULL)
-			fdrop(fp, td);
-		return (0);
-
-	default:
-		FILEDESC_XUNLOCK(fdp);
-		return (error);
+		break;
 	}
-	/* NOTREACHED */
+	FILEDESC_XUNLOCK(fdp);
+	*indxp = indx;
+	return (0);
 }
 
 /*
@@ -2884,7 +2846,7 @@
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = 0;
-			xf.xf_offset = fp->f_offset;
+			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
@@ -3089,7 +3051,7 @@
 			kif->kf_flags |= KF_FLAG_DIRECT;
 		if (fp->f_flag & FHASLOCK)
 			kif->kf_flags |= KF_FLAG_HASLOCK;
-		kif->kf_offset = fp->f_offset;
+		kif->kf_offset = foffset_get(fp);
 		if (vp != NULL) {
 			vref(vp);
 			switch (vp->v_type) {
@@ -3433,7 +3395,7 @@
 		}
 		refcnt = fp->f_count;
 		fflags = fp->f_flag;
-		offset = fp->f_offset;
+		offset = foffset_get(fp);
 
 		/*
 		 * Create sysctl entry.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_event.c
--- a/head/sys/kern/kern_event.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_event.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_event.c 233505 2012-03-26 09:34:17Z melifaro $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_event.c 238424 2012-07-13 13:24:33Z jhb $");
 
 #include "opt_ktrace.h"
 
@@ -513,6 +513,10 @@
 	list->kl_unlock(list->kl_lockarg);
 }
 
+/*
+ * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
+ * interval timer support code.
+ */
 static int
 timertoticks(intptr_t data)
 {
@@ -526,7 +530,6 @@
 	return tticks;
 }
 
-/* XXX - move to kern_timeout.c? */
 static void
 filt_timerexpire(void *knx)
 {
@@ -536,9 +539,16 @@
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 
+	/*
+	 * timertoticks() uses tvtohz() which always adds 1 to allow
+	 * for the time until the next clock interrupt being strictly
+	 * less than 1 clock tick.  We don't want that here since we
+	 * want to appear to be in sync with the clock interrupt even
+	 * when we're delayed.
+	 */
 	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
 		calloutp = (struct callout *)kn->kn_hook;
-		callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
+		callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata) - 1,
 		    filt_timerexpire, kn);
 	}
 }
@@ -546,7 +556,6 @@
 /*
  * data contains amount of time to sleep, in milliseconds
  */
-/* XXX - move to kern_timeout.c? */
 static int
 filt_timerattach(struct knote *kn)
 {
@@ -570,7 +579,6 @@
 	return (0);
 }
 
-/* XXX - move to kern_timeout.c? */
 static void
 filt_timerdetach(struct knote *kn)
 {
@@ -583,7 +591,6 @@
 	kn->kn_status |= KN_DETACHED;	/* knlist_remove usually clears it */
 }
 
-/* XXX - move to kern_timeout.c? */
 static int
 filt_timer(struct knote *kn, long hint)
 {
@@ -692,7 +699,7 @@
 	if (error)
 		goto done2;
 
-	/* An extra reference on `nfp' has been held for us by falloc(). */
+	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_exec.c
--- a/head/sys/kern/kern_exec.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_exec.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_exec.c 232700 2012-03-08 19:41:05Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_exec.c 238220 2012-07-08 00:51:38Z mjg $");
 
 #include "opt_capsicum.h"
 #include "opt_hwpmc_hooks.h"
@@ -443,8 +443,10 @@
 		/*
 		 * Some might argue that CAP_READ and/or CAP_MMAP should also
 		 * be required here; such arguments will be entertained.
+		 *
+		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
 		 */
-		error = fgetvp_read(td, args->fd, CAP_FEXECVE, &binvp);
+		error = fgetvp_exec(td, args->fd, CAP_FEXECVE, &binvp);
 		if (error)
 			goto exec_fail;
 		vfslocked = VFS_LOCK_GIANT(binvp->v_mount);
@@ -1511,64 +1513,3 @@
 	execsw = newexecsw;
 	return (0);
 }
-
-static vm_object_t shared_page_obj;
-static int shared_page_free;
-
-int
-shared_page_fill(int size, int align, const char *data)
-{
-	vm_page_t m;
-	struct sf_buf *s;
-	vm_offset_t sk;
-	int res;
-
-	VM_OBJECT_LOCK(shared_page_obj);
-	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY);
-	res = roundup(shared_page_free, align);
-	if (res + size >= IDX_TO_OFF(shared_page_obj->size))
-		res = -1;
-	else {
-		VM_OBJECT_UNLOCK(shared_page_obj);
-		s = sf_buf_alloc(m, SFB_DEFAULT);
-		sk = sf_buf_kva(s);
-		bcopy(data, (void *)(sk + res), size);
-		shared_page_free = res + size;
-		sf_buf_free(s);
-		VM_OBJECT_LOCK(shared_page_obj);
-	}
-	vm_page_wakeup(m);
-	VM_OBJECT_UNLOCK(shared_page_obj);
-	return (res);
-}
-
-static void
-shared_page_init(void *dummy __unused)
-{
-	vm_page_t m;
-
-	shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
-	    VM_PROT_DEFAULT, 0, NULL);
-	VM_OBJECT_LOCK(shared_page_obj);
-	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY | VM_ALLOC_NOBUSY |
-	    VM_ALLOC_ZERO);
-	m->valid = VM_PAGE_BITS_ALL;
-	VM_OBJECT_UNLOCK(shared_page_obj);
-}
-
-SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
-    NULL);
-
-void
-exec_sysvec_init(void *param)
-{
-	struct sysentvec *sv;
-
-	sv = (struct sysentvec *)param;
-
-	if ((sv->sv_flags & SV_SHP) == 0)
-		return;
-	sv->sv_shared_page_obj = shared_page_obj;
-	sv->sv_sigcode_base = sv->sv_shared_page_base +
-	    shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode);
-}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_fork.c
--- a/head/sys/kern/kern_fork.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_fork.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_fork.c 232240 2012-02-27 21:10:10Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_fork.c 237276 2012-06-19 22:21:59Z pjd $");
 
 #include "opt_kdtrace.h"
 #include "opt_ktrace.h"
@@ -475,7 +475,6 @@
 
 	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
 	td2->td_sigstk = td->td_sigstk;
-	td2->td_sigmask = td->td_sigmask;
 	td2->td_flags = TDF_INMEM;
 	td2->td_lend_user_pri = PRI_MAX;
 
@@ -922,8 +921,10 @@
 		 */
 		*procp = newproc;
 #ifdef PROCDESC
-		if (flags & RFPROCDESC)
+		if (flags & RFPROCDESC) {
 			procdesc_finit(newproc->p_procdesc, fp_procdesc);
+			fdrop(fp_procdesc, td);
+		}
 #endif
 		racct_proc_fork_done(newproc);
 		return (0);
@@ -939,14 +940,16 @@
 #ifdef MAC
 	mac_proc_destroy(newproc);
 #endif
+	racct_proc_exit(newproc);
 fail1:
-	racct_proc_exit(newproc);
 	if (vm2 != NULL)
 		vmspace_free(vm2);
 	uma_zfree(proc_zone, newproc);
 #ifdef PROCDESC
-	if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL))
+	if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL)) {
+		fdclose(td->td_proc->p_fd, fp_procdesc, *procdescp, td);
 		fdrop(fp_procdesc, td);
+	}
 #endif
 	pause("fork", hz / 2);
 	return (error);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_jail.c
--- a/head/sys/kern/kern_jail.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_jail.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 232598 2012-03-06 11:05:50Z trasz $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_jail.c 235803 2012-05-22 19:43:20Z trasz $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -1811,6 +1811,16 @@
 		}
 	}
 
+#ifdef RACCT
+	if (!created) {
+		sx_sunlock(&allprison_lock);
+		prison_racct_modify(pr);
+		sx_slock(&allprison_lock);
+	}
+#endif
+
+	td->td_retval[0] = pr->pr_id;
+
 	/*
 	 * Now that it is all there, drop the temporary reference from existing
 	 * prisons.  Or add a reference to newly created persistent prisons
@@ -1832,12 +1842,6 @@
 			sx_sunlock(&allprison_lock);
 	}
 
-#ifdef RACCT
-	if (!created)
-		prison_racct_modify(pr);
-#endif
-
-	td->td_retval[0] = pr->pr_id;
 	goto done_errmsg;
 
  done_deref_locked:
@@ -4491,8 +4495,11 @@
 	sx_slock(&allproc_lock);
 	sx_xlock(&allprison_lock);
 
-	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0)
+	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
+		sx_xunlock(&allprison_lock);
+		sx_sunlock(&allproc_lock);
 		return;
+	}
 
 	oldprr = pr->pr_prison_racct;
 	pr->pr_prison_racct = NULL;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_kthread.c
--- a/head/sys/kern/kern_kthread.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_kthread.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_kthread.c 232700 2012-03-08 19:41:05Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_kthread.c 236117 2012-05-26 20:03:47Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -271,7 +271,6 @@
 
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
-/* XXX check if we should zero. */
 	bcopy(&oldtd->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
@@ -295,7 +294,6 @@
 	/* this code almost the same as create_thread() in kern_thr.c */
 	PROC_LOCK(p);
 	p->p_flag |= P_HADTHREADS;
-	newtd->td_sigmask = oldtd->td_sigmask; /* XXX dubious */
 	thread_link(newtd, p);
 	thread_lock(oldtd);
 	/* let the scheduler know about these things. */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_malloc.c
--- a/head/sys/kern/kern_malloc.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_malloc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -43,7 +43,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_malloc.c 232356 2012-03-01 19:58:34Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_malloc.c 238502 2012-07-15 20:29:48Z mdf $");
 
 #include "opt_ddb.h"
 #include "opt_kdtrace.h"
@@ -744,7 +744,7 @@
 		vm_kmem_size = 2 * mem_size * PAGE_SIZE;
 
 #ifdef DEBUG_MEMGUARD
-	tmp = memguard_fudge(vm_kmem_size, vm_kmem_size_max);
+	tmp = memguard_fudge(vm_kmem_size, kernel_map);
 #else
 	tmp = vm_kmem_size;
 #endif
@@ -1000,6 +1000,8 @@
 		db_printf("%18s %12ju %12juK %12ju\n",
 		    mtp->ks_shortdesc, allocs - frees,
 		    (alloced - freed + 1023) / 1024, allocs);
+		if (db_pager_quit)
+			break;
 	}
 }
 
@@ -1029,6 +1031,8 @@
 		if (mtip->mti_zone != subzone)
 			continue;
 		db_printf("%s\n", mtp->ks_shortdesc);
+		if (db_pager_quit)
+			break;
 	}
 }
 #endif /* MALLOC_DEBUG_MAXZONES > 1 */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_proc.c
--- a/head/sys/kern/kern_proc.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_proc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_proc.c 233389 2012-03-23 20:05:41Z trociny $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_proc.c 238527 2012-07-16 09:38:19Z pgj $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -309,6 +309,30 @@
 	return (p);
 }
 
+static struct proc *
+pfind_tid(pid_t tid)
+{
+	struct proc *p;
+	struct thread *td;
+
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state == PRS_NEW) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		FOREACH_THREAD_IN_PROC(p, td) {
+			if (td->td_tid == tid)
+				goto found;
+		}
+		PROC_UNLOCK(p);
+	}
+found:
+	sx_sunlock(&allproc_lock);
+	return (p);
+}
+
 /*
  * Locate a process group by number.
  * The caller must hold proctree_lock.
@@ -339,7 +363,12 @@
 	struct proc *p;
 	int error;
 
-	p = pfind(pid);
+	if (pid <= PID_MAX)
+		p = pfind(pid);
+	else if ((flags & PGET_NOTID) == 0)
+		p = pfind_tid(pid);
+	else
+		p = NULL;
 	if (p == NULL)
 		return (ESRCH);
 	if ((flags & PGET_CANSEE) != 0) {
@@ -849,6 +878,9 @@
 	kp->ki_childtime = kp->ki_childstime;
 	timevaladd(&kp->ki_childtime, &kp->ki_childutime);
 
+	FOREACH_THREAD_IN_PROC(p, td0)
+		kp->ki_cow += td0->td_cow;
+
 	tp = NULL;
 	if (p->p_pgrp) {
 		kp->ki_pgid = p->p_pgrp->pg_id;
@@ -961,6 +993,7 @@
 		kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime);
 		kp->ki_pctcpu = sched_pctcpu(td);
 		kp->ki_estcpu = td->td_estcpu;
+		kp->ki_cow = td->td_cow;
 	}
 
 	/* We can't get this anymore but ps etc never used it anyway. */
@@ -1103,6 +1136,7 @@
 	CP(*ki, *ki32, ki_estcpu);
 	CP(*ki, *ki32, ki_slptime);
 	CP(*ki, *ki32, ki_swtime);
+	CP(*ki, *ki32, ki_cow);
 	CP(*ki, *ki32, ki_runtime);
 	TV_CP(*ki, *ki32, ki_start);
 	TV_CP(*ki, *ki32, ki_childtime);
@@ -2155,6 +2189,10 @@
 			kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
 		if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
 			kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
+		if (entry->eflags & MAP_ENTRY_GROWS_UP)
+			kve->kve_flags |= KVME_FLAG_GROWS_UP;
+		if (entry->eflags & MAP_ENTRY_GROWS_DOWN)
+			kve->kve_flags |= KVME_FLAG_GROWS_DOWN;
 
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_racct.c
--- a/head/sys/kern/kern_racct.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_racct.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,11 +26,11 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/kern/kern_racct.c 234383 2012-04-17 14:31:02Z trasz $
+ * $FreeBSD: head/sys/kern/kern_racct.c 235787 2012-05-22 15:58:27Z trasz $
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 234383 2012-04-17 14:31:02Z trasz $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_racct.c 235787 2012-05-22 15:58:27Z trasz $");
 
 #include "opt_kdtrace.h"
 
@@ -573,6 +573,9 @@
 	PROC_UNLOCK(child);
 	PROC_UNLOCK(parent);
 
+	if (error != 0)
+		racct_proc_exit(child);
+
 	return (error);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_rangelock.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/kern/kern_rangelock.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/kern_rangelock.c 236317 2012-05-30 16:06:38Z kib $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rangelock.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+struct rl_q_entry {
+	TAILQ_ENTRY(rl_q_entry) rl_q_link;
+	off_t		rl_q_start, rl_q_end;
+	int		rl_q_flags;
+};
+
+static uma_zone_t rl_entry_zone;
+
+static void
+rangelock_sys_init(void)
+{
+
+	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
+
+static struct rl_q_entry *
+rlqentry_alloc(void)
+{
+
+	return (uma_zalloc(rl_entry_zone, M_WAITOK));
+}
+
+void
+rlqentry_free(struct rl_q_entry *rleq)
+{
+
+	uma_zfree(rl_entry_zone, rleq);
+}
+
+void
+rangelock_init(struct rangelock *lock)
+{
+
+	TAILQ_INIT(&lock->rl_waiters);
+	lock->rl_currdep = NULL;
+}
+
+void
+rangelock_destroy(struct rangelock *lock)
+{
+
+	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+}
+
+/*
+ * Verifies the supplied rl_q_entries for compatibility.  Returns true
+ * if the rangelock queue entries are not compatible, false if they are.
+ *
+ * Two entries are compatible if their ranges do not overlap, or both
+ * entries are for read.
+ */
+static int
+rangelock_incompatible(const struct rl_q_entry *e1,
+    const struct rl_q_entry *e2)
+{
+
+	if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ &&
+	    (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ)
+		return (0);
+	if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
+		return (1);
+	return (0);
+}
+
+/*
+ * Recalculate the lock->rl_currdep after an unlock.
+ */
+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+	struct rl_q_entry *entry, *entry1, *whead;
+
+	if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) &&
+	    lock->rl_currdep != NULL)
+		lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link);
+	for (entry = lock->rl_currdep; entry != NULL;
+	     entry = TAILQ_NEXT(entry, rl_q_link)) {
+		TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) {
+			if (rangelock_incompatible(entry, entry1))
+				goto out;
+			if (entry1 == entry)
+				break;
+		}
+	}
+out:
+	lock->rl_currdep = entry;
+	TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) {
+		if (whead == lock->rl_currdep)
+			break;
+		if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) {
+			whead->rl_q_flags |= RL_LOCK_GRANTED;
+			wakeup(whead);
+		}
+	}
+}
+
+static void
+rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
+    struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && entry != NULL && ilk != NULL);
+	mtx_assert(ilk, MA_OWNED);
+	KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
+
+	TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	if (curthread->td_rlqe == NULL)
+		curthread->td_rlqe = entry;
+	else
+		rlqentry_free(entry);
+}
+
+void
+rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
+{
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+
+	mtx_lock(ilk);
+	rangelock_unlock_locked(lock, cookie, ilk);
+}
+
+/*
+ * Unlock the sub-range of granted lock.
+ */
+void *
+rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
+    off_t end, struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+
+	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+	entry = cookie;
+	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
+	    ("Unlocking non-granted lock"));
+	KASSERT(entry->rl_q_start == start, ("wrong start"));
+	KASSERT(entry->rl_q_end >= end, ("wrong end"));
+
+	mtx_lock(ilk);
+	if (entry->rl_q_end == end) {
+		rangelock_unlock_locked(lock, cookie, ilk);
+		return (NULL);
+	}
+	entry->rl_q_end = end;
+	rangelock_calc_block(lock);
+	mtx_unlock(ilk);
+	return (cookie);
+}
+
+/*
+ * Add the lock request to the queue of the pending requests for
+ * rangelock.  Sleep until the request can be granted.
+ */
+static void *
+rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
+    struct mtx *ilk)
+{
+	struct rl_q_entry *entry;
+	struct thread *td;
+
+	MPASS(lock != NULL && ilk != NULL);
+
+	td = curthread;
+	if (td->td_rlqe != NULL) {
+		entry = td->td_rlqe;
+		td->td_rlqe = NULL;
+	} else
+		entry = rlqentry_alloc();
+	MPASS(entry != NULL);
+	entry->rl_q_flags = mode;
+	entry->rl_q_start = start;
+	entry->rl_q_end = end;
+
+	mtx_lock(ilk);
+	/*
+	 * XXXKIB TODO. Check that a thread does not try to enqueue a
+	 * lock that is incompatible with another request from the same
+	 * thread.
+	 */
+
+	TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
+	if (lock->rl_currdep == NULL)
+		lock->rl_currdep = entry;
+	rangelock_calc_block(lock);
+	while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
+		msleep(entry, ilk, 0, "range", 0);
+	mtx_unlock(ilk);
+	return (entry);
+}
+
+void *
+rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+	return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk));
+}
+
+void *
+rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+	return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk));
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_sharedpage.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/kern/kern_sharedpage.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,240 @@
+/*-
+ * Copyright (c) 2010, 2012 Konstantin Belousov <kib at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/kern_sharedpage.c 237477 2012-06-23 10:15:23Z kib $");
+
+#include "opt_compat.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/vdso.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+static struct sx shared_page_alloc_sx;
+static vm_object_t shared_page_obj;
+static int shared_page_free;
+char *shared_page_mapping;
+
+void
+shared_page_write(int base, int size, const void *data)
+{
+
+	bcopy(data, shared_page_mapping + base, size);
+}
+
+static int
+shared_page_alloc_locked(int size, int align)
+{
+	int res;
+
+	res = roundup(shared_page_free, align);
+	if (res + size >= IDX_TO_OFF(shared_page_obj->size))
+		res = -1;
+	else
+		shared_page_free = res + size;
+	return (res);
+}
+
+int
+shared_page_alloc(int size, int align)
+{
+	int res;
+
+	sx_xlock(&shared_page_alloc_sx);
+	res = shared_page_alloc_locked(size, align);
+	sx_xunlock(&shared_page_alloc_sx);
+	return (res);
+}
+
+int
+shared_page_fill(int size, int align, const void *data)
+{
+	int res;
+
+	sx_xlock(&shared_page_alloc_sx);
+	res = shared_page_alloc_locked(size, align);
+	if (res != -1)
+		shared_page_write(res, size, data);
+	sx_xunlock(&shared_page_alloc_sx);
+	return (res);
+}
+
+static void
+shared_page_init(void *dummy __unused)
+{
+	vm_page_t m;
+	vm_offset_t addr;
+
+	sx_init(&shared_page_alloc_sx, "shpsx");
+	shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
+	    VM_PROT_DEFAULT, 0, NULL);
+	VM_OBJECT_LOCK(shared_page_obj);
+	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY | VM_ALLOC_NOBUSY |
+	    VM_ALLOC_ZERO);
+	m->valid = VM_PAGE_BITS_ALL;
+	VM_OBJECT_UNLOCK(shared_page_obj);
+	addr = kmem_alloc_nofault(kernel_map, PAGE_SIZE);
+	pmap_qenter(addr, &m, 1);
+	shared_page_mapping = (char *)addr;
+}
+
+SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
+    NULL);
+
+static void
+timehands_update(struct sysentvec *sv)
+{
+	struct vdso_timehands th;
+	struct vdso_timekeep *tk;
+	uint32_t enabled, idx;
+
+	enabled = tc_fill_vdso_timehands(&th);
+	tk = (struct vdso_timekeep *)(shared_page_mapping +
+	    sv->sv_timekeep_off);
+	idx = sv->sv_timekeep_curr;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0);
+	if (++idx >= VDSO_TH_NUM)
+		idx = 0;
+	sv->sv_timekeep_curr = idx;
+	if (++sv->sv_timekeep_gen == 0)
+		sv->sv_timekeep_gen = 1;
+	th.th_gen = 0;
+	if (enabled)
+		tk->tk_th[idx] = th;
+	tk->tk_enabled = enabled;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen);
+	tk->tk_current = idx;
+}
+
+#ifdef COMPAT_FREEBSD32
+static void
+timehands_update32(struct sysentvec *sv)
+{
+	struct vdso_timekeep32 *tk;
+	struct vdso_timehands32 th;
+	uint32_t enabled, idx;
+
+	enabled = tc_fill_vdso_timehands32(&th);
+	tk = (struct vdso_timekeep32 *)(shared_page_mapping +
+	    sv->sv_timekeep_off);
+	idx = sv->sv_timekeep_curr;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0);
+	if (++idx >= VDSO_TH_NUM)
+		idx = 0;
+	sv->sv_timekeep_curr = idx;
+	if (++sv->sv_timekeep_gen == 0)
+		sv->sv_timekeep_gen = 1;
+	th.th_gen = 0;
+	if (enabled)
+		tk->tk_th[idx] = th;
+	tk->tk_enabled = enabled;
+	atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen);
+	tk->tk_current = idx;
+}
+#endif
+
+/*
+ * This is hackish, but easiest way to avoid creating list structures
+ * that needs to be iterated over from the hardclock interrupt
+ * context.
+ */
+static struct sysentvec *host_sysentvec;
+#ifdef COMPAT_FREEBSD32
+static struct sysentvec *compat32_sysentvec;
+#endif
+
+void
+timekeep_push_vdso(void)
+{
+
+	if (host_sysentvec != NULL && host_sysentvec->sv_timekeep_base != 0)
+		timehands_update(host_sysentvec);
+#ifdef COMPAT_FREEBSD32
+	if (compat32_sysentvec != NULL &&
+	    compat32_sysentvec->sv_timekeep_base != 0)
+		timehands_update32(compat32_sysentvec);
+#endif
+}
+
+void
+exec_sysvec_init(void *param)
+{
+	struct sysentvec *sv;
+	int tk_base;
+	uint32_t tk_ver;
+
+	sv = (struct sysentvec *)param;
+
+	if ((sv->sv_flags & SV_SHP) == 0)
+		return;
+	sv->sv_shared_page_obj = shared_page_obj;
+	sv->sv_sigcode_base = sv->sv_shared_page_base +
+	    shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode);
+	if ((sv->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD)
+		return;
+	tk_ver = VDSO_TK_VER_CURR;
+#ifdef COMPAT_FREEBSD32
+	if ((sv->sv_flags & SV_ILP32) != 0) {
+		tk_base = shared_page_alloc(sizeof(struct vdso_timekeep32) +
+		    sizeof(struct vdso_timehands32) * VDSO_TH_NUM, 16);
+		KASSERT(tk_base != -1, ("tk_base -1 for 32bit"));
+		shared_page_write(tk_base + offsetof(struct vdso_timekeep32,
+		    tk_ver), sizeof(uint32_t), &tk_ver);
+		KASSERT(compat32_sysentvec == 0,
+		    ("Native compat32 already registered"));
+		compat32_sysentvec = sv;
+	} else {
+#endif
+		tk_base = shared_page_alloc(sizeof(struct vdso_timekeep) +
+		    sizeof(struct vdso_timehands) * VDSO_TH_NUM, 16);
+		KASSERT(tk_base != -1, ("tk_base -1 for native"));
+		shared_page_write(tk_base + offsetof(struct vdso_timekeep,
+		    tk_ver), sizeof(uint32_t), &tk_ver);
+		KASSERT(host_sysentvec == 0, ("Native already registered"));
+		host_sysentvec = sv;
+#ifdef COMPAT_FREEBSD32
+	}
+#endif
+	sv->sv_timekeep_base = sv->sv_shared_page_base + tk_base;
+	sv->sv_timekeep_off = tk_base;
+	timekeep_push_vdso();
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_shutdown.c
--- a/head/sys/kern/kern_shutdown.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_shutdown.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_shutdown.c 230643 2012-01-28 14:00:21Z attilio $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_shutdown.c 236503 2012-06-03 08:01:12Z avg $");
 
 #include "opt_ddb.h"
 #include "opt_kdb.h"
@@ -66,9 +66,7 @@
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
-#ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
-#endif
 
 #include <ddb/ddb.h>
 
@@ -151,7 +149,7 @@
 
 /* Context information for dump-debuggers. */
 static struct pcb dumppcb;		/* Registers. */
-static lwpid_t dumptid;			/* Thread ID. */
+lwpid_t dumptid;			/* Thread ID. */
 
 static void poweroff_wait(void *, int);
 static void shutdown_halt(void *junk, int howto);
@@ -334,9 +332,7 @@
 
 		waittime = 0;
 
-#ifdef SW_WATCHDOG
 		wdog_kern_pat(WD_LASTVAL);
-#endif
 		sys_sync(curthread, NULL);
 
 		/*
@@ -362,9 +358,8 @@
 			if (nbusy < pbusy)
 				iter = 0;
 			pbusy = nbusy;
-#ifdef SW_WATCHDOG
+
 			wdog_kern_pat(WD_LASTVAL);
-#endif
 			sys_sync(curthread, NULL);
 
 #ifdef PREEMPTION
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_sig.c
--- a/head/sys/kern/kern_sig.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_sig.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_sig.c 234172 2012-04-12 10:48:43Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_sig.c 238336 2012-07-10 05:45:13Z davidxu $");
 
 #include "opt_compat.h"
 #include "opt_kdtrace.h"
@@ -2436,9 +2436,10 @@
 		}
 stopme:
 		thread_suspend_switch(td);
-		if (!(p->p_flag & P_TRACED)) {
+		if (p->p_xthread == td)
+			p->p_xthread = NULL;
+		if (!(p->p_flag & P_TRACED))
 			break;
-		}
 		if (td->td_dbgflags & TDB_SUSPEND) {
 			if (p->p_flag & P_SINGLE_EXIT)
 				break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_synch.c
--- a/head/sys/kern/kern_synch.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_synch.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,8 +35,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_synch.c 234494 2012-04-20 15:32:36Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_synch.c 235459 2012-05-15 01:30:25Z rstone $");
 
+#include "opt_kdtrace.h"
 #include "opt_ktrace.h"
 #include "opt_sched.h"
 
@@ -51,6 +52,7 @@
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
@@ -105,6 +107,20 @@
 
 static void	loadav(void *arg);
 
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE(sched, , , preempt, preempt);
+
+/*
+ * These probes reference Solaris features that are not implemented in FreeBSD.
+ * Create the probes anyway for compatibility with existing D scripts; they'll
+ * just never fire.
+ */
+SDT_PROBE_DEFINE(sched, , , cpucaps_sleep, cpucaps-sleep);
+SDT_PROBE_DEFINE(sched, , , cpucaps_wakeup, cpucaps-wakeup);
+SDT_PROBE_DEFINE(sched, , , schedctl_nopreempt, schedctl-nopreempt);
+SDT_PROBE_DEFINE(sched, , , schedctl_preempt, schedctl-preempt);
+SDT_PROBE_DEFINE(sched, , , schedctl_yield, schedctl-yield);
+
 void
 sleepinit(void)
 {
@@ -462,6 +478,7 @@
 		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
 		    "lockname:\"%s\"", td->td_lockname);
 #endif
+	SDT_PROBE0(sched, , , preempt);
 #ifdef XEN
 	PT_UPDATES_FLUSH();
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_tc.c
--- a/head/sys/kern/kern_tc.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_tc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -14,8 +14,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_tc.c 232449 2012-03-03 08:19:18Z jmallett $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_tc.c 238537 2012-07-16 20:17:19Z gnn $");
 
+#include "opt_compat.h"
 #include "opt_ntp.h"
 #include "opt_ffclock.h"
 
@@ -32,6 +33,7 @@
 #include <sys/timepps.h>
 #include <sys/timetc.h>
 #include <sys/timex.h>
+#include <sys/vdso.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
@@ -120,6 +122,8 @@
 static void tc_windup(void);
 static void cpu_tick_calibrate(int);
 
+void dtrace_getnanotime(struct timespec *tsp);
+
 static int
 sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
 {
@@ -958,6 +962,24 @@
 #endif /* FFCLOCK */
 
 /*
+ * This is a clone of getnanotime and used for walltimestamps.
+ * The dtrace_ prefix prevents fbt from creating probes for
+ * it so walltimestamp can be safely used in all fbt probes.
+ */
+void
+dtrace_getnanotime(struct timespec *tsp)
+{
+	struct timehands *th;
+	u_int gen;
+
+	do {
+		th = timehands;
+		gen = th->th_generation;
+		*tsp = th->th_nanotime;
+	} while (gen == 0 || gen != th->th_generation);
+}
+
+/*
  * System clock currently providing time to the system. Modifiable via sysctl
  * when the FFCLOCK option is defined.
  */
@@ -1360,6 +1382,7 @@
 #endif
 
 	timehands = th;
+	timekeep_push_vdso();
 }
 
 /* Report or change the active timecounter hardware. */
@@ -1386,6 +1409,7 @@
 		(void)newtc->tc_get_timecount(newtc);
 
 		timecounter = newtc;
+		timekeep_push_vdso();
 		return (0);
 	}
 	return (EINVAL);
@@ -1844,3 +1868,63 @@
 }
 
 cpu_tick_f	*cpu_ticks = tc_cpu_ticks;
+
+static int vdso_th_enable = 1;
+static int
+sysctl_fast_gettime(SYSCTL_HANDLER_ARGS)
+{
+	int old_vdso_th_enable, error;
+
+	old_vdso_th_enable = vdso_th_enable;
+	error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req);
+	if (error != 0)
+		return (error);
+	vdso_th_enable = old_vdso_th_enable;
+	timekeep_push_vdso();
+	return (0);
+}
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day");
+
+uint32_t
+tc_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+{
+	struct timehands *th;
+	uint32_t enabled;
+
+	th = timehands;
+	vdso_th->th_algo = VDSO_TH_ALGO_1;
+	vdso_th->th_scale = th->th_scale;
+	vdso_th->th_offset_count = th->th_offset_count;
+	vdso_th->th_counter_mask = th->th_counter->tc_counter_mask;
+	vdso_th->th_offset = th->th_offset;
+	vdso_th->th_boottime = boottimebin;
+	enabled = cpu_fill_vdso_timehands(vdso_th);
+	if (!vdso_th_enable)
+		enabled = 0;
+	return (enabled);
+}
+
+#ifdef COMPAT_FREEBSD32
+uint32_t
+tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+{
+	struct timehands *th;
+	uint32_t enabled;
+
+	th = timehands;
+	vdso_th32->th_algo = VDSO_TH_ALGO_1;
+	*(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale;
+	vdso_th32->th_offset_count = th->th_offset_count;
+	vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask;
+	vdso_th32->th_offset.sec = th->th_offset.sec;
+	*(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac;
+	vdso_th32->th_boottime.sec = boottimebin.sec;
+	*(uint64_t *)&vdso_th32->th_boottime.frac[0] = boottimebin.frac;
+	enabled = cpu_fill_vdso_timehands32(vdso_th32);
+	if (!vdso_th_enable)
+		enabled = 0;
+	return (enabled);
+}
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_thr.c
--- a/head/sys/kern/kern_thr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_thr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_thr.c 234381 2012-04-17 13:44:40Z trasz $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_thr.c 236117 2012-05-26 20:03:47Z kib $");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
@@ -252,7 +252,6 @@
 
 	PROC_LOCK(td->td_proc);
 	td->td_proc->p_flag |= P_HADTHREADS;
-	newtd->td_sigmask = td->td_sigmask;
 	thread_link(newtd, p); 
 	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
 	thread_lock(td);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_thread.c
--- a/head/sys/kern/kern_thread.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_thread.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,10 +27,11 @@
  */
 
 #include "opt_witness.h"
+#include "opt_kdtrace.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_thread.c 229429 2012-01-03 21:03:28Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_thread.c 236317 2012-05-30 16:06:38Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -38,7 +39,9 @@
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rangelock.h>
 #include <sys/resourcevar.h>
+#include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
@@ -59,6 +62,10 @@
 #include <vm/uma.h>
 #include <sys/eventhandler.h>
 
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE(proc, , , lwp_exit, lwp-exit);
+
+
 /*
  * thread related storage.
  */
@@ -199,6 +206,7 @@
 
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
+	td->td_rlqe = NULL;
 	EVENTHANDLER_INVOKE(thread_init, td);
 	td->td_sched = (struct td_sched *)&td[1];
 	umtx_thread_init(td);
@@ -216,6 +224,7 @@
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_INVOKE(thread_fini, td);
+	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/kern_timeout.c
--- a/head/sys/kern/kern_timeout.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/kern_timeout.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/kern_timeout.c 227293 2011-11-07 06:44:47Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/kern_timeout.c 234981 2012-05-03 20:00:30Z kib $");
 
 #include "opt_kdtrace.h"
 
@@ -437,6 +437,181 @@
 	}
 }
 
+static void
+callout_cc_del(struct callout *c, struct callout_cpu *cc)
+{
+
+	if (cc->cc_next == c)
+		cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
+	if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
+		c->c_func = NULL;
+		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+	}
+}
+
+static struct callout *
+softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
+    int *lockcalls, int *gcalls)
+{
+	void (*c_func)(void *);
+	void *c_arg;
+	struct lock_class *class;
+	struct lock_object *c_lock;
+	int c_flags, sharedlock;
+#ifdef SMP
+	struct callout_cpu *new_cc;
+	void (*new_func)(void *);
+	void *new_arg;
+	int new_cpu, new_ticks;
+#endif
+#ifdef DIAGNOSTIC
+	struct bintime bt1, bt2;
+	struct timespec ts2;
+	static uint64_t maxdt = 36893488147419102LL;	/* 2 msec */
+	static timeout_t *lastfunc;
+#endif
+
+	cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
+	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
+	sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
+	c_lock = c->c_lock;
+	c_func = c->c_func;
+	c_arg = c->c_arg;
+	c_flags = c->c_flags;
+	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+		c->c_flags = CALLOUT_LOCAL_ALLOC;
+	else
+		c->c_flags &= ~CALLOUT_PENDING;
+	cc->cc_curr = c;
+	cc->cc_cancel = 0;
+	CC_UNLOCK(cc);
+	if (c_lock != NULL) {
+		class->lc_lock(c_lock, sharedlock);
+		/*
+		 * The callout may have been cancelled
+		 * while we switched locks.
+		 */
+		if (cc->cc_cancel) {
+			class->lc_unlock(c_lock);
+			goto skip;
+		}
+		/* The callout cannot be stopped now. */
+		cc->cc_cancel = 1;
+
+		if (c_lock == &Giant.lock_object) {
+			(*gcalls)++;
+			CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
+			    c, c_func, c_arg);
+		} else {
+			(*lockcalls)++;
+			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
+			    c, c_func, c_arg);
+		}
+	} else {
+		(*mpcalls)++;
+		CTR3(KTR_CALLOUT, "callout mpsafe %p func %p arg %p",
+		    c, c_func, c_arg);
+	}
+#ifdef DIAGNOSTIC
+	binuptime(&bt1);
+#endif
+	THREAD_NO_SLEEPING();
+	SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
+	c_func(c_arg);
+	SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
+	THREAD_SLEEPING_OK();
+#ifdef DIAGNOSTIC
+	binuptime(&bt2);
+	bintime_sub(&bt2, &bt1);
+	if (bt2.frac > maxdt) {
+		if (lastfunc != c_func || bt2.frac > maxdt * 2) {
+			bintime2timespec(&bt2, &ts2);
+			printf(
+		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
+			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
+		}
+		maxdt = bt2.frac;
+		lastfunc = c_func;
+	}
+#endif
+	CTR1(KTR_CALLOUT, "callout %p finished", c);
+	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
+		class->lc_unlock(c_lock);
+skip:
+	CC_LOCK(cc);
+	/*
+	 * If the current callout is locally allocated (from
+	 * timeout(9)) then put it on the freelist.
+	 *
+	 * Note: we need to check the cached copy of c_flags because
+	 * if it was not local, then it's not safe to deref the
+	 * callout pointer.
+	 */
+	if (c_flags & CALLOUT_LOCAL_ALLOC) {
+		KASSERT(c->c_flags == CALLOUT_LOCAL_ALLOC,
+		    ("corrupted callout"));
+		c->c_func = NULL;
+		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+	}
+	cc->cc_curr = NULL;
+	if (cc->cc_waiting) {
+		/*
+		 * There is someone waiting for the
+		 * callout to complete.
+		 * If the callout was scheduled for
+		 * migration just cancel it.
+		 */
+		if (cc_cme_migrating(cc))
+			cc_cme_cleanup(cc);
+		cc->cc_waiting = 0;
+		CC_UNLOCK(cc);
+		wakeup(&cc->cc_waiting);
+		CC_LOCK(cc);
+	} else if (cc_cme_migrating(cc)) {
+#ifdef SMP
+		/*
+		 * If the callout was scheduled for
+		 * migration just perform it now.
+		 */
+		new_cpu = cc->cc_migration_cpu;
+		new_ticks = cc->cc_migration_ticks;
+		new_func = cc->cc_migration_func;
+		new_arg = cc->cc_migration_arg;
+		cc_cme_cleanup(cc);
+
+		/*
+		 * Handle deferred callout stops
+		 */
+		if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
+			CTR3(KTR_CALLOUT,
+			     "deferred cancelled %p func %p arg %p",
+			     c, new_func, new_arg);
+			callout_cc_del(c, cc);
+			goto nextc;
+		}
+
+		c->c_flags &= ~CALLOUT_DFRMIGRATION;
+
+		/*
+		 * It should be assert here that the
+		 * callout is not destroyed but that
+		 * is not easy.
+		 */
+		new_cc = callout_cpu_switch(c, cc, new_cpu);
+		callout_cc_add(c, new_cc, new_ticks, new_func, new_arg,
+		    new_cpu);
+		CC_UNLOCK(new_cc);
+		CC_LOCK(cc);
+#else
+		panic("migration should not happen");
+#endif
+	}
+#ifdef SMP
+nextc:
+#endif
+	return (cc->cc_next);
+}
+
 /*
  * The callout mechanism is based on the work of Adam M. Costello and 
  * George Varghese, published in a technical report entitled "Redesigning
@@ -465,12 +640,6 @@
 	int mpcalls;
 	int lockcalls;
 	int gcalls;
-#ifdef DIAGNOSTIC
-	struct bintime bt1, bt2;
-	struct timespec ts2;
-	static uint64_t maxdt = 36893488147419102LL;	/* 2 msec */
-	static timeout_t *lastfunc;
-#endif
 
 #ifndef MAX_SOFTCLOCK_STEPS
 #define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
@@ -492,7 +661,7 @@
 		cc->cc_softticks++;
 		bucket = &cc->cc_callwheel[curticks & callwheelmask];
 		c = TAILQ_FIRST(bucket);
-		while (c) {
+		while (c != NULL) {
 			depth++;
 			if (c->c_time != curticks) {
 				c = TAILQ_NEXT(c, c_links.tqe);
@@ -507,160 +676,10 @@
 					steps = 0;
 				}
 			} else {
-				void (*c_func)(void *);
-				void *c_arg;
-				struct lock_class *class;
-				struct lock_object *c_lock;
-				int c_flags, sharedlock;
-
-				cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
 				TAILQ_REMOVE(bucket, c, c_links.tqe);
-				class = (c->c_lock != NULL) ?
-				    LOCK_CLASS(c->c_lock) : NULL;
-				sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ?
-				    0 : 1;
-				c_lock = c->c_lock;
-				c_func = c->c_func;
-				c_arg = c->c_arg;
-				c_flags = c->c_flags;
-				if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
-					c->c_flags = CALLOUT_LOCAL_ALLOC;
-				} else {
-					c->c_flags =
-					    (c->c_flags & ~CALLOUT_PENDING);
-				}
-				cc->cc_curr = c;
-				cc->cc_cancel = 0;
-				CC_UNLOCK(cc);
-				if (c_lock != NULL) {
-					class->lc_lock(c_lock, sharedlock);
-					/*
-					 * The callout may have been cancelled
-					 * while we switched locks.
-					 */
-					if (cc->cc_cancel) {
-						class->lc_unlock(c_lock);
-						goto skip;
-					}
-					/* The callout cannot be stopped now. */
-					cc->cc_cancel = 1;
-
-					if (c_lock == &Giant.lock_object) {
-						gcalls++;
-						CTR3(KTR_CALLOUT,
-						    "callout %p func %p arg %p",
-						    c, c_func, c_arg);
-					} else {
-						lockcalls++;
-						CTR3(KTR_CALLOUT, "callout lock"
-						    " %p func %p arg %p",
-						    c, c_func, c_arg);
-					}
-				} else {
-					mpcalls++;
-					CTR3(KTR_CALLOUT,
-					    "callout mpsafe %p func %p arg %p",
-					    c, c_func, c_arg);
-				}
-#ifdef DIAGNOSTIC
-				binuptime(&bt1);
-#endif
-				THREAD_NO_SLEEPING();
-				SDT_PROBE(callout_execute, kernel, ,
-				    callout_start, c, 0, 0, 0, 0);
-				c_func(c_arg);
-				SDT_PROBE(callout_execute, kernel, ,
-				    callout_end, c, 0, 0, 0, 0);
-				THREAD_SLEEPING_OK();
-#ifdef DIAGNOSTIC
-				binuptime(&bt2);
-				bintime_sub(&bt2, &bt1);
-				if (bt2.frac > maxdt) {
-					if (lastfunc != c_func ||
-					    bt2.frac > maxdt * 2) {
-						bintime2timespec(&bt2, &ts2);
-						printf(
-			"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
-						    c_func, c_arg,
-						    (intmax_t)ts2.tv_sec,
-						    ts2.tv_nsec);
-					}
-					maxdt = bt2.frac;
-					lastfunc = c_func;
-				}
-#endif
-				CTR1(KTR_CALLOUT, "callout %p finished", c);
-				if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
-					class->lc_unlock(c_lock);
-			skip:
-				CC_LOCK(cc);
-				/*
-				 * If the current callout is locally
-				 * allocated (from timeout(9))
-				 * then put it on the freelist.
-				 *
-				 * Note: we need to check the cached
-				 * copy of c_flags because if it was not
-				 * local, then it's not safe to deref the
-				 * callout pointer.
-				 */
-				if (c_flags & CALLOUT_LOCAL_ALLOC) {
-					KASSERT(c->c_flags ==
-					    CALLOUT_LOCAL_ALLOC,
-					    ("corrupted callout"));
-					c->c_func = NULL;
-					SLIST_INSERT_HEAD(&cc->cc_callfree, c,
-					    c_links.sle);
-				}
-				cc->cc_curr = NULL;
-				if (cc->cc_waiting) {
-
-					/*
-					 * There is someone waiting for the
-					 * callout to complete.
-					 * If the callout was scheduled for
-					 * migration just cancel it.
-					 */
-					if (cc_cme_migrating(cc))
-						cc_cme_cleanup(cc);
-					cc->cc_waiting = 0;
-					CC_UNLOCK(cc);
-					wakeup(&cc->cc_waiting);
-					CC_LOCK(cc);
-				} else if (cc_cme_migrating(cc)) {
-#ifdef SMP
-					struct callout_cpu *new_cc;
-					void (*new_func)(void *);
-					void *new_arg;
-					int new_cpu, new_ticks;
-
-					/*
-					 * If the callout was scheduled for
-					 * migration just perform it now.
-					 */
-					new_cpu = cc->cc_migration_cpu;
-					new_ticks = cc->cc_migration_ticks;
-					new_func = cc->cc_migration_func;
-					new_arg = cc->cc_migration_arg;
-					cc_cme_cleanup(cc);
-
-					/*
-					 * It should be assert here that the
-					 * callout is not destroyed but that
-					 * is not easy.
-					 */
-					new_cc = callout_cpu_switch(c, cc,
-					    new_cpu);
-					callout_cc_add(c, new_cc, new_ticks,
-					    new_func, new_arg, new_cpu);
-					CC_UNLOCK(new_cc);
-					CC_LOCK(cc);
-#else
-					panic("migration should not happen");
-#endif
-				}
+				c = softclock_call_cc(c, cc, &mpcalls,
+				    &lockcalls, &gcalls);
 				steps = 0;
-				c = cc->cc_next;
 			}
 		}
 	}
@@ -814,6 +833,7 @@
 			cc->cc_migration_ticks = to_ticks;
 			cc->cc_migration_func = ftn;
 			cc->cc_migration_arg = arg;
+			c->c_flags |= CALLOUT_DFRMIGRATION;
 			CTR5(KTR_CALLOUT,
 		    "migration of %p func %p arg %p in %d to %u deferred",
 			    c, c->c_func, c->c_arg, to_ticks, cpu);
@@ -984,6 +1004,12 @@
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
+		} else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
+			c->c_flags &= ~CALLOUT_DFRMIGRATION;
+			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
+			CC_UNLOCK(cc);
+			return (1);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
@@ -996,19 +1022,12 @@
 
 	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
-	if (cc->cc_next == c) {
-		cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
-	}
+	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+	    c, c->c_func, c->c_arg);
 	TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c,
 	    c_links.tqe);
+	callout_cc_del(c, cc);
 
-	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
-	    c, c->c_func, c->c_arg);
-
-	if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
-		c->c_func = NULL;
-		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
-	}
 	CC_UNLOCK(cc);
 	return (1);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sched_4bsd.c
--- a/head/sys/kern/sched_4bsd.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sched_4bsd.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sched_4bsd.c 232700 2012-03-08 19:41:05Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/sched_4bsd.c 235471 2012-05-15 10:58:17Z pluknet $");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
@@ -50,6 +50,7 @@
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
@@ -244,12 +245,31 @@
 	   "allow threads to share a quantum");
 #endif
 
+SDT_PROVIDER_DEFINE(sched);
+
+SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
+    "struct proc *", "uint8_t");
+SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
+    "struct proc *", "void *");
+SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
+    "struct proc *", "void *", "int");
+SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
+    "struct proc *", "uint8_t", "struct thread *");
+SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *",
+    "struct proc *");
+SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
+SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *",
+    "struct proc *");
+
 static __inline void
 sched_load_add(void)
 {
 
 	sched_tdcnt++;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
+	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
 }
 
 static __inline void
@@ -258,6 +278,7 @@
 
 	sched_tdcnt--;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
+	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
 }
 /*
  * Arrange to reschedule if necessary, taking the priorities and
@@ -795,10 +816,13 @@
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
 	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
+	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
 	if (td != curthread && prio > td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
+		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
+		    curthread);
 	}
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
@@ -987,6 +1011,9 @@
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
+
+		SDT_PROBE2(sched, , , off_cpu, td, td->td_proc);
+
                 /* I feel sleepy */
 		lock_profile_release_lock(&sched_lock.lock_object);
 #ifdef KDTRACE_HOOKS
@@ -1018,11 +1045,14 @@
 		 * needed to, or the thread_wait() or wait() will
 		 * need to reap it.
 		 */
+
+		SDT_PROBE0(sched, , , on_cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
-	}
+	} else
+		SDT_PROBE0(sched, , , remain_cpu);
 
 #ifdef SMP
 	if (td->td_flags & TDF_IDLETD)
@@ -1223,6 +1253,8 @@
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
+	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
+	    flags & SRQ_PREEMPTED);
 
 
 	/*
@@ -1315,6 +1347,8 @@
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
+	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
+	    flags & SRQ_PREEMPTED);
 
 	/*
 	 * Now that the thread is moving to the run-queue, set the lock
@@ -1362,6 +1396,7 @@
 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
+	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_rem();
@@ -1425,6 +1460,8 @@
 void
 sched_preempt(struct thread *td)
 {
+
+	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
 	thread_lock(td);
 	if (td->td_critnest > 1)
 		td->td_owepreempt = 1;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sched_ule.c
--- a/head/sys/kern/sched_ule.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sched_ule.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 234066 2012-04-09 18:24:58Z mav $");
+__FBSDID("$FreeBSD: head/sys/kern/sched_ule.c 236141 2012-05-27 10:25:20Z raj $");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_kdtrace.h"
@@ -53,6 +53,7 @@
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
@@ -76,7 +77,7 @@
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
-#if defined(__powerpc__) && defined(E500)
+#if defined(__powerpc__) && defined(BOOKE_E500)
 #error "This architecture is not currently compatible with ULE"
 #endif
 
@@ -327,6 +328,24 @@
 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
     NULL);
 
+SDT_PROVIDER_DEFINE(sched);
+
+SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
+    "struct proc *", "uint8_t");
+SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
+    "struct proc *", "void *");
+SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
+    "struct proc *", "void *", "int");
+SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
+    "struct proc *", "uint8_t", "struct thread *");
+SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *", 
+    "struct proc *");
+SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
+SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *", 
+    "struct proc *");
+
 /*
  * Print the threads waiting on a run-queue.
  */
@@ -509,6 +528,7 @@
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload++;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
+	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
@@ -528,6 +548,7 @@
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload--;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
+	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
@@ -1625,10 +1646,13 @@
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
 	    "prio:%d", td->td_priority, "new prio:%d", prio,
 	    KTR_ATTR_LINKED, sched_tdname(curthread));
+	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
 	if (td != curthread && prio > td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
+		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
+		    curthread);
 	} 
 	ts = td->td_sched;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
@@ -1879,6 +1903,7 @@
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
+		SDT_PROBE2(sched, , , off_cpu, td, td->td_proc);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
 		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
 		sched_pctcpu_update(newtd->td_sched, 0);
@@ -1903,12 +1928,16 @@
 		tdq = TDQ_CPU(cpuid);
 		lock_profile_obtain_lock_success(
 		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
+
+		SDT_PROBE0(sched, , , on_cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
-	} else
+	} else {
 		thread_unblock_switch(td, mtx);
+		SDT_PROBE0(sched, , , remain_cpu);
+	}
 	/*
 	 * Assert that all went well and return.
 	 */
@@ -2102,6 +2131,8 @@
 {
 	struct tdq *tdq;
 
+	SDT_PROBE2(sched, , , surrender, td, td->td_proc);
+
 	thread_lock(td);
 	tdq = TDQ_SELF();
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
@@ -2330,6 +2361,8 @@
 	    sched_tdname(curthread));
 	KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
 	    KTR_ATTR_LINKED, sched_tdname(td));
+	SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 
+	    flags & SRQ_PREEMPTED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Recalculate the priority before we select the target cpu or
@@ -2375,6 +2408,7 @@
 
 	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
 	    "prio:%d", td->td_priority);
+	SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
 	tdq = TDQ_CPU(td->td_sched->ts_cpu);
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_bus.c
--- a/head/sys/kern/subr_bus.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_bus.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_bus.c 234152 2012-04-11 20:57:41Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_bus.c 235978 2012-05-25 07:32:26Z avg $");
 
 #include "opt_bus.h"
 
@@ -1909,6 +1909,8 @@
 
 	PDEBUG(("%s at %s with order %u as unit %d",
 	    name, DEVICENAME(dev), order, unit));
+	KASSERT(name != NULL || unit == -1,
+	    ("child device with wildcard name and specific unit number"));
 
 	child = make_device(dev, name, unit);
 	if (child == NULL)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_devstat.c
--- a/head/sys/kern/subr_devstat.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_devstat.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_devstat.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_devstat.c 238372 2012-07-11 18:50:50Z kib $");
+
+#include "opt_kdtrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -44,6 +46,58 @@
 
 #include <machine/atomic.h>
 
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+
+dtrace_io_start_probe_func_t dtrace_io_start_probe;
+dtrace_io_done_probe_func_t dtrace_io_done_probe;
+dtrace_io_wait_start_probe_func_t dtrace_io_wait_start_probe;
+dtrace_io_wait_done_probe_func_t dtrace_io_wait_done_probe;
+
+uint32_t	dtio_start_id;
+uint32_t	dtio_done_id;
+uint32_t	dtio_wait_start_id;
+uint32_t	dtio_wait_done_id;
+
+#define DTRACE_DEVSTAT_START() \
+	if (dtrace_io_start_probe != NULL) \
+		(*dtrace_io_start_probe)(dtio_start_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_BIO_START() \
+	if (dtrace_io_start_probe != NULL) \
+		(*dtrace_io_start_probe)(dtio_start_id, bp, ds);
+
+#define DTRACE_DEVSTAT_DONE() \
+	if (dtrace_io_done_probe != NULL) \
+		(*dtrace_io_done_probe)(dtio_done_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_BIO_DONE() \
+	if (dtrace_io_done_probe != NULL) \
+		(*dtrace_io_done_probe)(dtio_done_id, bp, ds);
+
+#define DTRACE_DEVSTAT_WAIT_START() \
+	if (dtrace_io_wait_start_probe != NULL) \
+		(*dtrace_io_wait_start_probe)(dtio_wait_start_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_WAIT_DONE() \
+	if (dtrace_io_wait_done_probe != NULL) \
+		(*dtrace_io_wait_done_probe)(dtio_wait_done_id, NULL, ds);
+
+#else /* ! KDTRACE_HOOKS */
+
+#define DTRACE_DEVSTAT_START()
+
+#define DTRACE_DEVSTAT_BIO_START()
+
+#define DTRACE_DEVSTAT_DONE()
+
+#define DTRACE_DEVSTAT_BIO_DONE()
+
+#define DTRACE_DEVSTAT_WAIT_START()
+
+#define DTRACE_DEVSTAT_WAIT_DONE()
+#endif /* KDTRACE_HOOKS */
+
 static int devstat_num_devs;
 static long devstat_generation = 1;
 static int devstat_version = DEVSTAT_VERSION;
@@ -227,6 +281,7 @@
 	}
 	ds->start_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
+	DTRACE_DEVSTAT_START();
 }
 
 void
@@ -241,6 +296,7 @@
 
 	binuptime(&bp->bio_t0);
 	devstat_start_transaction(ds, &bp->bio_t0);
+	DTRACE_DEVSTAT_BIO_START();
 }
 
 /*
@@ -312,6 +368,7 @@
 
 	ds->end_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
+	DTRACE_DEVSTAT_DONE();
 }
 
 void
@@ -334,6 +391,7 @@
 
 	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
 				DEVSTAT_TAG_SIMPLE, flg, NULL, &bp->bio_t0);
+	DTRACE_DEVSTAT_BIO_DONE();
 }
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_dummy_vdso_tc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/kern/subr_dummy_vdso_tc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,49 @@
+/*-
+ * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/subr_dummy_vdso_tc.c 237433 2012-06-22 07:06:40Z kib $");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/vdso.h>
+
+uint32_t
+cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+{
+
+	return (0);
+}
+
+#ifdef COMPAT_FREEBSD32
+uint32_t
+cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+{
+
+	return (0);
+}
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_firmware.c
--- a/head/sys/kern/subr_firmware.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_firmware.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_firmware.c 234201 2012-04-13 04:22:42Z adrian $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_firmware.c 237546 2012-06-25 05:41:16Z kevlo $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -198,7 +198,7 @@
 		free(str, M_TEMP);
 		return NULL;
 	}
-	bzero(frp, sizeof(frp));	/* start from a clean record */
+	bzero(frp, sizeof(*frp));	/* start from a clean record */
 	frp->fw.name = str;
 	frp->fw.data = data;
 	frp->fw.datasize = datasize;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_rman.c
--- a/head/sys/kern/subr_rman.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_rman.c	Wed Jul 25 16:40:53 2012 +0300
@@ -58,7 +58,7 @@
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_rman.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_rman.c 236359 2012-05-31 17:27:05Z imp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -161,6 +161,7 @@
 rman_manage_region(struct rman *rm, u_long start, u_long end)
 {
 	struct resource_i *r, *s, *t;
+	int rv = 0;
 
 	DPRINTF(("rman_manage_region: <%s> request: start %#lx, end %#lx\n",
 	    rm->rm_descr, start, end));
@@ -188,13 +189,17 @@
 		TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link);
 	} else {
 		/* Check for any overlap with the current region. */
-		if (r->r_start <= s->r_end && r->r_end >= s->r_start)
-			return EBUSY;
+		if (r->r_start <= s->r_end && r->r_end >= s->r_start) {
+			rv = EBUSY;
+			goto out;
+		}
 
 		/* Check for any overlap with the next region. */
 		t = TAILQ_NEXT(s, r_link);
-		if (t && r->r_start <= t->r_end && r->r_end >= t->r_start)
-			return EBUSY;
+		if (t && r->r_start <= t->r_end && r->r_end >= t->r_start) {
+			rv = EBUSY;
+			goto out;
+		}
 
 		/*
 		 * See if this region can be merged with the next region.  If
@@ -225,9 +230,9 @@
 			TAILQ_INSERT_BEFORE(s, r, r_link);
 		}
 	}
-
+out:
 	mtx_unlock(rm->rm_mtx);
-	return 0;
+	return rv;
 }
 
 int
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_sleepqueue.c
--- a/head/sys/kern/subr_sleepqueue.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_sleepqueue.c	Wed Jul 25 16:40:53 2012 +0300
@@ -60,10 +60,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_sleepqueue.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_sleepqueue.c 235459 2012-05-15 01:30:25Z rstone $");
 
 #include "opt_sleepqueue_profiling.h"
 #include "opt_ddb.h"
+#include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
@@ -75,6 +76,7 @@
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
@@ -166,6 +168,9 @@
 static void	sleepq_switch(void *wchan, int pri);
 static void	sleepq_timeout(void *arg);
 
+SDT_PROBE_DECLARE(sched, , , sleep);
+SDT_PROBE_DECLARE(sched, , , wakeup);
+
 /*
  * Early initialization of sleep queues that is called from the sleepinit()
  * SYSINIT.
@@ -534,6 +539,7 @@
 	MPASS(td->td_sleepqueue == NULL);
 	sched_sleep(td, pri);
 	thread_lock_set(td, &sc->sc_lock);
+	SDT_PROBE0(sched, , , sleep);
 	TD_SET_SLEEPING(td);
 	mi_switch(SW_VOL | SWT_SLEEPQ, NULL);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
@@ -715,6 +721,8 @@
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
+	SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
+
 	/* Remove the thread from the queue. */
 	sq->sq_blockedcnt[td->td_sqqueue]--;
 	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_smp.c
--- a/head/sys/kern/subr_smp.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_smp.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_smp.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_smp.c 236906 2012-06-11 18:47:26Z iwasaki $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -55,6 +55,7 @@
 #ifdef SMP
 volatile cpuset_t stopped_cpus;
 volatile cpuset_t started_cpus;
+volatile cpuset_t suspended_cpus;
 cpuset_t hlt_cpus_mask;
 cpuset_t logical_cpus_mask;
 
@@ -207,9 +208,10 @@
 #endif
 	static volatile u_int stopping_cpu = NOCPU;
 	int i;
+	volatile cpuset_t *cpus;
 
 	KASSERT(
-#if defined(__amd64__)
+#if defined(__amd64__) || defined(__i386__)
 	    type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
 #else
 	    type == IPI_STOP || type == IPI_STOP_HARD,
@@ -231,8 +233,15 @@
 	/* send the stop IPI to all CPUs in map */
 	ipi_selected(map, type);
 
+#if defined(__amd64__) || defined(__i386__)
+	if (type == IPI_SUSPEND)
+		cpus = &suspended_cpus;
+	else
+#endif
+		cpus = &stopped_cpus;
+
 	i = 0;
-	while (!CPU_SUBSET(&stopped_cpus, &map)) {
+	while (!CPU_SUBSET(cpus, &map)) {
 		/* spin */
 		cpu_spinwait();
 		i++;
@@ -260,7 +269,7 @@
 	return (generic_stop_cpus(map, IPI_STOP_HARD));
 }
 
-#if defined(__amd64__)
+#if defined(__amd64__) || defined(__i386__)
 int
 suspend_cpus(cpuset_t map)
 {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_syscall.c
--- a/head/sys/kern/subr_syscall.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_syscall.c	Wed Jul 25 16:40:53 2012 +0300
@@ -42,7 +42,7 @@
 #include "opt_ktrace.h"
 #include "opt_kdtrace.h"
 
-__FBSDID("$FreeBSD: head/sys/kern/subr_syscall.c 234172 2012-04-12 10:48:43Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_syscall.c 236309 2012-05-30 13:44:42Z kib $");
 
 #include <sys/capability.h>
 #include <sys/ktr.h>
@@ -182,6 +182,12 @@
 	KASSERT(td->td_locks == 0,
 	    ("System call %s returning with %d locks held",
 	     syscallname(p, sa->code), td->td_locks));
+	KASSERT((td->td_pflags & TDP_NOFAULTING) == 0,
+	    ("System call %s returning with pagefaults disabled",
+	     syscallname(p, sa->code)));
+	KASSERT((td->td_pflags & TDP_NOSLEEPING) == 0,
+	    ("System call %s returning with sleep disabled",
+	     syscallname(p, sa->code)));
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_trap.c
--- a/head/sys/kern/subr_trap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_trap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -42,9 +42,8 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_trap.c 234494 2012-04-20 15:32:36Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_trap.c 236859 2012-06-10 20:24:01Z pjd $");
 
-#include "opt_capsicum.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_kdtrace.h"
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_turnstile.c
--- a/head/sys/kern/subr_turnstile.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_turnstile.c	Wed Jul 25 16:40:53 2012 +0300
@@ -57,9 +57,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_turnstile.c 234303 2012-04-14 23:59:58Z davide $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_turnstile.c 235459 2012-05-15 01:30:25Z rstone $");
 
 #include "opt_ddb.h"
+#include "opt_kdtrace.h"
 #include "opt_turnstile_profiling.h"
 #include "opt_sched.h"
 
@@ -73,6 +74,7 @@
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 
@@ -167,6 +169,11 @@
 static int	turnstile_init(void *mem, int size, int flags);
 static void	turnstile_fini(void *mem, int size);
 
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE(sched, , , sleep, sleep);
+SDT_PROBE_DEFINE2(sched, , , wakeup, wakeup, "struct thread *", 
+    "struct proc *");
+
 /*
  * Walks the chain of turnstiles and their owners to propagate the priority
  * of the thread being blocked to all the threads holding locks that have to
@@ -740,6 +747,8 @@
 		CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
 		    td->td_tid, lock, lock->lo_name);
 
+	SDT_PROBE0(sched, , , sleep);
+
 	THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 	mi_switch(SW_VOL | SWT_TURNSTILE, NULL);
 
@@ -916,6 +925,7 @@
 	while (!TAILQ_EMPTY(&pending_threads)) {
 		td = TAILQ_FIRST(&pending_threads);
 		TAILQ_REMOVE(&pending_threads, td, td_lockq);
+		SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
 		thread_lock(td);
 		THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/subr_witness.c
--- a/head/sys/kern/subr_witness.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/subr_witness.c	Wed Jul 25 16:40:53 2012 +0300
@@ -85,7 +85,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/subr_witness.c 233937 2012-04-06 06:53:58Z melifaro $");
+__FBSDID("$FreeBSD: head/sys/kern/subr_witness.c 237623 2012-06-27 03:45:25Z alc $");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
@@ -564,7 +564,7 @@
 	 */
 	{ "bpf global lock", &lock_class_mtx_sleep },
 	{ "bpf interface lock", &lock_class_rw },
-	{ "bpf cdev lock", &lock_class_rw },
+	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
@@ -593,19 +593,22 @@
 	/*
 	 * CDEV
 	 */
-	{ "system map", &lock_class_mtx_sleep },
-	{ "vm page queue mutex", &lock_class_mtx_sleep },
+	{ "vm map (system)", &lock_class_mtx_sleep },
+	{ "vm page queue", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
-	 * 
 	 */
+	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_mtx_sleep },
-	{ "page lock", &lock_class_mtx_sleep },
-	{ "vm page queue mutex", &lock_class_mtx_sleep },
+	{ "vm page", &lock_class_mtx_sleep },
+	{ "vm page queue", &lock_class_mtx_sleep },
+	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
+	{ "pmap pv list", &lock_class_rw },
+	{ "vm page free queue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sys_capability.c
--- a/head/sys/kern/sys_capability.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sys_capability.c	Wed Jul 25 16:40:53 2012 +0300
@@ -51,12 +51,12 @@
  * anonymous, rather than named, POSIX shared memory objects.
  */
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/kern/sys_capability.c 236858 2012-06-10 20:22:10Z pjd $");
+
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sys_capability.c 232860 2012-03-12 11:56:57Z pho $");
-
 #include <sys/param.h>
 #include <sys/capability.h>
 #include <sys/file.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sys_generic.c
--- a/head/sys/kern/sys_generic.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sys_generic.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sys_generic.c 232494 2012-03-04 14:55:37Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/sys_generic.c 237195 2012-06-17 13:03:50Z davide $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -1255,7 +1255,7 @@
 	struct pollfd *bits;
 	struct pollfd smallbits[32];
 	struct timeval atv, rtv, ttv;
-	int error = 0, timo;
+	int error, timo;
 	u_int nfds;
 	size_t ni;
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sys_procdesc.c
--- a/head/sys/kern/sys_procdesc.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sys_procdesc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -59,7 +59,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sys_procdesc.c 225617 2011-09-16 13:58:51Z kmacy $");
+__FBSDID("$FreeBSD: head/sys/kern/sys_procdesc.c 237277 2012-06-19 22:23:59Z pjd $");
 
 #include "opt_procdesc.h"
 
@@ -338,7 +338,7 @@
 
 /*
  * procdesc_close() - last close on a process descriptor.  If the process is
- * still running, terminate with SIGKILL (unless PD_DAEMON is set) and let
+ * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let
  * init(8) clean up the mess; if not, we have to clean up the zombie ourselves.
  */
 static int
@@ -386,7 +386,7 @@
 		 */
 		p->p_sigparent = SIGCHLD;
 		proc_reparent(p, initproc);
-		if ((pd->pd_flags & PD_DAEMON) == 0)
+		if ((pd->pd_flags & PDF_DAEMON) == 0)
 			kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 		sx_xunlock(&proctree_lock);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/sys_process.c
--- a/head/sys/kern/sys_process.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/sys_process.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/sys_process.c 232048 2012-02-23 11:50:23Z kib $");
+__FBSDID("$FreeBSD: head/sys/kern/sys_process.c 238287 2012-07-09 09:24:46Z davidxu $");
 
 #include "opt_compat.h"
 
@@ -635,7 +635,7 @@
 	struct iovec iov;
 	struct uio uio;
 	struct proc *curp, *p, *pp;
-	struct thread *td2 = NULL;
+	struct thread *td2 = NULL, *td3;
 	struct ptrace_io_desc *piod = NULL;
 	struct ptrace_lwpinfo *pl;
 	int error, write, tmp, num;
@@ -953,10 +953,8 @@
 			td2->td_xsig = data;
 
 			if (req == PT_DETACH) {
-				struct thread *td3;
-				FOREACH_THREAD_IN_PROC(p, td3) {
+				FOREACH_THREAD_IN_PROC(p, td3)
 					td3->td_dbgflags &= ~TDB_SUSPEND; 
-				}
 			}
 			/*
 			 * unsuspend all threads, to not let a thread run,
@@ -967,6 +965,8 @@
 			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
+			if (req == PT_ATTACH)
+				kern_psignal(p, data);
 		} else {
 			if (data)
 				kern_psignal(p, data);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/syscalls.c
--- a/head/sys/kern/syscalls.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/syscalls.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,8 +2,8 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/kern/syscalls.c 227776 2011-11-21 01:26:10Z lstewart $
- * created from FreeBSD: head/sys/kern/syscalls.master 227691 2011-11-19 06:35:15Z ed 
+ * $FreeBSD: head/sys/kern/syscalls.c 236027 2012-05-25 21:52:57Z ed $
+ * created from FreeBSD: head/sys/kern/syscalls.master 236026 2012-05-25 21:50:48Z ed 
  */
 
 const char *syscallnames[] = {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/syscalls.master
--- a/head/sys/kern/syscalls.master	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/syscalls.master	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
- $FreeBSD: head/sys/kern/syscalls.master 227776 2011-11-21 01:26:10Z lstewart $
+ $FreeBSD: head/sys/kern/syscalls.master 236026 2012-05-25 21:50:48Z ed $
 ;	from: @(#)syscalls.master	8.2 (Berkeley) 1/13/94
 ;
 ; System call name/number master file.
@@ -916,9 +916,9 @@
 512	AUE_SHMCTL	NOSTD	{ int shmctl(int shmid, int cmd, \
 				    struct shmid_ds *buf); }
 513	AUE_LPATHCONF	STD	{ int lpathconf(char *path, int name); }
-514	AUE_CAP_NEW	STD	{ int cap_new(int fd, u_int64_t rights); }
+514	AUE_CAP_NEW	STD	{ int cap_new(int fd, uint64_t rights); }
 515	AUE_CAP_GETRIGHTS	STD	{ int cap_getrights(int fd, \
-				    u_int64_t *rightsp); }
+				    uint64_t *rightsp); }
 516	AUE_CAP_ENTER	STD	{ int cap_enter(void); }
 517	AUE_CAP_GETMODE	STD	{ int cap_getmode(u_int *modep); }
 518	AUE_PDFORK	STD	{ int pdfork(int *fdp, int flags); }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/systrace_args.c
--- a/head/sys/kern/systrace_args.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/systrace_args.c	Wed Jul 25 16:40:53 2012 +0300
@@ -2,7 +2,7 @@
  * System call argument to DTrace register array converstion.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: head/sys/kern/systrace_args.c 227776 2011-11-21 01:26:10Z lstewart $
+ * $FreeBSD: head/sys/kern/systrace_args.c 236027 2012-05-25 21:52:57Z ed $
  * This file is part of the DTrace syscall provider.
  */
 
@@ -3121,7 +3121,7 @@
 	case 514: {
 		struct cap_new_args *p = params;
 		iarg[0] = p->fd; /* int */
-		uarg[1] = p->rights; /* u_int64_t */
+		uarg[1] = p->rights; /* uint64_t */
 		*n_args = 2;
 		break;
 	}
@@ -3129,7 +3129,7 @@
 	case 515: {
 		struct cap_getrights_args *p = params;
 		iarg[0] = p->fd; /* int */
-		uarg[1] = (intptr_t) p->rightsp; /* u_int64_t * */
+		uarg[1] = (intptr_t) p->rightsp; /* uint64_t * */
 		*n_args = 2;
 		break;
 	}
@@ -8434,7 +8434,7 @@
 			p = "int";
 			break;
 		case 1:
-			p = "u_int64_t";
+			p = "uint64_t";
 			break;
 		default:
 			break;
@@ -8447,7 +8447,7 @@
 			p = "int";
 			break;
 		case 1:
-			p = "u_int64_t *";
+			p = "uint64_t *";
 			break;
 		default:
 			break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/tty.c
--- a/head/sys/kern/tty.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/tty.c	Wed Jul 25 16:40:53 2012 +0300
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/tty.c 232197 2012-02-26 20:56:49Z phk $");
+__FBSDID("$FreeBSD: head/sys/kern/tty.c 237219 2012-06-18 07:34:38Z pho $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -219,9 +219,15 @@
 static int
 ttydev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
-	struct tty *tp = dev->si_drv1;
+	struct tty *tp;
 	int error = 0;
 
+	while ((tp = dev->si_drv1) == NULL) {
+		error = tsleep(&dev->si_drv1, PCATCH, "ttdrv1", 1);
+		if (error != EWOULDBLOCK)
+			return (error);
+	}
+
 	tty_lock(tp);
 	if (tty_gone(tp)) {
 		/* Device is already gone. */
@@ -738,9 +744,14 @@
 static int
 ttyil_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
-	struct tty *tp = dev->si_drv1;
+	struct tty *tp;
 	int error = 0;
 
+	while ((tp = dev->si_drv1) == NULL) {
+		error = tsleep(&dev->si_drv1, PCATCH, "ttdrv1", 1);
+		if (error != EWOULDBLOCK)
+			return (error);
+	}
 	tty_lock(tp);
 	if (tty_gone(tp))
 		error = ENODEV;
@@ -1203,6 +1214,7 @@
 	dev = make_dev_cred(&ttydev_cdevsw, 0, cred,
 	    uid, gid, mode, "%s%s", prefix, name);
 	dev->si_drv1 = tp;
+	wakeup(&dev->si_drv1);
 	tp->t_dev = dev;
 
 	/* Slave call-in devices. */
@@ -1211,12 +1223,14 @@
 		    uid, gid, mode, "%s%s.init", prefix, name);
 		dev_depends(tp->t_dev, dev);
 		dev->si_drv1 = tp;
+		wakeup(&dev->si_drv1);
 		dev->si_drv2 = &tp->t_termios_init_in;
 
 		dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_LOCK, cred,
 		    uid, gid, mode, "%s%s.lock", prefix, name);
 		dev_depends(tp->t_dev, dev);
 		dev->si_drv1 = tp;
+		wakeup(&dev->si_drv1);
 		dev->si_drv2 = &tp->t_termios_lock_in;
 	}
 
@@ -1226,6 +1240,7 @@
 		    UID_UUCP, GID_DIALER, 0660, "cua%s", name);
 		dev_depends(tp->t_dev, dev);
 		dev->si_drv1 = tp;
+		wakeup(&dev->si_drv1);
 
 		/* Slave call-out devices. */
 		if (tp->t_flags & TF_INITLOCK) {
@@ -1234,6 +1249,7 @@
 			    UID_UUCP, GID_DIALER, 0660, "cua%s.init", name);
 			dev_depends(tp->t_dev, dev);
 			dev->si_drv1 = tp;
+			wakeup(&dev->si_drv1);
 			dev->si_drv2 = &tp->t_termios_init_out;
 
 			dev = make_dev_cred(&ttyil_cdevsw,
@@ -1241,6 +1257,7 @@
 			    UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name);
 			dev_depends(tp->t_dev, dev);
 			dev->si_drv1 = tp;
+			wakeup(&dev->si_drv1);
 			dev->si_drv2 = &tp->t_termios_lock_out;
 		}
 	}
@@ -1817,9 +1834,6 @@
 {
 	struct tty *tp;
 	struct file *fp;
-#ifdef CAPABILITIES
-	struct file *fp_cap;
-#endif
 	struct cdev *dev;
 	struct cdevsw *cdp;
 	struct filedesc *fdp;
@@ -1838,10 +1852,9 @@
 	}
 
 #ifdef CAPABILITIES
-	fp_cap = fp;
-	error = cap_funwrap(fp_cap, CAP_TTYHOOK, &fp);
+	error = cap_funwrap(fp, CAP_TTYHOOK, &fp);
 	if (error)
-		return (error);
+		goto done1;
 #endif
 
 	/*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/uipc_mqueue.c
--- a/head/sys/kern/uipc_mqueue.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/uipc_mqueue.c	Wed Jul 25 16:40:53 2012 +0300
@@ -43,7 +43,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/uipc_mqueue.c 229272 2012-01-02 12:12:10Z ed $");
+__FBSDID("$FreeBSD: head/sys/kern/uipc_mqueue.c 234607 2012-04-23 14:10:34Z trasz $");
 
 #include "opt_compat.h"
 
@@ -703,7 +703,7 @@
 {
 	struct vnode *vp = (struct vnode *)context;
 
-	vrecycle(vp, curthread);
+	vrecycle(vp);
 	vdrop(vp);
 }
 
@@ -1065,7 +1065,7 @@
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
-		vrecycle(ap->a_vp, ap->a_td);
+		vrecycle(ap->a_vp);
 	return (0);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/uipc_socket.c
--- a/head/sys/kern/uipc_socket.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/uipc_socket.c	Wed Jul 25 16:40:53 2012 +0300
@@ -101,7 +101,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 233850 2012-04-03 18:38:00Z np $");
+__FBSDID("$FreeBSD: head/sys/kern/uipc_socket.c 238085 2012-07-03 19:08:02Z trociny $");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
@@ -635,7 +635,7 @@
 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 	if (so->so_options & SO_ACCEPTCONN) {
 		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
-		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
+		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated"));
 	}
 	SOCK_UNLOCK(so);
 	ACCEPT_UNLOCK();
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/uipc_syscalls.c
--- a/head/sys/kern/uipc_syscalls.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/uipc_syscalls.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 233004 2012-03-15 14:13:38Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/kern/uipc_syscalls.c 236891 2012-06-11 16:08:03Z pjd $");
 
 #include "opt_capsicum.h"
 #include "opt_inet.h"
@@ -134,8 +134,7 @@
 	int error;
 #endif
 
-	fp = NULL;
-	if ((fdp == NULL) || ((fp = fget_unlocked(fdp, fd)) == NULL))
+	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL)
 		return (EBADF);
 #ifdef CAPABILITIES
 	/*
@@ -179,7 +178,6 @@
 		int	protocol;
 	} */ *uap;
 {
-	struct filedesc *fdp;
 	struct socket *so;
 	struct file *fp;
 	int fd, error;
@@ -191,7 +189,6 @@
 	if (error)
 		return (error);
 #endif
-	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd, 0);
 	if (error)
 		return (error);
@@ -199,7 +196,7 @@
 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error) {
-		fdclose(fdp, fp, fd, td);
+		fdclose(td->td_proc->p_fd, fp, fd, td);
 	} else {
 		finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
 		td->td_retval[0] = fd;
@@ -1962,6 +1959,7 @@
 	 * and takes care of the overall progress.
 	 */
 	for (off = uap->offset, rem = uap->nbytes; ; ) {
+		struct mbuf *mtail = NULL;
 		int loopbytes = 0;
 		int space = 0;
 		int done = 0;
@@ -2181,10 +2179,13 @@
 			m0->m_len = xfsize;
 
 			/* Append to mbuf chain. */
-			if (m != NULL)
-				m_cat(m, m0);
+			if (mtail != NULL)
+				mtail->m_next = m0;
+			else if (m != NULL)
+				m_last(m)->m_next = m0;
 			else
 				m = m0;
+			mtail = m0;
 
 			/* Keep track of bits processed. */
 			loopbytes += xfsize;
@@ -2309,25 +2310,23 @@
 	} */ *uap;
 {
 #if (defined(INET) || defined(INET6)) && defined(SCTP)
-	struct filedesc *fdp;
 	struct file *nfp = NULL;
 	int error;
 	struct socket *head, *so;
 	int fd;
 	u_int fflag;
 
-	fdp = td->td_proc->p_fd;
 	AUDIT_ARG_FD(uap->sd);
 	error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag);
 	if (error)
 		goto done2;
 	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
-		goto done2;
+		goto done;
 	}
 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
 	if (error)
-		goto done2;
+		goto done;
 	/*
 	 * At this point we know we do have a assoc to pull
 	 * we proceed to get the fd setup. This may block
@@ -2374,7 +2373,7 @@
 	 * out from under us.
 	 */
 	if (error)
-		fdclose(fdp, nfp, fd, td);
+		fdclose(td->td_proc->p_fd, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/uipc_usrreq.c
--- a/head/sys/kern/uipc_usrreq.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/uipc_usrreq.c	Wed Jul 25 16:40:53 2012 +0300
@@ -57,7 +57,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 232317 2012-02-29 21:38:31Z trociny $");
+__FBSDID("$FreeBSD: head/sys/kern/uipc_usrreq.c 237036 2012-06-13 22:12:10Z pjd $");
 
 #include "opt_ddb.h"
 
@@ -1872,7 +1872,7 @@
 			FILEDESC_SLOCK(fdescp);
 			for (i = 0; i < oldfds; i++) {
 				fd = *fdp++;
-				if ((unsigned)fd >= fdescp->fd_nfiles ||
+				if (fd < 0 || fd >= fdescp->fd_nfiles ||
 				    fdescp->fd_ofiles[fd] == NULL) {
 					FILEDESC_SUNLOCK(fdescp);
 					error = EBADF;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_bio.c
--- a/head/sys/kern/vfs_bio.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_bio.c	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 232351 2012-03-01 18:45:25Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 236487 2012-06-02 19:39:12Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -2640,8 +2640,8 @@
 	if (bp != NULL) {
 		int lockflags;
 		/*
-		 * Buffer is in-core.  If the buffer is not busy, it must
-		 * be on a queue.
+		 * Buffer is in-core.  If the buffer is not busy nor managed,
+		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
@@ -2671,9 +2671,13 @@
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
-		BO_LOCK(bo);
-		bremfree(bp);
-		BO_UNLOCK(bo);
+		if (bp->b_flags & B_MANAGED)
+			MPASS(bp->b_qindex == QUEUE_NONE);
+		else {
+			BO_LOCK(bo);
+			bremfree(bp);
+			BO_UNLOCK(bo);
+		}
 
 		/*
 		 * check for size inconsistancies for non-VMIO case.
@@ -3991,7 +3995,9 @@
 	}
 
 	db_printf("buf at %p\n", bp);
-	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
+	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
+	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
+	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_default.c
--- a/head/sys/kern/vfs_default.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_default.c	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_default.c 234386 2012-04-17 16:28:22Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_default.c 236825 2012-06-09 22:26:53Z mckusick $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -343,8 +343,8 @@
 		if (error)
 			goto out;
 
-		if ((dp->d_type != DT_WHT) &&
-		    !strcmp(dp->d_name, dirname)) {
+		if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
+		    strcmp(dp->d_name, dirname) == 0) {
 			found = 1;
 			goto out;
 		}
@@ -646,8 +646,17 @@
 		if ((bp->b_vflags & BV_SCANNED) != 0)
 			continue;
 		bp->b_vflags |= BV_SCANNED;
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
-			continue;
+		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
+			if (ap->a_waitfor != MNT_WAIT)
+				continue;
+			if (BUF_LOCK(bp,
+			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
+			    BO_MTX(bo)) != 0) {
+				BO_LOCK(bo);
+				goto loop1;
+			}
+			BO_LOCK(bo);
+		}
 		BO_UNLOCK(bo);
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_subr.c
--- a/head/sys/kern/vfs_subr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_subr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 234483 2012-04-20 07:00:28Z mckusick $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 236503 2012-06-03 08:01:12Z avg $");
 
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
@@ -73,9 +73,7 @@
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
-#ifdef SW_WATCHDOG
 #include <sys/watchdog.h>
-#endif
 
 #include <machine/stdarg.h>
 
@@ -1027,6 +1025,7 @@
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
+	rangelock_init(&vp->v_rl);
 
 	*vpp = vp;
 	return (0);
@@ -1327,8 +1326,7 @@
  * sync activity.
  */
 int
-vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
-    off_t length, int blksize)
+vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
@@ -1869,10 +1867,10 @@
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
-#ifdef SW_WATCHDOG
+
 			if (first_printf == 0)
 				wdog_kern_pat(WD_LASTVAL);
-#endif
+
 		}
 		if (!LIST_EMPTY(gslp)) {
 			mtx_unlock(&sync_mtx);
@@ -2469,6 +2467,7 @@
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
+	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	mtx_destroy(BO_MTX(bo));
@@ -2660,7 +2659,7 @@
  * Recycle an unused vnode to the front of the free list.
  */
 int
-vrecycle(struct vnode *vp, struct thread *td)
+vrecycle(struct vnode *vp)
 {
 	int recycled;
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_syscalls.c
--- a/head/sys/kern/vfs_syscalls.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_syscalls.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_syscalls.c 234489 2012-04-20 10:08:30Z jh $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_syscalls.c 238029 2012-07-02 21:01:03Z kib $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -1093,8 +1093,7 @@
 	struct file *fp;
 	struct vnode *vp;
 	int cmode;
-	struct file *nfp;
-	int type, indx = -1, error, error_open;
+	int type, indx = -1, error;
 	struct flock lf;
 	struct nameidata nd;
 	int vfslocked;
@@ -1111,19 +1110,22 @@
 	if (flags & O_EXEC) {
 		if (flags & O_ACCMODE)
 			return (EINVAL);
-	} else if ((flags & O_ACCMODE) == O_ACCMODE)
+	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
 		return (EINVAL);
-	else
+	} else {
 		flags = FFLAGS(flags);
+	}
 
 	/*
-	 * allocate the file descriptor, but don't install a descriptor yet
+	 * Allocate the file descriptor, but don't install a descriptor yet.
 	 */
-	error = falloc_noinstall(td, &nfp);
+	error = falloc_noinstall(td, &fp);
 	if (error)
 		return (error);
-	/* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
-	fp = nfp;
+	/*
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
+	 */
 	/* Set the flags early so the finit in devfs can pick them up. */
 	fp->f_flag = flags & FMASK;
 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
@@ -1141,36 +1143,24 @@
 			goto success;
 
 		/*
-		 * handle special fdopen() case.  bleh.  dupfdopen() is
-		 * responsible for dropping the old contents of ofiles[indx]
-		 * if it succeeds.
+		 * Handle special fdopen() case. bleh.
 		 *
 		 * Don't do this for relative (capability) lookups; we don't
 		 * understand exactly what would happen, and we don't think
 		 * that it ever should.
 		 */
-		if ((nd.ni_strictrelative == 0) &&
+		if (nd.ni_strictrelative == 0 &&
 		    (error == ENODEV || error == ENXIO) &&
-		    (td->td_dupfd >= 0)) {
-			/* XXX from fdopen */
-			error_open = error;
-			if ((error = finstall(td, fp, &indx, flags)) != 0)
-				goto bad_unlocked;
-			if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
-			    flags, error_open)) == 0)
+		    td->td_dupfd >= 0) {
+			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
+			    &indx);
+			if (error == 0)
 				goto success;
 		}
-		/*
-		 * Clean up the descriptor, but only if another thread hadn't
-		 * replaced or closed it.
-		 */
-		if (indx != -1)
-			fdclose(fdp, fp, indx, td);
-		fdrop(fp, td);
 
 		if (error == ERESTART)
 			error = EINTR;
-		return (error);
+		goto bad_unlocked;
 	}
 	td->td_dupfd = 0;
 	vfslocked = NDHASGIANT(&nd);
@@ -1206,7 +1196,7 @@
 		if ((flags & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
-			    type)) != 0)
+		    type)) != 0)
 			goto bad;
 		atomic_set_int(&fp->f_flag, FHASLOCK);
 	}
@@ -1247,10 +1237,8 @@
 bad:
 	VFS_UNLOCK_GIANT(vfslocked);
 bad_unlocked:
-	if (indx != -1)
-		fdclose(fdp, fp, indx, td);
+	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
 	fdrop(fp, td);
-	td->td_retval[0] = -1;
 	return (error);
 }
 
@@ -1993,7 +1981,7 @@
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vattr;
-	off_t offset, size;
+	off_t foffset, offset, size;
 	int error, noneg;
 	int vfslocked;
 
@@ -2005,18 +1993,19 @@
 		return (ESPIPE);
 	}
 	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	noneg = (vp->v_type != VCHR);
 	offset = uap->offset;
 	switch (uap->whence) {
 	case L_INCR:
 		if (noneg &&
-		    (fp->f_offset < 0 ||
-		    (offset > 0 && fp->f_offset > OFF_MAX - offset))) {
+		    (foffset < 0 ||
+		    (offset > 0 && foffset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
-		offset += fp->f_offset;
+		offset += foffset;
 		break;
 	case L_XTND:
 		vn_lock(vp, LK_SHARED | LK_RETRY);
@@ -2056,12 +2045,12 @@
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
-	fp->f_offset = offset;
 	VFS_KNOTE_UNLOCKED(vp, 0);
-	*(off_t *)(td->td_retval) = fp->f_offset;
+	*(off_t *)(td->td_retval) = offset;
 drop:
 	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
+	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
@@ -3994,6 +3983,7 @@
 	caddr_t dirbuf;
 	int error, eofflag, readcnt, vfslocked;
 	long loff;
+	off_t foffset;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	if (uap->count > 64 * 1024)
@@ -4006,10 +3996,12 @@
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
 unionread:
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
+		foffset_unlock(fp, foffset, 0);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
@@ -4022,12 +4014,13 @@
 	auio.uio_td = td;
 	auio.uio_resid = uap->count;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
-	loff = auio.uio_offset = fp->f_offset;
+	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error) {
 		VOP_UNLOCK(vp, 0);
 		VFS_UNLOCK_GIANT(vfslocked);
+		foffset_unlock(fp, foffset, FOF_NOUPDATE);
 		fdrop(fp, td);
 		return (error);
 	}
@@ -4036,7 +4029,7 @@
 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 			    NULL, NULL);
-			fp->f_offset = auio.uio_offset;
+			foffset = auio.uio_offset;
 		} else
 #	endif
 	{
@@ -4048,7 +4041,7 @@
 		kiov.iov_base = dirbuf;
 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
 			    NULL, NULL);
-		fp->f_offset = kuio.uio_offset;
+		foffset = kuio.uio_offset;
 		if (error == 0) {
 			readcnt = uap->count - kuio.uio_resid;
 			edp = (struct dirent *)&dirbuf[readcnt];
@@ -4086,6 +4079,7 @@
 	if (error) {
 		VOP_UNLOCK(vp, 0);
 		VFS_UNLOCK_GIANT(vfslocked);
+		foffset_unlock(fp, foffset, 0);
 		fdrop(fp, td);
 		return (error);
 	}
@@ -4097,13 +4091,14 @@
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
-		fp->f_offset = 0;
+		foffset = 0;
 		vput(tvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
+	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	td->td_retval[0] = uap->count - auio.uio_resid;
 	if (error == 0)
@@ -4136,7 +4131,8 @@
 	long base;
 	int error;
 
-	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
+	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+	    NULL, UIO_USERSPACE);
 	if (error)
 		return (error);
 	if (uap->basep != NULL)
@@ -4146,7 +4142,7 @@
 
 int
 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
-    long *basep)
+    long *basep, ssize_t *residp, enum uio_seg bufseg)
 {
 	struct vnode *vp;
 	struct file *fp;
@@ -4155,6 +4151,7 @@
 	int vfslocked;
 	long loff;
 	int error, eofflag;
+	off_t foffset;
 
 	AUDIT_ARG_FD(fd);
 	if (count > IOSIZE_MAX)
@@ -4168,6 +4165,7 @@
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
+	foffset = foffset_lock(fp, 0);
 unionread:
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
@@ -4180,18 +4178,18 @@
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
-	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
-	loff = auio.uio_offset = fp->f_offset;
+	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 		    NULL);
-	fp->f_offset = auio.uio_offset;
+	foffset = auio.uio_offset;
 	if (error) {
 		VOP_UNLOCK(vp, 0);
 		VFS_UNLOCK_GIANT(vfslocked);
@@ -4205,7 +4203,7 @@
 		VREF(vp);
 		fp->f_vnode = vp;
 		fp->f_data = vp;
-		fp->f_offset = 0;
+		foffset = 0;
 		vput(tvp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
@@ -4213,8 +4211,11 @@
 	VOP_UNLOCK(vp, 0);
 	VFS_UNLOCK_GIANT(vfslocked);
 	*basep = loff;
+	if (residp != NULL)
+		*residp = auio.uio_resid;
 	td->td_retval[0] = count - auio.uio_resid;
 fail:
+	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	return (error);
 }
@@ -4334,12 +4335,10 @@
 	struct file *fp;
 #ifdef CAPABILITIES
 	struct file *fp_fromcap;
+	int error;
 #endif
-	int error;
-
-	error = 0;
-	fp = NULL;
-	if ((fdp == NULL) || (fp = fget_unlocked(fdp, fd)) == NULL)
+
+	if (fdp == NULL || (fp = fget_unlocked(fdp, fd)) == NULL)
 		return (EBADF);
 #ifdef CAPABILITIES
 	/*
@@ -4481,24 +4480,19 @@
 		int flags;
 	} */ *uap;
 {
-	struct proc *p = td->td_proc;
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
-	struct vattr vat;
-	struct vattr *vap = &vat;
 	struct flock lf;
 	struct file *fp;
-	register struct filedesc *fdp = p->p_fd;
 	int fmode, error, type;
-	accmode_t accmode;
-	struct file *nfp;
 	int vfslocked;
 	int indx;
 
 	error = priv_check(td, PRIV_VFS_FHOPEN);
 	if (error)
 		return (error);
+	indx = -1;
 	fmode = FFLAGS(uap->flags);
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
@@ -4514,109 +4508,42 @@
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
-	if (error)
-		goto out;
+	if (error) {
+		VFS_UNLOCK_GIANT(vfslocked);
+		return (error);
+	}
+
+	error = falloc_noinstall(td, &fp);
+	if (error) {
+		vput(vp);
+		VFS_UNLOCK_GIANT(vfslocked);
+		return (error);
+	}
 	/*
-	 * from now on we have to make sure not
-	 * to forget about the vnode
-	 * any error that causes an abort must vput(vp)
-	 * just set error = err and 'goto bad;'.
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
 	 */
 
-	/*
-	 * from vn_open
-	 */
-	if (vp->v_type == VLNK) {
-		error = EMLINK;
+#ifdef INVARIANTS
+	td->td_dupfd = -1;
+#endif
+	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
+	if (error) {
+		KASSERT(fp->f_ops == &badfileops,
+		    ("VOP_OPEN in fhopen() set f_ops"));
+		KASSERT(td->td_dupfd < 0,
+		    ("fhopen() encountered fdopen()"));
+
+		vput(vp);
 		goto bad;
 	}
-	if (vp->v_type == VSOCK) {
-		error = EOPNOTSUPP;
-		goto bad;
-	}
-	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
-		error = ENOTDIR;
-		goto bad;
-	}
-	accmode = 0;
-	if (fmode & (FWRITE | O_TRUNC)) {
-		if (vp->v_type == VDIR) {
-			error = EISDIR;
-			goto bad;
-		}
-		error = vn_writechk(vp);
-		if (error)
-			goto bad;
-		accmode |= VWRITE;
-	}
-	if (fmode & FREAD)
-		accmode |= VREAD;
-	if ((fmode & O_APPEND) && (fmode & FWRITE))
-		accmode |= VAPPEND;
-#ifdef MAC
-	error = mac_vnode_check_open(td->td_ucred, vp, accmode);
-	if (error)
-		goto bad;
+#ifdef INVARIANTS
+	td->td_dupfd = 0;
 #endif
-	if (accmode) {
-		error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
-		if (error)
-			goto bad;
-	}
-	if (fmode & O_TRUNC) {
-		vfs_ref(mp);
-		VOP_UNLOCK(vp, 0);				/* XXX */
-		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
-			vrele(vp);
-			vfs_rel(mp);
-			goto out;
-		}
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
-		vfs_rel(mp);
-#ifdef MAC
-		/*
-		 * We don't yet have fp->f_cred, so use td->td_ucred, which
-		 * should be right.
-		 */
-		error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
-		if (error == 0) {
-#endif
-			VATTR_NULL(vap);
-			vap->va_size = 0;
-			error = VOP_SETATTR(vp, vap, td->td_ucred);
-#ifdef MAC
-		}
-#endif
-		vn_finished_write(mp);
-		if (error)
-			goto bad;
-	}
-	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
-	if (error)
-		goto bad;
-
-	if (fmode & FWRITE) {
-		vp->v_writecount++;
-		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
-		    __func__, vp, vp->v_writecount);
-	}
-
-	/*
-	 * end of vn_open code
-	 */
-
-	if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
-		if (fmode & FWRITE) {
-			vp->v_writecount--;
-			CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
-			    __func__, vp, vp->v_writecount);
-		}
-		goto bad;
-	}
-	/* An extra reference on `nfp' has been held for us by falloc(). */
-	fp = nfp;
-	nfp->f_vnode = vp;
-	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
+	fp->f_vnode = vp;
+	fp->f_seqcount = 1;
+	finit(fp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
+	VOP_UNLOCK(vp, 0);
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
@@ -4628,36 +4555,22 @@
 		type = F_FLOCK;
 		if ((fmode & FNONBLOCK) == 0)
 			type |= F_WAIT;
-		VOP_UNLOCK(vp, 0);
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
-			    type)) != 0) {
-			/*
-			 * The lock request failed.  Normally close the
-			 * descriptor but handle the case where someone might
-			 * have dup()d or close()d it when we weren't looking.
-			 */
-			fdclose(fdp, fp, indx, td);
-
-			/*
-			 * release our private reference
-			 */
-			fdrop(fp, td);
-			goto out;
-		}
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		    type)) != 0)
+			goto bad;
 		atomic_set_int(&fp->f_flag, FHASLOCK);
 	}
-
-	VOP_UNLOCK(vp, 0);
+	if (fmode & O_TRUNC) {
+		error = fo_truncate(fp, 0, td->td_ucred, td);
+		if (error)
+			goto bad;
+	}
+
+	error = finstall(td, fp, &indx, fmode);
+bad:
+	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	td->td_retval[0] = indx;
-	return (0);
-
-bad:
-	vput(vp);
-out:
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -4679,7 +4592,22 @@
 	} */ *uap;
 {
 	struct stat sb;
-	fhandle_t fh;
+	struct fhandle fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fh));
+	if (error != 0)
+		return (error);
+	error = kern_fhstat(td, fh, &sb);
+	if (error != 0)
+		return (error);
+	error = copyout(&sb, uap->sb, sizeof(sb));
+	return (error);
+}
+
+int
+kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
+{
 	struct mount *mp;
 	struct vnode *vp;
 	int vfslocked;
@@ -4688,9 +4616,6 @@
 	error = priv_check(td, PRIV_VFS_FHSTAT);
 	if (error)
 		return (error);
-	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
-	if (error)
-		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	vfslocked = VFS_LOCK_GIANT(mp);
@@ -4700,12 +4625,9 @@
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
-	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
+	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
-	if (error)
-		return (error);
-	error = copyout(&sb, uap->sb, sizeof(sb));
 	return (error);
 }
 
@@ -4960,6 +4882,8 @@
 			new->fa_advice = advice;
 			new->fa_start = offset;
 			new->fa_end = end;
+			new->fa_prevstart = 0;
+			new->fa_prevend = 0;
 			fp->f_advice = new;
 			new = fa;
 		}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/kern/vfs_vnops.c
--- a/head/sys/kern/vfs_vnops.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/kern/vfs_vnops.c	Wed Jul 25 16:40:53 2012 +0300
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/kern/vfs_vnops.c 232701 2012-03-08 20:27:20Z jhb $");
+__FBSDID("$FreeBSD: head/sys/kern/vfs_vnops.c 238029 2012-07-02 21:01:03Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -56,6 +56,7 @@
 #include <sys/filio.h>
 #include <sys/resourcevar.h>
 #include <sys/sx.h>
+#include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
@@ -65,10 +66,15 @@
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
 #include <vm/vm_object.h>
+#include <vm/vm_page.h>
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
+static fo_rdwr_t	vn_io_fault;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
@@ -77,8 +83,8 @@
 static fo_close_t	vn_closefile;
 
 struct 	fileops vnops = {
-	.fo_read = vn_read,
-	.fo_write = vn_write,
+	.fo_read = vn_io_fault,
+	.fo_write = vn_io_fault,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
@@ -102,7 +108,8 @@
 }
 
 /*
- * Common code for vnode open operations.
+ * Common code for vnode open operations via a name lookup.
+ * Lookup the vnode and invoke VOP_CREATE if needed.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  * 
  * Note that this does NOT free nameidata for the successful case,
@@ -118,7 +125,6 @@
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int fmode, error;
-	accmode_t accmode;
 	int vfslocked, mpsafe;
 
 	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
@@ -199,24 +205,44 @@
 		vfslocked = NDHASGIANT(ndp);
 		vp = ndp->ni_vp;
 	}
-	if (vp->v_type == VLNK) {
-		error = EMLINK;
+	error = vn_open_vnode(vp, fmode, cred, td, fp);
+	if (error)
 		goto bad;
-	}
-	if (vp->v_type == VSOCK) {
-		error = EOPNOTSUPP;
-		goto bad;
-	}
-	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
-		error = ENOTDIR;
-		goto bad;
-	}
+	*flagp = fmode;
+	if (!mpsafe)
+		VFS_UNLOCK_GIANT(vfslocked);
+	return (0);
+bad:
+	NDFREE(ndp, NDF_ONLY_PNBUF);
+	vput(vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	*flagp = fmode;
+	ndp->ni_vp = NULL;
+	return (error);
+}
+
+/*
+ * Common code for vnode open operations once a vnode is located.
+ * Check permissions, and call the VOP_OPEN routine.
+ */
+int
+vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
+    struct thread *td, struct file *fp)
+{
+	accmode_t accmode;
+	int error;
+
+	VFS_ASSERT_GIANT(vp->v_mount);
+	if (vp->v_type == VLNK)
+		return (EMLINK);
+	if (vp->v_type == VSOCK)
+		return (EOPNOTSUPP);
+	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
+		return (ENOTDIR);
 	accmode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
-		if (vp->v_type == VDIR) {
-			error = EISDIR;
-			goto bad;
-		}
+		if (vp->v_type == VDIR)
+			return (EISDIR);
 		accmode |= VWRITE;
 	}
 	if (fmode & FREAD)
@@ -228,40 +254,30 @@
 #ifdef MAC
 	error = mac_vnode_check_open(cred, vp, accmode);
 	if (error)
-		goto bad;
+		return (error);
 #endif
 	if ((fmode & O_CREAT) == 0) {
 		if (accmode & VWRITE) {
 			error = vn_writechk(vp);
 			if (error)
-				goto bad;
+				return (error);
 		}
 		if (accmode) {
 		        error = VOP_ACCESS(vp, accmode, cred, td);
 			if (error)
-				goto bad;
+				return (error);
 		}
 	}
 	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
-		goto bad;
+		return (error);
 
 	if (fmode & FWRITE) {
 		vp->v_writecount++;
 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
-	*flagp = fmode;
-	ASSERT_VOP_LOCKED(vp, "vn_open_cred");
-	if (!mpsafe)
-		VFS_UNLOCK_GIANT(vfslocked);
+	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 	return (0);
-bad:
-	NDFREE(ndp, NDF_ONLY_PNBUF);
-	vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	*flagp = fmode;
-	ndp->ni_vp = NULL;
-	return (error);
 }
 
 /*
@@ -367,47 +383,19 @@
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
-vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
-    aresid, td)
-	enum uio_rw rw;
-	struct vnode *vp;
-	void *base;
-	int len;
-	off_t offset;
-	enum uio_seg segflg;
-	int ioflg;
-	struct ucred *active_cred;
-	struct ucred *file_cred;
-	ssize_t *aresid;
-	struct thread *td;
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+    enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+    struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
+	void *rl_cookie;
 	int error, lock_flags;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
-	if ((ioflg & IO_NODELOCKED) == 0) {
-		mp = NULL;
-		if (rw == UIO_WRITE) { 
-			if (vp->v_type != VCHR &&
-			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
-			    != 0)
-				return (error);
-			if (MNT_SHARED_WRITES(mp) ||
-			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
-				lock_flags = LK_SHARED;
-			} else {
-				lock_flags = LK_EXCLUSIVE;
-			}
-			vn_lock(vp, lock_flags | LK_RETRY);
-		} else
-			vn_lock(vp, LK_SHARED | LK_RETRY);
-
-	}
-	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
@@ -418,6 +406,33 @@
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
+
+	if ((ioflg & IO_NODELOCKED) == 0) {
+		if (rw == UIO_READ) {
+			rl_cookie = vn_rangelock_rlock(vp, offset,
+			    offset + len);
+		} else {
+			rl_cookie = vn_rangelock_wlock(vp, offset,
+			    offset + len);
+		}
+		mp = NULL;
+		if (rw == UIO_WRITE) { 
+			if (vp->v_type != VCHR &&
+			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
+			    != 0)
+				goto out;
+			if (MNT_SHARED_WRITES(mp) ||
+			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
+				lock_flags = LK_SHARED;
+			else
+				lock_flags = LK_EXCLUSIVE;
+		} else
+			lock_flags = LK_SHARED;
+		vn_lock(vp, lock_flags | LK_RETRY);
+	} else
+		rl_cookie = NULL;
+
+	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
@@ -429,7 +444,7 @@
 	}
 #endif
 	if (error == 0) {
-		if (file_cred)
+		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
@@ -444,10 +459,13 @@
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
-		if (rw == UIO_WRITE && vp->v_type != VCHR)
+		VOP_UNLOCK(vp, 0);
+		if (mp != NULL)
 			vn_finished_write(mp);
-		VOP_UNLOCK(vp, 0);
 	}
+ out:
+	if (rl_cookie != NULL)
+		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
@@ -509,6 +527,110 @@
 	return (error);
 }
 
+off_t
+foffset_lock(struct file *fp, int flags)
+{
+	struct mtx *mtxp;
+	off_t res;
+
+	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+	/*
+	 * Caller only wants the current f_offset value.  Assume that
+	 * the long and shorter integer types reads are atomic.
+	 */
+	if ((flags & FOF_NOLOCK) != 0)
+		return (fp->f_offset);
+#endif
+
+	/*
+	 * According to McKusick the vn lock was protecting f_offset here.
+	 * It is now protected by the FOFFSET_LOCKED flag.
+	 */
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if ((flags & FOF_NOLOCK) == 0) {
+		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+			    "vofflock", 0);
+		}
+		fp->f_vnread_flags |= FOFFSET_LOCKED;
+	}
+	res = fp->f_offset;
+	mtx_unlock(mtxp);
+	return (res);
+}
+
+void
+foffset_unlock(struct file *fp, off_t val, int flags)
+{
+	struct mtx *mtxp;
+
+	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+	if ((flags & FOF_NOLOCK) != 0) {
+		if ((flags & FOF_NOUPDATE) == 0)
+			fp->f_offset = val;
+		if ((flags & FOF_NEXTOFF) != 0)
+			fp->f_nextoff = val;
+		return;
+	}
+#endif
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if ((flags & FOF_NOUPDATE) == 0)
+		fp->f_offset = val;
+	if ((flags & FOF_NEXTOFF) != 0)
+		fp->f_nextoff = val;
+	if ((flags & FOF_NOLOCK) == 0) {
+		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
+		    ("Lost FOFFSET_LOCKED"));
+		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+			wakeup(&fp->f_vnread_flags);
+		fp->f_vnread_flags = 0;
+	}
+	mtx_unlock(mtxp);
+}
+
+void
+foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+	if ((flags & FOF_OFFSET) == 0)
+		uio->uio_offset = foffset_lock(fp, flags);
+}
+
+void
+foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+	if ((flags & FOF_OFFSET) == 0)
+		foffset_unlock(fp, uio->uio_offset, flags);
+}
+
+static int
+get_advice(struct file *fp, struct uio *uio)
+{
+	struct mtx *mtxp;
+	int ret;
+
+	ret = POSIX_FADV_NORMAL;
+	if (fp->f_advice == NULL)
+		return (ret);
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	if (uio->uio_offset >= fp->f_advice->fa_start &&
+	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+		ret = fp->f_advice->fa_advice;
+	mtx_unlock(mtxp);
+	return (ret);
+}
+
 /*
  * File table vnode read routine.
  */
@@ -521,44 +643,22 @@
 	struct thread *td;
 {
 	struct vnode *vp;
+	struct mtx *mtxp;
 	int error, ioflag;
-	struct mtx *mtxp;
 	int advice, vfslocked;
-	off_t offset;
+	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
-	mtxp = NULL;
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
-	advice = POSIX_FADV_NORMAL;
+	advice = get_advice(fp, uio);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	/*
-	 * According to McKusick the vn lock was protecting f_offset here.
-	 * It is now protected by the FOFFSET_LOCKED flag.
-	 */
-	if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
-		mtxp = mtx_pool_find(mtxpool_sleep, fp);
-		mtx_lock(mtxp);
-		if ((flags & FOF_OFFSET) == 0) {
-			while (fp->f_vnread_flags & FOFFSET_LOCKED) {
-				fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-				msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
-				    "vnread offlock", 0);
-			}
-			fp->f_vnread_flags |= FOFFSET_LOCKED;
-			uio->uio_offset = fp->f_offset;
-		}
-		if (fp->f_advice != NULL &&
-		    uio->uio_offset >= fp->f_advice->fa_start &&
-		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
-			advice = fp->f_advice->fa_advice;
-		mtx_unlock(mtxp);
-	}
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	switch (advice) {
@@ -578,20 +678,42 @@
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0) {
-		fp->f_offset = uio->uio_offset;
-		mtx_lock(mtxp);
-		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
-			wakeup(&fp->f_vnread_flags);
-		fp->f_vnread_flags = 0;
-		mtx_unlock(mtxp);
-	}
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
-	    offset != uio->uio_offset)
-		error = VOP_ADVISE(vp, offset, uio->uio_offset - 1,
-		    POSIX_FADV_DONTNEED);
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE read(2).  To optimize the common
+		 * case of using POSIX_FADV_NOREUSE with sequential
+		 * access, track the previous implicit DONTNEED
+		 * request and grow this request to include the
+		 * current read(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously read regions of the
+		 * file.  This allows filesystem blocks that are
+		 * accessed by multiple calls to read(2) to be flushed
+		 * once the last read(2) finishes.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
@@ -609,12 +731,14 @@
 {
 	struct vnode *vp;
 	struct mount *mp;
+	struct mtx *mtxp;
 	int error, ioflag, lock_flags;
-	struct mtx *mtxp;
 	int advice, vfslocked;
+	off_t offset, start, end;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
+	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type == VREG)
@@ -633,6 +757,8 @@
 	if (vp->v_type != VCHR &&
 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto unlock;
+
+	advice = get_advice(fp, uio);
  
 	if ((MNT_SHARED_WRITES(mp) ||
 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
@@ -643,74 +769,360 @@
 	}
 
 	vn_lock(vp, lock_flags | LK_RETRY);
-	if ((flags & FOF_OFFSET) == 0)
-		uio->uio_offset = fp->f_offset;
-	advice = POSIX_FADV_NORMAL;
-	if (fp->f_advice != NULL) {
-		mtxp = mtx_pool_find(mtxpool_sleep, fp);
-		mtx_lock(mtxp);
-		if (fp->f_advice != NULL &&
-		    uio->uio_offset >= fp->f_advice->fa_start &&
-		    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
-			advice = fp->f_advice->fa_advice;
-		mtx_unlock(mtxp);
-	}
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
+	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
-	case POSIX_FADV_NOREUSE:
-		/*
-		 * Request the underlying FS to discard the buffers
-		 * and pages after the I/O is complete.
-		 */
-		ioflag |= IO_DIRECT;
-		break;
 	}
+	offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0)
-		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
 	if (vp->v_type != VCHR)
 		vn_finished_write(mp);
+	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+	    offset != uio->uio_offset) {
+		/*
+		 * Use POSIX_FADV_DONTNEED to flush clean pages and
+		 * buffers for the backing file after a
+		 * POSIX_FADV_NOREUSE write(2).  To optimize the
+		 * common case of using POSIX_FADV_NOREUSE with
+		 * sequential access, track the previous implicit
+		 * DONTNEED request and grow this request to include
+		 * the current write(2) in addition to the previous
+		 * DONTNEED.  With purely sequential access this will
+		 * cause the DONTNEED requests to continously grow to
+		 * cover all of the previously written regions of the
+		 * file.
+		 *
+		 * Note that the blocks just written are almost
+		 * certainly still dirty, so this only works when
+		 * VOP_ADVISE() calls from subsequent writes push out
+		 * the data written by this write(2) once the backing
+		 * buffers are clean.  However, as compared to forcing
+		 * IO_DIRECT, this gives much saner behavior.  Write
+		 * clustering is still allowed, and clean pages are
+		 * merely moved to the cache page queue rather than
+		 * outright thrown away.  This means a subsequent
+		 * read(2) can still avoid hitting the disk if the
+		 * pages have not been reclaimed.
+		 *
+		 * This does make POSIX_FADV_NOREUSE largely useless
+		 * with non-sequential access.  However, sequential
+		 * access is the more common use case and the flag is
+		 * merely advisory.
+		 */
+		start = offset;
+		end = uio->uio_offset - 1;
+		mtxp = mtx_pool_find(mtxpool_sleep, fp);
+		mtx_lock(mtxp);
+		if (fp->f_advice != NULL &&
+		    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+			if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+				start = fp->f_advice->fa_prevstart;
+			else if (fp->f_advice->fa_prevstart != 0 &&
+			    fp->f_advice->fa_prevstart == end + 1)
+				end = fp->f_advice->fa_prevend;
+			fp->f_advice->fa_prevstart = start;
+			fp->f_advice->fa_prevend = end;
+		}
+		mtx_unlock(mtxp);
+		error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+	}
+	
 unlock:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
+static const int io_hold_cnt = 16;
+static int vn_io_fault_enable = 1;
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
+    &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
+static unsigned long vn_io_faults_cnt;
+SYSCTL_LONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
+    &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
+
+/*
+ * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
+ * prevent the following deadlock:
+ *
+ * Assume that the thread A reads from the vnode vp1 into userspace
+ * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
+ * currently not resident, then system ends up with the call chain
+ *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
+ *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
+ * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
+ * If, at the same time, thread B reads from vnode vp2 into buffer buf2
+ * backed by the pages of vnode vp1, and some page in buf2 is not
+ * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
+ *
+ * To prevent the lock order reversal and deadlock, vn_io_fault() does
+ * not allow page faults to happen during VOP_READ() or VOP_WRITE().
+ * Instead, it first tries to do the whole range i/o with pagefaults
+ * disabled. If all pages in the i/o buffer are resident and mapped,
+ * VOP will succeed (ignoring the genuine filesystem errors).
+ * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
+ * i/o in chunks, with all pages in the chunk prefaulted and held
+ * using vm_fault_quick_hold_pages().
+ *
+ * Filesystems using this deadlock avoidance scheme should use the
+ * array of the held pages from uio, saved in the curthread->td_ma,
+ * instead of doing uiomove().  A helper function
+ * vn_io_fault_uiomove() converts uiomove request into
+ * uiomove_fromphys() over td_ma array.
+ *
+ * Since vnode locks do not cover the whole i/o anymore, rangelocks
+ * make the current i/o request atomic with respect to other i/os and
+ * truncations.
+ */
+static int
+vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
+{
+	vm_page_t ma[io_hold_cnt + 2];
+	struct uio *uio_clone, short_uio;
+	struct iovec short_iovec[1];
+	fo_rdwr_t *doio;
+	struct vnode *vp;
+	void *rl_cookie;
+	struct mount *mp;
+	vm_page_t *prev_td_ma;
+	int cnt, error, save, saveheld, prev_td_ma_cnt;
+	vm_offset_t addr, end;
+	vm_prot_t prot;
+	size_t len, resid;
+	ssize_t adv;
+
+	if (uio->uio_rw == UIO_READ)
+		doio = vn_read;
+	else
+		doio = vn_write;
+	vp = fp->f_vnode;
+	foffset_lock_uio(fp, uio, flags);
+
+	if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
+	    ((mp = vp->v_mount) != NULL &&
+	    (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) ||
+	    !vn_io_fault_enable) {
+		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+		goto out_last;
+	}
+
+	/*
+	 * The UFS follows IO_UNIT directive and replays back both
+	 * uio_offset and uio_resid if an error is encountered during the
+	 * operation.  But, since the iovec may be already advanced,
+	 * uio is still in an inconsistent state.
+	 *
+	 * Cache a copy of the original uio, which is advanced to the redo
+	 * point using UIO_NOCOPY below.
+	 */
+	uio_clone = cloneuio(uio);
+	resid = uio->uio_resid;
+
+	short_uio.uio_segflg = UIO_USERSPACE;
+	short_uio.uio_rw = uio->uio_rw;
+	short_uio.uio_td = uio->uio_td;
+
+	if (uio->uio_rw == UIO_READ) {
+		prot = VM_PROT_WRITE;
+		rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
+		    uio->uio_offset + uio->uio_resid);
+	} else {
+		prot = VM_PROT_READ;
+		if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0)
+			/* For appenders, punt and lock the whole range. */
+			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+		else
+			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
+			    uio->uio_offset + uio->uio_resid);
+	}
+
+	save = vm_fault_disable_pagefaults();
+	error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+	if (error != EFAULT)
+		goto out;
+
+	atomic_add_long(&vn_io_faults_cnt, 1);
+	uio_clone->uio_segflg = UIO_NOCOPY;
+	uiomove(NULL, resid - uio->uio_resid, uio_clone);
+	uio_clone->uio_segflg = uio->uio_segflg;
+
+	saveheld = curthread_pflags_set(TDP_UIOHELD);
+	prev_td_ma = td->td_ma;
+	prev_td_ma_cnt = td->td_ma_cnt;
+
+	while (uio_clone->uio_resid != 0) {
+		len = uio_clone->uio_iov->iov_len;
+		if (len == 0) {
+			KASSERT(uio_clone->uio_iovcnt >= 1,
+			    ("iovcnt underflow"));
+			uio_clone->uio_iov++;
+			uio_clone->uio_iovcnt--;
+			continue;
+		}
+
+		addr = (vm_offset_t)uio_clone->uio_iov->iov_base;
+		end = round_page(addr + len);
+		cnt = howmany(end - trunc_page(addr), PAGE_SIZE);
+		/*
+		 * A perfectly misaligned address and length could cause
+		 * both the start and the end of the chunk to use partial
+		 * page.  +2 accounts for such a situation.
+		 */
+		if (cnt > io_hold_cnt + 2) {
+			len = io_hold_cnt * PAGE_SIZE;
+			KASSERT(howmany(round_page(addr + len) -
+			    trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2,
+			    ("cnt overflow"));
+		}
+		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
+		    addr, len, prot, ma, io_hold_cnt + 2);
+		if (cnt == -1) {
+			error = EFAULT;
+			break;
+		}
+		short_uio.uio_iov = &short_iovec[0];
+		short_iovec[0].iov_base = (void *)addr;
+		short_uio.uio_iovcnt = 1;
+		short_uio.uio_resid = short_iovec[0].iov_len = len;
+		short_uio.uio_offset = uio_clone->uio_offset;
+		td->td_ma = ma;
+		td->td_ma_cnt = cnt;
+
+		error = doio(fp, &short_uio, active_cred, flags | FOF_OFFSET,
+		    td);
+		vm_page_unhold_pages(ma, cnt);
+		adv = len - short_uio.uio_resid;
+
+		uio_clone->uio_iov->iov_base =
+		    (char *)uio_clone->uio_iov->iov_base + adv;
+		uio_clone->uio_iov->iov_len -= adv;
+		uio_clone->uio_resid -= adv;
+		uio_clone->uio_offset += adv;
+
+		uio->uio_resid -= adv;
+		uio->uio_offset += adv;
+
+		if (error != 0 || adv == 0)
+			break;
+	}
+	td->td_ma = prev_td_ma;
+	td->td_ma_cnt = prev_td_ma_cnt;
+	curthread_pflags_restore(saveheld);
+out:
+	vm_fault_enable_pagefaults(save);
+	vn_rangelock_unlock(vp, rl_cookie);
+	free(uio_clone, M_IOV);
+out_last:
+	foffset_unlock_uio(fp, uio, flags);
+	return (error);
+}
+
+/*
+ * Helper function to perform the requested uiomove operation using
+ * the held pages for io->uio_iov[0].iov_base buffer instead of
+ * copyin/copyout.  Access to the pages with uiomove_fromphys()
+ * instead of iov_base prevents page faults that could occur due to
+ * pmap_collect() invalidating the mapping created by
+ * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
+ * object cleanup revoking the write access from page mappings.
+ *
+ * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
+ * instead of plain uiomove().
+ */
+int
+vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
+{
+	struct uio transp_uio;
+	struct iovec transp_iov[1];
+	struct thread *td;
+	size_t adv;
+	int error, pgadv;
+
+	td = curthread;
+	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+	    uio->uio_segflg != UIO_USERSPACE)
+		return (uiomove(data, xfersize, uio));
+
+	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+	transp_iov[0].iov_base = data;
+	transp_uio.uio_iov = &transp_iov[0];
+	transp_uio.uio_iovcnt = 1;
+	if (xfersize > uio->uio_resid)
+		xfersize = uio->uio_resid;
+	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
+	transp_uio.uio_offset = 0;
+	transp_uio.uio_segflg = UIO_SYSSPACE;
+	/*
+	 * Since transp_iov points to data, and td_ma page array
+	 * corresponds to original uio->uio_iov, we need to invert the
+	 * direction of the i/o operation as passed to
+	 * uiomove_fromphys().
+	 */
+	switch (uio->uio_rw) {
+	case UIO_WRITE:
+		transp_uio.uio_rw = UIO_READ;
+		break;
+	case UIO_READ:
+		transp_uio.uio_rw = UIO_WRITE;
+		break;
+	}
+	transp_uio.uio_td = uio->uio_td;
+	error = uiomove_fromphys(td->td_ma,
+	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
+	    xfersize, &transp_uio);
+	adv = xfersize - transp_uio.uio_resid;
+	pgadv =
+	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
+	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
+	td->td_ma += pgadv;
+	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+	    pgadv));
+	td->td_ma_cnt -= pgadv;
+	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
+	uio->uio_iov->iov_len -= adv;
+	uio->uio_resid -= adv;
+	uio->uio_offset += adv;
+	return (error);
+}
+
 /*
  * File table truncate routine.
  */
 static int
-vn_truncate(fp, length, active_cred, td)
-	struct file *fp;
-	off_t length;
-	struct ucred *active_cred;
-	struct thread *td;
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
 {
 	struct vattr vattr;
 	struct mount *mp;
 	struct vnode *vp;
+	void *rl_cookie;
 	int vfslocked;
 	int error;
 
 	vp = fp->f_vnode;
+
+	/*
+	 * Lock the whole range for truncation.  Otherwise split i/o
+	 * might happen partly before and partly after the truncation.
+	 */
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-	if (error) {
-		VFS_UNLOCK_GIANT(vfslocked);
-		return (error);
-	}
+	if (error)
+		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
@@ -730,7 +1142,9 @@
 out:
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
+out1:
 	VFS_UNLOCK_GIANT(vfslocked);
+	vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
@@ -1466,3 +1880,56 @@
 	vm_object_page_remove(object, start, end, 0);
 	VM_OBJECT_UNLOCK(object);
 }
+
+int
+vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
+{
+	struct vattr va;
+	daddr_t bn, bnp;
+	uint64_t bsize;
+	off_t noff;
+	int error;
+
+	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
+	    ("Wrong command %lu", cmd));
+
+	if (vn_lock(vp, LK_SHARED) != 0)
+		return (EBADF);
+	if (vp->v_type != VREG) {
+		error = ENOTTY;
+		goto unlock;
+	}
+	error = VOP_GETATTR(vp, &va, cred);
+	if (error != 0)
+		goto unlock;
+	noff = *off;
+	if (noff >= va.va_size) {
+		error = ENXIO;
+		goto unlock;
+	}
+	bsize = vp->v_mount->mnt_stat.f_iosize;
+	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
+		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
+		if (error == EOPNOTSUPP) {
+			error = ENOTTY;
+			goto unlock;
+		}
+		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
+		    (bnp != -1 && cmd == FIOSEEKDATA)) {
+			noff = bn * bsize;
+			if (noff < *off)
+				noff = *off;
+			goto unlock;
+		}
+	}
+	if (noff > va.va_size)
+		noff = va.va_size;
+	/* noff == va.va_size. There is an implicit hole at the end of file. */
+	if (cmd == FIOSEEKDATA)
+		error = ENXIO;
+unlock:
+	VOP_UNLOCK(vp, 0);
+	if (error == 0)
+		*off = noff;
+	return (error);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/icmp_var.h
--- a/head/sys/netinet/icmp_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/icmp_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)icmp_var.h	8.1 (Berkeley) 6/10/93
- * $FreeBSD$
+ * $FreeBSD: head/sys/netinet/icmp_var.h 237230 2012-06-18 17:11:24Z tuexen $
  */
 
 #ifndef _NETINET_ICMP_VAR_H_
@@ -102,7 +102,8 @@
 #define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
 #define BANDLIM_RST_OPENPORT 4   /* No connection, listener */
 #define BANDLIM_ICMP6_UNREACH 5
-#define BANDLIM_MAX 5
+#define BANDLIM_SCTP_OOTB 6
+#define BANDLIM_MAX 6
 #endif
 
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/if_ether.c
--- a/head/sys/netinet/if_ether.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/if_ether.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/if_ether.c 230442 2012-01-22 02:13:19Z bz $");
+__FBSDID("$FreeBSD: head/sys/netinet/if_ether.c 237263 2012-06-19 07:34:13Z np $");
 
 #include "opt_inet.h"
 
@@ -180,6 +180,17 @@
 		    callout_active(&lle->la_timer)) {
 			callout_stop(&lle->la_timer);
 			LLE_REMREF(lle);
+
+			if (lle->la_flags != LLE_DELETED) {
+				int evt;
+
+				if (lle->la_flags & LLE_VALID)
+					evt = LLENTRY_EXPIRED;
+				else
+					evt = LLENTRY_TIMEDOUT;
+				EVENTHANDLER_INVOKE(lle_event, lle, evt);
+			}
+
 			pkts_dropped = llentry_free(lle);
 			ARPSTAT_ADD(dropped, pkts_dropped);
 			ARPSTAT_INC(timeouts);
@@ -726,7 +737,7 @@
 		(void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen);
 		la->la_flags |= LLE_VALID;
 
-		EVENTHANDLER_INVOKE(arp_update_event, la);
+		EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 		if (!(la->la_flags & LLE_STATIC)) {
 			int canceled;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/if_ether.h
--- a/head/sys/netinet/if_ether.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/if_ether.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)if_ether.h	8.3 (Berkeley) 5/2/95
- * $FreeBSD: head/sys/netinet/if_ether.h 229810 2012-01-08 13:34:00Z glebius $
+ * $FreeBSD: head/sys/netinet/if_ether.h 237263 2012-06-19 07:34:13Z np $
  */
 
 #ifndef _NETINET_IF_ETHER_H_
@@ -122,8 +122,14 @@
 void	arp_ifscrub(struct ifnet *, uint32_t);
 
 #include <sys/eventhandler.h>
-typedef void (*llevent_arp_update_fn)(void *, struct llentry *);
-EVENTHANDLER_DECLARE(arp_update_event, llevent_arp_update_fn);
+enum {
+	LLENTRY_RESOLVED,
+	LLENTRY_TIMEDOUT,
+	LLENTRY_DELETED,
+	LLENTRY_EXPIRED,
+};
+typedef void (*lle_event_fn)(void *, struct llentry *, int);
+EVENTHANDLER_DECLARE(lle_event, lle_event_fn);
 
 #endif
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/igmp.c
--- a/head/sys/netinet/igmp.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/igmp.c	Wed Jul 25 16:40:53 2012 +0300
@@ -48,7 +48,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/igmp.c 229621 2012-01-05 19:00:36Z jhb $");
+__FBSDID("$FreeBSD: head/sys/netinet/igmp.c 238084 2012-07-03 19:04:18Z trociny $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -2285,13 +2285,11 @@
 	 */
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
-	if (ifp != NULL) {
-		/*
-		 * Sanity check that netinet's notion of ifp is the
-		 * same as net's.
-		 */
-		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
-	}
+	/*
+	 * Sanity check that netinet's notion of ifp is the
+	 * same as net's.
+	 */
+	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 
 	IGMP_LOCK();
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in.c
--- a/head/sys/netinet/in.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/in.c 234087 2012-04-10 06:52:39Z glebius $");
+__FBSDID("$FreeBSD: head/sys/netinet/in.c 237263 2012-06-19 07:34:13Z np $");
 
 #include "opt_mpath.h"
 
@@ -1469,7 +1469,7 @@
 		if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) {
 			LLE_WLOCK(lle);
 			lle->la_flags = LLE_DELETED;
-			EVENTHANDLER_INVOKE(arp_update_event, lle);
+			EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 			LLE_WUNLOCK(lle);
 #ifdef DIAGNOSTIC
 			log(LOG_INFO, "ifaddr cache = %p  is deleted\n", lle);	
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in.h
--- a/head/sys/netinet/in.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)in.h	8.3 (Berkeley) 1/3/94
- * $FreeBSD: head/sys/netinet/in.h 226402 2011-10-15 18:41:25Z glebius $
+ * $FreeBSD: head/sys/netinet/in.h 236959 2012-06-12 14:02:38Z tuexen $
  */
 
 #ifndef _NETINET_IN_H_
@@ -241,6 +241,7 @@
 #define	IPPROTO_PIM		103		/* Protocol Independent Mcast */
 #define	IPPROTO_CARP		112		/* CARP */
 #define	IPPROTO_PGM		113		/* PGM */
+#define	IPPROTO_MPLS		137		/* MPLS-in-IP */
 #define	IPPROTO_PFSYNC		240		/* PFSYNC */
 /* 255: Reserved */
 /* BSD Private, local use, namespace incursion, no longer used */
@@ -461,6 +462,7 @@
 #define	IP_RECVTTL		65   /* bool; receive IP TTL w/dgram */
 #define	IP_MINTTL		66   /* minimum TTL for packet or drop */
 #define	IP_DONTFRAG		67   /* don't fragment packet */
+#define	IP_RECVTOS		68   /* bool; receive IP TOS w/dgram */
 
 /* IPv4 Source Filter Multicast API [RFC3678] */
 #define	IP_ADD_SOURCE_MEMBERSHIP	70   /* join a source-specific group */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in_pcb.c
--- a/head/sys/netinet/in_pcb.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in_pcb.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/in_pcb.c 230442 2012-01-22 02:13:19Z bz $");
+__FBSDID("$FreeBSD: head/sys/netinet/in_pcb.c 236959 2012-06-12 14:02:38Z tuexen $");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
@@ -2295,6 +2295,10 @@
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
+	if (inp_flags & INP_RECVTOS) {
+		db_printf("%sINP_RECVTOS", comma ? ", " : "");
+		comma = 1;
+	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in_pcb.h
--- a/head/sys/netinet/in_pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in_pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.h	8.1 (Berkeley) 6/10/93
- * $FreeBSD: head/sys/netinet/in_pcb.h 233096 2012-03-17 21:51:39Z rmh $
+ * $FreeBSD: head/sys/netinet/in_pcb.h 236959 2012-06-12 14:02:38Z tuexen $
  */
 
 #ifndef _NETINET_IN_PCB_H_
@@ -509,6 +509,7 @@
 #define	INP_DONTFRAG		0x00000800 /* don't fragment packet */
 #define	INP_BINDANY		0x00001000 /* allow bind to any address */
 #define	INP_INHASHLIST		0x00002000 /* in_pcbinshash() has been called */
+#define	INP_RECVTOS		0x00004000 /* receive incoming IP TOS */
 #define	IN6P_IPV6_V6ONLY	0x00008000 /* restrict AF_INET6 socket for v6 */
 #define	IN6P_PKTINFO		0x00010000 /* receive IP6 dst and I/F */
 #define	IN6P_HOPLIMIT		0x00020000 /* receive hoplimit */
@@ -528,7 +529,7 @@
 #define	IN6P_MTU		0x80000000 /* receive path MTU */
 
 #define	INP_CONTROLOPTS		(INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
-				 INP_RECVIF|INP_RECVTTL|\
+				 INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\
 				 IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
 				 IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
 				 IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/in_var.h
--- a/head/sys/netinet/in_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/in_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)in_var.h	8.2 (Berkeley) 1/9/95
- * $FreeBSD: head/sys/netinet/in_var.h 229815 2012-01-08 17:20:29Z glebius $
+ * $FreeBSD: head/sys/netinet/in_var.h 238572 2012-07-18 08:41:00Z glebius $
  */
 
 #ifndef _NETINET_IN_VAR_H_
@@ -161,14 +161,16 @@
 #define IFP_TO_IA(ifp, ia)						\
 	/* struct ifnet *ifp; */					\
 	/* struct in_ifaddr *ia; */					\
-{									\
+do {									\
+	IN_IFADDR_RLOCK();						\
 	for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead);			\
 	    (ia) != NULL && (ia)->ia_ifp != (ifp);			\
 	    (ia) = TAILQ_NEXT((ia), ia_link))				\
 		continue;						\
 	if ((ia) != NULL)						\
 		ifa_ref(&(ia)->ia_ifa);					\
-}
+	IN_IFADDR_RUNLOCK();						\
+} while (0)
 #endif
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip.h
--- a/head/sys/netinet/ip.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip.h	Wed Jul 25 16:40:53 2012 +0300
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ip.h	8.2 (Berkeley) 6/1/94
- * $FreeBSD$
+ * $FreeBSD: head/sys/netinet/ip.h 235036 2012-05-04 21:00:32Z delphij $
  */
 
 #ifndef _NETINET_IP_H_
@@ -92,6 +92,31 @@
 #define	IPTOS_PREC_ROUTINE		0x00
 
 /*
+ * Definitions for DiffServ Codepoints as per RFC2474
+ */
+#define	IPTOS_DSCP_CS0		0x00
+#define	IPTOS_DSCP_CS1		0x20
+#define	IPTOS_DSCP_AF11		0x28
+#define	IPTOS_DSCP_AF12		0x30
+#define	IPTOS_DSCP_AF13		0x38
+#define	IPTOS_DSCP_CS2		0x40
+#define	IPTOS_DSCP_AF21		0x48
+#define	IPTOS_DSCP_AF22		0x50
+#define	IPTOS_DSCP_AF23		0x58
+#define	IPTOS_DSCP_CS3		0x60
+#define	IPTOS_DSCP_AF31		0x68
+#define	IPTOS_DSCP_AF32		0x70
+#define	IPTOS_DSCP_AF33		0x78
+#define	IPTOS_DSCP_CS4		0x80
+#define	IPTOS_DSCP_AF41		0x88
+#define	IPTOS_DSCP_AF42		0x90
+#define	IPTOS_DSCP_AF43		0x98
+#define	IPTOS_DSCP_CS5		0xa0
+#define	IPTOS_DSCP_EF		0xb8
+#define	IPTOS_DSCP_CS6		0xc0
+#define	IPTOS_DSCP_CS7		0xe0
+
+/*
  * ECN (Explicit Congestion Notification) codepoints in RFC3168 mapped to the
  * lower 2 bits of the TOS field.
  */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_carp.c
--- a/head/sys/netinet/ip_carp.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_carp.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_carp.c 234130 2012-04-11 12:26:30Z glebius $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_carp.c 236310 2012-05-30 13:51:00Z glebius $");
 
 #include "opt_bpf.h"
 #include "opt_inet.h"
@@ -696,7 +696,7 @@
 		CARPSTATS_INC(carps_onomem);
 		return (ENOMEM);
 	}
-	bcopy(&sc, (caddr_t)(mtag + 1), sizeof(struct carp_softc *));
+	bcopy(&sc, mtag + 1, sizeof(sc));
 	m_tag_prepend(m, mtag);
 
 	return (0);
@@ -1061,13 +1061,12 @@
 			IF_ADDR_RUNLOCK(ifp);
 
 			mtag = m_tag_get(PACKET_TAG_CARP,
-			    sizeof(struct ifnet *), M_NOWAIT);
+			    sizeof(struct carp_softc *), M_NOWAIT);
 			if (mtag == NULL)
 				/* Better a bit than nothing. */
 				return (LLADDR(&sc->sc_addr));
 
-			bcopy(&ifp, (caddr_t)(mtag + 1),
-			    sizeof(struct ifnet *));
+			bcopy(&sc, mtag + 1, sizeof(sc));
 			m_tag_prepend(m, mtag);
 
 			return (LLADDR(&sc->sc_addr));
@@ -1391,7 +1390,7 @@
 	if (mtag == NULL)
 		return (0);
 
-	bcopy(mtag + 1, &sc, sizeof(struct carp_softc *));
+	bcopy(mtag + 1, &sc, sizeof(sc));
 
 	/* Set the source MAC address to the Virtual Router MAC Address. */
 	switch (ifp->if_type) {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_fw.h
--- a/head/sys/netinet/ip_fw.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_fw.h	Wed Jul 25 16:40:53 2012 +0300
@@ -22,7 +22,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/netinet/ip_fw.h 233478 2012-03-25 20:37:59Z melifaro $
+ * $FreeBSD: head/sys/netinet/ip_fw.h 234946 2012-05-03 08:56:43Z melifaro $
  */
 
 #ifndef _IPFW2_H
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_icmp.c
--- a/head/sys/netinet/ip_icmp.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_icmp.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_icmp.c 229749 2012-01-07 00:11:36Z eadler $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_icmp.c 237230 2012-06-18 17:11:24Z tuexen $");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
@@ -965,7 +965,8 @@
 		{ "icmp tstamp response" },
 		{ "closed port RST response" },
 		{ "open port RST response" },
-		{ "icmp6 unreach response" }
+		{ "icmp6 unreach response" },
+		{ "sctp ootb response" }
 	};
 
 	/*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_input.c
--- a/head/sys/netinet/ip_input.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_input.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_input.c 229621 2012-01-05 19:00:36Z jhb $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_input.c 238092 2012-07-04 07:37:53Z glebius $");
 
 #include "opt_bootp.h"
 #include "opt_ipfw.h"
@@ -1495,8 +1495,7 @@
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_rmx.rmx_mtu;
-	if (ro.ro_rt)
-		RTFREE(ro.ro_rt);
+	RO_RTFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
@@ -1684,6 +1683,12 @@
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
+	if (inp->inp_flags & INP_RECVTOS) {
+		*mp = sbcreatecontrol((caddr_t) &ip->ip_tos,
+		    sizeof(u_char), IP_RECVTOS, IPPROTO_IP);
+		if (*mp)
+			mp = &(*mp)->m_next;
+	}
 }
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_mroute.c
--- a/head/sys/netinet/ip_mroute.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_mroute.c	Wed Jul 25 16:40:53 2012 +0300
@@ -67,7 +67,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_mroute.c 232517 2012-03-04 18:59:38Z zec $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_mroute.c 238016 2012-07-02 19:44:18Z glebius $");
 
 #include "opt_inet.h"
 #include "opt_mrouting.h"
@@ -924,7 +924,6 @@
     vifp->v_pkt_out   = 0;
     vifp->v_bytes_in  = 0;
     vifp->v_bytes_out = 0;
-    bzero(&vifp->v_route, sizeof(vifp->v_route));
 
     /* Adjust numvifs up if the vifi is higher than numvifs */
     if (V_numvifs <= vifcp->vifc_vifi)
@@ -1702,7 +1701,7 @@
 	 * should get rejected because they appear to come from
 	 * the loopback interface, thus preventing looping.
 	 */
-	error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL);
+	error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL);
 	CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
 	    (ptrdiff_t)(vifp - V_viftable), error);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_mroute.h
--- a/head/sys/netinet/ip_mroute.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_mroute.h	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)ip_mroute.h	8.1 (Berkeley) 6/10/93
- * $FreeBSD$
+ * $FreeBSD: head/sys/netinet/ip_mroute.h 238016 2012-07-02 19:44:18Z glebius $
  */
 
 #ifndef _NETINET_IP_MROUTE_H_
@@ -262,7 +262,6 @@
     u_long		v_pkt_out;	/* # pkts out on interface           */
     u_long		v_bytes_in;	/* # bytes in on interface	     */
     u_long		v_bytes_out;	/* # bytes out on interface	     */
-    struct route	v_route;	/* cached route */
 };
 
 #ifdef _KERNEL
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ip_output.c
--- a/head/sys/netinet/ip_output.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ip_output.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 227207 2011-11-06 10:47:20Z trociny $");
+__FBSDID("$FreeBSD: head/sys/netinet/ip_output.c 238573 2012-07-18 08:58:30Z glebius $");
 
 #include "opt_ipfw.h"
 #include "opt_ipsec.h"
@@ -105,6 +105,10 @@
  * ip_len and ip_off are in host format.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
+ * If route ro is present and has ro_rt initialized, route lookup would be
+ * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
+ * then result of route lookup is stored in ro->ro_rt.
+ *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
@@ -119,9 +123,8 @@
 	int mtu;
 	int n;	/* scratchpad */
 	int error = 0;
-	int nortfree = 0;
 	struct sockaddr_in *dst;
-	struct in_ifaddr *ia = NULL;
+	struct in_ifaddr *ia;
 	int isbroadcast, sw_csum;
 	struct route iproute;
 	struct rtentry *rte;	/* cache for ro->ro_rt */
@@ -146,24 +149,23 @@
 	if (ro == NULL) {
 		ro = &iproute;
 		bzero(ro, sizeof (*ro));
+	}
 
 #ifdef FLOWTABLE
-		{
-			struct flentry *fle;
+	if (ro->ro_rt == NULL) {
+		struct flentry *fle;
 			
-			/*
-			 * The flow table returns route entries valid for up to 30
-			 * seconds; we rely on the remainder of ip_output() taking no
-			 * longer than that long for the stability of ro_rt.  The
-			 * flow ID assignment must have happened before this point.
-			 */
-			if ((fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET)) != NULL) {
-				flow_to_route(fle, ro);
-				nortfree = 1;
-			}
-		}
+		/*
+		 * The flow table returns route entries valid for up to 30
+		 * seconds; we rely on the remainder of ip_output() taking no
+		 * longer than that long for the stability of ro_rt. The
+		 * flow ID assignment must have happened before this point.
+		 */
+		fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET);
+		if (fle != NULL)
+			flow_to_route(fle, ro);
+	}
 #endif
-	}
 
 	if (opt) {
 		int len = 0;
@@ -196,6 +198,7 @@
 
 	dst = (struct sockaddr_in *)&ro->ro_dst;
 again:
+	ia = NULL;
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
@@ -209,10 +212,9 @@
 		    !RT_LINK_IS_UP(rte->rt_ifp) ||
 			  dst->sin_family != AF_INET ||
 			  dst->sin_addr.s_addr != ip->ip_dst.s_addr)) {
-		if (!nortfree)
-			RTFREE(rte);
-		rte = ro->ro_rt = (struct rtentry *)NULL;
-		ro->ro_lle = (struct llentry *)NULL;
+		RO_RTFREE(ro);
+		ro->ro_lle = NULL;
+		rte = NULL;
 	}
 #ifdef IPFIREWALL_FORWARD
 	if (rte == NULL && fwd_tag == NULL) {
@@ -532,8 +534,11 @@
 #endif
 			error = netisr_queue(NETISR_IP, m);
 			goto done;
-		} else
+		} else {
+			if (ia != NULL)
+				ifa_free(&ia->ia_ifa);
 			goto again;	/* Redo the routing table lookup. */
+		}
 	}
 
 #ifdef IPFIREWALL_FORWARD
@@ -563,6 +568,8 @@
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m_tag_delete(m, fwd_tag);
+		if (ia != NULL)
+			ifa_free(&ia->ia_ifa);
 		goto again;
 	}
 #endif /* IPFIREWALL_FORWARD */
@@ -672,9 +679,8 @@
 		IPSTAT_INC(ips_fragmented);
 
 done:
-	if (ro == &iproute && ro->ro_rt && !nortfree) {
-		RTFREE(ro->ro_rt);
-	}
+	if (ro == &iproute)
+		RO_RTFREE(ro);
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (error);
@@ -984,6 +990,7 @@
 		case IP_FAITH:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
+		case IP_RECVTOS:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
@@ -1047,6 +1054,9 @@
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
+			case IP_RECVTOS:
+				OPTSET(INP_RECVTOS);
+				break;
 			}
 			break;
 #undef OPTSET
@@ -1156,6 +1166,7 @@
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
+		case IP_RECVTOS:
 			switch (sopt->sopt_name) {
 
 			case IP_TOS:
@@ -1214,6 +1225,9 @@
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
+			case IP_RECVTOS:
+				optval = OPTBIT(INP_RECVTOS);
+				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ipfw/ip_dummynet.c
--- a/head/sys/netinet/ipfw/ip_dummynet.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ipfw/ip_dummynet.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_dummynet.c 222560 2011-06-01 12:33:05Z ae $");
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_dummynet.c 238063 2012-07-03 08:42:48Z issyl0 $");
 
 /*
  * Configuration and internal object management for dummynet.
@@ -97,7 +97,7 @@
 	struct dn_alg *d;
 
 	SLIST_FOREACH(d, &dn_cfg.schedlist, next) {
-		if (d->type == type || (name && !strcmp(d->name, name)))
+		if (d->type == type || (name && !strcasecmp(d->name, name)))
 			return d;
 	}
 	return NULL; /* not found */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ipfw/ip_fw_log.c
--- a/head/sys/netinet/ipfw/ip_fw_log.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ipfw/ip_fw_log.c	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 227085 2011-11-04 16:24:19Z bz $");
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 238277 2012-07-09 07:16:19Z hrs $");
 
 /*
  * Logging support for ipfw
@@ -44,8 +44,11 @@
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
 #include <net/ethernet.h> /* for ETHERTYPE_IP */
 #include <net/if.h>
+#include <net/if_clone.h>
 #include <net/vnet.h>
 #include <net/if_types.h>	/* for IFT_ETHER */
 #include <net/bpf.h>		/* for BPF */
@@ -90,6 +93,15 @@
 }
 #else /* !WITHOUT_BPF */
 static struct ifnet *log_if;	/* hook to attach to bpf */
+static struct rwlock log_if_lock;
+#define	LOGIF_LOCK_INIT(x)	rw_init(&log_if_lock, "ipfw log_if lock")
+#define	LOGIF_LOCK_DESTROY(x)	rw_destroy(&log_if_lock)
+#define	LOGIF_RLOCK(x)		rw_rlock(&log_if_lock)
+#define	LOGIF_RUNLOCK(x)	rw_runlock(&log_if_lock)
+#define	LOGIF_WLOCK(x)		rw_wlock(&log_if_lock)
+#define	LOGIF_WUNLOCK(x)	rw_wunlock(&log_if_lock)
+
+#define	IPFWNAME	"ipfw"
 
 /* we use this dummy function for all ifnet callbacks */
 static int
@@ -116,37 +128,105 @@
 static const u_char ipfwbroadcastaddr[6] =
 	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
+static int
+ipfw_log_clone_match(struct if_clone *ifc, const char *name)
+{
+
+	return (strncmp(name, IPFWNAME, sizeof(IPFWNAME) - 1) == 0);
+}
+
+static int
+ipfw_log_clone_create(struct if_clone *ifc, char *name, size_t len,
+    caddr_t params)
+{
+	int error;
+	int unit;
+	struct ifnet *ifp;
+
+	error = ifc_name2unit(name, &unit);
+	if (error)
+		return (error);
+
+	error = ifc_alloc_unit(ifc, &unit);
+	if (error)
+		return (error);
+
+	ifp = if_alloc(IFT_ETHER);
+	if (ifp == NULL) {
+		ifc_free_unit(ifc, unit);
+		return (ENOSPC);
+	}
+	ifp->if_dname = IPFWNAME;
+	ifp->if_dunit = unit;
+	snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", IPFWNAME, unit);
+	strlcpy(name, ifp->if_xname, len);
+	ifp->if_mtu = 65536;
+	ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
+	ifp->if_init = (void *)log_dummy;
+	ifp->if_ioctl = log_dummy;
+	ifp->if_start = ipfw_log_start;
+	ifp->if_output = ipfw_log_output;
+	ifp->if_addrlen = 6;
+	ifp->if_hdrlen = 14;
+	ifp->if_broadcastaddr = ipfwbroadcastaddr;
+	ifp->if_baudrate = IF_Mbps(10);
+
+	LOGIF_WLOCK();
+	if (log_if == NULL)
+		log_if = ifp;
+	else {
+		LOGIF_WUNLOCK();
+		if_free(ifp);
+		ifc_free_unit(ifc, unit);
+		return (EEXIST);
+	}
+	LOGIF_WUNLOCK();
+	if_attach(ifp);
+	bpfattach(ifp, DLT_EN10MB, 14);
+
+	return (0);
+}
+
+static int
+ipfw_log_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
+{
+	int unit;
+
+	if (ifp == NULL)
+		return (0);
+
+	LOGIF_WLOCK();
+	if (log_if != NULL && ifp == log_if)
+		log_if = NULL;
+	else {
+		LOGIF_WUNLOCK();
+		return (EINVAL);
+	}
+	LOGIF_WUNLOCK();
+
+	unit = ifp->if_dunit;
+	bpfdetach(ifp);
+	if_detach(ifp);
+	if_free(ifp);
+	ifc_free_unit(ifc, unit);
+
+	return (0);
+}
+
+static struct if_clone ipfw_log_cloner = IFC_CLONE_INITIALIZER(
+    IPFWNAME, NULL, IF_MAXUNIT,
+    NULL, ipfw_log_clone_match, ipfw_log_clone_create, ipfw_log_clone_destroy);
+
 void
 ipfw_log_bpf(int onoff)
 {
-	struct ifnet *ifp;
 
 	if (onoff) {
-		if (log_if)
-			return;
-		ifp = if_alloc(IFT_ETHER);
-		if (ifp == NULL)
-			return;
-		if_initname(ifp, "ipfw", 0);
-		ifp->if_mtu = 65536;
-		ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
-		ifp->if_init = (void *)log_dummy;
-		ifp->if_ioctl = log_dummy;
-		ifp->if_start = ipfw_log_start;
-		ifp->if_output = ipfw_log_output;
-		ifp->if_addrlen = 6;
-		ifp->if_hdrlen = 14;
-		if_attach(ifp);
-		ifp->if_broadcastaddr = ipfwbroadcastaddr;
-		ifp->if_baudrate = IF_Mbps(10);
-		bpfattach(ifp, DLT_EN10MB, 14);
-		log_if = ifp;
+		LOGIF_LOCK_INIT();
+		if_clone_attach(&ipfw_log_cloner);
 	} else {
-		if (log_if) {
-			ether_ifdetach(log_if);
-			if_free(log_if);
-		}
-		log_if = NULL;
+		if_clone_detach(&ipfw_log_cloner);
+		LOGIF_LOCK_DESTROY();
 	}
 }
 #endif /* !WITHOUT_BPF */
@@ -166,9 +246,11 @@
 
 	if (V_fw_verbose == 0) {
 #ifndef WITHOUT_BPF
-
-		if (log_if == NULL || log_if->if_bpf == NULL)
+		LOGIF_RLOCK();
+		if (log_if == NULL || log_if->if_bpf == NULL) {
+			LOGIF_RUNLOCK();
 			return;
+		}
 
 		if (args->eh) /* layer2, use orig hdr */
 			BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m);
@@ -177,6 +259,7 @@
 			 * more info in the header.
 			 */
 			BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m);
+		LOGIF_RUNLOCK();
 #endif /* !WITHOUT_BPF */
 		return;
 	}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ipfw/ip_fw_private.h
--- a/head/sys/netinet/ipfw/ip_fw_private.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ipfw/ip_fw_private.h	Wed Jul 25 16:40:53 2012 +0300
@@ -22,7 +22,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 233478 2012-03-25 20:37:59Z melifaro $
+ * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 234946 2012-05-03 08:56:43Z melifaro $
  */
 
 #ifndef _IPFW2_PRIVATE_H
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/ipfw/ip_fw_table.c
--- a/head/sys/netinet/ipfw/ip_fw_table.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/ipfw/ip_fw_table.c	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 233478 2012-03-25 20:37:59Z melifaro $");
+__FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 238265 2012-07-08 21:13:04Z melifaro $");
 
 /*
  * Lookup table support for ipfw
@@ -153,6 +153,9 @@
 	case IPFW_TABLE_CIDR:
 		if (plen == sizeof(in_addr_t)) {
 #ifdef INET
+			/* IPv4 case */
+			if (mlen > 32)
+				return (EINVAL);
 			ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO);
 			ent->value = value;
 			/* Set 'total' structure length */
@@ -341,9 +344,12 @@
 		struct xaddr_iface ifname, ifmask;
 		memset(&ifname, 0, sizeof(ifname));
 
+		/* Include last \0 into comparison */
+		mlen++;
+
 		/* Set 'total' structure length */
-		KEY_LEN(ifname) = mlen;
-		KEY_LEN(ifmask) = mlen;
+		KEY_LEN(ifname) = KEY_LEN_IFACE + mlen;
+		KEY_LEN(ifmask) = KEY_LEN_IFACE + mlen;
 		/* Assume direct match */
 		/* FIXME: Add interface pattern matching */
 #if 0
@@ -565,7 +571,8 @@
 		break;
 
 	case IPFW_TABLE_INTERFACE:
-		KEY_LEN(iface) = strlcpy(iface.ifname, (char *)paddr, IF_NAMESIZE);
+		KEY_LEN(iface) = KEY_LEN_IFACE +
+		    strlcpy(iface.ifname, (char *)paddr, IF_NAMESIZE) + 1;
 		/* Assume direct match */
 		/* FIXME: Add interface pattern matching */
 		xent = (struct table_xentry *)(rnh->rnh_lookup(&iface, NULL, rnh));
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/libalias/alias_sctp.h
--- a/head/sys/netinet/libalias/alias_sctp.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/libalias/alias_sctp.h	Wed Jul 25 16:40:53 2012 +0300
@@ -45,7 +45,7 @@
  *
  */
 
-/* $FreeBSD: head/sys/netinet/libalias/alias_sctp.h 222809 2011-06-07 06:57:22Z ae $ */
+/* $FreeBSD: head/sys/netinet/libalias/alias_sctp.h 235644 2012-05-19 05:14:24Z marcel $ */
 
 #ifndef _ALIAS_SCTP_H_
 #define _ALIAS_SCTP_H_
@@ -92,7 +92,6 @@
 #ifndef _KERNEL
 #include <stdlib.h>
 #include <stdio.h>
-#include <curses.h>
 #endif //#ifdef _KERNEL
 
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/libalias/libalias.3
--- a/head/sys/netinet/libalias/libalias.3	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/libalias/libalias.3	Wed Jul 25 16:40:53 2012 +0300
@@ -23,9 +23,9 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.\" $FreeBSD: head/sys/netinet/libalias/libalias.3 223773 2011-07-04 23:00:26Z gjb $
+.\" $FreeBSD: head/sys/netinet/libalias/libalias.3 237015 2012-06-13 18:57:27Z joel $
 .\"
-.Dd July 04, 2011
+.Dd July 4, 2011
 .Dt LIBALIAS 3
 .Os
 .Sh NAME
@@ -201,11 +201,10 @@
 If this mode bit is set, traffic on the local network which does not
 originate from unregistered address spaces will be ignored.
 Standard Class A, B and C unregistered addresses are:
-.Bd -literal -offset indent
+.Pp
 10.0.0.0     ->  10.255.255.255   (Class A subnet)
 172.16.0.0   ->  172.31.255.255   (Class B subnets)
 192.168.0.0  ->  192.168.255.255  (Class C subnets)
-.Ed
 .Pp
 This option is useful in the case that the packet aliasing host has both
 registered and unregistered subnets on different interfaces.
@@ -499,14 +498,13 @@
 New traffic generated by any of the local machines, designated in the
 several function calls, will be aliased to the same address.
 Consider the following example:
-.Bd -literal -offset indent
+.Pp
 LibAliasRedirectAddr(la, inet_aton("192.168.0.2"),
                         inet_aton("141.221.254.101"));
 LibAliasRedirectAddr(la, inet_aton("192.168.0.3"),
                         inet_aton("141.221.254.101"));
 LibAliasRedirectAddr(la, inet_aton("192.168.0.4"),
                         inet_aton("141.221.254.101"));
-.Ed
 .Pp
 Any outgoing connections such as
 .Xr telnet 1
@@ -919,7 +917,7 @@
 .An Paolo Pisati Aq piso at FreeBSD.org
 made the library modular, moving support for all
 protocols (except for IP, TCP and UDP) to external modules.
-.Sh ACKNOWLEDGMENTS
+.Sh ACKNOWLEDGEMENTS
 Listed below, in approximate chronological order, are individuals who
 have provided valuable comments and/or debugging assistance.
 .Bd -ragged -offset indent
@@ -1277,10 +1275,10 @@
 .Ed
 .Bl -inset
 .It Va name
-is the name of the module
+is the name of the module.
 .It Va handle
 is a pointer to the module obtained through
-.Xr dlopen 3
+.Xr dlopen 3 .
 .El
 Whenever a module is loaded in userland, an entry is added to
 .Va dll_chain ,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp.h
--- a/head/sys/netinet/sctp.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,14 +29,14 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
-/* $KAME: sctp.h,v 1.18 2005/03/06 16:04:16 itojun Exp $	 */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp.h 233660 2012-03-29 13:36:53Z rrs $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp.h 235990 2012-05-25 11:14:08Z tuexen $");
 
 #ifndef _NETINET_SCTP_H_
 #define _NETINET_SCTP_H_
 
+
 #include <sys/types.h>
 
 
@@ -265,8 +265,6 @@
 #define SCTP_PEELOFF                    0x0000800a
 /* the real worker for sctp_getaddrlen() */
 #define SCTP_GET_ADDR_LEN               0x0000800b
-/* temporary workaround for Apple listen() issue, no args used */
-#define SCTP_LISTEN_FIX			0x0000800c
 /* Debug things that need to be purged */
 #define SCTP_SET_INITIAL_DBG_SEQ	0x00009f00
 
@@ -511,35 +509,38 @@
 /*
  * PCB Features (in sctp_features bitmask)
  */
-#define SCTP_PCB_FLAGS_DO_NOT_PMTUD     0x00000001
-#define SCTP_PCB_FLAGS_EXT_RCVINFO      0x00000002	/* deprecated */
-#define SCTP_PCB_FLAGS_DONOT_HEARTBEAT  0x00000004
-#define SCTP_PCB_FLAGS_FRAG_INTERLEAVE  0x00000008
-#define SCTP_PCB_FLAGS_INTERLEAVE_STRMS	0x00000010
-#define SCTP_PCB_FLAGS_DO_ASCONF	0x00000020
-#define SCTP_PCB_FLAGS_AUTO_ASCONF	0x00000040
-#define SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE 0x00000080
+#define SCTP_PCB_FLAGS_DO_NOT_PMTUD      0x00000001
+#define SCTP_PCB_FLAGS_EXT_RCVINFO       0x00000002	/* deprecated */
+#define SCTP_PCB_FLAGS_DONOT_HEARTBEAT   0x00000004
+#define SCTP_PCB_FLAGS_FRAG_INTERLEAVE   0x00000008
+#define SCTP_PCB_FLAGS_INTERLEAVE_STRMS  0x00000010
+#define SCTP_PCB_FLAGS_DO_ASCONF         0x00000020
+#define SCTP_PCB_FLAGS_AUTO_ASCONF       0x00000040
+#define SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE  0x00000080
 /* socket options */
-#define SCTP_PCB_FLAGS_NODELAY		0x00000100
-#define SCTP_PCB_FLAGS_AUTOCLOSE	0x00000200
-#define SCTP_PCB_FLAGS_RECVDATAIOEVNT	0x00000400	/* deprecated */
-#define SCTP_PCB_FLAGS_RECVASSOCEVNT	0x00000800
-#define SCTP_PCB_FLAGS_RECVPADDREVNT	0x00001000
-#define SCTP_PCB_FLAGS_RECVPEERERR	0x00002000
-#define SCTP_PCB_FLAGS_RECVSENDFAILEVNT	0x00004000
-#define SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT	0x00008000
-#define SCTP_PCB_FLAGS_ADAPTATIONEVNT	0x00010000
-#define SCTP_PCB_FLAGS_PDAPIEVNT	0x00020000
-#define SCTP_PCB_FLAGS_AUTHEVNT		0x00040000
-#define SCTP_PCB_FLAGS_STREAM_RESETEVNT 0x00080000
-#define SCTP_PCB_FLAGS_NO_FRAGMENT	0x00100000
-#define SCTP_PCB_FLAGS_EXPLICIT_EOR     0x00400000
-#define SCTP_PCB_FLAGS_NEEDS_MAPPED_V4	0x00800000
-#define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS	0x01000000
-#define SCTP_PCB_FLAGS_PORTREUSE        0x02000000
-#define SCTP_PCB_FLAGS_DRYEVNT          0x04000000
-#define SCTP_PCB_FLAGS_RECVRCVINFO      0x08000000
-#define SCTP_PCB_FLAGS_RECVNXTINFO      0x10000000
+#define SCTP_PCB_FLAGS_NODELAY           0x00000100
+#define SCTP_PCB_FLAGS_AUTOCLOSE         0x00000200
+#define SCTP_PCB_FLAGS_RECVDATAIOEVNT    0x00000400	/* deprecated */
+#define SCTP_PCB_FLAGS_RECVASSOCEVNT     0x00000800
+#define SCTP_PCB_FLAGS_RECVPADDREVNT     0x00001000
+#define SCTP_PCB_FLAGS_RECVPEERERR       0x00002000
+#define SCTP_PCB_FLAGS_RECVSENDFAILEVNT  0x00004000	/* deprecated */
+#define SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT  0x00008000
+#define SCTP_PCB_FLAGS_ADAPTATIONEVNT    0x00010000
+#define SCTP_PCB_FLAGS_PDAPIEVNT         0x00020000
+#define SCTP_PCB_FLAGS_AUTHEVNT          0x00040000
+#define SCTP_PCB_FLAGS_STREAM_RESETEVNT  0x00080000
+#define SCTP_PCB_FLAGS_NO_FRAGMENT       0x00100000
+#define SCTP_PCB_FLAGS_EXPLICIT_EOR      0x00400000
+#define SCTP_PCB_FLAGS_NEEDS_MAPPED_V4   0x00800000
+#define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS  0x01000000
+#define SCTP_PCB_FLAGS_PORTREUSE         0x02000000
+#define SCTP_PCB_FLAGS_DRYEVNT           0x04000000
+#define SCTP_PCB_FLAGS_RECVRCVINFO       0x08000000
+#define SCTP_PCB_FLAGS_RECVNXTINFO       0x10000000
+#define SCTP_PCB_FLAGS_ASSOC_RESETEVNT   0x20000000
+#define SCTP_PCB_FLAGS_STREAM_CHANGEEVNT 0x40000000
+#define SCTP_PCB_FLAGS_RECVNSENDFAILEVNT 0x80000000
 
 /*-
  * mobility_features parameters (by micchie).Note
@@ -547,14 +548,16 @@
  * sctp_mobility_features flags.. not the sctp_features
  * flags.
  */
-#define SCTP_MOBILITY_BASE		0x00000001
-#define SCTP_MOBILITY_FASTHANDOFF	0x00000002
-#define SCTP_MOBILITY_PRIM_DELETED	0x00000004
+#define SCTP_MOBILITY_BASE               0x00000001
+#define SCTP_MOBILITY_FASTHANDOFF        0x00000002
+#define SCTP_MOBILITY_PRIM_DELETED       0x00000004
 
 
 #define SCTP_SMALLEST_PMTU 512	/* smallest pmtu allowed when disabling PMTU
 				 * discovery */
 
+#undef SCTP_PACKED
+
 #include <netinet/sctp_uio.h>
 
 /* This dictates the size of the packet
@@ -606,7 +609,4 @@
 #define SCTP_LOG_AT_SEND_2_OUTQ				0x08000000
 #define SCTP_LOG_TRY_ADVANCE				0x10000000
 
-
-#undef SCTP_PACKED
-
 #endif				/* !_NETINET_SCTP_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_asconf.c
--- a/head/sys/netinet/sctp_asconf.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_asconf.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,9 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_asconf.c,v 1.24 2005/03/06 16:04:16 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_asconf.c 238501 2012-07-15 20:16:17Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_asconf.c 228907 2011-12-27 10:16:24Z tuexen $");
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
@@ -49,63 +48,10 @@
  * SCTP_DEBUG_ASCONF1: protocol info, general info and errors
  * SCTP_DEBUG_ASCONF2: detailed info
  */
-#ifdef SCTP_DEBUG
-#endif				/* SCTP_DEBUG */
 
 
-static void
-sctp_asconf_get_source_ip(struct mbuf *m, struct sockaddr *sa)
-{
-	struct ip *iph;
-
-#ifdef INET
-	struct sockaddr_in *sin;
-
-#endif
-#ifdef INET6
-	struct sockaddr_in6 *sin6;
-
-#endif
-
-	iph = mtod(m, struct ip *);
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		{
-			/* IPv4 source */
-			sin = (struct sockaddr_in *)sa;
-			bzero(sin, sizeof(*sin));
-			sin->sin_family = AF_INET;
-			sin->sin_len = sizeof(struct sockaddr_in);
-			sin->sin_port = 0;
-			sin->sin_addr.s_addr = iph->ip_src.s_addr;
-			break;
-		}
-#endif
-#ifdef INET6
-	case (IPV6_VERSION >> 4):
-		{
-			/* IPv6 source */
-			struct ip6_hdr *ip6;
-
-			sin6 = (struct sockaddr_in6 *)sa;
-			bzero(sin6, sizeof(*sin6));
-			sin6->sin6_family = AF_INET6;
-			sin6->sin6_len = sizeof(struct sockaddr_in6);
-			sin6->sin6_port = 0;
-			ip6 = mtod(m, struct ip6_hdr *);
-			sin6->sin6_addr = ip6->ip6_src;
-			break;
-		}
-#endif				/* INET6 */
-	default:
-		break;
-	}
-	return;
-}
-
 /*
- * draft-ietf-tsvwg-addip-sctp
+ * RFC 5061
  *
  * An ASCONF parameter queue exists per asoc which holds the pending address
  * operations.  Lists are updated upon receipt of ASCONF-ACK.
@@ -197,12 +143,12 @@
 }
 
 static struct mbuf *
-sctp_process_asconf_add_ip(struct mbuf *m, struct sctp_asconf_paramhdr *aph,
+sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *aph,
     struct sctp_tcb *stcb, int send_hb, int response_required)
 {
 	struct sctp_nets *net;
 	struct mbuf *m_reply = NULL;
-	struct sockaddr_storage sa_source, sa_store;
+	struct sockaddr_storage sa_store;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type, param_length, aparam_length;
 	struct sockaddr *sa;
@@ -282,11 +228,10 @@
 
 	/* if 0.0.0.0/::0, add the source address instead */
 	if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
-		sa = (struct sockaddr *)&sa_source;
-		sctp_asconf_get_source_ip(m, sa);
+		sa = src;
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_add_ip: using source addr ");
-		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src);
 	}
 	/* add the address */
 	if (bad_address) {
@@ -346,11 +291,12 @@
 }
 
 static struct mbuf *
-sctp_process_asconf_delete_ip(struct mbuf *m, struct sctp_asconf_paramhdr *aph,
+sctp_process_asconf_delete_ip(struct sockaddr *src,
+    struct sctp_asconf_paramhdr *aph,
     struct sctp_tcb *stcb, int response_required)
 {
 	struct mbuf *m_reply = NULL;
-	struct sockaddr_storage sa_source, sa_store;
+	struct sockaddr_storage sa_store;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type, param_length, aparam_length;
 	struct sockaddr *sa;
@@ -368,9 +314,6 @@
 
 #endif
 
-	/* get the source IP address for src and 0.0.0.0/::0 delete checks */
-	sctp_asconf_get_source_ip(m, (struct sockaddr *)&sa_source);
-
 	aparam_length = ntohs(aph->ph.param_length);
 	ph = (struct sctp_paramhdr *)(aph + 1);
 	param_type = ntohs(ph->param_type);
@@ -427,7 +370,7 @@
 	}
 
 	/* make sure the source address is not being deleted */
-	if (sctp_cmpaddr(sa, (struct sockaddr *)&sa_source)) {
+	if (sctp_cmpaddr(sa, src)) {
 		/* trying to delete the source address! */
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete source addr\n");
 		m_reply = sctp_asconf_error_response(aph->correlation_id,
@@ -437,8 +380,7 @@
 	}
 	/* if deleting 0.0.0.0/::0, delete all addresses except src addr */
 	if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
-		result = sctp_asconf_del_remote_addrs_except(stcb,
-		    (struct sockaddr *)&sa_source);
+		result = sctp_asconf_del_remote_addrs_except(stcb, src);
 
 		if (result) {
 			/* src address did not exist? */
@@ -478,12 +420,12 @@
 }
 
 static struct mbuf *
-sctp_process_asconf_set_primary(struct mbuf *m,
+sctp_process_asconf_set_primary(struct sockaddr *src,
     struct sctp_asconf_paramhdr *aph,
     struct sctp_tcb *stcb, int response_required)
 {
 	struct mbuf *m_reply = NULL;
-	struct sockaddr_storage sa_source, sa_store;
+	struct sockaddr_storage sa_store;
 	struct sctp_paramhdr *ph;
 	uint16_t param_type, param_length, aparam_length;
 	struct sockaddr *sa;
@@ -553,11 +495,10 @@
 
 	/* if 0.0.0.0/::0, use the source address instead */
 	if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) {
-		sa = (struct sockaddr *)&sa_source;
-		sctp_asconf_get_source_ip(m, sa);
+		sa = src;
 		SCTPDBG(SCTP_DEBUG_ASCONF1,
 		    "process_asconf_set_primary: using source addr ");
-		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa);
+		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src);
 	}
 	/* set the primary address */
 	if (sctp_set_primary_addr(stcb, sa, NULL) == 0) {
@@ -629,6 +570,7 @@
  */
 void
 sctp_handle_asconf(struct mbuf *m, unsigned int offset,
+    struct sockaddr *src,
     struct sctp_asconf_chunk *cp, struct sctp_tcb *stcb,
     int first)
 {
@@ -765,13 +707,13 @@
 		switch (param_type) {
 		case SCTP_ADD_IP_ADDRESS:
 			asoc->peer_supports_asconf = 1;
-			m_result = sctp_process_asconf_add_ip(m, aph, stcb,
+			m_result = sctp_process_asconf_add_ip(src, aph, stcb,
 			    (cnt < SCTP_BASE_SYSCTL(sctp_hb_maxburst)), error);
 			cnt++;
 			break;
 		case SCTP_DEL_IP_ADDRESS:
 			asoc->peer_supports_asconf = 1;
-			m_result = sctp_process_asconf_delete_ip(m, aph, stcb,
+			m_result = sctp_process_asconf_delete_ip(src, aph, stcb,
 			    error);
 			break;
 		case SCTP_ERROR_CAUSE_IND:
@@ -779,7 +721,7 @@
 			break;
 		case SCTP_SET_PRIM_ADDR:
 			asoc->peer_supports_asconf = 1;
-			m_result = sctp_process_asconf_set_primary(m, aph,
+			m_result = sctp_process_asconf_set_primary(src, aph,
 			    stcb, error);
 			break;
 		case SCTP_NAT_VTAGS:
@@ -859,70 +801,16 @@
 		 * this could happen if the source address was just newly
 		 * added
 		 */
-		struct ip *iph;
-		struct sctphdr *sh;
-		struct sockaddr_storage from_store;
-		struct sockaddr *from = (struct sockaddr *)&from_store;
-
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: looking up net for IP source address\n");
-		/* pullup already done, IP options already stripped */
-		iph = mtod(m, struct ip *);
-		switch (iph->ip_v) {
-#ifdef INET
-		case IPVERSION:
-			{
-				struct sockaddr_in *from4;
-
-				sh = (struct sctphdr *)((caddr_t)iph + sizeof(*iph));
-				from4 = (struct sockaddr_in *)&from_store;
-				bzero(from4, sizeof(*from4));
-				from4->sin_family = AF_INET;
-				from4->sin_len = sizeof(struct sockaddr_in);
-				from4->sin_addr.s_addr = iph->ip_src.s_addr;
-				from4->sin_port = sh->src_port;
-				break;
-			}
+		SCTPDBG(SCTP_DEBUG_ASCONF1, "Looking for IP source: ");
+		SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, src);
+		/* look up the from address */
+		stcb->asoc.last_control_chunk_from = sctp_findnet(stcb, src);
+#ifdef SCTP_DEBUG
+		if (stcb->asoc.last_control_chunk_from == NULL) {
+			SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: IP source address not found?!\n");
+		}
 #endif
-#ifdef INET6
-		case IPV6_VERSION >> 4:
-			{
-				struct ip6_hdr *ip6;
-				struct sockaddr_in6 *from6;
-
-				ip6 = mtod(m, struct ip6_hdr *);
-				sh = (struct sctphdr *)((caddr_t)ip6 + sizeof(*ip6));
-				from6 = (struct sockaddr_in6 *)&from_store;
-				bzero(from6, sizeof(*from6));
-				from6->sin6_family = AF_INET6;
-				from6->sin6_len = sizeof(struct sockaddr_in6);
-				from6->sin6_addr = ip6->ip6_src;
-				from6->sin6_port = sh->src_port;
-				/*
-				 * Get the scopes in properly to the sin6
-				 * addr's
-				 */
-				/* we probably don't need these operations */
-				(void)sa6_recoverscope(from6);
-				sa6_embedscope(from6,
-				    MODULE_GLOBAL(ip6_use_defzone));
-
-				break;
-			}
-#endif
-		default:
-			/* unknown address type */
-			from = NULL;
-		}
-		if (from != NULL) {
-			SCTPDBG(SCTP_DEBUG_ASCONF1, "Looking for IP source: ");
-			SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, from);
-			/* look up the from address */
-			stcb->asoc.last_control_chunk_from = sctp_findnet(stcb, from);
-#ifdef SCTP_DEBUG
-			if (stcb->asoc.last_control_chunk_from == NULL)
-				SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: IP source address not found?!\n");
-#endif
-		}
 	}
 }
 
@@ -1789,8 +1677,7 @@
 	 */
 	if (serial_num == (asoc->asconf_seq_out + 1)) {
 		SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got unexpected next serial number! Aborting asoc!\n");
-		sctp_abort_an_association(stcb->sctp_ep, stcb,
-		    SCTP_CAUSE_ILLEGAL_ASCONF_ACK, NULL, SCTP_SO_NOT_LOCKED);
+		sctp_abort_an_association(stcb->sctp_ep, stcb, NULL, SCTP_SO_NOT_LOCKED);
 		*abort_no_unlock = 1;
 		return;
 	}
@@ -2860,13 +2747,14 @@
 	struct sctp_paramhdr tmp_param, *ph;
 	uint16_t plen, ptype;
 	struct sctp_ifa *sctp_ifa;
-	struct sctp_ipv6addr_param addr_store;
 
 #ifdef INET6
+	struct sctp_ipv6addr_param addr6_store;
 	struct sockaddr_in6 sin6;
 
 #endif
 #ifdef INET
+	struct sctp_ipv4addr_param addr4_store;
 	struct sockaddr_in sin;
 
 #endif
@@ -2915,7 +2803,7 @@
 				a6p = (struct sctp_ipv6addr_param *)
 				    sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv6addr_param),
-				    (uint8_t *) & addr_store);
+				    (uint8_t *) & addr6_store);
 				if (plen != sizeof(struct sctp_ipv6addr_param) ||
 				    a6p == NULL) {
 					return;
@@ -2934,7 +2822,7 @@
 				/* get the entire IPv4 address param */
 				a4p = (struct sctp_ipv4addr_param *)sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv4addr_param),
-				    (uint8_t *) & addr_store);
+				    (uint8_t *) & addr4_store);
 				if (plen != sizeof(struct sctp_ipv4addr_param) ||
 				    a4p == NULL) {
 					return;
@@ -3012,16 +2900,17 @@
 {
 	struct sctp_paramhdr tmp_param, *ph;
 	uint16_t plen, ptype;
-	struct sctp_ipv6addr_param addr_store;
 
 #ifdef INET
 	struct sockaddr_in *sin;
 	struct sctp_ipv4addr_param *a4p;
+	struct sctp_ipv6addr_param addr4_store;
 
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	struct sctp_ipv6addr_param *a6p;
+	struct sctp_ipv6addr_param addr6_store;
 	struct sockaddr_in6 sin6_tmp;
 
 #endif
@@ -3067,7 +2956,7 @@
 				a6p = (struct sctp_ipv6addr_param *)
 				    sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv6addr_param),
-				    (uint8_t *) & addr_store);
+				    (uint8_t *) & addr6_store);
 				if (a6p == NULL) {
 					return (0);
 				}
@@ -3097,7 +2986,7 @@
 				a4p = (struct sctp_ipv4addr_param *)
 				    sctp_m_getptr(m, offset,
 				    sizeof(struct sctp_ipv4addr_param),
-				    (uint8_t *) & addr_store);
+				    (uint8_t *) & addr4_store);
 				if (a4p == NULL) {
 					return (0);
 				}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_asconf.h
--- a/head/sys/netinet/sctp_asconf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_asconf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_asconf.h,v 1.8 2005/03/06 16:04:16 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_asconf.h 228653 2011-12-17 19:21:40Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_asconf.h 237715 2012-06-28 16:01:08Z tuexen $");
 
 #ifndef _NETINET_SCTP_ASCONF_H_
 #define _NETINET_SCTP_ASCONF_H_
@@ -48,8 +46,8 @@
 extern struct mbuf *sctp_compose_asconf(struct sctp_tcb *, int *, int);
 
 extern void
-sctp_handle_asconf(struct mbuf *, unsigned int, struct sctp_asconf_chunk *,
-    struct sctp_tcb *, int i);
+sctp_handle_asconf(struct mbuf *, unsigned int, struct sockaddr *,
+    struct sctp_asconf_chunk *, struct sctp_tcb *, int);
 
 extern void
 sctp_handle_asconf_ack(struct mbuf *, int, struct sctp_asconf_ack_chunk *,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_auth.c
--- a/head/sys/netinet/sctp_auth.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_auth.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_auth.c 228907 2011-12-27 10:16:24Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_auth.c 235828 2012-05-23 11:26:28Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp.h>
@@ -284,16 +284,16 @@
 	uint32_t i;
 
 	if (key == NULL) {
-		printf("%s: [Null key]\n", str);
+		SCTP_PRINTF("%s: [Null key]\n", str);
 		return;
 	}
-	printf("%s: len %u, ", str, key->keylen);
+	SCTP_PRINTF("%s: len %u, ", str, key->keylen);
 	if (key->keylen) {
 		for (i = 0; i < key->keylen; i++)
-			printf("%02x", key->key[i]);
-		printf("\n");
+			SCTP_PRINTF("%02x", key->key[i]);
+		SCTP_PRINTF("\n");
 	} else {
-		printf("[Null key]\n");
+		SCTP_PRINTF("[Null key]\n");
 	}
 }
 
@@ -303,16 +303,16 @@
 	uint32_t i;
 
 	if (key == NULL) {
-		printf("%s: [Null key]\n", str);
+		SCTP_PRINTF("%s: [Null key]\n", str);
 		return;
 	}
-	printf("%s: len %u, ", str, key->keylen);
+	SCTP_PRINTF("%s: len %u, ", str, key->keylen);
 	if (key->keylen) {
 		for (i = 0; i < key->keylen; i++)
-			printf("%02x", key->key[i]);
-		printf("\n");
+			SCTP_PRINTF("%02x", key->key[i]);
+		SCTP_PRINTF("\n");
 	} else {
-		printf("[Null key]\n");
+		SCTP_PRINTF("[Null key]\n");
 	}
 }
 
@@ -1801,7 +1801,7 @@
 			 * shared_key_id, (void
 			 * *)stcb->asoc.authinfo.recv_keyid);
 			 */
-			sctp_notify_authentication(stcb, SCTP_AUTH_NEWKEY,
+			sctp_notify_authentication(stcb, SCTP_AUTH_NEW_KEY,
 			    shared_key_id, stcb->asoc.authinfo.recv_keyid,
 			    SCTP_SO_NOT_LOCKED);
 		/* compute a new recv assoc key and cache it */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_auth.h
--- a/head/sys/netinet/sctp_auth.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_auth.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,10 +31,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_auth.h 228653 2011-12-17 19:21:40Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_auth.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#ifndef __SCTP_AUTH_H__
-#define __SCTP_AUTH_H__
+#ifndef _NETINET_SCTP_AUTH_H_
+#define _NETINET_SCTP_AUTH_H_
 
 
 /* digest lengths */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_bsd_addr.c
--- a/head/sys/netinet/sctp_bsd_addr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_bsd_addr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_output.c,v 1.46 2005/03/06 16:04:17 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_bsd_addr.c 232866 2012-03-12 15:05:17Z rrs $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_bsd_addr.c 237540 2012-06-24 21:25:54Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
@@ -424,11 +422,12 @@
 
 #ifdef SCTP_PACKET_LOGGING
 void
-sctp_packet_log(struct mbuf *m, int length)
+sctp_packet_log(struct mbuf *m)
 {
 	int *lenat, thisone;
 	void *copyto;
 	uint32_t *tick_tock;
+	int length;
 	int total_len;
 	int grabbed_lock = 0;
 	int value, newval, thisend, thisbegin;
@@ -438,6 +437,7 @@
 	 * (value) -ticks of log      (ticks) o -ip packet o -as logged -
 	 * where this started (thisbegin) x <--end points here
 	 */
+	length = SCTP_HEADER_LEN(m);
 	total_len = SCTP_SIZE32((length + (4 * sizeof(int))));
 	/* Log a packet to the buffer. */
 	if (total_len > SCTP_PACKET_LOG_SIZE) {
@@ -483,7 +483,7 @@
 	}
 	/* Sanity check */
 	if (thisend >= SCTP_PACKET_LOG_SIZE) {
-		printf("Insanity stops a log thisbegin:%d thisend:%d writers:%d lock:%d end:%d\n",
+		SCTP_PRINTF("Insanity stops a log thisbegin:%d thisend:%d writers:%d lock:%d end:%d\n",
 		    thisbegin,
 		    thisend,
 		    SCTP_BASE_VAR(packet_log_writers),
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_bsd_addr.h
--- a/head/sys/netinet/sctp_bsd_addr.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_bsd_addr.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,10 +31,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_bsd_addr.h 228653 2011-12-17 19:21:40Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_bsd_addr.h 237540 2012-06-24 21:25:54Z tuexen $");
 
-#ifndef __sctp_bsd_addr_h__
-#define __sctp_bsd_addr_h__
+#ifndef _NETINET_SCTP_BSD_ADDR_H_
+#define _NETINET_SCTP_BSD_ADDR_H_
+
 #include <netinet/sctp_pcb.h>
 
 #if defined(_KERNEL) || defined(__Userspace__)
@@ -52,7 +53,7 @@
 
 #ifdef  SCTP_PACKET_LOGGING
 
-void sctp_packet_log(struct mbuf *m, int length);
+void sctp_packet_log(struct mbuf *m);
 int sctp_copy_out_packet_log(uint8_t * target, int length);
 
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_cc_functions.c
--- a/head/sys/netinet/sctp_cc_functions.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_cc_functions.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,9 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_cc_functions.c 235828 2012-05-23 11:26:28Z tuexen $");
+
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_sysctl.h>
@@ -44,8 +47,6 @@
 #include <netinet/sctp_auth.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctp_dtrace_declare.h>
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_cc_functions.c 228907 2011-12-27 10:16:24Z tuexen $");
 
 #define SHIFT_MPTCP_MULTI_N 40
 #define SHIFT_MPTCP_MULTI_Z 16
@@ -1594,9 +1595,7 @@
 
 	cur_val = net->cwnd >> 10;
 	indx = SCTP_HS_TABLE_SIZE - 1;
-#ifdef SCTP_DEBUG
-	printf("HS CC CAlled.\n");
-#endif
+
 	if (cur_val < sctp_cwnd_adjust[0].cwnd) {
 		/* normal mode */
 		if (net->net_ack > net->mtu) {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_constants.h
--- a/head/sys/netinet/sctp_constants.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_constants.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_constants.h,v 1.17 2005/03/06 16:04:17 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_constants.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_constants.h 233660 2012-03-29 13:36:53Z rrs $");
-
-#ifndef __sctp_constants_h__
-#define __sctp_constants_h__
+#ifndef _NETINET_SCTP_CONSTANTS_H_
+#define _NETINET_SCTP_CONSTANTS_H_
 
 /* IANA assigned port number for SCTP over UDP encapsulation */
 /* For freebsd we cannot bind the port at
@@ -348,7 +346,7 @@
 #define SCTP_NO_FR_UNLESS_SEGMENT_SMALLER 1
 
 /* default max I can burst out after a fast retransmit, 0 disables it */
-#define SCTP_DEF_MAX_BURST 0
+#define SCTP_DEF_MAX_BURST 4
 #define SCTP_DEF_HBMAX_BURST 4
 #define SCTP_DEF_FRMAX_BURST 4
 
@@ -460,18 +458,6 @@
 #define SCTP_HAS_NAT_SUPPORT            0xc007
 #define SCTP_NAT_VTAGS                  0xc008
 
-/* Notification error codes */
-#define SCTP_NOTIFY_DATAGRAM_UNSENT	0x0001
-#define SCTP_NOTIFY_DATAGRAM_SENT	0x0002
-#define SCTP_FAILED_THRESHOLD		0x0004
-#define SCTP_HEARTBEAT_SUCCESS		0x0008
-#define SCTP_RESPONSE_TO_USER_REQ	0x0010
-#define SCTP_INTERNAL_ERROR		0x0020
-#define SCTP_SHUTDOWN_GUARD_EXPIRES	0x0040
-#define SCTP_RECEIVED_SACK		0x0080
-#define SCTP_PEER_FAULTY		0x0100
-#define SCTP_ICMP_REFUSED		0x0200
-
 /* bits for TOS field */
 #define SCTP_ECT0_BIT		0x02
 #define SCTP_ECT1_BIT		0x01
@@ -755,35 +741,29 @@
 #define SCTP_NOTIFY_ASSOC_DOWN                   2
 #define SCTP_NOTIFY_INTERFACE_DOWN               3
 #define SCTP_NOTIFY_INTERFACE_UP                 4
-#define SCTP_NOTIFY_DG_FAIL                      5
-#define SCTP_NOTIFY_STRDATA_ERR                  6
-#define SCTP_NOTIFY_ASSOC_ABORTED                7
-#define SCTP_NOTIFY_PEER_OPENED_STREAM           8
-#define SCTP_NOTIFY_STREAM_OPENED_OK             9
+#define SCTP_NOTIFY_SENT_DG_FAIL                 5
+#define SCTP_NOTIFY_UNSENT_DG_FAIL               6
+#define SCTP_NOTIFY_SPECIAL_SP_FAIL              7
+#define SCTP_NOTIFY_ASSOC_LOC_ABORTED            8
+#define SCTP_NOTIFY_ASSOC_REM_ABORTED            9
 #define SCTP_NOTIFY_ASSOC_RESTART               10
-#define SCTP_NOTIFY_HB_RESP                     11
-#define SCTP_NOTIFY_ASCONF_SUCCESS              12
-#define SCTP_NOTIFY_ASCONF_FAILED               13
-#define SCTP_NOTIFY_PEER_SHUTDOWN               14
-#define SCTP_NOTIFY_ASCONF_ADD_IP               15
-#define SCTP_NOTIFY_ASCONF_DELETE_IP            16
-#define SCTP_NOTIFY_ASCONF_SET_PRIMARY          17
-#define SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION 18
-#define SCTP_NOTIFY_INTERFACE_CONFIRMED         20
-#define SCTP_NOTIFY_STR_RESET_RECV              21
-#define SCTP_NOTIFY_STR_RESET_SEND              22
-#define SCTP_NOTIFY_STR_RESET_FAILED_OUT        23
-#define SCTP_NOTIFY_STR_RESET_FAILED_IN         24
-#define SCTP_NOTIFY_AUTH_NEW_KEY                25
-#define SCTP_NOTIFY_AUTH_FREE_KEY               26
-#define SCTP_NOTIFY_SPECIAL_SP_FAIL             27
-#define SCTP_NOTIFY_NO_PEER_AUTH                28
-#define SCTP_NOTIFY_SENDER_DRY                  29
-#define SCTP_NOTIFY_STR_RESET_ADD_OK            30
-#define SCTP_NOTIFY_STR_RESET_ADD_FAIL          31
-#define SCTP_NOTIFY_STR_RESET_INSTREAM_ADD_OK   32
-#define SCTP_NOTIFY_MAX                         32
-
+#define SCTP_NOTIFY_PEER_SHUTDOWN               11
+#define SCTP_NOTIFY_ASCONF_ADD_IP               12
+#define SCTP_NOTIFY_ASCONF_DELETE_IP            13
+#define SCTP_NOTIFY_ASCONF_SET_PRIMARY          14
+#define SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION 15
+#define SCTP_NOTIFY_INTERFACE_CONFIRMED         16
+#define SCTP_NOTIFY_STR_RESET_RECV              17
+#define SCTP_NOTIFY_STR_RESET_SEND              18
+#define SCTP_NOTIFY_STR_RESET_FAILED_OUT        19
+#define SCTP_NOTIFY_STR_RESET_FAILED_IN         20
+#define SCTP_NOTIFY_STR_RESET_DENIED_OUT        21
+#define SCTP_NOTIFY_STR_RESET_DENIED_IN         22
+#define SCTP_NOTIFY_AUTH_NEW_KEY                23
+#define SCTP_NOTIFY_AUTH_FREE_KEY               24
+#define SCTP_NOTIFY_NO_PEER_AUTH                25
+#define SCTP_NOTIFY_SENDER_DRY                  26
+#define SCTP_NOTIFY_REMOTE_ERROR                27
 
 /* This is the value for messages that are NOT completely
  * copied down where we will start to split the message.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_crc32.c
--- a/head/sys/netinet/sctp_crc32.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_crc32.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,11 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_crc32.c,v 1.12 2005/03/06 16:04:17 itojun Exp $	 */
-
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_crc32.c 235828 2012-05-23 11:26:28Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp.h>
@@ -124,7 +121,9 @@
 sctp_delayed_cksum(struct mbuf *m, uint32_t offset)
 {
 #if defined(SCTP_WITH_NO_CSUM)
+#ifdef INVARIANTS
 	panic("sctp_delayed_cksum() called when using no SCTP CRC.");
+#endif
 #else
 	uint32_t checksum;
 
@@ -134,7 +133,7 @@
 	offset += offsetof(struct sctphdr, checksum);
 
 	if (offset + sizeof(uint32_t) > (uint32_t) (m->m_len)) {
-		printf("sctp_delayed_cksum(): m->len: %d,  off: %d.\n",
+		SCTP_PRINTF("sctp_delayed_cksum(): m->len: %d,  off: %d.\n",
 		    (uint32_t) m->m_len, offset);
 		/*
 		 * XXX this shouldn't happen, but if it does, the correct
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_crc32.h
--- a/head/sys/netinet/sctp_crc32.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_crc32.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_crc32.h,v 1.5 2004/08/17 04:06:16 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_crc32.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_crc32.h 228653 2011-12-17 19:21:40Z tuexen $");
-
-#ifndef __crc32c_h__
-#define __crc32c_h__
+#ifndef _NETINET_SCTP_CRC32_H_
+#define _NETINET_SCTP_CRC32_H_
 
 #if defined(_KERNEL)
 #if !defined(SCTP_WITH_NO_CSUM)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_dtrace_declare.h
--- a/head/sys/netinet/sctp_dtrace_declare.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_dtrace_declare.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,6 +1,6 @@
 /*-
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,9 +28,13 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_dtrace_declare.h 228653 2011-12-17 19:21:40Z tuexen $");
-#ifndef __sctp_dtrace_declare_h__
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_dtrace_declare.h 235828 2012-05-23 11:26:28Z tuexen $");
+
+#ifndef _NETINET_SCTP_DTRACE_DECLARE_H_
+#define _NETINET_SCTP_DTRACE_DECLARE_H_
+
 #include "opt_kdtrace.h"
 #include <sys/kernel.h>
 #include <sys/sdt.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_dtrace_define.h
--- a/head/sys/netinet/sctp_dtrace_define.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_dtrace_define.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,6 +1,6 @@
 /*-
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,9 +28,13 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_dtrace_define.h 228653 2011-12-17 19:21:40Z tuexen $");
-#ifndef __sctp_dtrace_define_h__
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_dtrace_define.h 235828 2012-05-23 11:26:28Z tuexen $");
+
+#ifndef _NETINET_SCTP_DTRACE_DEFINE_H_
+#define _NETINET_SCTP_DTRACE_DEFINE_H_
+
 #include "opt_kdtrace.h"
 #include <sys/kernel.h>
 #include <sys/sdt.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_header.h
--- a/head/sys/netinet/sctp_header.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_header.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_header.h,v 1.14 2005/03/06 16:04:17 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_header.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_header.h 233660 2012-03-29 13:36:53Z rrs $");
-
-#ifndef __sctp_header_h__
-#define __sctp_header_h__
+#ifndef _NETINET_SCTP_HEADER_H_
+#define _NETINET_SCTP_HEADER_H_
 
 #include <sys/time.h>
 #include <netinet/sctp.h>
@@ -499,12 +497,13 @@
 	uint16_t reserved;
 }                          SCTP_PACKED;
 
-#define SCTP_STREAM_RESET_NOTHING   0x00000000	/* Nothing for me to do */
-#define SCTP_STREAM_RESET_PERFORMED 0x00000001	/* Did it */
-#define SCTP_STREAM_RESET_REJECT    0x00000002	/* refused to do it */
-#define SCTP_STREAM_RESET_ERROR_STR 0x00000003	/* bad Stream no */
-#define SCTP_STREAM_RESET_TRY_LATER 0x00000004	/* collision, try again */
-#define SCTP_STREAM_RESET_BAD_SEQNO 0x00000005	/* bad str-reset seq no */
+#define SCTP_STREAM_RESET_RESULT_NOTHING_TO_DO   0x00000000	/* XXX: unused */
+#define SCTP_STREAM_RESET_RESULT_PERFORMED       0x00000001
+#define SCTP_STREAM_RESET_RESULT_DENIED          0x00000002
+#define SCTP_STREAM_RESET_RESULT_ERR__WRONG_SSN  0x00000003	/* XXX: unused */
+#define SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS 0x00000004
+#define SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO   0x00000005
+#define SCTP_STREAM_RESET_RESULT_IN_PROGRESS     0x00000006	/* XXX: unused */
 
 /*
  * convience structures, note that if you are making a request for specific
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_indata.c
--- a/head/sys/netinet/sctp_indata.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_indata.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_indata.c,v 1.36 2005/03/06 16:04:17 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_indata.c 234459 2012-04-19 12:43:19Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_indata.c 237715 2012-06-28 16:01:08Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
@@ -328,7 +326,7 @@
 	}
 	SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn);
 	if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) {
-		printf("gap:%x tsn:%x\n", gap, tsn);
+		SCTP_PRINTF("gap:%x tsn:%x\n", gap, tsn);
 		sctp_print_mapping_array(asoc);
 #ifdef INVARIANTS
 		panic("Things are really messed up now!!");
@@ -607,9 +605,7 @@
 			*ippp = ((control->sinfo_stream << 16) | control->sinfo_ssn);
 		}
 		stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_1;
-		sctp_abort_an_association(stcb->sctp_ep, stcb,
-		    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+		sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 		*abort_flag = 1;
 		return;
 
@@ -892,8 +888,7 @@
 
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_2;
-				sctp_abort_an_association(stcb->sctp_ep, stcb,
-				    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+				sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 				*abort_flag = 1;
 			} else if (asoc->fragmented_delivery_inprogress &&
 			    (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) {
@@ -924,8 +919,7 @@
 					*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_3;
-				sctp_abort_an_association(stcb->sctp_ep, stcb,
-				    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+				sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 				*abort_flag = 1;
 			} else if (asoc->fragmented_delivery_inprogress) {
 				/*
@@ -961,8 +955,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_4;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 				} else if ((asoc->fragment_flags & SCTP_DATA_UNORDERED) !=
 					    SCTP_DATA_UNORDERED &&
@@ -995,8 +988,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_5;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 				}
 			}
@@ -1090,8 +1082,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_6;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1127,9 +1118,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_7;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1166,9 +1155,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_8;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1202,9 +1189,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_9;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1247,9 +1232,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_10;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1289,9 +1272,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_11;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1328,9 +1309,7 @@
 
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_12;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1367,9 +1346,7 @@
 						*ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_13;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return;
 				}
@@ -1531,7 +1508,7 @@
 		struct mbuf *op_err;
 
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
-		sctp_abort_an_association(stcb->sctp_ep, stcb, 0, op_err, SCTP_SO_NOT_LOCKED);
+		sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
 		*abort_flag = 1;
 		return (0);
 	}
@@ -1552,7 +1529,7 @@
 		 */
 		if (stcb->sctp_socket->so_rcv.sb_cc) {
 			/* some to read, wake-up */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			struct socket *so;
 
 			so = SCTP_INP_SO(stcb->sctp_ep);
@@ -1568,7 +1545,7 @@
 			}
 #endif
 			sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 		}
@@ -1678,8 +1655,7 @@
 
 		}
 		stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_14;
-		sctp_abort_an_association(stcb->sctp_ep, stcb,
-		    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+		sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 		*abort_flag = 1;
 		return (0);
 	}
@@ -1942,9 +1918,7 @@
 					*ippp = ((strmno << 16) | strmseq);
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_15;
-				sctp_abort_an_association(stcb->sctp_ep, stcb,
-				    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+				sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 				*abort_flag = 1;
 				return (0);
 			} else {
@@ -1980,9 +1954,7 @@
 						*ippp = ((strmno << 16) | strmseq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_16;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return (0);
 				}
@@ -2027,9 +1999,7 @@
 						*ippp = ((strmno << 16) | strmseq);
 					}
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_17;
-					sctp_abort_an_association(stcb->sctp_ep,
-					    stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
-
+					sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 					*abort_flag = 1;
 					return (0);
 				}
@@ -2308,7 +2278,7 @@
 #ifdef INVARIANTS
 		for (i = 0; i < asoc->mapping_array_size; i++) {
 			if ((asoc->mapping_array[i]) || (asoc->nr_mapping_array[i])) {
-				printf("Error Mapping array's not clean at clear\n");
+				SCTP_PRINTF("Error Mapping array's not clean at clear\n");
 				sctp_print_mapping_array(asoc);
 			}
 		}
@@ -2330,7 +2300,7 @@
 #ifdef INVARIANTS
 			panic("impossible slide");
 #else
-			printf("impossible slide lgap:%x slide_end:%x slide_from:%x? at:%d\n",
+			SCTP_PRINTF("impossible slide lgap:%x slide_end:%x slide_from:%x? at:%d\n",
 			    lgap, slide_end, slide_from, at);
 			return;
 #endif
@@ -2339,7 +2309,7 @@
 #ifdef INVARIANTS
 			panic("would overrun buffer");
 #else
-			printf("Gak, would have overrun map end:%d slide_end:%d\n",
+			SCTP_PRINTF("Gak, would have overrun map end:%d slide_end:%d\n",
 			    asoc->mapping_array_size, slide_end);
 			slide_end = asoc->mapping_array_size;
 #endif
@@ -2546,8 +2516,11 @@
 
 int
 sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length,
-    struct sctphdr *sh, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
-    struct sctp_nets *net, uint32_t * high_tsn)
+    struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh, struct sctp_inpcb *inp,
+    struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t * high_tsn,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_data_chunk *ch, chunk_buf;
 	struct sctp_association *asoc;
@@ -2654,8 +2627,10 @@
 
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19;
-				sctp_abort_association(inp, stcb, m, iphlen, sh,
-				    op_err, 0, net->port);
+				sctp_abort_association(inp, stcb, m, iphlen,
+				    src, dst, sh, op_err,
+				    use_mflowid, mflowid,
+				    vrf_id, port);
 				return (2);
 			}
 #ifdef SCTP_AUDITING_ENABLED
@@ -2719,7 +2694,12 @@
 					struct mbuf *op_err;
 
 					op_err = sctp_generate_invmanparam(SCTP_CAUSE_PROTOCOL_VIOLATION);
-					sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, 0, net->port);
+					sctp_abort_association(inp, stcb,
+					    m, iphlen,
+					    src, dst,
+					    sh, op_err,
+					    use_mflowid, mflowid,
+					    vrf_id, port);
 					return (2);
 				}
 				break;
@@ -2784,7 +2764,7 @@
 		/*
 		 * we need to report rwnd overrun drops.
 		 */
-		sctp_send_packet_dropped(stcb, net, *mm, iphlen, 0);
+		sctp_send_packet_dropped(stcb, net, *mm, length, iphlen, 0);
 	}
 	if (num_chunks) {
 		/*
@@ -3222,8 +3202,7 @@
 				if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) {
 					/* Yes so drop it */
 					if (tp1->data != NULL) {
-						(void)sctp_release_pr_sctp_chunk(stcb, tp1,
-						    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+						(void)sctp_release_pr_sctp_chunk(stcb, tp1, 1,
 						    SCTP_SO_NOT_LOCKED);
 					}
 					continue;
@@ -3480,8 +3459,7 @@
 				if (tp1->snd_count > tp1->rec.data.timetodrop.tv_sec) {
 					/* Yes, so drop it */
 					if (tp1->data != NULL) {
-						(void)sctp_release_pr_sctp_chunk(stcb, tp1,
-						    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+						(void)sctp_release_pr_sctp_chunk(stcb, tp1, 1,
 						    SCTP_SO_NOT_LOCKED);
 					}
 					/* Make sure to flag we had a FR */
@@ -3489,7 +3467,10 @@
 					continue;
 				}
 			}
-			/* printf("OK, we are now ready to FR this guy\n"); */
+			/*
+			 * SCTP_PRINTF("OK, we are now ready to FR this
+			 * guy\n");
+			 */
 			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) {
 				sctp_log_fr(tp1->rec.data.TSN_seq, tp1->snd_count,
 				    0, SCTP_FR_MARKED);
@@ -3557,7 +3538,7 @@
 			tot_retrans++;
 			/* mark the sending seq for possible subsequent FR's */
 			/*
-			 * printf("Marking TSN for FR new value %x\n",
+			 * SCTP_PRINTF("Marking TSN for FR new value %x\n",
 			 * (uint32_t)tpi->rec.data.TSN_seq);
 			 */
 			if (TAILQ_EMPTY(&asoc->send_queue)) {
@@ -3657,8 +3638,7 @@
 				/* Yes so drop it */
 				if (tp1->data) {
 					(void)sctp_release_pr_sctp_chunk(stcb, tp1,
-					    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
-					    SCTP_SO_NOT_LOCKED);
+					    1, SCTP_SO_NOT_LOCKED);
 				}
 			} else {
 				/*
@@ -3709,11 +3689,10 @@
 
 	TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
 		if (chk->sent < SCTP_DATAGRAM_RESEND) {
-			printf("Chk TSN:%u size:%d inflight cnt:%d\n",
+			SCTP_PRINTF("Chk TSN:%u size:%d inflight cnt:%d\n",
 			    chk->rec.data.TSN_seq,
 			    chk->send_size,
-			    chk->snd_count
-			    );
+			    chk->snd_count);
 			inflight++;
 		} else if (chk->sent == SCTP_DATAGRAM_RESEND) {
 			resend++;
@@ -3730,7 +3709,7 @@
 #ifdef INVARIANTS
 		panic("Flight size-express incorrect? \n");
 #else
-		printf("asoc->total_flight:%d cnt:%d\n",
+		SCTP_PRINTF("asoc->total_flight:%d cnt:%d\n",
 		    entry_flight, entry_cnt);
 
 		SCTP_PRINTF("Flight size-express incorrect F:%d I:%d R:%d Ab:%d ACK:%d\n",
@@ -3876,7 +3855,7 @@
 				*ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_25);
 			}
 			stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25;
-			sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+			sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 			return;
 #endif
 		}
@@ -3895,7 +3874,7 @@
 		TAILQ_FOREACH_SAFE(tp1, &asoc->sent_queue, sctp_next, tp2) {
 			if (SCTP_TSN_GE(cumack, tp1->rec.data.TSN_seq)) {
 				if (tp1->sent == SCTP_DATAGRAM_UNSENT) {
-					printf("Warning, an unsent is now acked?\n");
+					SCTP_PRINTF("Warning, an unsent is now acked?\n");
 				}
 				if (tp1->sent < SCTP_DATAGRAM_ACKED) {
 					/*
@@ -4005,7 +3984,7 @@
 	}
 	/* sa_ignore NO_NULL_CHK */
 	if (stcb->sctp_socket) {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		struct socket *so;
 
 #endif
@@ -4014,7 +3993,7 @@
 			/* sa_ignore NO_NULL_CHK */
 			sctp_wakeup_log(stcb, 1, SCTP_WAKESND_FROM_SACK);
 		}
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		so = SCTP_INP_SO(stcb->sctp_ep);
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -4028,7 +4007,7 @@
 		}
 #endif
 		sctp_sowwakeup_locked(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	} else {
@@ -4050,7 +4029,7 @@
 					/* addr came good */
 					net->dest_state |= SCTP_ADDR_REACHABLE;
 					sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb,
-					    SCTP_RECEIVED_SACK, (void *)net, SCTP_SO_NOT_LOCKED);
+					    0, (void *)net, SCTP_SO_NOT_LOCKED);
 				}
 				if (net == stcb->asoc.primary_destination) {
 					if (stcb->asoc.alternate) {
@@ -4238,7 +4217,7 @@
 					*ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_24);
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_24;
-				sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_RESPONSE_TO_USER_REQ, oper, SCTP_SO_NOT_LOCKED);
+				sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 			} else {
 				struct sctp_nets *netp;
 
@@ -4437,10 +4416,10 @@
 			 * no way, we have not even sent this TSN out yet.
 			 * Peer is hopelessly messed up with us.
 			 */
-			printf("NEW cum_ack:%x send_s:%x is smaller or equal\n",
+			SCTP_PRINTF("NEW cum_ack:%x send_s:%x is smaller or equal\n",
 			    cum_ack, send_s);
 			if (tp1) {
-				printf("Got send_s from tsn:%x + 1 of tp1:%p\n",
+				SCTP_PRINTF("Got send_s from tsn:%x + 1 of tp1:%p\n",
 				    tp1->rec.data.TSN_seq, tp1);
 			}
 	hopeless_peer:
@@ -4461,7 +4440,7 @@
 				*ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_25);
 			}
 			stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25;
-			sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+			sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 			return;
 		}
 	}
@@ -4681,10 +4660,8 @@
 				 * peer is either confused or we are under
 				 * attack. We must abort.
 				 */
-				printf("Hopeless peer! biggest_tsn_acked:%x largest seq:%x\n",
-				    biggest_tsn_acked,
-				    send_s);
-
+				SCTP_PRINTF("Hopeless peer! biggest_tsn_acked:%x largest seq:%x\n",
+				    biggest_tsn_acked, send_s);
 				goto hopeless_peer;
 			}
 		}
@@ -4719,7 +4696,7 @@
 		}
 		if (tp1->sent == SCTP_DATAGRAM_UNSENT) {
 			/* no more sent on list */
-			printf("Warning, tp1->sent == %d and its now acked?\n",
+			SCTP_PRINTF("Warning, tp1->sent == %d and its now acked?\n",
 			    tp1->sent);
 		}
 		TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next);
@@ -4759,7 +4736,7 @@
 	}
 	/* sa_ignore NO_NULL_CHK */
 	if ((wake_him) && (stcb->sctp_socket)) {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		struct socket *so;
 
 #endif
@@ -4767,7 +4744,7 @@
 		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) {
 			sctp_wakeup_log(stcb, wake_him, SCTP_WAKESND_FROM_SACK);
 		}
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		so = SCTP_INP_SO(stcb->sctp_ep);
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -4781,7 +4758,7 @@
 		}
 #endif
 		sctp_sowwakeup_locked(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	} else {
@@ -4859,7 +4836,7 @@
 					/* addr came good */
 					net->dest_state |= SCTP_ADDR_REACHABLE;
 					sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb,
-					    SCTP_RECEIVED_SACK, (void *)net, SCTP_SO_NOT_LOCKED);
+					    0, (void *)net, SCTP_SO_NOT_LOCKED);
 				}
 				if (net == stcb->asoc.primary_destination) {
 					if (stcb->asoc.alternate) {
@@ -4966,7 +4943,7 @@
 					*ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_31);
 				}
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_31;
-				sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_RESPONSE_TO_USER_REQ, oper, SCTP_SO_NOT_LOCKED);
+				sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 				return;
 			} else {
 				struct sctp_nets *netp;
@@ -5421,8 +5398,7 @@
 				*ippp = new_cum_tsn;
 			}
 			stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_33;
-			sctp_abort_an_association(stcb->sctp_ep, stcb,
-			    SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED);
+			sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED);
 			return;
 		}
 		SCTP_STAT_INCR(sctps_fwdtsn_map_over);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_indata.h
--- a/head/sys/netinet/sctp_indata.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_indata.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_indata.h,v 1.9 2005/03/06 16:04:17 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_indata.h 237715 2012-06-28 16:01:08Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_indata.h 228653 2011-12-17 19:21:40Z tuexen $");
-
-#ifndef __sctp_indata_h__
-#define __sctp_indata_h__
+#ifndef _NETINET_SCTP_INDATA_H_
+#define _NETINET_SCTP_INDATA_H_
 
 #if defined(_KERNEL) || defined(__Userspace__)
 
@@ -113,9 +111,13 @@
      sctp_update_acked(struct sctp_tcb *, struct sctp_shutdown_chunk *, int *);
 
 int
-sctp_process_data(struct mbuf **, int, int *, int, struct sctphdr *,
+sctp_process_data(struct mbuf **, int, int *, int,
+    struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *,
     struct sctp_inpcb *, struct sctp_tcb *,
-    struct sctp_nets *, uint32_t *);
+    struct sctp_nets *, uint32_t *,
+    uint8_t, uint32_t,
+    uint32_t, uint16_t);
 
 void sctp_slide_mapping_arrays(struct sctp_tcb *stcb);
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_input.c
--- a/head/sys/netinet/sctp_input.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_input.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_input.c,v 1.27 2005/03/06 16:04:17 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_input.c 234461 2012-04-19 13:11:17Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_input.c 238454 2012-07-14 19:44:39Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_var.h>
@@ -82,9 +80,12 @@
 
 /* INIT handler */
 static void
-sctp_handle_init(struct mbuf *m, int iphlen, int offset, struct sctphdr *sh,
-    struct sctp_init_chunk *cp, struct sctp_inpcb *inp, struct sctp_tcb *stcb,
-    int *abort_no_unlock, uint32_t vrf_id, uint16_t port)
+sctp_handle_init(struct mbuf *m, int iphlen, int offset,
+    struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh,
+    struct sctp_init_chunk *cp, struct sctp_inpcb *inp,
+    struct sctp_tcb *stcb, int *abort_no_unlock,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_init *init;
 	struct mbuf *op_err;
@@ -97,7 +98,8 @@
 	/* validate length */
 	if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) {
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
+		    use_mflowid, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
@@ -108,7 +110,8 @@
 	if (init->initiate_tag == 0) {
 		/* protocol error... send abort */
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
+		    use_mflowid, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
@@ -117,7 +120,8 @@
 	if (ntohl(init->a_rwnd) < SCTP_MIN_RWND) {
 		/* invalid parameter... send abort */
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
+		    use_mflowid, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
@@ -126,7 +130,8 @@
 	if (init->num_inbound_streams == 0) {
 		/* protocol error... send abort */
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
+		    use_mflowid, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
@@ -135,7 +140,8 @@
 	if (init->num_outbound_streams == 0) {
 		/* protocol error... send abort */
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(inp, stcb, m, iphlen, sh, op_err,
+		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err,
+		    use_mflowid, mflowid,
 		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
@@ -144,7 +150,9 @@
 	if (sctp_validate_init_auth_params(m, offset + sizeof(*cp),
 	    offset + ntohs(cp->ch.chunk_length))) {
 		/* auth parameter(s) error... send abort */
-		sctp_abort_association(inp, stcb, m, iphlen, sh, NULL, vrf_id, port);
+		sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, NULL,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 		if (stcb)
 			*abort_no_unlock = 1;
 		goto outnow;
@@ -171,7 +179,9 @@
 		 * state :-)
 		 */
 		if (SCTP_BASE_SYSCTL(sctp_blackhole) == 0) {
-			sctp_send_abort(m, iphlen, sh, 0, NULL, vrf_id, port);
+			sctp_send_abort(m, iphlen, src, dst, sh, 0, NULL,
+			    use_mflowid, mflowid,
+			    vrf_id, port);
 		}
 		goto outnow;
 	}
@@ -182,7 +192,10 @@
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED);
 	} else {
 		SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending INIT-ACK\n");
-		sctp_send_initiate_ack(inp, stcb, m, iphlen, offset, sh, cp, vrf_id, port,
+		sctp_send_initiate_ack(inp, stcb, m, iphlen, offset, src, dst,
+		    sh, cp,
+		    use_mflowid, mflowid,
+		    vrf_id, port,
 		    ((stcb == NULL) ? SCTP_HOLDS_LOCK : SCTP_NOT_LOCKED));
 	}
 outnow:
@@ -300,8 +313,8 @@
 				asoc->send_queue_cnt--;
 				if (chk->data != NULL) {
 					sctp_free_bufspace(stcb, asoc, chk, 1);
-					sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
-					    SCTP_NOTIFY_DATAGRAM_UNSENT, chk, SCTP_SO_NOT_LOCKED);
+					sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb,
+					    0, chk, SCTP_SO_NOT_LOCKED);
 					if (chk->data) {
 						sctp_m_freem(chk->data);
 						chk->data = NULL;
@@ -318,8 +331,7 @@
 					TAILQ_REMOVE(&outs->outqueue, sp, next);
 					asoc->stream_queue_cnt--;
 					sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL,
-					    stcb, SCTP_NOTIFY_DATAGRAM_UNSENT,
-					    sp, SCTP_SO_NOT_LOCKED);
+					    stcb, 0, sp, SCTP_SO_NOT_LOCKED);
 					if (sp->data) {
 						sctp_m_freem(sp->data);
 						sp->data = NULL;
@@ -410,8 +422,11 @@
  */
 static int
 sctp_process_init_ack(struct mbuf *m, int iphlen, int offset,
-    struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
-    struct sctp_nets *net, int *abort_no_unlock, uint32_t vrf_id)
+    struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh,
+    struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
+    struct sctp_nets *net, int *abort_no_unlock,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id)
 {
 	struct sctp_association *asoc;
 	struct mbuf *op_err;
@@ -428,7 +443,7 @@
 	    &abort_flag, (struct sctp_chunkhdr *)cp, &nat_friendly);
 	if (abort_flag) {
 		/* Send an abort and notify peer */
-		sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_CAUSE_PROTOCOL_VIOLATION, op_err, SCTP_SO_NOT_LOCKED);
+		sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
@@ -442,14 +457,16 @@
 	initack_limit = offset + ntohs(cp->ch.chunk_length);
 	/* load all addresses */
 	if ((retval = sctp_load_addresses_from_init(stcb, m,
-	    (offset + sizeof(struct sctp_init_chunk)), initack_limit, sh,
-	    NULL))) {
+	    (offset + sizeof(struct sctp_init_chunk)), initack_limit,
+	    src, dst, NULL))) {
 		/* Huh, we should abort */
 		SCTPDBG(SCTP_DEBUG_INPUT1,
 		    "Load addresses from INIT causes an abort %d\n",
 		    retval);
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
-		    NULL, 0, net->port);
+		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+		    src, dst, sh, NULL,
+		    use_mflowid, mflowid,
+		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
@@ -523,7 +540,9 @@
 				mp->resv = 0;
 			}
 			sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
-			    sh, op_err, vrf_id, net->port);
+			    src, dst, sh, op_err,
+			    use_mflowid, mflowid,
+			    vrf_id, net->port);
 			*abort_no_unlock = 1;
 		}
 		return (retval);
@@ -632,7 +651,7 @@
 	if (!(r_net->dest_state & SCTP_ADDR_REACHABLE)) {
 		r_net->dest_state |= SCTP_ADDR_REACHABLE;
 		sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb,
-		    SCTP_HEARTBEAT_SUCCESS, (void *)r_net, SCTP_SO_NOT_LOCKED);
+		    0, (void *)r_net, SCTP_SO_NOT_LOCKED);
 	}
 	if (r_net->dest_state & SCTP_ADDR_PF) {
 		r_net->dest_state &= ~SCTP_ADDR_PF;
@@ -739,61 +758,51 @@
 
 
 static void
-sctp_handle_abort(struct sctp_abort_chunk *cp,
+sctp_handle_abort(struct sctp_abort_chunk *abort,
     struct sctp_tcb *stcb, struct sctp_nets *net)
 {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
 	uint16_t len;
+	uint16_t error;
 
 	SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: handling ABORT\n");
 	if (stcb == NULL)
 		return;
 
-	len = ntohs(cp->ch.chunk_length);
+	len = ntohs(abort->ch.chunk_length);
 	if (len > sizeof(struct sctp_chunkhdr)) {
 		/*
 		 * Need to check the cause codes for our two magic nat
 		 * aborts which don't kill the assoc necessarily.
 		 */
-		struct sctp_abort_chunk *cpnext;
 		struct sctp_missing_nat_state *natc;
-		uint16_t cause;
-
-		cpnext = cp;
-		cpnext++;
-		natc = (struct sctp_missing_nat_state *)cpnext;
-		cause = ntohs(natc->cause);
-		if (cause == SCTP_CAUSE_NAT_COLLIDING_STATE) {
+
+		natc = (struct sctp_missing_nat_state *)(abort + 1);
+		error = ntohs(natc->cause);
+		if (error == SCTP_CAUSE_NAT_COLLIDING_STATE) {
 			SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n",
-			    cp->ch.chunk_flags);
+			    abort->ch.chunk_flags);
 			if (sctp_handle_nat_colliding_state(stcb)) {
 				return;
 			}
-		} else if (cause == SCTP_CAUSE_NAT_MISSING_STATE) {
+		} else if (error == SCTP_CAUSE_NAT_MISSING_STATE) {
 			SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state abort flags:%x\n",
-			    cp->ch.chunk_flags);
+			    abort->ch.chunk_flags);
 			if (sctp_handle_nat_missing_state(stcb, net)) {
 				return;
 			}
 		}
+	} else {
+		error = 0;
 	}
 	/* stop any receive timers */
 	sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_6);
 	/* notify user of the abort and clean up... */
-	sctp_abort_notification(stcb, 0, SCTP_SO_NOT_LOCKED);
+	sctp_abort_notification(stcb, 1, error, abort, SCTP_SO_NOT_LOCKED);
 	/* free the tcb */
-#if defined(SCTP_PANIC_ON_ABORT)
-	printf("stcb:%p state:%d rport:%d net:%p\n",
-	    stcb, stcb->asoc.state, stcb->rport, net);
-	if (!(stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) {
-		panic("Received an ABORT");
-	} else {
-		printf("No panic its in state %x closed\n", stcb->asoc.state);
-	}
-#endif
 	SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 	if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
 	    (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
@@ -802,7 +811,7 @@
 #ifdef SCTP_ASOCLOG_OF_TSNS
 	sctp_print_out_track_log(stcb);
 #endif
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	so = SCTP_INP_SO(stcb->sctp_ep);
 	atomic_add_int(&stcb->asoc.refcnt, 1);
 	SCTP_TCB_UNLOCK(stcb);
@@ -813,7 +822,7 @@
 	stcb->asoc.state |= SCTP_STATE_WAS_ABORTED;
 	(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
 	    SCTP_FROM_SCTP_INPUT + SCTP_LOC_6);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: finished\n");
@@ -855,7 +864,7 @@
 	struct sctp_association *asoc;
 	int some_on_streamwheel;
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
@@ -887,7 +896,7 @@
 		asoc->control_pdapi->pdapi_aborted = 1;
 		asoc->control_pdapi = NULL;
 		SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		so = SCTP_INP_SO(stcb->sctp_ep);
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -901,7 +910,7 @@
 		}
 #endif
 		sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	}
@@ -961,7 +970,7 @@
 {
 	struct sctp_association *asoc;
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 	so = SCTP_INP_SO(stcb->sctp_ep);
@@ -995,7 +1004,7 @@
 		asoc->control_pdapi->pdapi_aborted = 1;
 		asoc->control_pdapi = NULL;
 		SCTP_INP_READ_UNLOCK(stcb->sctp_ep);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_SOCKET_LOCK(so, 1);
@@ -1008,7 +1017,7 @@
 		}
 #endif
 		sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	}
@@ -1016,7 +1025,7 @@
 	if (!TAILQ_EMPTY(&asoc->send_queue) ||
 	    !TAILQ_EMPTY(&asoc->sent_queue) ||
 	    !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
-		sctp_report_all_outbound(stcb, 0, SCTP_SO_NOT_LOCKED);
+		sctp_report_all_outbound(stcb, 0, 0, SCTP_SO_NOT_LOCKED);
 	}
 	/* stop the timer */
 	sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9);
@@ -1032,7 +1041,7 @@
 	}
 	SCTP_STAT_INCR_COUNTER32(sctps_shutdown);
 	/* free the TCB but first save off the ep */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	atomic_add_int(&stcb->asoc.refcnt, 1);
 	SCTP_TCB_UNLOCK(stcb);
 	SCTP_SOCKET_LOCK(so, 1);
@@ -1041,7 +1050,7 @@
 #endif
 	(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
 	    SCTP_FROM_SCTP_INPUT + SCTP_LOC_10);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 }
@@ -1122,12 +1131,12 @@
 {
 	int chklen;
 	struct sctp_paramhdr *phdr;
-	uint16_t error_type;
+	uint16_t error, error_type;
 	uint16_t error_len;
 	struct sctp_association *asoc;
 	int adjust;
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
@@ -1137,6 +1146,7 @@
 	phdr = (struct sctp_paramhdr *)((caddr_t)ch +
 	    sizeof(struct sctp_chunkhdr));
 	chklen = ntohs(ch->chunk_length) - sizeof(struct sctp_chunkhdr);
+	error = 0;
 	while ((size_t)chklen >= sizeof(struct sctp_paramhdr)) {
 		/* Process an Error Cause */
 		error_type = ntohs(phdr->param_type);
@@ -1147,6 +1157,10 @@
 			    chklen, error_len);
 			return (0);
 		}
+		if (error == 0) {
+			/* report the first error cause */
+			error = error_type;
+		}
 		switch (error_type) {
 		case SCTP_CAUSE_INVALID_STREAM:
 		case SCTP_CAUSE_MISSING_PARAM:
@@ -1183,9 +1197,9 @@
 				asoc->stale_cookie_count++;
 				if (asoc->stale_cookie_count >
 				    asoc->max_init_times) {
-					sctp_abort_notification(stcb, 0, SCTP_SO_NOT_LOCKED);
+					sctp_abort_notification(stcb, 0, 0, NULL, SCTP_SO_NOT_LOCKED);
 					/* now free the asoc */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 					so = SCTP_INP_SO(stcb->sctp_ep);
 					atomic_add_int(&stcb->asoc.refcnt, 1);
 					SCTP_TCB_UNLOCK(stcb);
@@ -1195,7 +1209,7 @@
 #endif
 					(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC,
 					    SCTP_FROM_SCTP_INPUT + SCTP_LOC_11);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 					SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 					return (-1);
@@ -1263,13 +1277,17 @@
 		chklen -= adjust;
 		phdr = (struct sctp_paramhdr *)((caddr_t)phdr + adjust);
 	}
+	sctp_ulp_notify(SCTP_NOTIFY_REMOTE_ERROR, stcb, error, ch, SCTP_SO_NOT_LOCKED);
 	return (0);
 }
 
 static int
 sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset,
-    struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
-    struct sctp_nets *net, int *abort_no_unlock, uint32_t vrf_id)
+    struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh,
+    struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb,
+    struct sctp_nets *net, int *abort_no_unlock,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id)
 {
 	struct sctp_init_ack *init_ack;
 	struct mbuf *op_err;
@@ -1285,8 +1303,10 @@
 	if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_ack_chunk)) {
 		/* Invalid length */
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
-		    op_err, 0, net->port);
+		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+		    src, dst, sh, op_err,
+		    use_mflowid, mflowid,
+		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
@@ -1295,32 +1315,40 @@
 	if (init_ack->initiate_tag == 0) {
 		/* protocol error... send an abort */
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
-		    op_err, 0, net->port);
+		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+		    src, dst, sh, op_err,
+		    use_mflowid, mflowid,
+		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	if (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) {
 		/* protocol error... send an abort */
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
-		    op_err, 0, net->port);
+		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+		    src, dst, sh, op_err,
+		    use_mflowid, mflowid,
+		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	if (init_ack->num_inbound_streams == 0) {
 		/* protocol error... send an abort */
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
-		    op_err, 0, net->port);
+		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+		    src, dst, sh, op_err,
+		    use_mflowid, mflowid,
+		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
 	if (init_ack->num_outbound_streams == 0) {
 		/* protocol error... send an abort */
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM);
-		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh,
-		    op_err, 0, net->port);
+		sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen,
+		    src, dst, sh, op_err,
+		    use_mflowid, mflowid,
+		    vrf_id, net->port);
 		*abort_no_unlock = 1;
 		return (-1);
 	}
@@ -1342,8 +1370,10 @@
 			sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
 			    stcb, 0, (void *)stcb->asoc.primary_destination, SCTP_SO_NOT_LOCKED);
 		}
-		if (sctp_process_init_ack(m, iphlen, offset, sh, cp, stcb,
-		    net, abort_no_unlock, vrf_id) < 0) {
+		if (sctp_process_init_ack(m, iphlen, offset, src, dst, sh, cp, stcb,
+		    net, abort_no_unlock,
+		    use_mflowid, mflowid,
+		    vrf_id) < 0) {
 			/* error in parsing parameters */
 			return (-1);
 		}
@@ -1394,10 +1424,12 @@
 
 static struct sctp_tcb *
 sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
+    struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len,
     struct sctp_inpcb *inp, struct sctp_nets **netp,
     struct sockaddr *init_src, int *notification,
     int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
+    uint8_t use_mflowid, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port);
 
 
@@ -1409,10 +1441,13 @@
  */
 static struct sctp_tcb *
 sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset,
+    struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len,
     struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp,
     struct sockaddr *init_src, int *notification,
-    uint32_t vrf_id, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint16_t port)
+    int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_association *asoc;
 	struct sctp_init_chunk *init_cp, init_buf;
@@ -1449,7 +1484,8 @@
 		ph = mtod(op_err, struct sctp_paramhdr *);
 		ph->param_type = htons(SCTP_CAUSE_COOKIE_IN_SHUTDOWN);
 		ph->param_length = htons(sizeof(struct sctp_paramhdr));
-		sctp_send_operr_to(m, iphlen, op_err, cookie->peers_vtag,
+		sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err,
+		    use_mflowid, mflowid,
 		    vrf_id, net->port);
 		if (how_indx < sizeof(asoc->cookie_how))
 			asoc->cookie_how[how_indx] = 2;
@@ -1551,7 +1587,7 @@
 			    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
 			    (inp->sctp_socket->so_qlimit == 0)
 			    ) {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				struct socket *so;
 
 #endif
@@ -1563,7 +1599,7 @@
 				 */
 				stcb->sctp_ep->sctp_flags |=
 				    SCTP_PCB_FLAGS_CONNECTED;
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				so = SCTP_INP_SO(stcb->sctp_ep);
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				SCTP_TCB_UNLOCK(stcb);
@@ -1576,7 +1612,7 @@
 				}
 #endif
 				soisconnected(stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 			}
@@ -1613,7 +1649,7 @@
 		 */
 		if (sctp_load_addresses_from_init(stcb, m,
 		    init_offset + sizeof(struct sctp_init_chunk),
-		    initack_offset, sh, init_src)) {
+		    initack_offset, src, dst, init_src)) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 4;
 			return (NULL);
@@ -1674,7 +1710,9 @@
 		ph = mtod(op_err, struct sctp_paramhdr *);
 		ph->param_type = htons(SCTP_CAUSE_NAT_COLLIDING_STATE);
 		ph->param_length = htons(sizeof(struct sctp_paramhdr));
-		sctp_send_abort(m, iphlen, sh, 0, op_err, vrf_id, port);
+		sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 		return (NULL);
 	}
 	if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) &&
@@ -1755,7 +1793,7 @@
 		}
 		if (sctp_load_addresses_from_init(stcb, m,
 		    init_offset + sizeof(struct sctp_init_chunk),
-		    initack_offset, sh, init_src)) {
+		    initack_offset, src, dst, init_src)) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 10;
 			return (NULL);
@@ -1767,13 +1805,13 @@
 			if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 			    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
 			    (inp->sctp_socket->so_qlimit == 0)) {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				struct socket *so;
 
 #endif
 				stcb->sctp_ep->sctp_flags |=
 				    SCTP_PCB_FLAGS_CONNECTED;
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				so = SCTP_INP_SO(stcb->sctp_ep);
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				SCTP_TCB_UNLOCK(stcb);
@@ -1786,7 +1824,7 @@
 				}
 #endif
 				soisconnected(stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 			}
@@ -1836,9 +1874,11 @@
 			 * cookie_new code since we are allowing a duplicate
 			 * association. I hope this works...
 			 */
-			return (sctp_process_cookie_new(m, iphlen, offset, sh, cookie, cookie_len,
+			return (sctp_process_cookie_new(m, iphlen, offset, src, dst,
+			    sh, cookie, cookie_len,
 			    inp, netp, init_src, notification,
 			    auth_skipped, auth_offset, auth_len,
+			    use_mflowid, mflowid,
 			    vrf_id, port));
 		}
 		/*
@@ -1899,7 +1939,7 @@
 		/* send up all the data */
 		SCTP_TCB_SEND_LOCK(stcb);
 
-		sctp_report_all_outbound(stcb, 1, SCTP_SO_NOT_LOCKED);
+		sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_NOT_LOCKED);
 		for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 			stcb->asoc.strmout[i].stream_no = i;
 			stcb->asoc.strmout[i].next_sequence_sent = 0;
@@ -1940,7 +1980,7 @@
 
 		if (sctp_load_addresses_from_init(stcb, m,
 		    init_offset + sizeof(struct sctp_init_chunk),
-		    initack_offset, sh, init_src)) {
+		    initack_offset, src, dst, init_src)) {
 			if (how_indx < sizeof(asoc->cookie_how))
 				asoc->cookie_how[how_indx] = 14;
 
@@ -1969,12 +2009,14 @@
  * cookie-echo chunk length: length of the cookie chunk to: where the init
  * was from returns a new TCB
  */
-struct sctp_tcb *
+static struct sctp_tcb *
 sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset,
+    struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len,
     struct sctp_inpcb *inp, struct sctp_nets **netp,
     struct sockaddr *init_src, int *notification,
     int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
+    uint8_t use_mflowid, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_tcb *stcb;
@@ -1996,7 +2038,7 @@
 	struct sockaddr_in6 *sin6;
 
 #endif
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 	so = SCTP_INP_SO(inp);
@@ -2069,7 +2111,9 @@
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
 
 		sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen,
-		    sh, op_err, vrf_id, port);
+		    src, dst, sh, op_err,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 		return (NULL);
 	}
 	/* get the correct sctp_nets */
@@ -2095,15 +2139,17 @@
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
 		sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen,
-		    sh, op_err, vrf_id, port);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+		    src, dst, sh, op_err,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_SOCKET_LOCK(so, 1);
 		SCTP_TCB_LOCK(stcb);
 #endif
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 		atomic_subtract_int(&stcb->asoc.refcnt, 1);
@@ -2128,13 +2174,13 @@
 		retval = 0;
 	if (retval < 0) {
 		atomic_add_int(&stcb->asoc.refcnt, 1);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_SOCKET_LOCK(so, 1);
 		SCTP_TCB_LOCK(stcb);
 #endif
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 		atomic_subtract_int(&stcb->asoc.refcnt, 1);
@@ -2142,16 +2188,16 @@
 	}
 	/* load all addresses */
 	if (sctp_load_addresses_from_init(stcb, m,
-	    init_offset + sizeof(struct sctp_init_chunk), initack_offset, sh,
-	    init_src)) {
+	    init_offset + sizeof(struct sctp_init_chunk), initack_offset,
+	    src, dst, init_src)) {
 		atomic_add_int(&stcb->asoc.refcnt, 1);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_SOCKET_LOCK(so, 1);
 		SCTP_TCB_LOCK(stcb);
 #endif
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_17);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 		atomic_subtract_int(&stcb->asoc.refcnt, 1);
@@ -2174,13 +2220,13 @@
 			SCTPDBG(SCTP_DEBUG_AUTH1,
 			    "COOKIE-ECHO: AUTH failed\n");
 			atomic_add_int(&stcb->asoc.refcnt, 1);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_SOCKET_LOCK(so, 1);
 			SCTP_TCB_LOCK(stcb);
 #endif
 			(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_18);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 			atomic_subtract_int(&stcb->asoc.refcnt, 1);
@@ -2237,13 +2283,13 @@
 #endif
 	default:
 		atomic_add_int(&stcb->asoc.refcnt, 1);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_SOCKET_LOCK(so, 1);
 		SCTP_TCB_LOCK(stcb);
 #endif
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_19);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 		atomic_subtract_int(&stcb->asoc.refcnt, 1);
@@ -2264,7 +2310,7 @@
 		 * a bit of protection is worth having..
 		 */
 		stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_SOCKET_LOCK(so, 1);
@@ -2276,7 +2322,7 @@
 		}
 #endif
 		soisconnected(stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	} else if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
@@ -2339,10 +2385,13 @@
  */
 static struct mbuf *
 sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
+    struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_cookie_echo_chunk *cp,
     struct sctp_inpcb **inp_p, struct sctp_tcb **stcb, struct sctp_nets **netp,
     int auth_skipped, uint32_t auth_offset, uint32_t auth_len,
-    struct sctp_tcb **locked_tcb, uint32_t vrf_id, uint16_t port)
+    struct sctp_tcb **locked_tcb,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_state_cookie *cookie;
 	struct sctp_tcb *l_stcb = *stcb;
@@ -2353,13 +2402,10 @@
 	uint8_t calc_sig[SCTP_SIGNATURE_SIZE], tmp_sig[SCTP_SIGNATURE_SIZE];
 	uint8_t *sig;
 	uint8_t cookie_ok = 0;
-	unsigned int size_of_pkt, sig_offset, cookie_offset;
+	unsigned int sig_offset, cookie_offset;
 	unsigned int cookie_len;
 	struct timeval now;
 	struct timeval time_expires;
-	struct sockaddr_storage dest_store;
-	struct sockaddr *localep_sa = (struct sockaddr *)&dest_store;
-	struct ip *iph;
 	int notification = 0;
 	struct sctp_nets *netl;
 	int had_a_existing_tcb = 0;
@@ -2380,47 +2426,6 @@
 	if (inp_p == NULL) {
 		return (NULL);
 	}
-	/* First get the destination address setup too. */
-	iph = mtod(m, struct ip *);
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		{
-			/* its IPv4 */
-			struct sockaddr_in *lsin;
-
-			lsin = (struct sockaddr_in *)(localep_sa);
-			memset(lsin, 0, sizeof(*lsin));
-			lsin->sin_family = AF_INET;
-			lsin->sin_len = sizeof(*lsin);
-			lsin->sin_port = sh->dest_port;
-			lsin->sin_addr.s_addr = iph->ip_dst.s_addr;
-			size_of_pkt = SCTP_GET_IPV4_LENGTH(iph);
-			break;
-		}
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		{
-			/* its IPv6 */
-			struct ip6_hdr *ip6;
-			struct sockaddr_in6 *lsin6;
-
-			lsin6 = (struct sockaddr_in6 *)(localep_sa);
-			memset(lsin6, 0, sizeof(*lsin6));
-			lsin6->sin6_family = AF_INET6;
-			lsin6->sin6_len = sizeof(struct sockaddr_in6);
-			ip6 = mtod(m, struct ip6_hdr *);
-			lsin6->sin6_port = sh->dest_port;
-			lsin6->sin6_addr = ip6->ip6_dst;
-			size_of_pkt = SCTP_GET_IPV6_LENGTH(ip6) + iphlen;
-			break;
-		}
-#endif
-	default:
-		return (NULL);
-	}
-
 	cookie = &cp->cookie;
 	cookie_offset = offset + sizeof(struct sctp_chunkhdr);
 	cookie_len = ntohs(cp->ch.chunk_length);
@@ -2437,11 +2442,10 @@
 		 */
 		return (NULL);
 	}
-	if (cookie_len > size_of_pkt ||
-	    cookie_len < sizeof(struct sctp_cookie_echo_chunk) +
+	if (cookie_len < sizeof(struct sctp_cookie_echo_chunk) +
 	    sizeof(struct sctp_init_chunk) +
 	    sizeof(struct sctp_init_ack_chunk) + SCTP_SIGNATURE_SIZE) {
-		/* cookie too long!  or too small */
+		/* cookie too small */
 		return (NULL);
 	}
 	/*
@@ -2449,11 +2453,6 @@
 	 * calculated in the sctp_hmac_m() call).
 	 */
 	sig_offset = offset + cookie_len - SCTP_SIGNATURE_SIZE;
-	if (sig_offset > size_of_pkt) {
-		/* packet not correct size! */
-		/* XXX this may already be accounted for earlier... */
-		return (NULL);
-	}
 	m_sig = m_split(m, sig_offset, M_DONTWAIT);
 	if (m_sig == NULL) {
 		/* out of memory or ?? */
@@ -2577,7 +2576,8 @@
 		if (tim == 0)
 			tim = now.tv_usec - cookie->time_entered.tv_usec;
 		scm->time_usec = htonl(tim);
-		sctp_send_operr_to(m, iphlen, op_err, cookie->peers_vtag,
+		sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err,
+		    use_mflowid, mflowid,
 		    vrf_id, port);
 		return (NULL);
 	}
@@ -2620,7 +2620,7 @@
 	}
 	if ((*stcb == NULL) && to) {
 		/* Yep, lets check */
-		*stcb = sctp_findassociation_ep_addr(inp_p, to, netp, localep_sa, NULL);
+		*stcb = sctp_findassociation_ep_addr(inp_p, to, netp, dst, NULL);
 		if (*stcb == NULL) {
 			/*
 			 * We should have only got back the same inp. If we
@@ -2663,23 +2663,29 @@
 	cookie_len -= SCTP_SIGNATURE_SIZE;
 	if (*stcb == NULL) {
 		/* this is the "normal" case... get a new TCB */
-		*stcb = sctp_process_cookie_new(m, iphlen, offset, sh, cookie,
-		    cookie_len, *inp_p, netp, to, &notification,
-		    auth_skipped, auth_offset, auth_len, vrf_id, port);
+		*stcb = sctp_process_cookie_new(m, iphlen, offset, src, dst, sh,
+		    cookie, cookie_len, *inp_p,
+		    netp, to, &notification,
+		    auth_skipped, auth_offset, auth_len,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 	} else {
 		/* this is abnormal... cookie-echo on existing TCB */
 		had_a_existing_tcb = 1;
-		*stcb = sctp_process_cookie_existing(m, iphlen, offset, sh,
+		*stcb = sctp_process_cookie_existing(m, iphlen, offset,
+		    src, dst, sh,
 		    cookie, cookie_len, *inp_p, *stcb, netp, to,
-		    &notification, vrf_id, auth_skipped, auth_offset, auth_len, port);
+		    &notification, auth_skipped, auth_offset, auth_len,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 	}
 
 	if (*stcb == NULL) {
 		/* still no TCB... must be bad cookie-echo */
 		return (NULL);
 	}
-	if ((*netp != NULL) && (m->m_flags & M_FLOWID)) {
-		(*netp)->flowid = m->m_pkthdr.flowid;
+	if ((*netp != NULL) && (use_mflowid != 0)) {
+		(*netp)->flowid = mflowid;
 #ifdef INVARIANTS
 		(*netp)->flowidset = 1;
 #endif
@@ -2744,7 +2750,7 @@
 			if (so == NULL) {
 				struct mbuf *op_err;
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				struct socket *pcb_so;
 
 #endif
@@ -2752,8 +2758,10 @@
 				SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another socket!\n");
 				op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
 				sctp_abort_association(*inp_p, NULL, m, iphlen,
-				    sh, op_err, vrf_id, port);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+				    src, dst, sh, op_err,
+				    use_mflowid, mflowid,
+				    vrf_id, port);
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				pcb_so = SCTP_INP_SO(*inp_p);
 				atomic_add_int(&(*stcb)->asoc.refcnt, 1);
 				SCTP_TCB_UNLOCK((*stcb));
@@ -2762,7 +2770,7 @@
 				atomic_subtract_int(&(*stcb)->asoc.refcnt, 1);
 #endif
 				(void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_20);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				SCTP_SOCKET_UNLOCK(pcb_so, 1);
 #endif
 				return (NULL);
@@ -2852,13 +2860,13 @@
 			 * Pull it from the incomplete queue and wake the
 			 * guy
 			 */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			atomic_add_int(&(*stcb)->asoc.refcnt, 1);
 			SCTP_TCB_UNLOCK((*stcb));
 			SCTP_SOCKET_LOCK(so, 1);
 #endif
 			soisconnected(so);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			SCTP_TCB_LOCK((*stcb));
 			atomic_subtract_int(&(*stcb)->asoc.refcnt, 1);
 			SCTP_SOCKET_UNLOCK(so, 1);
@@ -2866,14 +2874,12 @@
 			return (m);
 		}
 	}
-	if ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
-		if (notification) {
-			sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
-		}
-		if (send_int_conf) {
-			sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
-			    (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED);
-		}
+	if (notification) {
+		sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
+	}
+	if (send_int_conf) {
+		sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED,
+		    (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED);
 	}
 	return (m);
 }
@@ -2916,12 +2922,12 @@
 		sctp_ulp_notify(SCTP_NOTIFY_ASSOC_UP, stcb, 0, NULL, SCTP_SO_NOT_LOCKED);
 		if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 		    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			struct socket *so;
 
 #endif
 			stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED;
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			so = SCTP_INP_SO(stcb->sctp_ep);
 			atomic_add_int(&stcb->asoc.refcnt, 1);
 			SCTP_TCB_UNLOCK(stcb);
@@ -2932,7 +2938,7 @@
 			if ((stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) == 0) {
 				soisconnected(stcb->sctp_socket);
 			}
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 		}
@@ -3160,7 +3166,7 @@
 {
 	struct sctp_association *asoc;
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
@@ -3186,7 +3192,7 @@
 		if (!TAILQ_EMPTY(&asoc->send_queue) ||
 		    !TAILQ_EMPTY(&asoc->sent_queue) ||
 		    !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) {
-			sctp_report_all_outbound(stcb, 0, SCTP_SO_NOT_LOCKED);
+			sctp_report_all_outbound(stcb, 0, 0, SCTP_SO_NOT_LOCKED);
 		}
 	}
 	/* stop the timer */
@@ -3195,7 +3201,7 @@
 	/* free the TCB */
 	SCTPDBG(SCTP_DEBUG_INPUT2,
 	    "sctp_handle_shutdown_complete: calls free-asoc\n");
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	so = SCTP_INP_SO(stcb->sctp_ep);
 	atomic_add_int(&stcb->asoc.refcnt, 1);
 	SCTP_TCB_UNLOCK(stcb);
@@ -3204,7 +3210,7 @@
 	atomic_subtract_int(&stcb->asoc.refcnt, 1);
 #endif
 	(void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_23);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	return;
@@ -3599,9 +3605,11 @@
 				asoc->stream_reset_out_is_outstanding = 0;
 				if (asoc->stream_reset_outstanding)
 					asoc->stream_reset_outstanding--;
-				if (action == SCTP_STREAM_RESET_PERFORMED) {
+				if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) {
 					/* do it */
 					sctp_reset_out_streams(stcb, number_entries, srparam->list_of_streams);
+				} else if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
+					sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_OUT, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
 				} else {
 					sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
 				}
@@ -3610,7 +3618,10 @@
 				number_entries = (lparm_len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t);
 				if (asoc->stream_reset_outstanding)
 					asoc->stream_reset_outstanding--;
-				if (action != SCTP_STREAM_RESET_PERFORMED) {
+				if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
+					sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_IN, stcb,
+					    number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
+				} else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) {
 					sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_IN, stcb,
 					    number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED);
 				}
@@ -3626,20 +3637,26 @@
 				stcb->asoc.strm_pending_add_size = 0;
 				if (asoc->stream_reset_outstanding)
 					asoc->stream_reset_outstanding--;
-				if (action == SCTP_STREAM_RESET_PERFORMED) {
+				if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) {
 					/* Put the new streams into effect */
 					stcb->asoc.streamoutcnt += num_stream;
 					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0);
+				} else if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
+					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt,
+					    SCTP_STREAM_CHANGE_DENIED);
 				} else {
 					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt,
-					    SCTP_STREAM_CHANGED_DENIED);
+					    SCTP_STREAM_CHANGE_FAILED);
 				}
 			} else if (type == SCTP_STR_RESET_ADD_IN_STREAMS) {
 				if (asoc->stream_reset_outstanding)
 					asoc->stream_reset_outstanding--;
-				if (action != SCTP_STREAM_RESET_PERFORMED) {
+				if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
 					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt,
-					    SCTP_STREAM_CHANGED_DENIED);
+					    SCTP_STREAM_CHANGE_DENIED);
+				} else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) {
+					sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt,
+					    SCTP_STREAM_CHANGE_FAILED);
 				}
 			} else if (type == SCTP_STR_RESET_TSN_REQUEST) {
 				/**
@@ -3655,7 +3672,7 @@
 					/* huh ? */
 					return (0);
 				}
-				if (action == SCTP_STREAM_RESET_PERFORMED) {
+				if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) {
 					resp = (struct sctp_stream_reset_response_tsn *)respin;
 					asoc->stream_reset_outstanding--;
 					fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk));
@@ -3682,9 +3699,12 @@
 					sctp_reset_out_streams(stcb, 0, (uint16_t *) NULL);
 					sctp_reset_in_stream(stcb, 0, (uint16_t *) NULL);
 					sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), 0);
+				} else if (action == SCTP_STREAM_RESET_RESULT_DENIED) {
+					sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1),
+					    SCTP_ASSOC_RESET_DENIED);
 				} else {
 					sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1),
-					    SCTP_STREAM_RESET_FAILED);
+					    SCTP_ASSOC_RESET_FAILED);
 				}
 			}
 			/* get rid of the request and get the request flags */
@@ -3714,10 +3734,12 @@
 
 	seq = ntohl(req->request_seq);
 	if (asoc->str_reset_seq_in == seq) {
-		if (trunc) {
+		asoc->last_reset_action[1] = asoc->last_reset_action[0];
+		if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) {
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
+		} else if (trunc) {
 			/* Can't do it, since they exceeded our buffer size  */
-			asoc->last_reset_action[1] = asoc->last_reset_action[0];
-			asoc->last_reset_action[0] = SCTP_STREAM_RESET_REJECT;
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else if (stcb->asoc.stream_reset_out_is_outstanding == 0) {
 			len = ntohs(req->ph.param_length);
 			number_entries = ((len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t));
@@ -3725,9 +3747,7 @@
 				temp = ntohs(req->list_of_streams[i]);
 				req->list_of_streams[i] = temp;
 			}
-			/* move the reset action back one */
-			asoc->last_reset_action[1] = asoc->last_reset_action[0];
-			asoc->last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 			sctp_add_stream_reset_out(chk, number_entries, req->list_of_streams,
 			    asoc->str_reset_seq_out,
 			    seq, (asoc->sending_seq - 1));
@@ -3737,8 +3757,7 @@
 			stcb->asoc.stream_reset_outstanding++;
 		} else {
 			/* Can't do it, since we have sent one out */
-			asoc->last_reset_action[1] = asoc->last_reset_action[0];
-			asoc->last_reset_action[0] = SCTP_STREAM_RESET_TRY_LATER;
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS;
 		}
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 		asoc->str_reset_seq_in++;
@@ -3747,7 +3766,7 @@
 	} else if (asoc->str_reset_seq_in - 2 == seq) {
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
 	} else {
-		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO);
+		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 	}
 }
 
@@ -3769,53 +3788,49 @@
 
 	seq = ntohl(req->request_seq);
 	if (asoc->str_reset_seq_in == seq) {
-		fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk));
-		fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN;
-		fwdtsn.ch.chunk_flags = 0;
-		fwdtsn.new_cumulative_tsn = htonl(stcb->asoc.highest_tsn_inside_map + 1);
-		sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0);
-		if (abort_flag) {
-			return (1);
+		asoc->last_reset_action[1] = stcb->asoc.last_reset_action[0];
+		if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
+		} else {
+			fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk));
+			fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN;
+			fwdtsn.ch.chunk_flags = 0;
+			fwdtsn.new_cumulative_tsn = htonl(stcb->asoc.highest_tsn_inside_map + 1);
+			sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0);
+			if (abort_flag) {
+				return (1);
+			}
+			asoc->highest_tsn_inside_map += SCTP_STREAM_RESET_TSN_DELTA;
+			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
+				sctp_log_map(0, 10, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
+			}
+			asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->highest_tsn_inside_map;
+			asoc->mapping_array_base_tsn = asoc->highest_tsn_inside_map + 1;
+			memset(asoc->mapping_array, 0, asoc->mapping_array_size);
+			asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map;
+			memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size);
+			atomic_add_int(&asoc->sending_seq, 1);
+			/* save off historical data for retrans */
+			asoc->last_sending_seq[1] = asoc->last_sending_seq[0];
+			asoc->last_sending_seq[0] = asoc->sending_seq;
+			asoc->last_base_tsnsent[1] = asoc->last_base_tsnsent[0];
+			asoc->last_base_tsnsent[0] = asoc->mapping_array_base_tsn;
+			sctp_reset_out_streams(stcb, 0, (uint16_t *) NULL);
+			sctp_reset_in_stream(stcb, 0, (uint16_t *) NULL);
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
+			sctp_notify_stream_reset_tsn(stcb, asoc->sending_seq, (asoc->mapping_array_base_tsn + 1), 0);
 		}
-		stcb->asoc.highest_tsn_inside_map += SCTP_STREAM_RESET_TSN_DELTA;
-		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) {
-			sctp_log_map(0, 10, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT);
-		}
-		stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map;
-		stcb->asoc.mapping_array_base_tsn = stcb->asoc.highest_tsn_inside_map + 1;
-		memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size);
-		stcb->asoc.highest_tsn_inside_nr_map = stcb->asoc.highest_tsn_inside_map;
-		memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size);
-		atomic_add_int(&stcb->asoc.sending_seq, 1);
-		/* save off historical data for retrans */
-		stcb->asoc.last_sending_seq[1] = stcb->asoc.last_sending_seq[0];
-		stcb->asoc.last_sending_seq[0] = stcb->asoc.sending_seq;
-		stcb->asoc.last_base_tsnsent[1] = stcb->asoc.last_base_tsnsent[0];
-		stcb->asoc.last_base_tsnsent[0] = stcb->asoc.mapping_array_base_tsn;
-
-		sctp_add_stream_reset_result_tsn(chk,
-		    ntohl(req->request_seq),
-		    SCTP_STREAM_RESET_PERFORMED,
-		    stcb->asoc.sending_seq,
-		    stcb->asoc.mapping_array_base_tsn);
-		sctp_reset_out_streams(stcb, 0, (uint16_t *) NULL);
-		sctp_reset_in_stream(stcb, 0, (uint16_t *) NULL);
-		stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
-		stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
-		sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), 0);
+		sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0],
+		    asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]);
 		asoc->str_reset_seq_in++;
 	} else if (asoc->str_reset_seq_in - 1 == seq) {
 		sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0],
-		    stcb->asoc.last_sending_seq[0],
-		    stcb->asoc.last_base_tsnsent[0]
-		    );
+		    asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]);
 	} else if (asoc->str_reset_seq_in - 2 == seq) {
 		sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[1],
-		    stcb->asoc.last_sending_seq[1],
-		    stcb->asoc.last_base_tsnsent[1]
-		    );
+		    asoc->last_sending_seq[1], asoc->last_base_tsnsent[1]);
 	} else {
-		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO);
+		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 	}
 	return (0);
 }
@@ -3846,12 +3861,14 @@
 
 		/* move the reset action back one */
 		asoc->last_reset_action[1] = asoc->last_reset_action[0];
-		if (trunc) {
-			asoc->last_reset_action[0] = SCTP_STREAM_RESET_REJECT;
+		if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) {
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
+		} else if (trunc) {
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else if (SCTP_TSN_GE(asoc->cumulative_tsn, tsn)) {
 			/* we can do it now */
 			sctp_reset_in_stream(stcb, number_entries, req->list_of_streams);
-			asoc->last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 		} else {
 			/*
 			 * we must queue it up and thus wait for the TSN's
@@ -3865,8 +3882,8 @@
 			    siz, SCTP_M_STRESET);
 			if (liste == NULL) {
 				/* gak out of memory */
-				sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_REJECT);
-				asoc->last_reset_action[0] = SCTP_STREAM_RESET_REJECT;
+				asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
+				sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 				return;
 			}
 			liste->tsn = tsn;
@@ -3874,7 +3891,7 @@
 			memcpy(&liste->req, req,
 			    (sizeof(struct sctp_stream_reset_out_request) + (number_entries * sizeof(uint16_t))));
 			TAILQ_INSERT_TAIL(&asoc->resetHead, liste, next_resp);
-			asoc->last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 		}
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
 		asoc->str_reset_seq_in++;
@@ -3891,7 +3908,7 @@
 		 */
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
 	} else {
-		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO);
+		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 	}
 }
 
@@ -3914,12 +3931,14 @@
 	/* Now what would be the new total? */
 	if (asoc->str_reset_seq_in == seq) {
 		num_stream += stcb->asoc.streamincnt;
-		if ((num_stream > stcb->asoc.max_inbound_streams) ||
+		stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
+		if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
+		} else if ((num_stream > stcb->asoc.max_inbound_streams) ||
 		    (num_stream > 0xffff)) {
 			/* We must reject it they ask for to many */
 	denied:
-			stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
-			stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_REJECT;
+			stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 		} else {
 			/* Ok, we can do that :-) */
 			struct sctp_stream_in *oldstrm;
@@ -3955,8 +3974,7 @@
 			SCTP_FREE(oldstrm, SCTP_M_STRMI);
 			/* update the size */
 			stcb->asoc.streamincnt = num_stream;
-			stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
-			stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+			stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 			sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0);
 		}
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]);
@@ -3974,7 +3992,7 @@
 		 */
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
 	} else {
-		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO);
+		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 
 	}
 }
@@ -3996,10 +4014,12 @@
 	num_stream = ntohs(str_add->number_of_streams);
 	/* Now what would be the new total? */
 	if (asoc->str_reset_seq_in == seq) {
-		if (stcb->asoc.stream_reset_outstanding) {
+		stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
+		if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) {
+			asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
+		} else if (stcb->asoc.stream_reset_outstanding) {
 			/* We must reject it we have something pending */
-			stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
-			stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_REJECT;
+			stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS;
 		} else {
 			/* Ok, we can do that :-) */
 			int mychk;
@@ -4007,14 +4027,12 @@
 			mychk = stcb->asoc.streamoutcnt;
 			mychk += num_stream;
 			if (mychk < 0x10000) {
-				stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
-				stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED;
+				stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED;
 				if (sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 0, 1, num_stream, 0, 1)) {
-					stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_REJECT;
+					stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 				}
 			} else {
-				stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0];
-				stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_REJECT;
+				stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED;
 			}
 		}
 		sctp_add_stream_reset_result(chk, seq, stcb->asoc.last_reset_action[0]);
@@ -4032,7 +4050,7 @@
 		 */
 		sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]);
 	} else {
-		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO);
+		sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO);
 	}
 }
 
@@ -4046,7 +4064,6 @@
 	int chk_length, param_len, ptype;
 	struct sctp_paramhdr pstore;
 	uint8_t cstore[SCTP_CHUNK_BUFFER_SIZE];
-
 	uint32_t seq = 0;
 	int num_req = 0;
 	int trunc = 0;
@@ -4124,7 +4141,7 @@
 				seq = ntohl(req_out->response_seq);
 				if (seq == stcb->asoc.str_reset_seq_out) {
 					/* implicit ack */
-					(void)sctp_handle_stream_reset_response(stcb, seq, SCTP_STREAM_RESET_PERFORMED, NULL);
+					(void)sctp_handle_stream_reset_response(stcb, seq, SCTP_STREAM_RESET_RESULT_PERFORMED, NULL);
 				}
 			}
 			sctp_handle_str_reset_request_out(stcb, chk, req_out, trunc);
@@ -4355,8 +4372,10 @@
 #endif
 	static struct sctp_tcb *
 	         sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length,
+             struct sockaddr *src, struct sockaddr *dst,
              struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb *inp,
              struct sctp_tcb *stcb, struct sctp_nets **netp, int *fwd_tsn_seen,
+             uint8_t use_mflowid, uint32_t mflowid,
              uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_association *asoc;
@@ -4379,7 +4398,7 @@
 	int auth_skipped = 0;
 	int asconf_cnt = 0;
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
@@ -4470,7 +4489,9 @@
 				if (asconf_len < sizeof(struct sctp_asconf_paramhdr))
 					break;
 				stcb = sctp_findassociation_ep_asconf(m,
-				    *offset, sh, &inp, netp, vrf_id);
+				    *offset,
+				    dst,
+				    sh, &inp, netp, vrf_id);
 				if (stcb != NULL)
 					break;
 				asconf_offset += SCTP_SIZE32(asconf_len);
@@ -4512,7 +4533,8 @@
 		}
 		if (stcb == NULL) {
 			/* no association, so it's out of the blue... */
-			sctp_handle_ootb(m, iphlen, *offset, sh, inp, NULL,
+			sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp,
+			    use_mflowid, mflowid,
 			    vrf_id, port);
 			*offset = length;
 			if (locked_tcb) {
@@ -4549,8 +4571,10 @@
 				if (locked_tcb) {
 					SCTP_TCB_UNLOCK(locked_tcb);
 				}
-				sctp_handle_ootb(m, iphlen, *offset, sh, inp,
-				    NULL, vrf_id, port);
+				sctp_handle_ootb(m, iphlen, *offset, src, dst,
+				    sh, inp,
+				    use_mflowid, mflowid,
+				    vrf_id, port);
 				return (NULL);
 			}
 		} else {
@@ -4690,8 +4714,10 @@
 			/* The INIT chunk must be the only chunk. */
 			if ((num_chunks > 1) ||
 			    (length - *offset > (int)SCTP_SIZE32(chk_length))) {
-				sctp_abort_association(inp, stcb, m,
-				    iphlen, sh, NULL, vrf_id, port);
+				sctp_abort_association(inp, stcb, m, iphlen,
+				    src, dst, sh, NULL,
+				    use_mflowid, mflowid,
+				    vrf_id, port);
 				*offset = length;
 				return (NULL);
 			}
@@ -4700,14 +4726,18 @@
 				struct mbuf *op_err;
 
 				op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
-				sctp_abort_association(inp, stcb, m,
-				    iphlen, sh, op_err, vrf_id, port);
+				sctp_abort_association(inp, stcb, m, iphlen,
+				    src, dst, sh, op_err,
+				    use_mflowid, mflowid,
+				    vrf_id, port);
 				*offset = length;
 				return (NULL);
 			}
-			sctp_handle_init(m, iphlen, *offset, sh,
+			sctp_handle_init(m, iphlen, *offset, src, dst, sh,
 			    (struct sctp_init_chunk *)ch, inp,
-			    stcb, &abort_no_unlock, vrf_id, port);
+			    stcb, &abort_no_unlock,
+			    use_mflowid, mflowid,
+			    vrf_id, port);
 			*offset = length;
 			if ((!abort_no_unlock) && (locked_tcb)) {
 				SCTP_TCB_UNLOCK(locked_tcb);
@@ -4729,7 +4759,7 @@
 					}
 					*offset = length;
 					if (stcb) {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 						so = SCTP_INP_SO(inp);
 						atomic_add_int(&stcb->asoc.refcnt, 1);
 						SCTP_TCB_UNLOCK(stcb);
@@ -4738,7 +4768,7 @@
 						atomic_subtract_int(&stcb->asoc.refcnt, 1);
 #endif
 						(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 						SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 					}
@@ -4755,8 +4785,13 @@
 				return (NULL);
 			}
 			if ((netp) && (*netp)) {
-				ret = sctp_handle_init_ack(m, iphlen, *offset, sh,
-				    (struct sctp_init_ack_chunk *)ch, stcb, *netp, &abort_no_unlock, vrf_id);
+				ret = sctp_handle_init_ack(m, iphlen, *offset,
+				    src, dst, sh,
+				    (struct sctp_init_ack_chunk *)ch,
+				    stcb, *netp,
+				    &abort_no_unlock,
+				    use_mflowid, mflowid,
+				    vrf_id);
 			} else {
 				ret = -1;
 			}
@@ -5061,8 +5096,10 @@
 					struct mbuf *op_err;
 
 					op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC);
-					sctp_abort_association(inp, stcb, m,
-					    iphlen, sh, op_err, vrf_id, port);
+					sctp_abort_association(inp, stcb, m, iphlen,
+					    src, dst, sh, op_err,
+					    use_mflowid, mflowid,
+					    vrf_id, port);
 				}
 				*offset = length;
 				return (NULL);
@@ -5087,13 +5124,17 @@
 				if (netp) {
 					ret_buf =
 					    sctp_handle_cookie_echo(m, iphlen,
-					    *offset, sh,
+					    *offset,
+					    src, dst,
+					    sh,
 					    (struct sctp_cookie_echo_chunk *)ch,
 					    &inp, &stcb, netp,
 					    auth_skipped,
 					    auth_offset,
 					    auth_len,
 					    &locked_tcb,
+					    use_mflowid,
+					    mflowid,
 					    vrf_id,
 					    port);
 				} else {
@@ -5141,7 +5182,7 @@
 				if ((stcb) && (stcb->asoc.total_output_queue_size)) {
 					;
 				} else if (stcb) {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 					so = SCTP_INP_SO(inp);
 					atomic_add_int(&stcb->asoc.refcnt, 1);
 					SCTP_TCB_UNLOCK(stcb);
@@ -5150,7 +5191,7 @@
 					atomic_subtract_int(&stcb->asoc.refcnt, 1);
 #endif
 					(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 					SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 					*offset = length;
@@ -5248,7 +5289,7 @@
 					    __LINE__);
 				}
 				stcb->asoc.overall_error_count = 0;
-				sctp_handle_asconf(m, *offset,
+				sctp_handle_asconf(m, *offset, src,
 				    (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0);
 				asconf_cnt++;
 			}
@@ -5304,7 +5345,7 @@
 				*fwd_tsn_seen = 1;
 				if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 					/* We are not interested anymore */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 					so = SCTP_INP_SO(inp);
 					atomic_add_int(&stcb->asoc.refcnt, 1);
 					SCTP_TCB_UNLOCK(stcb);
@@ -5313,7 +5354,7 @@
 					atomic_subtract_int(&stcb->asoc.refcnt, 1);
 #endif
 					(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_29);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 					SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 					*offset = length;
@@ -5349,7 +5390,7 @@
 			}
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) {
 				/* We are not interested anymore */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				so = SCTP_INP_SO(inp);
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				SCTP_TCB_UNLOCK(stcb);
@@ -5358,7 +5399,7 @@
 				atomic_subtract_int(&stcb->asoc.refcnt, 1);
 #endif
 				(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_30);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 				SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 				*offset = length;
@@ -5544,26 +5585,144 @@
  * common input chunk processing (v4 and v6)
  */
 void
-sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset,
-    int length, struct sctphdr *sh, struct sctp_chunkhdr *ch,
-    struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net,
-    uint8_t ecn_bits, uint32_t vrf_id, uint16_t port)
+sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int length,
+    struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh, struct sctp_chunkhdr *ch,
+#if !defined(SCTP_WITH_NO_CSUM)
+    uint8_t compute_crc,
+#endif
+    uint8_t ecn_bits,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port)
 {
-	/*
-	 * Control chunk processing
-	 */
 	uint32_t high_tsn;
 	int fwd_tsn_seen = 0, data_processed = 0;
 	struct mbuf *m = *mm;
 	int un_sent;
 	int cnt_ctrl_ready = 0;
+	struct sctp_inpcb *inp, *inp_decr = NULL;
+	struct sctp_tcb *stcb = NULL;
+	struct sctp_nets *net = NULL;
 
 	SCTP_STAT_INCR(sctps_recvdatagrams);
 #ifdef SCTP_AUDITING_ENABLED
 	sctp_audit_log(0xE0, 1);
 	sctp_auditing(0, inp, stcb, net);
 #endif
-
+#if !defined(SCTP_WITH_NO_CSUM)
+	if (compute_crc != 0) {
+		uint32_t check, calc_check;
+
+		check = sh->checksum;
+		sh->checksum = 0;
+		calc_check = sctp_calculate_cksum(m, iphlen);
+		sh->checksum = check;
+		if (calc_check != check) {
+			SCTPDBG(SCTP_DEBUG_INPUT1, "Bad CSUM on SCTP packet calc_check:%x check:%x  m:%p mlen:%d iphlen:%d\n",
+			    calc_check, check, m, length, iphlen);
+			stcb = sctp_findassociation_addr(m, offset, src, dst,
+			    sh, ch, &inp, &net, vrf_id);
+			if ((net != NULL) && (port != 0)) {
+				if (net->port == 0) {
+					sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr));
+				}
+				net->port = port;
+			}
+			if ((net != NULL) && (use_mflowid != 0)) {
+				net->flowid = mflowid;
+#ifdef INVARIANTS
+				net->flowidset = 1;
+#endif
+			}
+			if ((inp != NULL) && (stcb != NULL)) {
+				sctp_send_packet_dropped(stcb, net, m, length, iphlen, 1);
+				sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_INPUT_ERROR, SCTP_SO_NOT_LOCKED);
+			} else if ((inp != NULL) && (stcb == NULL)) {
+				inp_decr = inp;
+			}
+			SCTP_STAT_INCR(sctps_badsum);
+			SCTP_STAT_INCR_COUNTER32(sctps_checksumerrors);
+			goto out;
+		}
+	}
+#endif
+	/* Destination port of 0 is illegal, based on RFC4960. */
+	if (sh->dest_port == 0) {
+		SCTP_STAT_INCR(sctps_hdrops);
+		goto out;
+	}
+	stcb = sctp_findassociation_addr(m, offset, src, dst,
+	    sh, ch, &inp, &net, vrf_id);
+	if ((net != NULL) && (port != 0)) {
+		if (net->port == 0) {
+			sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr));
+		}
+		net->port = port;
+	}
+	if ((net != NULL) && (use_mflowid != 0)) {
+		net->flowid = mflowid;
+#ifdef INVARIANTS
+		net->flowidset = 1;
+#endif
+	}
+	if (inp == NULL) {
+		SCTP_STAT_INCR(sctps_noport);
+		if (badport_bandlim(BANDLIM_SCTP_OOTB) < 0) {
+			goto out;
+		}
+		if (ch->chunk_type == SCTP_SHUTDOWN_ACK) {
+			sctp_send_shutdown_complete2(src, dst, sh,
+			    use_mflowid, mflowid,
+			    vrf_id, port);
+			goto out;
+		}
+		if (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) {
+			goto out;
+		}
+		if (ch->chunk_type != SCTP_ABORT_ASSOCIATION) {
+			if ((SCTP_BASE_SYSCTL(sctp_blackhole) == 0) ||
+			    ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) &&
+			    (ch->chunk_type != SCTP_INIT))) {
+				sctp_send_abort(m, iphlen, src, dst,
+				    sh, 0, NULL,
+				    use_mflowid, mflowid,
+				    vrf_id, port);
+			}
+		}
+		goto out;
+	} else if (stcb == NULL) {
+		inp_decr = inp;
+	}
+#ifdef IPSEC
+	/*-
+	 * I very much doubt any of the IPSEC stuff will work but I have no
+	 * idea, so I will leave it in place.
+	 */
+	if (inp != NULL) {
+		switch (dst->sa_family) {
+#ifdef INET
+		case AF_INET:
+			if (ipsec4_in_reject(m, &inp->ip_inp.inp)) {
+				MODULE_GLOBAL(ipsec4stat).in_polvio++;
+				SCTP_STAT_INCR(sctps_hdrops);
+				goto out;
+			}
+			break;
+#endif
+#ifdef INET6
+		case AF_INET6:
+			if (ipsec6_in_reject(m, &inp->ip_inp.inp)) {
+				MODULE_GLOBAL(ipsec6stat).in_polvio++;
+				SCTP_STAT_INCR(sctps_hdrops);
+				goto out;
+			}
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+#endif
 	SCTPDBG(SCTP_DEBUG_INPUT1, "Ok, Common input processing called, m:%p iphlen:%d offset:%d length:%d stcb:%p\n",
 	    m, iphlen, offset, length, stcb);
 	if (stcb) {
@@ -5582,16 +5741,21 @@
 			 * NOT respond to any packet.. its OOTB.
 			 */
 			SCTP_TCB_UNLOCK(stcb);
-			sctp_handle_ootb(m, iphlen, offset, sh, inp, NULL,
+			stcb = NULL;
+			sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp,
+			    use_mflowid, mflowid,
 			    vrf_id, port);
-			goto out_now;
+			goto out;
 		}
 	}
 	if (IS_SCTP_CONTROL(ch)) {
 		/* process the control portion of the SCTP packet */
 		/* sa_ignore NO_NULL_CHK */
-		stcb = sctp_process_control(m, iphlen, &offset, length, sh, ch,
-		    inp, stcb, &net, &fwd_tsn_seen, vrf_id, port);
+		stcb = sctp_process_control(m, iphlen, &offset, length,
+		    src, dst, sh, ch,
+		    inp, stcb, &net, &fwd_tsn_seen,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 		if (stcb) {
 			/*
 			 * This covers us if the cookie-echo was there and
@@ -5621,20 +5785,19 @@
 		    sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) {
 			/* "silently" ignore */
 			SCTP_STAT_INCR(sctps_recvauthmissing);
-			SCTP_TCB_UNLOCK(stcb);
-			goto out_now;
+			goto out;
 		}
 		if (stcb == NULL) {
 			/* out of the blue DATA chunk */
-			sctp_handle_ootb(m, iphlen, offset, sh, inp, NULL,
+			sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp,
+			    use_mflowid, mflowid,
 			    vrf_id, port);
-			goto out_now;
+			goto out;
 		}
 		if (stcb->asoc.my_vtag != ntohl(sh->v_tag)) {
 			/* v_tag mismatch! */
 			SCTP_STAT_INCR(sctps_badvtag);
-			SCTP_TCB_UNLOCK(stcb);
-			goto out_now;
+			goto out;
 		}
 	}
 
@@ -5644,7 +5807,7 @@
 		 * packet while processing control, or we're done with this
 		 * packet (done or skip rest of data), so we drop it...
 		 */
-		goto out_now;
+		goto out;
 	}
 	/*
 	 * DATA chunk processing
@@ -5695,10 +5858,10 @@
 			/*
 			 * We consider OOTB any data sent during asoc setup.
 			 */
-			sctp_handle_ootb(m, iphlen, offset, sh, inp, NULL,
+			sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp,
+			    use_mflowid, mflowid,
 			    vrf_id, port);
-			SCTP_TCB_UNLOCK(stcb);
-			goto out_now;
+			goto out;
 			/* sa_ignore NOTREACHED */
 			break;
 		case SCTP_STATE_EMPTY:	/* should not happen */
@@ -5706,8 +5869,7 @@
 		case SCTP_STATE_SHUTDOWN_RECEIVED:	/* This is a peer error */
 		case SCTP_STATE_SHUTDOWN_ACK_SENT:
 		default:
-			SCTP_TCB_UNLOCK(stcb);
-			goto out_now;
+			goto out;
 			/* sa_ignore NOTREACHED */
 			break;
 		case SCTP_STATE_OPEN:
@@ -5715,14 +5877,18 @@
 			break;
 		}
 		/* plow through the data chunks while length > offset */
-		retval = sctp_process_data(mm, iphlen, &offset, length, sh,
-		    inp, stcb, net, &high_tsn);
+		retval = sctp_process_data(mm, iphlen, &offset, length,
+		    src, dst, sh,
+		    inp, stcb, net, &high_tsn,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 		if (retval == 2) {
 			/*
 			 * The association aborted, NO UNLOCK needed since
 			 * the association is destroyed.
 			 */
-			goto out_now;
+			stcb = NULL;
+			goto out;
 		}
 		data_processed = 1;
 		/*
@@ -5779,10 +5945,20 @@
 	sctp_audit_log(0xE0, 3);
 	sctp_auditing(2, inp, stcb, net);
 #endif
-	SCTP_TCB_UNLOCK(stcb);
-out_now:
+out:
+	if (stcb != NULL) {
+		SCTP_TCB_UNLOCK(stcb);
+	}
+	if (inp_decr != NULL) {
+		/* reduce ref-count */
+		SCTP_INP_WLOCK(inp_decr);
+		SCTP_INP_DECR_REF(inp_decr);
+		SCTP_INP_WUNLOCK(inp_decr);
+	}
 #ifdef INVARIANTS
-	sctp_validate_no_locks(inp);
+	if (inp != NULL) {
+		sctp_validate_no_locks(inp);
+	}
 #endif
 	return;
 }
@@ -5792,9 +5968,9 @@
 sctp_print_mbuf_chain(struct mbuf *m)
 {
 	for (; m; m = SCTP_BUF_NEXT(m)) {
-		printf("%p: m_len = %ld\n", m, SCTP_BUF_LEN(m));
+		SCTP_PRINTF("%p: m_len = %ld\n", m, SCTP_BUF_LEN(m));
 		if (SCTP_BUF_IS_EXTENDED(m))
-			printf("%p: extend_size = %d\n", m, SCTP_BUF_EXTEND_SIZE(m));
+			SCTP_PRINTF("%p: extend_size = %d\n", m, SCTP_BUF_EXTEND_SIZE(m));
 	}
 }
 
@@ -5808,33 +5984,25 @@
 	int iphlen;
 	uint32_t vrf_id = 0;
 	uint8_t ecn_bits;
+	struct sockaddr_in src, dst;
 	struct ip *ip;
 	struct sctphdr *sh;
-	struct sctp_inpcb *inp = NULL;
-	struct sctp_nets *net;
-	struct sctp_tcb *stcb = NULL;
 	struct sctp_chunkhdr *ch;
-	int refcount_up = 0;
-	int length, mlen, offset;
+	int length, offset;
 
 #if !defined(SCTP_WITH_NO_CSUM)
-	uint32_t check, calc_check;
+	uint8_t compute_crc;
 
 #endif
-
+	uint32_t mflowid;
+	uint8_t use_mflowid;
+
+	iphlen = off;
 	if (SCTP_GET_PKT_VRFID(i_pak, vrf_id)) {
 		SCTP_RELEASE_PKT(i_pak);
 		return;
 	}
-	mlen = SCTP_HEADER_LEN(i_pak);
-	iphlen = off;
 	m = SCTP_HEADER_TO_CHAIN(i_pak);
-
-	net = NULL;
-	SCTP_STAT_INCR(sctps_recvpackets);
-	SCTP_STAT_INCR_COUNTER64(sctps_inpackets);
-
-
 #ifdef SCTP_MBUF_LOGGING
 	/* Log in any input mbufs */
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
@@ -5847,208 +6015,85 @@
 		}
 	}
 #endif
-#ifdef  SCTP_PACKET_LOGGING
-	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
-		sctp_packet_log(m, mlen);
+#ifdef SCTP_PACKET_LOGGING
+	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) {
+		sctp_packet_log(m);
+	}
 #endif
-	/*
-	 * Must take out the iphlen, since mlen expects this (only effect lb
-	 * case)
-	 */
-	mlen -= iphlen;
-
-	/*
-	 * Get IP, SCTP, and first chunk header together in first mbuf.
-	 */
-	ip = mtod(m, struct ip *);
-	offset = iphlen + sizeof(*sh) + sizeof(*ch);
-	if (SCTP_BUF_LEN(m) < offset) {
-		if ((m = m_pullup(m, offset)) == 0) {
-			SCTP_STAT_INCR(sctps_hdrops);
-			return;
-		}
-		ip = mtod(m, struct ip *);
-	}
-	/* validate mbuf chain length with IP payload length */
-	if (mlen < (SCTP_GET_IPV4_LENGTH(ip) - iphlen)) {
-		SCTP_STAT_INCR(sctps_hdrops);
-		goto bad;
-	}
-	sh = (struct sctphdr *)((caddr_t)ip + iphlen);
-	ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(*sh));
-	SCTPDBG(SCTP_DEBUG_INPUT1,
-	    "sctp_input() length:%d iphlen:%d\n", mlen, iphlen);
-
-	/* SCTP does not allow broadcasts or multicasts */
-	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
-		goto bad;
-	}
-	if (SCTP_IS_IT_BROADCAST(ip->ip_dst, m)) {
-		/*
-		 * We only look at broadcast if its a front state, All
-		 * others we will not have a tcb for anyway.
-		 */
-		goto bad;
-	}
-	/* validate SCTP checksum */
 	SCTPDBG(SCTP_DEBUG_CRCOFFLOAD,
 	    "sctp_input(): Packet of length %d received on %s with csum_flags 0x%x.\n",
 	    m->m_pkthdr.len,
 	    if_name(m->m_pkthdr.rcvif),
 	    m->m_pkthdr.csum_flags);
+	if (m->m_flags & M_FLOWID) {
+		mflowid = m->m_pkthdr.flowid;
+		use_mflowid = 1;
+	} else {
+		mflowid = 0;
+		use_mflowid = 0;
+	}
+	SCTP_STAT_INCR(sctps_recvpackets);
+	SCTP_STAT_INCR_COUNTER64(sctps_inpackets);
+	/* Get IP, SCTP, and first chunk header together in the first mbuf. */
+	offset = iphlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+	if (SCTP_BUF_LEN(m) < offset) {
+		if ((m = m_pullup(m, offset)) == NULL) {
+			SCTP_STAT_INCR(sctps_hdrops);
+			return;
+		}
+	}
+	ip = mtod(m, struct ip *);
+	sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+	ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(struct sctphdr));
+	offset -= sizeof(struct sctp_chunkhdr);
+	memset(&src, 0, sizeof(struct sockaddr_in));
+	src.sin_family = AF_INET;
+	src.sin_len = sizeof(struct sockaddr_in);
+	src.sin_port = sh->src_port;
+	src.sin_addr = ip->ip_src;
+	memset(&dst, 0, sizeof(struct sockaddr_in));
+	dst.sin_family = AF_INET;
+	dst.sin_len = sizeof(struct sockaddr_in);
+	dst.sin_port = sh->dest_port;
+	dst.sin_addr = ip->ip_dst;
+	length = ip->ip_len + iphlen;
+	/* Validate mbuf chain length with IP payload length. */
+	if (SCTP_HEADER_LEN(m) != length) {
+		SCTPDBG(SCTP_DEBUG_INPUT1,
+		    "sctp_input() length:%d reported length:%d\n", length, SCTP_HEADER_LEN(m));
+		SCTP_STAT_INCR(sctps_hdrops);
+		goto out;
+	}
+	/* SCTP does not allow broadcasts or multicasts */
+	if (IN_MULTICAST(ntohl(dst.sin_addr.s_addr))) {
+		goto out;
+	}
+	if (SCTP_IS_IT_BROADCAST(dst.sin_addr, m)) {
+		goto out;
+	}
+	ecn_bits = ip->ip_tos;
 #if defined(SCTP_WITH_NO_CSUM)
 	SCTP_STAT_INCR(sctps_recvnocrc);
 #else
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) {
 		SCTP_STAT_INCR(sctps_recvhwcrc);
-		goto sctp_skip_csum_4;
-	}
-	check = sh->checksum;	/* save incoming checksum */
-	sh->checksum = 0;	/* prepare for calc */
-	calc_check = sctp_calculate_cksum(m, iphlen);
-	sh->checksum = check;
-	SCTP_STAT_INCR(sctps_recvswcrc);
-	if (calc_check != check) {
-		SCTPDBG(SCTP_DEBUG_INPUT1, "Bad CSUM on SCTP packet calc_check:%x check:%x  m:%p mlen:%d iphlen:%d\n",
-		    calc_check, check, m, mlen, iphlen);
-
-		stcb = sctp_findassociation_addr(m,
-		    offset - sizeof(*ch),
-		    sh, ch, &inp, &net,
-		    vrf_id);
-		if ((net) && (port)) {
-			if (net->port == 0) {
-				sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr));
-			}
-			net->port = port;
-		}
-		if ((net != NULL) && (m->m_flags & M_FLOWID)) {
-			net->flowid = m->m_pkthdr.flowid;
-#ifdef INVARIANTS
-			net->flowidset = 1;
+		compute_crc = 0;
+	} else {
+		SCTP_STAT_INCR(sctps_recvswcrc);
+		compute_crc = 1;
+	}
 #endif
-		}
-		if ((inp) && (stcb)) {
-			sctp_send_packet_dropped(stcb, net, m, iphlen, 1);
-			sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_INPUT_ERROR, SCTP_SO_NOT_LOCKED);
-		} else if ((inp != NULL) && (stcb == NULL)) {
-			refcount_up = 1;
-		}
-		SCTP_STAT_INCR(sctps_badsum);
-		SCTP_STAT_INCR_COUNTER32(sctps_checksumerrors);
-		goto bad;
-	}
-sctp_skip_csum_4:
+	sctp_common_input_processing(&m, iphlen, offset, length,
+	    (struct sockaddr *)&src,
+	    (struct sockaddr *)&dst,
+	    sh, ch,
+#if !defined(SCTP_WITH_NO_CSUM)
+	    compute_crc,
 #endif
-	/* destination port of 0 is illegal, based on RFC2960. */
-	if (sh->dest_port == 0) {
-		SCTP_STAT_INCR(sctps_hdrops);
-		goto bad;
-	}
-	/*
-	 * Locate pcb and tcb for datagram sctp_findassociation_addr() wants
-	 * IP/SCTP/first chunk header...
-	 */
-	stcb = sctp_findassociation_addr(m, offset - sizeof(*ch),
-	    sh, ch, &inp, &net, vrf_id);
-	if ((net) && (port)) {
-		if (net->port == 0) {
-			sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr));
-		}
-		net->port = port;
-	}
-	if ((net != NULL) && (m->m_flags & M_FLOWID)) {
-		net->flowid = m->m_pkthdr.flowid;
-#ifdef INVARIANTS
-		net->flowidset = 1;
-#endif
-	}
-	/* inp's ref-count increased && stcb locked */
-	if (inp == NULL) {
-		struct sctp_init_chunk *init_chk, chunk_buf;
-
-		SCTP_STAT_INCR(sctps_noport);
-#ifdef ICMP_BANDLIM
-		/*
-		 * we use the bandwidth limiting to protect against sending
-		 * too many ABORTS all at once. In this case these count the
-		 * same as an ICMP message.
-		 */
-		if (badport_bandlim(0) < 0)
-			goto bad;
-#endif				/* ICMP_BANDLIM */
-		SCTPDBG(SCTP_DEBUG_INPUT1,
-		    "Sending a ABORT from packet entry!\n");
-		if (ch->chunk_type == SCTP_INITIATION) {
-			/*
-			 * we do a trick here to get the INIT tag, dig in
-			 * and get the tag from the INIT and put it in the
-			 * common header.
-			 */
-			init_chk = (struct sctp_init_chunk *)sctp_m_getptr(m,
-			    iphlen + sizeof(*sh), sizeof(*init_chk),
-			    (uint8_t *) & chunk_buf);
-			if (init_chk != NULL)
-				sh->v_tag = init_chk->init.initiate_tag;
-		}
-		if (ch->chunk_type == SCTP_SHUTDOWN_ACK) {
-			sctp_send_shutdown_complete2(m, sh, vrf_id, port);
-			goto bad;
-		}
-		if (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) {
-			goto bad;
-		}
-		if (ch->chunk_type != SCTP_ABORT_ASSOCIATION) {
-			if ((SCTP_BASE_SYSCTL(sctp_blackhole) == 0) ||
-			    ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) &&
-			    (ch->chunk_type != SCTP_INIT))) {
-				sctp_send_abort(m, iphlen, sh, 0, NULL, vrf_id, port);
-			}
-		}
-		goto bad;
-	} else if (stcb == NULL) {
-		refcount_up = 1;
-	}
-#ifdef IPSEC
-	/*
-	 * I very much doubt any of the IPSEC stuff will work but I have no
-	 * idea, so I will leave it in place.
-	 */
-	if (inp && ipsec4_in_reject(m, &inp->ip_inp.inp)) {
-		MODULE_GLOBAL(ipsec4stat).in_polvio++;
-		SCTP_STAT_INCR(sctps_hdrops);
-		goto bad;
-	}
-#endif				/* IPSEC */
-
-	/*
-	 * common chunk processing
-	 */
-	length = ip->ip_len + iphlen;
-	offset -= sizeof(struct sctp_chunkhdr);
-
-	ecn_bits = ip->ip_tos;
-
-	/* sa_ignore NO_NULL_CHK */
-	sctp_common_input_processing(&m, iphlen, offset, length, sh, ch,
-	    inp, stcb, net, ecn_bits, vrf_id, port);
-	/* inp's ref-count reduced && stcb unlocked */
-	if (m) {
-		sctp_m_freem(m);
-	}
-	if ((inp) && (refcount_up)) {
-		/* reduce ref-count */
-		SCTP_INP_DECR_REF(inp);
-	}
-	return;
-bad:
-	if (stcb) {
-		SCTP_TCB_UNLOCK(stcb);
-	}
-	if ((inp) && (refcount_up)) {
-		/* reduce ref-count */
-		SCTP_INP_DECR_REF(inp);
-	}
+	    ecn_bits,
+	    use_mflowid, mflowid,
+	    vrf_id, port);
+out:
 	if (m) {
 		sctp_m_freem(m);
 	}
@@ -6078,15 +6123,14 @@
 			 * No flow id built by lower layers fix it so we
 			 * create one.
 			 */
-			ip = mtod(m, struct ip *);
-			offset = off + sizeof(*sh);
+			offset = off + sizeof(struct sctphdr);
 			if (SCTP_BUF_LEN(m) < offset) {
-				if ((m = m_pullup(m, offset)) == 0) {
+				if ((m = m_pullup(m, offset)) == NULL) {
 					SCTP_STAT_INCR(sctps_hdrops);
 					return;
 				}
-				ip = mtod(m, struct ip *);
 			}
+			ip = mtod(m, struct ip *);
 			sh = (struct sctphdr *)((caddr_t)ip + off);
 			tag = htonl(sh->v_tag);
 			flowid = tag ^ ntohs(sh->dest_port) ^ ntohs(sh->src_port);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_input.h
--- a/head/sys/netinet/sctp_input.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_input.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,19 +30,23 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_input.h,v 1.6 2005/03/06 16:04:17 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_input.h 238003 2012-07-02 16:44:09Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_input.h 228653 2011-12-17 19:21:40Z tuexen $");
-
-#ifndef __sctp_input_h__
-#define __sctp_input_h__
+#ifndef _NETINET_SCTP_INPUT_H_
+#define _NETINET_SCTP_INPUT_H_
 
 #if defined(_KERNEL) || defined(__Userspace__)
 void
 sctp_common_input_processing(struct mbuf **, int, int, int,
-    struct sctphdr *, struct sctp_chunkhdr *, struct sctp_inpcb *,
-    struct sctp_tcb *, struct sctp_nets *, uint8_t, uint32_t, uint16_t);
+    struct sockaddr *, struct sockaddr *,
+    struct sctphdr *, struct sctp_chunkhdr *,
+#if !defined(SCTP_WITH_NO_CSUM)
+    uint8_t,
+#endif
+    uint8_t,
+    uint8_t, uint32_t,
+    uint32_t, uint16_t);
 
 struct sctp_stream_reset_out_request *
 sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_lock_bsd.h
--- a/head/sys/netinet/sctp_lock_bsd.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_lock_bsd.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,9 +1,7 @@
-#ifndef __sctp_lock_bsd_h__
-#define __sctp_lock_bsd_h__
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,6 +30,12 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_lock_bsd.h 235828 2012-05-23 11:26:28Z tuexen $");
+
+#ifndef _NETINET_SCTP_LOCK_BSD_H_
+#define _NETINET_SCTP_LOCK_BSD_H_
+
 /*
  * General locking concepts: The goal of our locking is to of course provide
  * consistency and yet minimize overhead. We will attempt to use
@@ -70,9 +74,6 @@
  * SCTP_INP_INFO_RLOCK() and then when we want to add a new association to
  * the SCTP_BASE_INFO() list's we will do a SCTP_INP_INFO_WLOCK().
  */
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
 
 extern struct sctp_foo_stuff sctp_logoff[];
 extern int sctp_logoff_stuff;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_os.h
--- a/head/sys/netinet/sctp_os.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_os.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2006-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,10 +29,12 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_os.h 228653 2011-12-17 19:21:40Z tuexen $");
-#ifndef __sctp_os_h__
-#define __sctp_os_h__
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_os.h 235828 2012-05-23 11:26:28Z tuexen $");
+
+#ifndef _NETINET_SCTP_OS_H_
+#define _NETINET_SCTP_OS_H_
 
 /*
  * General kernel memory allocation:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_os_bsd.h
--- a/head/sys/netinet/sctp_os_bsd.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_os_bsd.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2006-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,10 +29,12 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_os_bsd.h 231852 2012-02-17 02:39:58Z bz $");
-#ifndef __sctp_os_bsd_h__
-#define __sctp_os_bsd_h__
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_os_bsd.h 237715 2012-06-28 16:01:08Z tuexen $");
+
+#ifndef _NETINET_SCTP_OS_BSD_H_
+#define _NETINET_SCTP_OS_BSD_H_
 /*
  * includes
  */
@@ -157,12 +159,13 @@
  */
 #define USER_ADDR_NULL	(NULL)	/* FIX ME: temp */
 
+#define SCTP_PRINTF(params...)	printf(params)
 #if defined(SCTP_DEBUG)
 #define SCTPDBG(level, params...)					\
 {									\
     do {								\
 	if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) {			\
-	    printf(params);						\
+	    SCTP_PRINTF(params);						\
 	}								\
     } while (0);							\
 }
@@ -174,20 +177,10 @@
 	}								\
     } while (0);							\
 }
-#define SCTPDBG_PKT(level, iph, sh)					\
-{									\
-    do {								\
-	    if (SCTP_BASE_SYSCTL(sctp_debug_on) & level) {		\
-		    sctp_print_address_pkt(iph, sh);			\
-	    }								\
-    } while (0);							\
-}
 #else
 #define SCTPDBG(level, params...)
 #define SCTPDBG_ADDR(level, addr)
-#define SCTPDBG_PKT(level, iph, sh)
 #endif
-#define SCTP_PRINTF(params...)	printf(params)
 
 #ifdef SCTP_LTRACE_CHUNKS
 #define SCTP_LTRACE_CHK(a, b, c, d) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_CHUNK_ENABLE) SCTP_CTR6(KTR_SUBSYS, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_CHUNK_PROC, 0, a, b, c, d)
@@ -196,12 +189,14 @@
 #endif
 
 #ifdef SCTP_LTRACE_ERRORS
-#define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
-                                                         printf("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
-								     m, inp, stcb, net, file, __LINE__, err);
-#define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
-                                                          printf("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
-								     inp, stcb, net, file, __LINE__, err);
+#define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) \
+	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
+        	SCTP_PRINTF("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
+		            m, inp, stcb, net, file, __LINE__, err);
+#define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) \
+	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
+        	SCTP_PRINTF("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
+		            inp, stcb, net, file, __LINE__, err);
 #else
 #define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err)
 #define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err)
@@ -361,7 +356,7 @@
  */
 #define SCTP_HEADER_TO_CHAIN(m) (m)
 #define SCTP_DETACH_HEADER_FROM_CHAIN(m)
-#define SCTP_HEADER_LEN(m) (m->m_pkthdr.len)
+#define SCTP_HEADER_LEN(m) ((m)->m_pkthdr.len)
 #define SCTP_GET_HEADER_FOR_OUTPUT(o_pak) 0
 #define SCTP_RELEASE_HEADER(m)
 #define SCTP_RELEASE_PKT(m)	sctp_m_freem(m)
@@ -390,10 +385,6 @@
  * its a NOP.
  */
 
-/* Macro's for getting length from V6/V4 header */
-#define SCTP_GET_IPV4_LENGTH(iph) (iph->ip_len)
-#define SCTP_GET_IPV6_LENGTH(ip6) (ntohs(ip6->ip6_plen))
-
 /* get the v6 hop limit */
 #define SCTP_GET_HLIM(inp, ro)	in6_selecthlim((struct in6pcb *)&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL)));
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_output.c
--- a/head/sys/netinet/sctp_output.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_output.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_output.c,v 1.46 2005/03/06 16:04:17 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_output.c 234461 2012-04-19 13:11:17Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_output.c 238501 2012-07-15 20:16:17Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <sys/proc.h>
@@ -3062,7 +3060,7 @@
 				continue;
 			}
 		} else {
-			printf("Stcb is null - no print\n");
+			SCTP_PRINTF("Stcb is null - no print\n");
 		}
 		atomic_add_int(&sifa->refcount, 1);
 		goto out;
@@ -3430,7 +3428,7 @@
 					}
 					m_copydata(control, at + CMSG_ALIGN(sizeof(struct cmsghdr)), sizeof(struct sctp_authinfo), (caddr_t)&authinfo);
 					sndrcvinfo->sinfo_keynumber_valid = 1;
-					sndrcvinfo->sinfo_keynumber = authinfo.auth_keyid;
+					sndrcvinfo->sinfo_keynumber = authinfo.auth_keynumber;
 					break;
 				default:
 					return (found);
@@ -3801,6 +3799,7 @@
 	}
 }
 
+#if defined(INET) || defined(INET6)
 static void
 sctp_handle_no_route(struct sctp_tcb *stcb,
     struct sctp_nets *net,
@@ -3815,8 +3814,7 @@
 			if ((net->dest_state & SCTP_ADDR_REACHABLE) && stcb) {
 				SCTPDBG(SCTP_DEBUG_OUTPUT1, "no route takes interface %p down\n", net);
 				sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
-				    stcb,
-				    SCTP_FAILED_THRESHOLD,
+				    stcb, 0,
 				    (void *)net,
 				    so_locked);
 				net->dest_state &= ~SCTP_ADDR_REACHABLE;
@@ -3846,6 +3844,8 @@
 	}
 }
 
+#endif
+
 static int
 sctp_lowlevel_chunk_output(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,	/* may be NULL */
@@ -3862,13 +3862,13 @@
     uint16_t dest_port,
     uint32_t v_tag,
     uint16_t port,
+    union sctp_sockstore *over_addr,
+    uint8_t use_mflowid, uint32_t mflowid,
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
-    int so_locked SCTP_UNUSED,
+    int so_locked SCTP_UNUSED
 #else
-    int so_locked,
-#endif
-    union sctp_sockstore *over_addr,
-    struct mbuf *init
+    int so_locked
+#endif
 )
 /* nofragment_flag to tell if IP_DF should be set (IPv4 only) */
 {
@@ -3885,17 +3885,21 @@
 	 *   interface and smallest_mtu size as well.
 	 */
 	/* Will need ifdefs around this */
-	struct mbuf *o_pak;
 	struct mbuf *newm;
 	struct sctphdr *sctphdr;
 	int packet_length;
 	int ret;
 	uint32_t vrf_id;
+
+#if defined(INET) || defined(INET6)
+	struct mbuf *o_pak;
 	sctp_route_t *ro = NULL;
 	struct udphdr *udp = NULL;
+
+#endif
 	uint8_t tos_value;
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so = NULL;
 
 #endif
@@ -3954,8 +3958,8 @@
 				m->m_pkthdr.flowid = net->flowid;
 				m->m_flags |= M_FLOWID;
 			} else {
-				if ((init != NULL) && (init->m_flags & M_FLOWID)) {
-					m->m_pkthdr.flowid = init->m_pkthdr.flowid;
+				if (use_mflowid != 0) {
+					m->m_pkthdr.flowid = mflowid;
 					m->m_flags |= M_FLOWID;
 				}
 			}
@@ -4110,23 +4114,13 @@
 				sctp_m_freem(m);
 				return (ENOMEM);
 			}
-#ifdef  SCTP_PACKET_LOGGING
-			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
-				sctp_packet_log(m, packet_length);
-#endif
 			SCTP_ATTACH_CHAIN(o_pak, m, packet_length);
 			if (port) {
 #if defined(SCTP_WITH_NO_CSUM)
 				SCTP_STAT_INCR(sctps_sendnocrc);
 #else
-				if (!(SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) &&
-				    (stcb) &&
-				    (stcb->asoc.loopback_scope))) {
-					sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip) + sizeof(struct udphdr));
-					SCTP_STAT_INCR(sctps_sendswcrc);
-				} else {
-					SCTP_STAT_INCR(sctps_sendnocrc);
-				}
+				sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip) + sizeof(struct udphdr));
+				SCTP_STAT_INCR(sctps_sendswcrc);
 #endif
 				if (V_udp_cksum) {
 					SCTP_ENABLE_UDP_CSUM(o_pak);
@@ -4140,15 +4134,19 @@
 				SCTP_STAT_INCR(sctps_sendhwcrc);
 #endif
 			}
+#ifdef SCTP_PACKET_LOGGING
+			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+				sctp_packet_log(o_pak);
+#endif
 			/* send it out.  table id is taken from stcb */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) {
 				so = SCTP_INP_SO(inp);
 				SCTP_SOCKET_UNLOCK(so, 0);
 			}
 #endif
 			SCTP_IP_OUTPUT(ret, o_pak, ro, stcb, vrf_id);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) {
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				SCTP_TCB_UNLOCK(stcb);
@@ -4165,10 +4163,7 @@
 			SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret);
 			if (net == NULL) {
 				/* free tempy routes */
-				if (ro->ro_rt) {
-					RTFREE(ro->ro_rt);
-					ro->ro_rt = NULL;
-				}
+				RO_RTFREE(ro);
 			} else {
 				/*
 				 * PMTU check versus smallest asoc MTU goes
@@ -4252,8 +4247,8 @@
 				m->m_pkthdr.flowid = net->flowid;
 				m->m_flags |= M_FLOWID;
 			} else {
-				if ((init != NULL) && (init->m_flags & M_FLOWID)) {
-					m->m_pkthdr.flowid = init->m_pkthdr.flowid;
+				if (use_mflowid != 0) {
+					m->m_pkthdr.flowid = mflowid;
 					m->m_flags |= M_FLOWID;
 				}
 			}
@@ -4468,23 +4463,13 @@
 				SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM);
 				return (ENOMEM);
 			}
-#ifdef  SCTP_PACKET_LOGGING
-			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
-				sctp_packet_log(m, packet_length);
-#endif
 			SCTP_ATTACH_CHAIN(o_pak, m, packet_length);
 			if (port) {
 #if defined(SCTP_WITH_NO_CSUM)
 				SCTP_STAT_INCR(sctps_sendnocrc);
 #else
-				if (!(SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) &&
-				    (stcb) &&
-				    (stcb->asoc.loopback_scope))) {
-					sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
-					SCTP_STAT_INCR(sctps_sendswcrc);
-				} else {
-					SCTP_STAT_INCR(sctps_sendnocrc);
-				}
+				sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
+				SCTP_STAT_INCR(sctps_sendswcrc);
 #endif
 				if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), packet_length - sizeof(struct ip6_hdr))) == 0) {
 					udp->uh_sum = 0xffff;
@@ -4493,20 +4478,24 @@
 #if defined(SCTP_WITH_NO_CSUM)
 				SCTP_STAT_INCR(sctps_sendnocrc);
 #else
-				m->m_pkthdr.csum_flags = CSUM_SCTP;
+				m->m_pkthdr.csum_flags = CSUM_SCTP_IPV6;
 				m->m_pkthdr.csum_data = 0;
 				SCTP_STAT_INCR(sctps_sendhwcrc);
 #endif
 			}
 			/* send it out. table id is taken from stcb */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) {
 				so = SCTP_INP_SO(inp);
 				SCTP_SOCKET_UNLOCK(so, 0);
 			}
 #endif
+#ifdef SCTP_PACKET_LOGGING
+			if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
+				sctp_packet_log(o_pak);
+#endif
 			SCTP_IP6_OUTPUT(ret, o_pak, (struct route_in6 *)ro, &ifp, stcb, vrf_id);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) {
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				SCTP_TCB_UNLOCK(stcb);
@@ -4528,9 +4517,7 @@
 			}
 			if (net == NULL) {
 				/* Now if we had a temp route free it */
-				if (ro->ro_rt) {
-					RTFREE(ro->ro_rt);
-				}
+				RO_RTFREE(ro);
 			} else {
 				/*
 				 * PMTU check versus smallest asoc MTU goes
@@ -4856,7 +4843,9 @@
 	    (struct sockaddr *)&net->ro._l_addr,
 	    m, 0, NULL, 0, 0, 0, 0,
 	    inp->sctp_lport, stcb->rport, htonl(0),
-	    net->port, so_locked, NULL, NULL);
+	    net->port, NULL,
+	    0, 0,
+	    so_locked);
 	SCTPDBG(SCTP_DEBUG_OUTPUT4, "lowlevel_output - %d\n", ret);
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 	(void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time);
@@ -5202,7 +5191,7 @@
 
 static int
 sctp_are_there_new_addresses(struct sctp_association *asoc,
-    struct mbuf *in_initpkt, int offset)
+    struct mbuf *in_initpkt, int offset, struct sockaddr *src)
 {
 	/*
 	 * Given a INIT packet, look through the packet to verify that there
@@ -5217,7 +5206,6 @@
 	uint16_t ptype, plen;
 	uint8_t fnd;
 	struct sctp_nets *net;
-	struct ip *iph;
 
 #ifdef INET
 	struct sockaddr_in sin4, *sa4;
@@ -5225,7 +5213,6 @@
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6, *sa6;
-	struct ip6_hdr *ip6h;
 
 #endif
 
@@ -5239,37 +5226,18 @@
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_len = sizeof(sin6);
 #endif
-	sa_touse = NULL;
 	/* First what about the src address of the pkt ? */
-	iph = mtod(in_initpkt, struct ip *);
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		/* source addr is IPv4 */
-		sin4.sin_addr = iph->ip_src;
-		sa_touse = (struct sockaddr *)&sin4;
-		break;
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		/* source addr is IPv6 */
-		ip6h = mtod(in_initpkt, struct ip6_hdr *);
-		sin6.sin6_addr = ip6h->ip6_src;
-		sa_touse = (struct sockaddr *)&sin6;
-		break;
-#endif
-	default:
-		return (1);
-	}
-
 	fnd = 0;
 	TAILQ_FOREACH(net, &asoc->nets, sctp_next) {
 		sa = (struct sockaddr *)&net->ro._l_addr;
-		if (sa->sa_family == sa_touse->sa_family) {
+		if (sa->sa_family == src->sa_family) {
 #ifdef INET
 			if (sa->sa_family == AF_INET) {
+				struct sockaddr_in *src4;
+
 				sa4 = (struct sockaddr_in *)sa;
-				if (sa4->sin_addr.s_addr == sin4.sin_addr.s_addr) {
+				src4 = (struct sockaddr_in *)src;
+				if (sa4->sin_addr.s_addr == src4->sin_addr.s_addr) {
 					fnd = 1;
 					break;
 				}
@@ -5277,8 +5245,11 @@
 #endif
 #ifdef INET6
 			if (sa->sa_family == AF_INET6) {
+				struct sockaddr_in6 *src6;
+
 				sa6 = (struct sockaddr_in6 *)sa;
-				if (SCTP6_ARE_ADDR_EQUAL(sa6, &sin6)) {
+				src6 = (struct sockaddr_in6 *)src;
+				if (SCTP6_ARE_ADDR_EQUAL(sa6, src6)) {
 					fnd = 1;
 					break;
 				}
@@ -5385,8 +5356,11 @@
  */
 void
 sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
-    struct mbuf *init_pkt, int iphlen, int offset, struct sctphdr *sh,
-    struct sctp_init_chunk *init_chk, uint32_t vrf_id, uint16_t port, int hold_inp_lock)
+    struct mbuf *init_pkt, int iphlen, int offset,
+    struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh, struct sctp_init_chunk *init_chk,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port, int hold_inp_lock)
 {
 	struct sctp_association *asoc;
 	struct mbuf *m, *m_at, *m_tmp, *m_cookie, *op_err, *mp_last;
@@ -5395,20 +5369,18 @@
 	struct sctp_ecn_supported_param *ecn;
 	struct sctp_prsctp_supported_param *prsctp;
 	struct sctp_supported_chunk_types_param *pr_supported;
-	union sctp_sockstore store, store1, *over_addr;
+	union sctp_sockstore *over_addr;
 
 #ifdef INET
-	struct sockaddr_in *sin, *to_sin;
+	struct sockaddr_in *dst4 = (struct sockaddr_in *)dst;
+	struct sockaddr_in *src4 = (struct sockaddr_in *)src;
+	struct sockaddr_in *sin;
 
 #endif
 #ifdef INET6
-	struct sockaddr_in6 *sin6, *to_sin6;
-
-#endif
-	struct ip *iph;
-
-#ifdef INET6
-	struct ip6_hdr *ip6;
+	struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
+	struct sockaddr_in6 *src6 = (struct sockaddr_in6 *)src;
+	struct sockaddr_in6 *sin6;
 
 #endif
 	struct sockaddr *to;
@@ -5423,21 +5395,24 @@
 	int nat_friendly = 0;
 	struct socket *so;
 
-	if (stcb)
+	if (stcb) {
 		asoc = &stcb->asoc;
-	else
+	} else {
 		asoc = NULL;
+	}
 	mp_last = NULL;
 	if ((asoc != NULL) &&
 	    (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) &&
-	    (sctp_are_there_new_addresses(asoc, init_pkt, offset))) {
+	    (sctp_are_there_new_addresses(asoc, init_pkt, offset, src))) {
 		/* new addresses, out of here in non-cookie-wait states */
 		/*
 		 * Send a ABORT, we don't add the new address error clause
 		 * though we even set the T bit and copy in the 0 tag.. this
 		 * looks no different than if no listener was present.
 		 */
-		sctp_send_abort(init_pkt, iphlen, sh, 0, NULL, vrf_id, port);
+		sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, NULL,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 		return;
 	}
 	abort_flag = 0;
@@ -5446,8 +5421,10 @@
 	    &abort_flag, (struct sctp_chunkhdr *)init_chk, &nat_friendly);
 	if (abort_flag) {
 do_a_abort:
-		sctp_send_abort(init_pkt, iphlen, sh,
-		    init_chk->init.initiate_tag, op_err, vrf_id, port);
+		sctp_send_abort(init_pkt, iphlen, src, dst, sh,
+		    init_chk->init.initiate_tag, op_err,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 		return;
 	}
 	m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA);
@@ -5509,61 +5486,20 @@
 #else
 	stc.ipv4_scope = 0;
 #endif
-	/* now for scope setup */
-	memset((caddr_t)&store, 0, sizeof(store));
-	memset((caddr_t)&store1, 0, sizeof(store1));
+	if (net == NULL) {
+		to = src;
+		switch (dst->sa_family) {
 #ifdef INET
-	sin = &store.sin;
-	to_sin = &store1.sin;
-#endif
-#ifdef INET6
-	sin6 = &store.sin6;
-	to_sin6 = &store1.sin6;
-#endif
-	iph = mtod(init_pkt, struct ip *);
-	/* establish the to_addr's */
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		to_sin->sin_port = sh->dest_port;
-		to_sin->sin_family = AF_INET;
-		to_sin->sin_len = sizeof(struct sockaddr_in);
-		to_sin->sin_addr = iph->ip_dst;
-		break;
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		ip6 = mtod(init_pkt, struct ip6_hdr *);
-		to_sin6->sin6_addr = ip6->ip6_dst;
-		to_sin6->sin6_scope_id = 0;
-		to_sin6->sin6_port = sh->dest_port;
-		to_sin6->sin6_family = AF_INET6;
-		to_sin6->sin6_len = sizeof(struct sockaddr_in6);
-		break;
-#endif
-	default:
-		goto do_a_abort;
-		break;
-	}
-
-	if (net == NULL) {
-		to = (struct sockaddr *)&store;
-		switch (iph->ip_v) {
-#ifdef INET
-		case IPVERSION:
+		case AF_INET:
 			{
-				sin->sin_family = AF_INET;
-				sin->sin_len = sizeof(struct sockaddr_in);
-				sin->sin_port = sh->src_port;
-				sin->sin_addr = iph->ip_src;
 				/* lookup address */
-				stc.address[0] = sin->sin_addr.s_addr;
+				stc.address[0] = src4->sin_addr.s_addr;
 				stc.address[1] = 0;
 				stc.address[2] = 0;
 				stc.address[3] = 0;
 				stc.addr_type = SCTP_IPV4_ADDRESS;
 				/* local from address */
-				stc.laddress[0] = to_sin->sin_addr.s_addr;
+				stc.laddress[0] = dst4->sin_addr.s_addr;
 				stc.laddress[1] = 0;
 				stc.laddress[2] = 0;
 				stc.laddress[3] = 0;
@@ -5571,14 +5507,14 @@
 				/* scope_id is only for v6 */
 				stc.scope_id = 0;
 #ifndef SCTP_DONT_DO_PRIVADDR_SCOPE
-				if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) {
+				if (IN4_ISPRIVATE_ADDRESS(&src4->sin_addr)) {
 					stc.ipv4_scope = 1;
 				}
 #else
 				stc.ipv4_scope = 1;
 #endif				/* SCTP_DONT_DO_PRIVADDR_SCOPE */
 				/* Must use the address in this case */
-				if (sctp_is_address_on_local_host((struct sockaddr *)sin, vrf_id)) {
+				if (sctp_is_address_on_local_host(src, vrf_id)) {
 					stc.loopback_scope = 1;
 					stc.ipv4_scope = 1;
 					stc.site_scope = 1;
@@ -5588,32 +5524,17 @@
 			}
 #endif
 #ifdef INET6
-		case IPV6_VERSION >> 4:
+		case AF_INET6:
 			{
-				ip6 = mtod(init_pkt, struct ip6_hdr *);
-				sin6->sin6_family = AF_INET6;
-				sin6->sin6_len = sizeof(struct sockaddr_in6);
-				sin6->sin6_port = sh->src_port;
-				sin6->sin6_addr = ip6->ip6_src;
-				/* lookup address */
-				memcpy(&stc.address, &sin6->sin6_addr,
-				    sizeof(struct in6_addr));
-				sin6->sin6_scope_id = 0;
 				stc.addr_type = SCTP_IPV6_ADDRESS;
-				stc.scope_id = 0;
-				if (sctp_is_address_on_local_host((struct sockaddr *)sin6, vrf_id)) {
-					/*
-					 * FIX ME: does this have scope from
-					 * rcvif?
-					 */
-					(void)sa6_recoverscope(sin6);
-					stc.scope_id = sin6->sin6_scope_id;
-					sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone));
+				memcpy(&stc.address, &src6->sin6_addr, sizeof(struct in6_addr));
+				stc.scope_id = in6_getscope(&src6->sin6_addr);
+				if (sctp_is_address_on_local_host(src, vrf_id)) {
 					stc.loopback_scope = 1;
 					stc.local_scope = 0;
 					stc.site_scope = 1;
 					stc.ipv4_scope = 1;
-				} else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
+				} else if (IN6_IS_ADDR_LINKLOCAL(&src6->sin6_addr)) {
 					/*
 					 * If the new destination is a
 					 * LINK_LOCAL we must have common
@@ -5638,14 +5559,7 @@
 					 * pull out the scope_id from
 					 * incoming pkt
 					 */
-					/*
-					 * FIX ME: does this have scope from
-					 * rcvif?
-					 */
-					(void)sa6_recoverscope(sin6);
-					stc.scope_id = sin6->sin6_scope_id;
-					sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone));
-				} else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) {
+				} else if (IN6_IS_ADDR_SITELOCAL(&src6->sin6_addr)) {
 					/*
 					 * If the new destination is
 					 * SITE_LOCAL then we must have site
@@ -5653,7 +5567,7 @@
 					 */
 					stc.site_scope = 1;
 				}
-				memcpy(&stc.laddress, &to_sin6->sin6_addr, sizeof(struct in6_addr));
+				memcpy(&stc.laddress, &dst6->sin6_addr, sizeof(struct in6_addr));
 				stc.laddr_type = SCTP_IPV6_ADDRESS;
 				break;
 			}
@@ -5733,7 +5647,7 @@
 			if (net->src_addr_selected == 0) {
 				/*
 				 * strange case here, the INIT should have
-				 * did the selection.
+				 * done the selection.
 				 */
 				net->ro._s_addr = sctp_source_address_selection(inp,
 				    stcb, (sctp_route_t *) & net->ro,
@@ -6041,7 +5955,7 @@
 		}
 	}
 	if (stc.loopback_scope) {
-		over_addr = &store1;
+		over_addr = (union sctp_sockstore *)dst;
 	} else {
 		over_addr = NULL;
 	}
@@ -6049,7 +5963,9 @@
 	(void)sctp_lowlevel_chunk_output(inp, NULL, NULL, to, m, 0, NULL, 0, 0,
 	    0, 0,
 	    inp->sctp_lport, sh->src_port, init_chk->init.initiate_tag,
-	    port, SCTP_SO_NOT_LOCKED, over_addr, init_pkt);
+	    port, over_addr,
+	    use_mflowid, mflowid,
+	    SCTP_SO_NOT_LOCKED);
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 }
 
@@ -6092,14 +6008,14 @@
 						 * if the mbuf is here
 						 */
 						int ret_spc;
-						int cause;
+						uint8_t sent;
 
 						if (chk->sent > SCTP_DATAGRAM_UNSENT)
-							cause = SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT;
+							sent = 1;
 						else
-							cause = SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_UNSENT;
+							sent = 0;
 						ret_spc = sctp_release_pr_sctp_chunk(stcb, chk,
-						    cause,
+						    sent,
 						    SCTP_SO_LOCKED);
 						freed_spc += ret_spc;
 						if (freed_spc >= dataout) {
@@ -6122,8 +6038,7 @@
 						int ret_spc;
 
 						ret_spc = sctp_release_pr_sctp_chunk(stcb, chk,
-						    SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_UNSENT,
-						    SCTP_SO_LOCKED);
+						    0, SCTP_SO_LOCKED);
 
 						freed_spc += ret_spc;
 						if (freed_spc >= dataout) {
@@ -6573,9 +6488,7 @@
 			 * dis-appearing on us.
 			 */
 			atomic_add_int(&stcb->asoc.refcnt, 1);
-			sctp_abort_an_association(inp, stcb,
-			    SCTP_RESPONSE_TO_USER_REQ,
-			    m, SCTP_SO_NOT_LOCKED);
+			sctp_abort_an_association(inp, stcb, m, SCTP_SO_NOT_LOCKED);
 			/*
 			 * sctp_abort_an_association calls sctp_free_asoc()
 			 * free association will NOT free it since we
@@ -6669,7 +6582,6 @@
 				abort_anyway:
 						atomic_add_int(&stcb->asoc.refcnt, 1);
 						sctp_abort_an_association(stcb->sctp_ep, stcb,
-						    SCTP_RESPONSE_TO_USER_REQ,
 						    NULL, SCTP_SO_NOT_LOCKED);
 						atomic_add_int(&stcb->asoc.refcnt, -1);
 						goto no_chunk_output;
@@ -8135,7 +8047,9 @@
 					    no_fragmentflg, 0, asconf,
 					    inp->sctp_lport, stcb->rport,
 					    htonl(stcb->asoc.peer_vtag),
-					    net->port, so_locked, NULL, NULL))) {
+					    net->port, NULL,
+					    0, 0,
+					    so_locked))) {
 						if (error == ENOBUFS) {
 							asoc->ifp_had_enobuf = 1;
 							SCTP_STAT_INCR(sctps_lowlevelerr);
@@ -8407,7 +8321,9 @@
 					    no_fragmentflg, 0, asconf,
 					    inp->sctp_lport, stcb->rport,
 					    htonl(stcb->asoc.peer_vtag),
-					    net->port, so_locked, NULL, NULL))) {
+					    net->port, NULL,
+					    0, 0,
+					    so_locked))) {
 						if (error == ENOBUFS) {
 							asoc->ifp_had_enobuf = 1;
 							SCTP_STAT_INCR(sctps_lowlevelerr);
@@ -8523,12 +8439,14 @@
 		}
 		/* now lets add any data within the MTU constraints */
 		switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) {
+#ifdef INET
 		case AF_INET:
 			if (net->mtu > (sizeof(struct ip) + sizeof(struct sctphdr)))
 				omtu = net->mtu - (sizeof(struct ip) + sizeof(struct sctphdr));
 			else
 				omtu = 0;
 			break;
+#endif
 #ifdef INET6
 		case AF_INET6:
 			if (net->mtu > (sizeof(struct ip6_hdr) + sizeof(struct sctphdr)))
@@ -8746,7 +8664,9 @@
 			    asconf,
 			    inp->sctp_lport, stcb->rport,
 			    htonl(stcb->asoc.peer_vtag),
-			    net->port, so_locked, NULL, NULL))) {
+			    net->port, NULL,
+			    0, 0,
+			    so_locked))) {
 				/* error, we could not output */
 				if (error == ENOBUFS) {
 					SCTP_STAT_INCR(sctps_lowlevelerr);
@@ -9448,7 +9368,9 @@
 		    auth_offset, auth, stcb->asoc.authinfo.active_keyid,
 		    no_fragmentflg, 0, 0,
 		    inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag),
-		    chk->whoTo->port, so_locked, NULL, NULL))) {
+		    chk->whoTo->port, NULL,
+		    0, 0,
+		    so_locked))) {
 			SCTP_STAT_INCR(sctps_lowlevelerr);
 			return (error);
 		}
@@ -9493,7 +9415,7 @@
 			continue;
 		}
 		if (chk->data == NULL) {
-			printf("TSN:%x chk->snd_count:%d chk->sent:%d can't retran - no data\n",
+			SCTP_PRINTF("TSN:%x chk->snd_count:%d chk->sent:%d can't retran - no data\n",
 			    chk->rec.data.TSN_seq, chk->snd_count, chk->sent);
 			continue;
 		}
@@ -9504,7 +9426,7 @@
 			    chk->snd_count,
 			    SCTP_BASE_SYSCTL(sctp_max_retran_chunk));
 			atomic_add_int(&stcb->asoc.refcnt, 1);
-			sctp_abort_an_association(stcb->sctp_ep, stcb, 0, NULL, so_locked);
+			sctp_abort_an_association(stcb->sctp_ep, stcb, NULL, so_locked);
 			SCTP_TCB_LOCK(stcb);
 			atomic_subtract_int(&stcb->asoc.refcnt, 1);
 			return (SCTP_RETRAN_EXIT);
@@ -9713,7 +9635,9 @@
 			    auth_offset, auth, auth_keyid,
 			    no_fragmentflg, 0, 0,
 			    inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag),
-			    net->port, so_locked, NULL, NULL))) {
+			    net->port, NULL,
+			    0, 0,
+			    so_locked))) {
 				/* error, we could not output */
 				SCTP_STAT_INCR(sctps_lowlevelerr);
 				return (error);
@@ -10831,7 +10755,9 @@
 	    (struct sockaddr *)&net->ro._l_addr,
 	    m_out, auth_offset, auth, stcb->asoc.authinfo.active_keyid, 1, 0, 0,
 	    stcb->sctp_ep->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag),
-	    stcb->asoc.primary_destination->port, so_locked, NULL, NULL);
+	    stcb->asoc.primary_destination->port, NULL,
+	    0, 0,
+	    so_locked);
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 }
 
@@ -10868,183 +10794,213 @@
 	    m_shutdown_comp, 0, NULL, 0, 1, 0, 0,
 	    stcb->sctp_ep->sctp_lport, stcb->rport,
 	    htonl(vtag),
-	    net->port, SCTP_SO_NOT_LOCKED, NULL, NULL);
+	    net->port, NULL,
+	    0, 0,
+	    SCTP_SO_NOT_LOCKED);
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 	return;
 }
 
-void
-sctp_send_shutdown_complete2(struct mbuf *m, struct sctphdr *sh,
+static void
+sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh, uint32_t vtag,
+    uint8_t type, struct mbuf *cause,
+    uint8_t use_mflowid, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
-	/* formulate and SEND a SHUTDOWN-COMPLETE */
 	struct mbuf *o_pak;
 	struct mbuf *mout;
-	struct ip *iph;
-	struct udphdr *udp = NULL;
-	int offset_out, len, mlen;
-	struct sctp_shutdown_complete_msg *comp_cp;
+	struct sctphdr *shout;
+	struct sctp_chunkhdr *ch;
+	struct udphdr *udp;
+	int len, cause_len, padding_len, ret;
 
 #ifdef INET
-	struct ip *iph_out;
+	struct sockaddr_in *src_sin, *dst_sin;
+	struct ip *ip;
 
 #endif
 #ifdef INET6
-	struct ip6_hdr *ip6, *ip6_out;
-
-#endif
-
-	iph = mtod(m, struct ip *);
-	switch (iph->ip_v) {
+	struct sockaddr_in6 *src_sin6, *dst_sin6;
+	struct ip6_hdr *ip6;
+
+#endif
+
+	/* Compute the length of the cause and add final padding. */
+	cause_len = 0;
+	if (cause != NULL) {
+		struct mbuf *m_at, *m_last = NULL;
+
+		for (m_at = cause; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
+			if (SCTP_BUF_NEXT(m_at) == NULL)
+				m_last = m_at;
+			cause_len += SCTP_BUF_LEN(m_at);
+		}
+		padding_len = cause_len % 4;
+		if (padding_len != 0) {
+			padding_len = 4 - padding_len;
+		}
+		if (padding_len != 0) {
+			if (sctp_add_pad_tombuf(m_last, padding_len)) {
+				sctp_m_freem(cause);
+				return;
+			}
+		}
+	} else {
+		padding_len = 0;
+	}
+	/* Get an mbuf for the header. */
+	len = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
+	switch (dst->sa_family) {
 #ifdef INET
-	case IPVERSION:
-		len = (sizeof(struct ip) + sizeof(struct sctp_shutdown_complete_msg));
+	case AF_INET:
+		len += sizeof(struct ip);
 		break;
 #endif
 #ifdef INET6
-	case IPV6_VERSION >> 4:
-		len = (sizeof(struct ip6_hdr) + sizeof(struct sctp_shutdown_complete_msg));
+	case AF_INET6:
+		len += sizeof(struct ip6_hdr);
 		break;
 #endif
 	default:
-		return;
+		break;
 	}
 	if (port) {
 		len += sizeof(struct udphdr);
 	}
 	mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA);
 	if (mout == NULL) {
+		if (cause) {
+			sctp_m_freem(cause);
+		}
 		return;
 	}
 	SCTP_BUF_RESV_UF(mout, max_linkhdr);
 	SCTP_BUF_LEN(mout) = len;
-	SCTP_BUF_NEXT(mout) = NULL;
-	if (m->m_flags & M_FLOWID) {
-		mout->m_pkthdr.flowid = m->m_pkthdr.flowid;
+	SCTP_BUF_NEXT(mout) = cause;
+	if (use_mflowid != 0) {
+		mout->m_pkthdr.flowid = mflowid;
 		mout->m_flags |= M_FLOWID;
 	}
 #ifdef INET
-	iph_out = NULL;
+	ip = NULL;
 #endif
 #ifdef INET6
-	ip6_out = NULL;
-#endif
-	offset_out = 0;
-
-	switch (iph->ip_v) {
+	ip6 = NULL;
+#endif
+	switch (dst->sa_family) {
 #ifdef INET
-	case IPVERSION:
-		iph_out = mtod(mout, struct ip *);
-
-		/* Fill in the IP header for the ABORT */
-		iph_out->ip_v = IPVERSION;
-		iph_out->ip_hl = (sizeof(struct ip) / 4);
-		iph_out->ip_tos = (u_char)0;
-		iph_out->ip_id = 0;
-		iph_out->ip_off = 0;
-		iph_out->ip_ttl = MAXTTL;
+	case AF_INET:
+		src_sin = (struct sockaddr_in *)src;
+		dst_sin = (struct sockaddr_in *)dst;
+		ip = mtod(mout, struct ip *);
+		ip->ip_v = IPVERSION;
+		ip->ip_hl = (sizeof(struct ip) >> 2);
+		ip->ip_tos = 0;
+		ip->ip_id = ip_newid();
+		ip->ip_off = 0;
+		ip->ip_ttl = MODULE_GLOBAL(ip_defttl);
 		if (port) {
-			iph_out->ip_p = IPPROTO_UDP;
+			ip->ip_p = IPPROTO_UDP;
 		} else {
-			iph_out->ip_p = IPPROTO_SCTP;
-		}
-		iph_out->ip_src.s_addr = iph->ip_dst.s_addr;
-		iph_out->ip_dst.s_addr = iph->ip_src.s_addr;
-
-		/* let IP layer calculate this */
-		iph_out->ip_sum = 0;
-		offset_out += sizeof(*iph_out);
-		comp_cp = (struct sctp_shutdown_complete_msg *)(
-		    (caddr_t)iph_out + offset_out);
+			ip->ip_p = IPPROTO_SCTP;
+		}
+		ip->ip_src.s_addr = dst_sin->sin_addr.s_addr;
+		ip->ip_dst.s_addr = src_sin->sin_addr.s_addr;
+		ip->ip_sum = 0;
+		len = sizeof(struct ip);
+		shout = (struct sctphdr *)((caddr_t)ip + len);
 		break;
 #endif
 #ifdef INET6
-	case IPV6_VERSION >> 4:
-		ip6 = (struct ip6_hdr *)iph;
-		ip6_out = mtod(mout, struct ip6_hdr *);
-
-		/* Fill in the IPv6 header for the ABORT */
-		ip6_out->ip6_flow = ip6->ip6_flow;
-		ip6_out->ip6_hlim = MODULE_GLOBAL(ip6_defhlim);
+	case AF_INET6:
+		src_sin6 = (struct sockaddr_in6 *)src;
+		dst_sin6 = (struct sockaddr_in6 *)dst;
+		ip6 = mtod(mout, struct ip6_hdr *);
+		ip6->ip6_flow = htonl(0x60000000);
+		if (V_ip6_auto_flowlabel) {
+			ip6->ip6_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
+		}
+		ip6->ip6_hlim = MODULE_GLOBAL(ip6_defhlim);
 		if (port) {
-			ip6_out->ip6_nxt = IPPROTO_UDP;
+			ip6->ip6_nxt = IPPROTO_UDP;
 		} else {
-			ip6_out->ip6_nxt = IPPROTO_SCTP;
-		}
-		ip6_out->ip6_src = ip6->ip6_dst;
-		ip6_out->ip6_dst = ip6->ip6_src;
-		/*
-		 * ?? The old code had both the iph len + payload, I think
-		 * this is wrong and would never have worked
-		 */
-		ip6_out->ip6_plen = sizeof(struct sctp_shutdown_complete_msg);
-		offset_out += sizeof(*ip6_out);
-		comp_cp = (struct sctp_shutdown_complete_msg *)(
-		    (caddr_t)ip6_out + offset_out);
+			ip6->ip6_nxt = IPPROTO_SCTP;
+		}
+		ip6->ip6_src = dst_sin6->sin6_addr;
+		ip6->ip6_dst = src_sin6->sin6_addr;
+		len = sizeof(struct ip6_hdr);
+		shout = (struct sctphdr *)((caddr_t)ip6 + len);
 		break;
-#endif				/* INET6 */
+#endif
 	default:
-		/* Currently not supported. */
-		sctp_m_freem(mout);
-		return;
+		len = 0;
+		shout = mtod(mout, struct sctphdr *);
+		break;
 	}
 	if (port) {
 		if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) {
 			sctp_m_freem(mout);
 			return;
 		}
-		udp = (struct udphdr *)comp_cp;
+		udp = (struct udphdr *)shout;
 		udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
 		udp->uh_dport = port;
-		udp->uh_ulen = htons(sizeof(struct sctp_shutdown_complete_msg) + sizeof(struct udphdr));
+		udp->uh_sum = 0;
+		udp->uh_ulen = htons(sizeof(struct udphdr) +
+		    sizeof(struct sctphdr) +
+		    sizeof(struct sctp_chunkhdr) +
+		    cause_len + padding_len);
+		len += sizeof(struct udphdr);
+		shout = (struct sctphdr *)((caddr_t)shout + sizeof(struct udphdr));
+	} else {
+		udp = NULL;
+	}
+	shout->src_port = sh->dest_port;
+	shout->dest_port = sh->src_port;
+	shout->checksum = 0;
+	if (vtag) {
+		shout->v_tag = htonl(vtag);
+	} else {
+		shout->v_tag = sh->v_tag;
+	}
+	len += sizeof(struct sctphdr);
+	ch = (struct sctp_chunkhdr *)((caddr_t)shout + sizeof(struct sctphdr));
+	ch->chunk_type = type;
+	if (vtag) {
+		ch->chunk_flags = 0;
+	} else {
+		ch->chunk_flags = SCTP_HAD_NO_TCB;
+	}
+	ch->chunk_length = htons(sizeof(struct sctp_chunkhdr) + cause_len);
+	len += sizeof(struct sctp_chunkhdr);
+	len += cause_len + padding_len;
+
+	if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
+		sctp_m_freem(mout);
+		return;
+	}
+	SCTP_ATTACH_CHAIN(o_pak, mout, len);
+	switch (dst->sa_family) {
 #ifdef INET
-		if (iph_out) {
+	case AF_INET:
+		if (port) {
 			if (V_udp_cksum) {
-				udp->uh_sum = in_pseudo(iph_out->ip_src.s_addr, iph_out->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
+				udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
 			} else {
 				udp->uh_sum = 0;
 			}
 		}
-#endif
-		offset_out += sizeof(struct udphdr);
-		comp_cp = (struct sctp_shutdown_complete_msg *)((caddr_t)comp_cp + sizeof(struct udphdr));
-	}
-	if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
-		/* no mbuf's */
-		sctp_m_freem(mout);
-		return;
-	}
-	/* Now copy in and fill in the ABORT tags etc. */
-	comp_cp->sh.src_port = sh->dest_port;
-	comp_cp->sh.dest_port = sh->src_port;
-	comp_cp->sh.checksum = 0;
-	comp_cp->sh.v_tag = sh->v_tag;
-	comp_cp->shut_cmp.ch.chunk_flags = SCTP_HAD_NO_TCB;
-	comp_cp->shut_cmp.ch.chunk_type = SCTP_SHUTDOWN_COMPLETE;
-	comp_cp->shut_cmp.ch.chunk_length = htons(sizeof(struct sctp_shutdown_complete_chunk));
-
-#ifdef INET
-	if (iph_out != NULL) {
-		sctp_route_t ro;
-		int ret;
-
-		mlen = SCTP_BUF_LEN(mout);
-		bzero(&ro, sizeof ro);
-		/* set IPv4 length */
-		iph_out->ip_len = mlen;
-#ifdef  SCTP_PACKET_LOGGING
-		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
-			sctp_packet_log(mout, mlen);
-#endif
+		ip->ip_len = len;
 		if (port) {
 #if defined(SCTP_WITH_NO_CSUM)
 			SCTP_STAT_INCR(sctps_sendnocrc);
 #else
-			comp_cp->sh.checksum = sctp_calculate_cksum(mout, offset_out);
+			shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip) + sizeof(struct udphdr));
 			SCTP_STAT_INCR(sctps_sendswcrc);
 #endif
 			if (V_udp_cksum) {
-				SCTP_ENABLE_UDP_CSUM(mout);
+				SCTP_ENABLE_UDP_CSUM(o_pak);
 			}
 		} else {
 #if defined(SCTP_WITH_NO_CSUM)
@@ -11055,59 +11011,66 @@
 			SCTP_STAT_INCR(sctps_sendhwcrc);
 #endif
 		}
-		SCTP_ATTACH_CHAIN(o_pak, mout, mlen);
-		/* out it goes */
-		SCTP_IP_OUTPUT(ret, o_pak, &ro, NULL, vrf_id);
-
-		/* Free the route if we got one back */
-		if (ro.ro_rt)
-			RTFREE(ro.ro_rt);
-	}
+#ifdef SCTP_PACKET_LOGGING
+		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) {
+			sctp_packet_log(o_pak);
+		}
+#endif
+		SCTP_IP_OUTPUT(ret, o_pak, NULL, NULL, vrf_id);
+		break;
 #endif
 #ifdef INET6
-	if (ip6_out != NULL) {
-		struct route_in6 ro;
-		int ret;
-		struct ifnet *ifp = NULL;
-
-		bzero(&ro, sizeof(ro));
-		mlen = SCTP_BUF_LEN(mout);
-#ifdef  SCTP_PACKET_LOGGING
-		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
-			sctp_packet_log(mout, mlen);
-#endif
-		SCTP_ATTACH_CHAIN(o_pak, mout, mlen);
+	case AF_INET6:
+		ip6->ip6_plen = len - sizeof(struct ip6_hdr);
 		if (port) {
 #if defined(SCTP_WITH_NO_CSUM)
 			SCTP_STAT_INCR(sctps_sendnocrc);
 #else
-			comp_cp->sh.checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
+			shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
 			SCTP_STAT_INCR(sctps_sendswcrc);
 #endif
-			if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), mlen - sizeof(struct ip6_hdr))) == 0) {
+			if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) {
 				udp->uh_sum = 0xffff;
 			}
 		} else {
 #if defined(SCTP_WITH_NO_CSUM)
 			SCTP_STAT_INCR(sctps_sendnocrc);
 #else
-			mout->m_pkthdr.csum_flags = CSUM_SCTP;
+			mout->m_pkthdr.csum_flags = CSUM_SCTP_IPV6;
 			mout->m_pkthdr.csum_data = 0;
 			SCTP_STAT_INCR(sctps_sendhwcrc);
 #endif
 		}
-		SCTP_IP6_OUTPUT(ret, o_pak, &ro, &ifp, NULL, vrf_id);
-
-		/* Free the route if we got one back */
-		if (ro.ro_rt)
-			RTFREE(ro.ro_rt);
-	}
-#endif
+#ifdef SCTP_PACKET_LOGGING
+		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) {
+			sctp_packet_log(o_pak);
+		}
+#endif
+		SCTP_IP6_OUTPUT(ret, o_pak, NULL, NULL, NULL, vrf_id);
+		break;
+#endif
+	default:
+		SCTPDBG(SCTP_DEBUG_OUTPUT1, "Unknown protocol (TSNH) type %d\n",
+		    dst->sa_family);
+		sctp_m_freem(mout);
+		SCTP_LTRACE_ERR_RET_PKT(mout, NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT);
+		return;
+	}
 	SCTP_STAT_INCR(sctps_sendpackets);
 	SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
 	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
 	return;
-
+}
+
+void
+sctp_send_shutdown_complete2(struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port)
+{
+	sctp_send_resp_msg(src, dst, sh, 0, SCTP_SHUTDOWN_COMPLETE, NULL,
+	    use_mflowid, mflowid,
+	    vrf_id, port);
 }
 
 void
@@ -11278,21 +11241,14 @@
 
 void
 sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net,
-    struct mbuf *m, int iphlen, int bad_crc)
+    struct mbuf *m, int len, int iphlen, int bad_crc)
 {
 	struct sctp_association *asoc;
 	struct sctp_pktdrop_chunk *drp;
 	struct sctp_tmit_chunk *chk;
 	uint8_t *datap;
-	int len;
 	int was_trunc = 0;
-	struct ip *iph;
-
-#ifdef INET6
-	struct ip6_hdr *ip6h;
-
-#endif
-	int fullsz = 0, extra = 0;
+	int fullsz = 0;
 	long spc;
 	int offset;
 	struct sctp_chunkhdr *ch, chunk_buf;
@@ -11317,28 +11273,8 @@
 		return;
 	}
 	chk->copy_by_ref = 0;
-	iph = mtod(m, struct ip *);
-	if (iph == NULL) {
-		sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED);
-		return;
-	}
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		/* IPv4 */
-		len = chk->send_size = iph->ip_len;
-		break;
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		/* IPv6 */
-		ip6h = mtod(m, struct ip6_hdr *);
-		len = chk->send_size = htons(ip6h->ip6_plen);
-		break;
-#endif
-	default:
-		return;
-	}
+	len -= iphlen;
+	chk->send_size = len;
 	/* Validate that we do not have an ABORT in here. */
 	offset = iphlen + sizeof(struct sctphdr);
 	ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset,
@@ -11374,7 +11310,7 @@
 		/*
 		 * only send 1 mtu worth, trim off the excess on the end.
 		 */
-		fullsz = len - extra;
+		fullsz = len;
 		len = min(stcb->asoc.smallest_mtu, MCLBYTES) - SCTP_MAX_OVERHEAD;
 		was_trunc = 1;
 	}
@@ -11938,545 +11874,33 @@
 }
 
 void
-sctp_send_abort(struct mbuf *m, int iphlen, struct sctphdr *sh, uint32_t vtag,
-    struct mbuf *err_cause, uint32_t vrf_id, uint16_t port)
-{
-	/*-
-	 * Formulate the abort message, and send it back down.
-	 */
-	struct mbuf *o_pak;
-	struct mbuf *mout;
-	struct sctp_abort_msg *abm;
-	struct ip *iph;
-	struct udphdr *udp;
-	int iphlen_out, len;
-
-#ifdef INET
-	struct ip *iph_out;
-
-#endif
-#ifdef INET6
-	struct ip6_hdr *ip6, *ip6_out;
-
-#endif
-
-	/* don't respond to ABORT with ABORT */
+sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh, uint32_t vtag, struct mbuf *cause,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port)
+{
+	/* Don't respond to an ABORT with an ABORT. */
 	if (sctp_is_there_an_abort_here(m, iphlen, &vtag)) {
-		if (err_cause)
-			sctp_m_freem(err_cause);
+		if (cause)
+			sctp_m_freem(cause);
 		return;
 	}
-	iph = mtod(m, struct ip *);
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		len = (sizeof(struct ip) + sizeof(struct sctp_abort_msg));
-		break;
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		len = (sizeof(struct ip6_hdr) + sizeof(struct sctp_abort_msg));
-		break;
-#endif
-	default:
-		if (err_cause) {
-			sctp_m_freem(err_cause);
-		}
-		return;
-	}
-	if (port) {
-		len += sizeof(struct udphdr);
-	}
-	mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA);
-	if (mout == NULL) {
-		if (err_cause) {
-			sctp_m_freem(err_cause);
-		}
-		return;
-	}
-	SCTP_BUF_RESV_UF(mout, max_linkhdr);
-	SCTP_BUF_LEN(mout) = len;
-	SCTP_BUF_NEXT(mout) = err_cause;
-	if (m->m_flags & M_FLOWID) {
-		mout->m_pkthdr.flowid = m->m_pkthdr.flowid;
-		mout->m_flags |= M_FLOWID;
-	}
-#ifdef INET
-	iph_out = NULL;
-#endif
-#ifdef INET6
-	ip6_out = NULL;
-#endif
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		iph_out = mtod(mout, struct ip *);
-
-		/* Fill in the IP header for the ABORT */
-		iph_out->ip_v = IPVERSION;
-		iph_out->ip_hl = (sizeof(struct ip) / 4);
-		iph_out->ip_tos = (u_char)0;
-		iph_out->ip_id = 0;
-		iph_out->ip_off = 0;
-		iph_out->ip_ttl = MAXTTL;
-		if (port) {
-			iph_out->ip_p = IPPROTO_UDP;
-		} else {
-			iph_out->ip_p = IPPROTO_SCTP;
-		}
-		iph_out->ip_src.s_addr = iph->ip_dst.s_addr;
-		iph_out->ip_dst.s_addr = iph->ip_src.s_addr;
-		/* let IP layer calculate this */
-		iph_out->ip_sum = 0;
-
-		iphlen_out = sizeof(*iph_out);
-		abm = (struct sctp_abort_msg *)((caddr_t)iph_out + iphlen_out);
-		break;
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		ip6 = (struct ip6_hdr *)iph;
-		ip6_out = mtod(mout, struct ip6_hdr *);
-
-		/* Fill in the IP6 header for the ABORT */
-		ip6_out->ip6_flow = ip6->ip6_flow;
-		ip6_out->ip6_hlim = MODULE_GLOBAL(ip6_defhlim);
-		if (port) {
-			ip6_out->ip6_nxt = IPPROTO_UDP;
-		} else {
-			ip6_out->ip6_nxt = IPPROTO_SCTP;
-		}
-		ip6_out->ip6_src = ip6->ip6_dst;
-		ip6_out->ip6_dst = ip6->ip6_src;
-
-		iphlen_out = sizeof(*ip6_out);
-		abm = (struct sctp_abort_msg *)((caddr_t)ip6_out + iphlen_out);
-		break;
-#endif				/* INET6 */
-	default:
-		/* Currently not supported */
-		sctp_m_freem(mout);
-		return;
-	}
-
-	udp = (struct udphdr *)abm;
-	if (port) {
-		if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) {
-			sctp_m_freem(mout);
-			return;
-		}
-		udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
-		udp->uh_dport = port;
-		/* set udp->uh_ulen later */
-		udp->uh_sum = 0;
-		iphlen_out += sizeof(struct udphdr);
-		abm = (struct sctp_abort_msg *)((caddr_t)abm + sizeof(struct udphdr));
-	}
-	abm->sh.src_port = sh->dest_port;
-	abm->sh.dest_port = sh->src_port;
-	abm->sh.checksum = 0;
-	if (vtag == 0) {
-		abm->sh.v_tag = sh->v_tag;
-		abm->msg.ch.chunk_flags = SCTP_HAD_NO_TCB;
-	} else {
-		abm->sh.v_tag = htonl(vtag);
-		abm->msg.ch.chunk_flags = 0;
-	}
-	abm->msg.ch.chunk_type = SCTP_ABORT_ASSOCIATION;
-
-	if (err_cause) {
-		struct mbuf *m_tmp = err_cause;
-		int err_len = 0;
-
-		/* get length of the err_cause chain */
-		while (m_tmp != NULL) {
-			err_len += SCTP_BUF_LEN(m_tmp);
-			m_tmp = SCTP_BUF_NEXT(m_tmp);
-		}
-		len = SCTP_BUF_LEN(mout) + err_len;
-		if (err_len % 4) {
-			/* need pad at end of chunk */
-			uint32_t cpthis = 0;
-			int padlen;
-
-			padlen = 4 - (len % 4);
-			m_copyback(mout, len, padlen, (caddr_t)&cpthis);
-			len += padlen;
-		}
-		abm->msg.ch.chunk_length = htons(sizeof(abm->msg.ch) + err_len);
-	} else {
-		len = SCTP_BUF_LEN(mout);
-		abm->msg.ch.chunk_length = htons(sizeof(abm->msg.ch));
-	}
-
-	if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
-		/* no mbuf's */
-		sctp_m_freem(mout);
-		return;
-	}
-#ifdef INET
-	if (iph_out != NULL) {
-		sctp_route_t ro;
-		int ret;
-
-		/* zap the stack pointer to the route */
-		bzero(&ro, sizeof ro);
-		if (port) {
-			udp->uh_ulen = htons(len - sizeof(struct ip));
-			if (V_udp_cksum) {
-				udp->uh_sum = in_pseudo(iph_out->ip_src.s_addr, iph_out->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
-			} else {
-				udp->uh_sum = 0;
-			}
-		}
-		SCTPDBG(SCTP_DEBUG_OUTPUT2, "sctp_send_abort calling ip_output:\n");
-		SCTPDBG_PKT(SCTP_DEBUG_OUTPUT2, iph_out, &abm->sh);
-		/* set IPv4 length */
-		iph_out->ip_len = len;
-		/* out it goes */
-#ifdef  SCTP_PACKET_LOGGING
-		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
-			sctp_packet_log(mout, len);
-#endif
-		SCTP_ATTACH_CHAIN(o_pak, mout, len);
-		if (port) {
-#if defined(SCTP_WITH_NO_CSUM)
-			SCTP_STAT_INCR(sctps_sendnocrc);
-#else
-			abm->sh.checksum = sctp_calculate_cksum(mout, iphlen_out);
-			SCTP_STAT_INCR(sctps_sendswcrc);
-#endif
-			if (V_udp_cksum) {
-				SCTP_ENABLE_UDP_CSUM(o_pak);
-			}
-		} else {
-#if defined(SCTP_WITH_NO_CSUM)
-			SCTP_STAT_INCR(sctps_sendnocrc);
-#else
-			mout->m_pkthdr.csum_flags = CSUM_SCTP;
-			mout->m_pkthdr.csum_data = 0;
-			SCTP_STAT_INCR(sctps_sendhwcrc);
-#endif
-		}
-		SCTP_IP_OUTPUT(ret, o_pak, &ro, NULL, vrf_id);
-
-		/* Free the route if we got one back */
-		if (ro.ro_rt)
-			RTFREE(ro.ro_rt);
-	}
-#endif
-#ifdef INET6
-	if (ip6_out != NULL) {
-		struct route_in6 ro;
-		int ret;
-		struct ifnet *ifp = NULL;
-
-		/* zap the stack pointer to the route */
-		bzero(&ro, sizeof(ro));
-		if (port) {
-			udp->uh_ulen = htons(len - sizeof(struct ip6_hdr));
-		}
-		SCTPDBG(SCTP_DEBUG_OUTPUT2, "sctp_send_abort calling ip6_output:\n");
-		SCTPDBG_PKT(SCTP_DEBUG_OUTPUT2, (struct ip *)ip6_out, &abm->sh);
-		ip6_out->ip6_plen = len - sizeof(*ip6_out);
-#ifdef  SCTP_PACKET_LOGGING
-		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
-			sctp_packet_log(mout, len);
-#endif
-		SCTP_ATTACH_CHAIN(o_pak, mout, len);
-		if (port) {
-#if defined(SCTP_WITH_NO_CSUM)
-			SCTP_STAT_INCR(sctps_sendnocrc);
-#else
-			abm->sh.checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
-			SCTP_STAT_INCR(sctps_sendswcrc);
-#endif
-			if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) {
-				udp->uh_sum = 0xffff;
-			}
-		} else {
-#if defined(SCTP_WITH_NO_CSUM)
-			SCTP_STAT_INCR(sctps_sendnocrc);
-#else
-			mout->m_pkthdr.csum_flags = CSUM_SCTP;
-			mout->m_pkthdr.csum_data = 0;
-			SCTP_STAT_INCR(sctps_sendhwcrc);
-#endif
-		}
-		SCTP_IP6_OUTPUT(ret, o_pak, &ro, &ifp, NULL, vrf_id);
-
-		/* Free the route if we got one back */
-		if (ro.ro_rt)
-			RTFREE(ro.ro_rt);
-	}
-#endif
-	SCTP_STAT_INCR(sctps_sendpackets);
-	SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
-	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+	sctp_send_resp_msg(src, dst, sh, vtag, SCTP_ABORT_ASSOCIATION, cause,
+	    use_mflowid, mflowid,
+	    vrf_id, port);
+	return;
 }
 
 void
-sctp_send_operr_to(struct mbuf *m, int iphlen, struct mbuf *scm, uint32_t vtag,
+sctp_send_operr_to(struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh, uint32_t vtag, struct mbuf *cause,
+    uint8_t use_mflowid, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
-	struct mbuf *o_pak;
-	struct sctphdr *sh, *sh_out;
-	struct sctp_chunkhdr *ch;
-	struct ip *iph;
-	struct udphdr *udp = NULL;
-	struct mbuf *mout;
-	int iphlen_out, len;
-
-#ifdef INET
-	struct ip *iph_out;
-
-#endif
-#ifdef INET6
-	struct ip6_hdr *ip6, *ip6_out;
-
-#endif
-
-	iph = mtod(m, struct ip *);
-	sh = (struct sctphdr *)((caddr_t)iph + iphlen);
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		len = (sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr));
-		break;
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		len = (sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr));
-		break;
-#endif
-	default:
-		if (scm) {
-			sctp_m_freem(scm);
-		}
-		return;
-	}
-	if (port) {
-		len += sizeof(struct udphdr);
-	}
-	mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA);
-	if (mout == NULL) {
-		if (scm) {
-			sctp_m_freem(scm);
-		}
-		return;
-	}
-	SCTP_BUF_RESV_UF(mout, max_linkhdr);
-	SCTP_BUF_LEN(mout) = len;
-	SCTP_BUF_NEXT(mout) = scm;
-	if (m->m_flags & M_FLOWID) {
-		mout->m_pkthdr.flowid = m->m_pkthdr.flowid;
-		mout->m_flags |= M_FLOWID;
-	}
-#ifdef INET
-	iph_out = NULL;
-#endif
-#ifdef INET6
-	ip6_out = NULL;
-#endif
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		iph_out = mtod(mout, struct ip *);
-
-		/* Fill in the IP header for the ABORT */
-		iph_out->ip_v = IPVERSION;
-		iph_out->ip_hl = (sizeof(struct ip) / 4);
-		iph_out->ip_tos = (u_char)0;
-		iph_out->ip_id = 0;
-		iph_out->ip_off = 0;
-		iph_out->ip_ttl = MAXTTL;
-		if (port) {
-			iph_out->ip_p = IPPROTO_UDP;
-		} else {
-			iph_out->ip_p = IPPROTO_SCTP;
-		}
-		iph_out->ip_src.s_addr = iph->ip_dst.s_addr;
-		iph_out->ip_dst.s_addr = iph->ip_src.s_addr;
-		/* let IP layer calculate this */
-		iph_out->ip_sum = 0;
-
-		iphlen_out = sizeof(struct ip);
-		sh_out = (struct sctphdr *)((caddr_t)iph_out + iphlen_out);
-		break;
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		ip6 = (struct ip6_hdr *)iph;
-		ip6_out = mtod(mout, struct ip6_hdr *);
-
-		/* Fill in the IP6 header for the ABORT */
-		ip6_out->ip6_flow = ip6->ip6_flow;
-		ip6_out->ip6_hlim = MODULE_GLOBAL(ip6_defhlim);
-		if (port) {
-			ip6_out->ip6_nxt = IPPROTO_UDP;
-		} else {
-			ip6_out->ip6_nxt = IPPROTO_SCTP;
-		}
-		ip6_out->ip6_src = ip6->ip6_dst;
-		ip6_out->ip6_dst = ip6->ip6_src;
-
-		iphlen_out = sizeof(struct ip6_hdr);
-		sh_out = (struct sctphdr *)((caddr_t)ip6_out + iphlen_out);
-		break;
-#endif				/* INET6 */
-	default:
-		/* Currently not supported */
-		sctp_m_freem(mout);
-		return;
-	}
-
-	udp = (struct udphdr *)sh_out;
-	if (port) {
-		if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) {
-			sctp_m_freem(mout);
-			return;
-		}
-		udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port));
-		udp->uh_dport = port;
-		/* set udp->uh_ulen later */
-		udp->uh_sum = 0;
-		iphlen_out += sizeof(struct udphdr);
-		sh_out = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr));
-	}
-	sh_out->src_port = sh->dest_port;
-	sh_out->dest_port = sh->src_port;
-	sh_out->v_tag = vtag;
-	sh_out->checksum = 0;
-
-	ch = (struct sctp_chunkhdr *)((caddr_t)sh_out + sizeof(struct sctphdr));
-	ch->chunk_type = SCTP_OPERATION_ERROR;
-	ch->chunk_flags = 0;
-
-	if (scm) {
-		struct mbuf *m_tmp = scm;
-		int cause_len = 0;
-
-		/* get length of the err_cause chain */
-		while (m_tmp != NULL) {
-			cause_len += SCTP_BUF_LEN(m_tmp);
-			m_tmp = SCTP_BUF_NEXT(m_tmp);
-		}
-		len = SCTP_BUF_LEN(mout) + cause_len;
-		if (cause_len % 4) {
-			/* need pad at end of chunk */
-			uint32_t cpthis = 0;
-			int padlen;
-
-			padlen = 4 - (len % 4);
-			m_copyback(mout, len, padlen, (caddr_t)&cpthis);
-			len += padlen;
-		}
-		ch->chunk_length = htons(sizeof(struct sctp_chunkhdr) + cause_len);
-	} else {
-		len = SCTP_BUF_LEN(mout);
-		ch->chunk_length = htons(sizeof(struct sctp_chunkhdr));
-	}
-
-	if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) {
-		/* no mbuf's */
-		sctp_m_freem(mout);
-		return;
-	}
-#ifdef INET
-	if (iph_out != NULL) {
-		sctp_route_t ro;
-		int ret;
-
-		/* zap the stack pointer to the route */
-		bzero(&ro, sizeof ro);
-		if (port) {
-			udp->uh_ulen = htons(len - sizeof(struct ip));
-			if (V_udp_cksum) {
-				udp->uh_sum = in_pseudo(iph_out->ip_src.s_addr, iph_out->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP));
-			} else {
-				udp->uh_sum = 0;
-			}
-		}
-		/* set IPv4 length */
-		iph_out->ip_len = len;
-		/* out it goes */
-#ifdef  SCTP_PACKET_LOGGING
-		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
-			sctp_packet_log(mout, len);
-#endif
-		SCTP_ATTACH_CHAIN(o_pak, mout, len);
-		if (port) {
-#if defined(SCTP_WITH_NO_CSUM)
-			SCTP_STAT_INCR(sctps_sendnocrc);
-#else
-			sh_out->checksum = sctp_calculate_cksum(mout, iphlen_out);
-			SCTP_STAT_INCR(sctps_sendswcrc);
-#endif
-			if (V_udp_cksum) {
-				SCTP_ENABLE_UDP_CSUM(o_pak);
-			}
-		} else {
-#if defined(SCTP_WITH_NO_CSUM)
-			SCTP_STAT_INCR(sctps_sendnocrc);
-#else
-			mout->m_pkthdr.csum_flags = CSUM_SCTP;
-			mout->m_pkthdr.csum_data = 0;
-			SCTP_STAT_INCR(sctps_sendhwcrc);
-#endif
-		}
-		SCTP_IP_OUTPUT(ret, o_pak, &ro, NULL, vrf_id);
-
-		/* Free the route if we got one back */
-		if (ro.ro_rt)
-			RTFREE(ro.ro_rt);
-	}
-#endif
-#ifdef INET6
-	if (ip6_out != NULL) {
-		struct route_in6 ro;
-		int ret;
-		struct ifnet *ifp = NULL;
-
-		/* zap the stack pointer to the route */
-		bzero(&ro, sizeof(ro));
-		if (port) {
-			udp->uh_ulen = htons(len - sizeof(struct ip6_hdr));
-		}
-		ip6_out->ip6_plen = len - sizeof(*ip6_out);
-#ifdef  SCTP_PACKET_LOGGING
-		if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING)
-			sctp_packet_log(mout, len);
-#endif
-		SCTP_ATTACH_CHAIN(o_pak, mout, len);
-		if (port) {
-#if defined(SCTP_WITH_NO_CSUM)
-			SCTP_STAT_INCR(sctps_sendnocrc);
-#else
-			sh_out->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr));
-			SCTP_STAT_INCR(sctps_sendswcrc);
-#endif
-			if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) {
-				udp->uh_sum = 0xffff;
-			}
-		} else {
-#if defined(SCTP_WITH_NO_CSUM)
-			SCTP_STAT_INCR(sctps_sendnocrc);
-#else
-			mout->m_pkthdr.csum_flags = CSUM_SCTP;
-			mout->m_pkthdr.csum_data = 0;
-			SCTP_STAT_INCR(sctps_sendhwcrc);
-#endif
-		}
-		SCTP_IP6_OUTPUT(ret, o_pak, &ro, &ifp, NULL, vrf_id);
-
-		/* Free the route if we got one back */
-		if (ro.ro_rt)
-			RTFREE(ro.ro_rt);
-	}
-#endif
-	SCTP_STAT_INCR(sctps_sendpackets);
-	SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
-	SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+	sctp_send_resp_msg(src, dst, sh, vtag, SCTP_OPERATION_ERROR, cause,
+	    use_mflowid, mflowid,
+	    vrf_id, port);
+	return;
 }
 
 static struct mbuf *
@@ -12750,7 +12174,7 @@
 		union sctp_sockstore *raddr = (union sctp_sockstore *)addr;
 
 		switch (raddr->sa.sa_family) {
-#if defined(INET)
+#ifdef INET
 		case AF_INET:
 			if (raddr->sin.sin_len != sizeof(struct sockaddr_in)) {
 				SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
@@ -12760,7 +12184,7 @@
 			port = raddr->sin.sin_port;
 			break;
 #endif
-#if defined(INET6)
+#ifdef INET6
 		case AF_INET6:
 			if (raddr->sin6.sin6_len != sizeof(struct sockaddr_in6)) {
 				SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL);
@@ -13138,9 +12562,7 @@
 		atomic_add_int(&stcb->asoc.refcnt, -1);
 		free_cnt_applied = 0;
 		/* release this lock, otherwise we hang on ourselves */
-		sctp_abort_an_association(stcb->sctp_ep, stcb,
-		    SCTP_RESPONSE_TO_USER_REQ,
-		    mm, SCTP_SO_LOCKED);
+		sctp_abort_an_association(stcb->sctp_ep, stcb, mm, SCTP_SO_LOCKED);
 		/* now relock the stcb so everything is sane */
 		hold_tcblock = 0;
 		stcb = NULL;
@@ -13617,8 +13039,7 @@
 dataless_eof:
 	/* EOF thing ? */
 	if ((srcv->sinfo_flags & SCTP_EOF) &&
-	    (got_all_of_the_send == 1) &&
-	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) {
+	    (got_all_of_the_send == 1)) {
 		int cnt;
 
 		SCTP_STAT_INCR(sctps_sends_with_eof);
@@ -13695,7 +13116,6 @@
 						free_cnt_applied = 0;
 					}
 					sctp_abort_an_association(stcb->sctp_ep, stcb,
-					    SCTP_RESPONSE_TO_USER_REQ,
 					    NULL, SCTP_SO_LOCKED);
 					/*
 					 * now relock the stcb so everything
@@ -13840,7 +13260,7 @@
 	if (inp) {
 		sctp_validate_no_locks(inp);
 	} else {
-		printf("Warning - inp is NULL so cant validate locks\n");
+		SCTP_PRINTF("Warning - inp is NULL so cant validate locks\n");
 	}
 #endif
 	if (top) {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_output.h
--- a/head/sys/netinet/sctp_output.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_output.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_output.h,v 1.14 2005/03/06 16:04:18 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_output.h 237715 2012-06-28 16:01:08Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_output.h 233660 2012-03-29 13:36:53Z rrs $");
-
-#ifndef __sctp_output_h__
-#define __sctp_output_h__
+#ifndef _NETINET_SCTP_OUTPUT_H_
+#define _NETINET_SCTP_OUTPUT_H_
 
 #include <netinet/sctp_header.h>
 
@@ -85,8 +83,11 @@
 );
 
 void
-sctp_send_initiate_ack(struct sctp_inpcb *, struct sctp_tcb *,
-    struct mbuf *, int, int, struct sctphdr *, struct sctp_init_chunk *,
+sctp_send_initiate_ack(struct sctp_inpcb *, struct sctp_tcb *, struct mbuf *,
+    int, int,
+    struct sockaddr *, struct sockaddr *,
+    struct sctphdr *, struct sctp_init_chunk *,
+    uint8_t, uint32_t,
     uint32_t, uint16_t, int);
 
 struct mbuf *
@@ -117,7 +118,9 @@
 void sctp_send_shutdown_complete(struct sctp_tcb *, struct sctp_nets *, int);
 
 void 
-sctp_send_shutdown_complete2(struct mbuf *, struct sctphdr *,
+sctp_send_shutdown_complete2(struct sockaddr *, struct sockaddr *,
+    struct sctphdr *,
+    uint8_t, uint32_t,
     uint32_t, uint16_t);
 
 void sctp_send_asconf(struct sctp_tcb *, struct sctp_nets *, int addr_locked);
@@ -162,7 +165,7 @@
 
 void
 sctp_send_packet_dropped(struct sctp_tcb *, struct sctp_nets *, struct mbuf *,
-    int, int);
+    int, int, int);
 
 
 
@@ -203,14 +206,20 @@
     uint16_t adding_i, uint8_t from_peer);
 
 void
-sctp_send_abort(struct mbuf *, int, struct sctphdr *, uint32_t,
-    struct mbuf *, uint32_t, uint16_t);
+sctp_send_abort(struct mbuf *, int, struct sockaddr *, struct sockaddr *,
+    struct sctphdr *, uint32_t, struct mbuf *,
+    uint8_t, uint32_t,
+    uint32_t, uint16_t);
 
-void sctp_send_operr_to(struct mbuf *, int, struct mbuf *, uint32_t, uint32_t, uint16_t);
+void 
+sctp_send_operr_to(struct sockaddr *, struct sockaddr *,
+    struct sctphdr *, uint32_t, struct mbuf *,
+    uint8_t, uint32_t,
+    uint32_t, uint16_t);
 
 #endif				/* _KERNEL || __Userspace__ */
 
-#if defined(_KERNEL) || defined (__Userspace__)
+#if defined(_KERNEL) || defined(__Userspace__)
 int
 sctp_sosend(struct socket *so,
     struct sockaddr *addr,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_pcb.c
--- a/head/sys/netinet/sctp_pcb.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_pcb.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_pcb.c,v 1.38 2005/03/06 16:04:18 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_pcb.c 228907 2011-12-27 10:16:24Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_pcb.c 238550 2012-07-17 13:03:47Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <sys/proc.h>
@@ -1205,11 +1203,18 @@
 	uint16_t rport;
 
 	inp = *inp_p;
-	if (remote->sa_family == AF_INET) {
+	switch (remote->sa_family) {
+#ifdef INET
+	case AF_INET:
 		rport = (((struct sockaddr_in *)remote)->sin_port);
-	} else if (remote->sa_family == AF_INET6) {
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
 		rport = (((struct sockaddr_in6 *)remote)->sin6_port);
-	} else {
+		break;
+#endif
+	default:
 		return (NULL);
 	}
 	if (locked_tcb) {
@@ -1893,7 +1898,7 @@
  * need to change the *to to some other struct like a mbuf...
  */
 struct sctp_tcb *
-sctp_findassociation_addr_sa(struct sockaddr *to, struct sockaddr *from,
+sctp_findassociation_addr_sa(struct sockaddr *from, struct sockaddr *to,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, int find_tcp_pool,
     uint32_t vrf_id)
 {
@@ -1948,7 +1953,7 @@
 static struct sctp_tcb *
 sctp_findassociation_special_addr(struct mbuf *m, int offset,
     struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp,
-    struct sockaddr *dest)
+    struct sockaddr *dst)
 {
 	struct sctp_paramhdr *phdr, parm_buf;
 	struct sctp_tcb *retval;
@@ -2002,7 +2007,7 @@
 			memcpy(&sin4.sin_addr, &p4->addr, sizeof(p4->addr));
 			/* look it up */
 			retval = sctp_findassociation_ep_addr(inp_p,
-			    (struct sockaddr *)&sin4, netp, dest, NULL);
+			    (struct sockaddr *)&sin4, netp, dst, NULL);
 			if (retval != NULL) {
 				return (retval);
 			}
@@ -2023,7 +2028,7 @@
 			memcpy(&sin6.sin6_addr, &p6->addr, sizeof(p6->addr));
 			/* look it up */
 			retval = sctp_findassociation_ep_addr(inp_p,
-			    (struct sockaddr *)&sin6, netp, dest, NULL);
+			    (struct sockaddr *)&sin6, netp, dst, NULL);
 			if (retval != NULL) {
 				return (retval);
 			}
@@ -2146,105 +2151,17 @@
  */
 struct sctp_tcb *
 sctp_findassociation_addr(struct mbuf *m, int offset,
+    struct sockaddr *src, struct sockaddr *dst,
     struct sctphdr *sh, struct sctp_chunkhdr *ch,
     struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
 {
 	int find_tcp_pool;
-	struct ip *iph;
 	struct sctp_tcb *retval;
-	struct sockaddr_storage to_store, from_store;
-	struct sockaddr *to = (struct sockaddr *)&to_store;
-	struct sockaddr *from = (struct sockaddr *)&from_store;
 	struct sctp_inpcb *inp;
 
-	iph = mtod(m, struct ip *);
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		{
-			/* its IPv4 */
-			struct sockaddr_in *from4;
-
-			from4 = (struct sockaddr_in *)&from_store;
-			bzero(from4, sizeof(*from4));
-			from4->sin_family = AF_INET;
-			from4->sin_len = sizeof(struct sockaddr_in);
-			from4->sin_addr.s_addr = iph->ip_src.s_addr;
-			from4->sin_port = sh->src_port;
-			break;
-		}
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		{
-			/* its IPv6 */
-			struct ip6_hdr *ip6;
-			struct sockaddr_in6 *from6;
-
-			ip6 = mtod(m, struct ip6_hdr *);
-			from6 = (struct sockaddr_in6 *)&from_store;
-			bzero(from6, sizeof(*from6));
-			from6->sin6_family = AF_INET6;
-			from6->sin6_len = sizeof(struct sockaddr_in6);
-			from6->sin6_addr = ip6->ip6_src;
-			from6->sin6_port = sh->src_port;
-			/* Get the scopes in properly to the sin6 addr's */
-			/* we probably don't need these operations */
-			(void)sa6_recoverscope(from6);
-			sa6_embedscope(from6, MODULE_GLOBAL(ip6_use_defzone));
-			break;
-		}
-#endif
-	default:
-		/* Currently not supported. */
-		return (NULL);
-	}
-
-
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		{
-			/* its IPv4 */
-			struct sockaddr_in *to4;
-
-			to4 = (struct sockaddr_in *)&to_store;
-			bzero(to4, sizeof(*to4));
-			to4->sin_family = AF_INET;
-			to4->sin_len = sizeof(struct sockaddr_in);
-			to4->sin_addr.s_addr = iph->ip_dst.s_addr;
-			to4->sin_port = sh->dest_port;
-			break;
-		}
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		{
-			/* its IPv6 */
-			struct ip6_hdr *ip6;
-			struct sockaddr_in6 *to6;
-
-			ip6 = mtod(m, struct ip6_hdr *);
-			to6 = (struct sockaddr_in6 *)&to_store;
-			bzero(to6, sizeof(*to6));
-			to6->sin6_family = AF_INET6;
-			to6->sin6_len = sizeof(struct sockaddr_in6);
-			to6->sin6_addr = ip6->ip6_dst;
-			to6->sin6_port = sh->dest_port;
-			/* Get the scopes in properly to the sin6 addr's */
-			/* we probably don't need these operations */
-			(void)sa6_recoverscope(to6);
-			sa6_embedscope(to6, MODULE_GLOBAL(ip6_use_defzone));
-			break;
-		}
-#endif
-	default:
-		/* TSNH */
-		break;
-	}
 	if (sh->v_tag) {
 		/* we only go down this path if vtag is non-zero */
-		retval = sctp_findassoc_by_vtag(from, to, ntohl(sh->v_tag),
+		retval = sctp_findassoc_by_vtag(src, dst, ntohl(sh->v_tag),
 		    inp_p, netp, sh->src_port, sh->dest_port, 0, vrf_id, 0);
 		if (retval) {
 			return (retval);
@@ -2259,11 +2176,11 @@
 		find_tcp_pool = 1;
 	}
 	if (inp_p) {
-		retval = sctp_findassociation_addr_sa(to, from, inp_p, netp,
+		retval = sctp_findassociation_addr_sa(src, dst, inp_p, netp,
 		    find_tcp_pool, vrf_id);
 		inp = *inp_p;
 	} else {
-		retval = sctp_findassociation_addr_sa(to, from, &inp, netp,
+		retval = sctp_findassociation_addr_sa(src, dst, &inp, netp,
 		    find_tcp_pool, vrf_id);
 	}
 	SCTPDBG(SCTP_DEBUG_PCB1, "retval:%p inp:%p\n", retval, inp);
@@ -2286,7 +2203,7 @@
 				return (NULL);
 			}
 			retval = sctp_findassociation_special_addr(m,
-			    offset, sh, &inp, netp, to);
+			    offset, sh, &inp, netp, dst);
 			if (inp_p != NULL) {
 				*inp_p = inp;
 			}
@@ -2302,12 +2219,11 @@
  */
 struct sctp_tcb *
 sctp_findassociation_ep_asconf(struct mbuf *m, int offset,
-    struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
+    struct sockaddr *dst, struct sctphdr *sh,
+    struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id)
 {
 	struct sctp_tcb *stcb;
-	struct sockaddr_storage local_store, remote_store;
-	struct sockaddr *to;
-	struct ip *iph;
+	struct sockaddr_storage remote_store;
 	struct sctp_paramhdr parm_buf, *phdr;
 	int ptype;
 	int zero_address = 0;
@@ -2317,42 +2233,11 @@
 
 #endif
 #ifdef INET6
-	struct ip6_hdr *ip6;
 	struct sockaddr_in6 *sin6;
 
 #endif
 
-	memset(&local_store, 0, sizeof(local_store));
 	memset(&remote_store, 0, sizeof(remote_store));
-	to = (struct sockaddr *)&local_store;
-	/* First get the destination address setup too. */
-	iph = mtod(m, struct ip *);
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		/* its IPv4 */
-		sin = (struct sockaddr_in *)&local_store;
-		sin->sin_family = AF_INET;
-		sin->sin_len = sizeof(*sin);
-		sin->sin_port = sh->dest_port;
-		sin->sin_addr.s_addr = iph->ip_dst.s_addr;
-		break;
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		/* its IPv6 */
-		ip6 = mtod(m, struct ip6_hdr *);
-		sin6 = (struct sockaddr_in6 *)&local_store;
-		sin6->sin6_family = AF_INET6;
-		sin6->sin6_len = sizeof(*sin6);
-		sin6->sin6_port = sh->dest_port;
-		sin6->sin6_addr = ip6->ip6_dst;
-		break;
-#endif
-	default:
-		return NULL;
-	}
-
 	phdr = sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk),
 	    &parm_buf, sizeof(struct sctp_paramhdr));
 	if (phdr == NULL) {
@@ -2423,16 +2308,16 @@
 	}
 
 	if (zero_address) {
-		stcb = sctp_findassoc_by_vtag(NULL, to, ntohl(sh->v_tag), inp_p,
+		stcb = sctp_findassoc_by_vtag(NULL, dst, ntohl(sh->v_tag), inp_p,
 		    netp, sh->src_port, sh->dest_port, 1, vrf_id, 0);
 		/*
-		 * printf("findassociation_ep_asconf: zero lookup address
-		 * finds stcb 0x%x\n", (uint32_t)stcb);
+		 * SCTP_PRINTF("findassociation_ep_asconf: zero lookup
+		 * address finds stcb 0x%x\n", (uint32_t)stcb);
 		 */
 	} else {
 		stcb = sctp_findassociation_ep_addr(inp_p,
 		    (struct sockaddr *)&remote_store, netp,
-		    to, NULL);
+		    dst, NULL);
 	}
 	return (stcb);
 }
@@ -3131,6 +3016,7 @@
 
 		memset(&store_sa, 0, sizeof(store_sa));
 		switch (addr->sa_family) {
+#ifdef INET
 		case AF_INET:
 			{
 				struct sockaddr_in *sin;
@@ -3140,6 +3026,8 @@
 				sin->sin_port = 0;
 				break;
 			}
+#endif
+#ifdef INET6
 		case AF_INET6:
 			{
 				struct sockaddr_in6 *sin6;
@@ -3149,6 +3037,7 @@
 				sin6->sin6_port = 0;
 				break;
 			}
+#endif
 		default:
 			break;
 		}
@@ -3426,9 +3315,6 @@
 					*ippp = htonl(SCTP_FROM_SCTP_PCB + SCTP_LOC_3);
 				}
 				asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_3;
-#if defined(SCTP_PANIC_ON_ABORT)
-				panic("inpcb_free does an abort");
-#endif
 				sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
 				SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 				if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
@@ -3520,10 +3406,6 @@
 						*ippp = htonl(SCTP_FROM_SCTP_PCB + SCTP_LOC_5);
 					}
 					asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_5;
-#if defined(SCTP_PANIC_ON_ABORT)
-					panic("inpcb_free does an abort");
-#endif
-
 					sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
 					SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 					if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) ||
@@ -3604,9 +3486,6 @@
 
 			}
 			asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_7;
-#if defined(SCTP_PANIC_ON_ABORT)
-			panic("inpcb_free does an abort");
-#endif
 			sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED);
 			SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 		} else if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
@@ -4996,30 +4875,25 @@
 		/* now clean up any chunks here */
 		TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) {
 			TAILQ_REMOVE(&outs->outqueue, sp, next);
+			sctp_free_spbufspace(stcb, asoc, sp);
 			if (sp->data) {
 				if (so) {
 					/* Still an open socket - report */
 					sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb,
-					    SCTP_NOTIFY_DATAGRAM_UNSENT,
-					    (void *)sp, SCTP_SO_LOCKED);
+					    0, (void *)sp, SCTP_SO_LOCKED);
 				}
 				if (sp->data) {
 					sctp_m_freem(sp->data);
 					sp->data = NULL;
 					sp->tail_mbuf = NULL;
+					sp->length = 0;
 				}
 			}
 			if (sp->net) {
 				sctp_free_remote_addr(sp->net);
 				sp->net = NULL;
 			}
-			sctp_free_spbufspace(stcb, asoc, sp);
-			if (sp->holds_key_ref)
-				sctp_auth_key_release(stcb, sp->auth_keyid, SCTP_SO_LOCKED);
-			/* Free the zone stuff  */
-			SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_strmoq), sp);
-			SCTP_DECR_STRMOQ_COUNT();
-			/* sa_ignore FREED_MEMORY */
+			sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED);
 		}
 	}
 	/* sa_ignore FREED_MEMORY */
@@ -5061,8 +4935,8 @@
 		if (chk->data) {
 			if (so) {
 				/* Still a socket? */
-				sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
-				    SCTP_NOTIFY_DATAGRAM_UNSENT, chk, SCTP_SO_LOCKED);
+				sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb,
+				    0, chk, SCTP_SO_LOCKED);
 			}
 			if (chk->data) {
 				sctp_m_freem(chk->data);
@@ -5085,8 +4959,8 @@
 		if (chk->data) {
 			if (so) {
 				/* Still a socket? */
-				sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
-				    SCTP_NOTIFY_DATAGRAM_SENT, chk, SCTP_SO_LOCKED);
+				sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb,
+				    0, chk, SCTP_SO_LOCKED);
 			}
 			if (chk->data) {
 				sctp_m_freem(chk->data);
@@ -5321,12 +5195,16 @@
 	}
 	/* NOTE: all "scope" checks are done when local addresses are added */
 	switch (destaddr->sa_family) {
+#ifdef INET6
 	case AF_INET6:
 		answer = inp->ip_inp.inp.inp_vflag & INP_IPV6;
 		break;
+#endif
+#ifdef INET
 	case AF_INET:
 		answer = inp->ip_inp.inp.inp_vflag & INP_IPV4;
 		break;
+#endif
 	default:
 		/* invalid family, so it's unreachable */
 		answer = 0;
@@ -5417,7 +5295,7 @@
 			inp->ip_inp.inp.inp_vflag |= INP_IPV6;
 			break;
 #endif
-#ifdef INET6
+#ifdef INET
 		case AF_INET:
 			inp->ip_inp.inp.inp_vflag |= INP_IPV4;
 			break;
@@ -5745,7 +5623,7 @@
 			if (v6 == 0) {
 				sctp_input_with_port(m, off, 0);
 			} else {
-				printf("V6 not yet supported\n");
+				SCTP_PRINTF("V6 not yet supported\n");
 				sctp_m_freem(m);
 			}
 			CURVNET_RESTORE();
@@ -6096,7 +5974,8 @@
 
 int
 sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m,
-    int offset, int limit, struct sctphdr *sh,
+    int offset, int limit,
+    struct sockaddr *src, struct sockaddr *dst,
     struct sockaddr *altsa)
 {
 	/*
@@ -6108,13 +5987,10 @@
 	 */
 	struct sctp_inpcb *inp;
 	struct sctp_nets *net, *nnet, *net_tmp;
-	struct ip *iph;
 	struct sctp_paramhdr *phdr, parm_buf;
 	struct sctp_tcb *stcb_tmp;
 	uint16_t ptype, plen;
 	struct sockaddr *sa;
-	struct sockaddr_storage dest_store;
-	struct sockaddr *local_sa = (struct sockaddr *)&dest_store;
 	uint8_t random_store[SCTP_PARAM_BUFFER_SIZE];
 	struct sctp_auth_random *p_random = NULL;
 	uint16_t random_len = 0;
@@ -6153,65 +6029,10 @@
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_port = stcb->rport;
 #endif
-	iph = mtod(m, struct ip *);
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		{
-			/* its IPv4 */
-			struct sockaddr_in *sin_2;
-
-			sin_2 = (struct sockaddr_in *)(local_sa);
-			memset(sin_2, 0, sizeof(sin));
-			sin_2->sin_family = AF_INET;
-			sin_2->sin_len = sizeof(sin);
-			sin_2->sin_port = sh->dest_port;
-			sin_2->sin_addr.s_addr = iph->ip_dst.s_addr;
-			if (altsa) {
-				/*
-				 * For cookies we use the src address NOT
-				 * from the packet but from the original
-				 * INIT.
-				 */
-				sa = altsa;
-			} else {
-				sin.sin_addr = iph->ip_src;
-				sa = (struct sockaddr *)&sin;
-			}
-			break;
-		}
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		{
-			/* its IPv6 */
-			struct ip6_hdr *ip6;
-			struct sockaddr_in6 *sin6_2;
-
-			ip6 = mtod(m, struct ip6_hdr *);
-			sin6_2 = (struct sockaddr_in6 *)(local_sa);
-			memset(sin6_2, 0, sizeof(sin6));
-			sin6_2->sin6_family = AF_INET6;
-			sin6_2->sin6_len = sizeof(struct sockaddr_in6);
-			sin6_2->sin6_port = sh->dest_port;
-			sin6_2->sin6_addr = ip6->ip6_dst;
-			if (altsa) {
-				/*
-				 * For cookies we use the src address NOT
-				 * from the packet but from the original
-				 * INIT.
-				 */
-				sa = altsa;
-			} else {
-				sin6.sin6_addr = ip6->ip6_src;
-				sa = (struct sockaddr *)&sin6;
-			}
-			break;
-		}
-#endif
-	default:
-		return (-1);
-		break;
+	if (altsa) {
+		sa = altsa;
+	} else {
+		sa = src;
 	}
 	/* Turn off ECN until we get through all params */
 	ecn_allowed = 0;
@@ -6222,7 +6043,7 @@
 	/* does the source address already exist? if so skip it */
 	inp = stcb->sctp_ep;
 	atomic_add_int(&stcb->asoc.refcnt, 1);
-	stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net_tmp, local_sa, stcb);
+	stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net_tmp, dst, stcb);
 	atomic_add_int(&stcb->asoc.refcnt, -1);
 
 	if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) {
@@ -6276,8 +6097,8 @@
 		ptype = ntohs(phdr->param_type);
 		plen = ntohs(phdr->param_length);
 		/*
-		 * printf("ptype => %0x, plen => %d\n", (uint32_t)ptype,
-		 * (int)plen);
+		 * SCTP_PRINTF("ptype => %0x, plen => %d\n",
+		 * (uint32_t)ptype, (int)plen);
 		 */
 		if (offset + plen > limit) {
 			break;
@@ -6312,7 +6133,7 @@
 				inp = stcb->sctp_ep;
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net,
-				    local_sa, stcb);
+				    dst, stcb);
 				atomic_add_int(&stcb->asoc.refcnt, -1);
 
 				if ((stcb_tmp == NULL && inp == stcb->sctp_ep) ||
@@ -6357,7 +6178,7 @@
 							 * abort this guy
 							 */
 							sctp_abort_an_association(stcb_tmp->sctp_ep,
-							    stcb_tmp, 1, NULL, 0);
+							    stcb_tmp, NULL, SCTP_SO_NOT_LOCKED);
 							goto add_it_now;
 						}
 						SCTP_TCB_UNLOCK(stcb_tmp);
@@ -6402,7 +6223,7 @@
 				inp = stcb->sctp_ep;
 				atomic_add_int(&stcb->asoc.refcnt, 1);
 				stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net,
-				    local_sa, stcb);
+				    dst, stcb);
 				atomic_add_int(&stcb->asoc.refcnt, -1);
 				if (stcb_tmp == NULL &&
 				    (inp == stcb->sctp_ep || inp == NULL)) {
@@ -6448,7 +6269,7 @@
 							 * abort this guy
 							 */
 							sctp_abort_an_association(stcb_tmp->sctp_ep,
-							    stcb_tmp, 1, NULL, 0);
+							    stcb_tmp, NULL, SCTP_SO_NOT_LOCKED);
 							goto add_it_now6;
 						}
 					SCTP_TCB_UNLOCK(stcb_tmp);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_pcb.h
--- a/head/sys/netinet/sctp_pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,8 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
@@ -29,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_pcb.h,v 1.21 2005/07/16 01:18:47 suz Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_pcb.h 237715 2012-06-28 16:01:08Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_pcb.h 233660 2012-03-29 13:36:53Z rrs $");
-
-#ifndef __sctp_pcb_h__
-#define __sctp_pcb_h__
+#ifndef _NETINET_SCTP_PCB_H_
+#define _NETINET_SCTP_PCB_H_
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp.h>
@@ -529,6 +528,7 @@
 
 struct sctp_tcb *
 sctp_findassociation_addr(struct mbuf *, int,
+    struct sockaddr *, struct sockaddr *,
     struct sctphdr *, struct sctp_chunkhdr *, struct sctp_inpcb **,
     struct sctp_nets **, uint32_t vrf_id);
 
@@ -559,7 +559,7 @@
     sctp_assoc_t, int);
 
 struct sctp_tcb *
-sctp_findassociation_ep_asconf(struct mbuf *, int,
+sctp_findassociation_ep_asconf(struct mbuf *, int, struct sockaddr *,
     struct sctphdr *, struct sctp_inpcb **, struct sctp_nets **, uint32_t vrf_id);
 
 int sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id);
@@ -604,8 +604,8 @@
 void sctp_del_local_addr_restricted(struct sctp_tcb *, struct sctp_ifa *);
 
 int
-sctp_load_addresses_from_init(struct sctp_tcb *, struct mbuf *, int,
-    int, struct sctphdr *, struct sockaddr *);
+sctp_load_addresses_from_init(struct sctp_tcb *, struct mbuf *, int, int,
+    struct sockaddr *, struct sockaddr *, struct sockaddr *);
 
 int
 sctp_set_primary_addr(struct sctp_tcb *, struct sockaddr *,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_peeloff.c
--- a/head/sys/netinet/sctp_peeloff.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_peeloff.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,11 +30,9 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_peeloff.c 235828 2012-05-23 11:26:28Z tuexen $");
 
-/* $KAME: sctp_peeloff.c,v 1.13 2005/03/06 16:04:18 itojun Exp $	 */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_peeloff.c 233660 2012-03-29 13:36:53Z rrs $");
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctputil.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_peeloff.h
--- a/head/sys/netinet/sctp_peeloff.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_peeloff.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_peeloff.h,v 1.6 2005/03/06 16:04:18 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_peeloff.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_peeloff.h 228653 2011-12-17 19:21:40Z tuexen $");
-
-#ifndef __sctp_peeloff_h__
-#define __sctp_peeloff_h__
+#ifndef _NETINET_SCTP_PEELOFF_H_
+#define _NETINET_SCTP_PEELOFF_H_
 
 
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_ss_functions.c
--- a/head/sys/netinet/sctp_ss_functions.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_ss_functions.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
- * Copyright (c) 2010-2011, by Michael Tuexen. All rights reserved.
- * Copyright (c) 2010-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2010-2011, by Robin Seggelmann. All rights reserved.
+ * Copyright (c) 2010-2012, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2010-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2010-2012, by Robin Seggelmann. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_ss_functions.c 228907 2011-12-27 10:16:24Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_ss_functions.c 235828 2012-05-23 11:26:28Z tuexen $");
 
 #include <netinet/sctp_pcb.h>
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_structs.h
--- a/head/sys/netinet/sctp_structs.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_structs.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_structs.h,v 1.13 2005/03/06 16:04:18 itojun Exp $	 */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_structs.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_structs.h 233660 2012-03-29 13:36:53Z rrs $");
-
-#ifndef __sctp_structs_h__
-#define __sctp_structs_h__
+#ifndef _NETINET_SCTP_STRUCTS_H_
+#define _NETINET_SCTP_STRUCTS_H_
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_header.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_sysctl.c
--- a/head/sys/netinet/sctp_sysctl.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_sysctl.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_sysctl.c 229805 2012-01-08 09:56:24Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_sysctl.c 237565 2012-06-25 17:15:09Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp.h>
@@ -55,9 +55,6 @@
 	SCTP_BASE_SYSCTL(sctp_multiple_asconfs) = SCTPCTL_MULTIPLEASCONFS_DEFAULT;
 	SCTP_BASE_SYSCTL(sctp_ecn_enable) = SCTPCTL_ECN_ENABLE_DEFAULT;
 	SCTP_BASE_SYSCTL(sctp_strict_sacks) = SCTPCTL_STRICT_SACKS_DEFAULT;
-#if !defined(SCTP_WITH_NO_CSUM)
-	SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) = SCTPCTL_LOOPBACK_NOCSUM_DEFAULT;
-#endif
 	SCTP_BASE_SYSCTL(sctp_peer_chunk_oh) = SCTPCTL_PEER_CHKOH_DEFAULT;
 	SCTP_BASE_SYSCTL(sctp_max_burst_default) = SCTPCTL_MAXBURST_DEFAULT;
 	SCTP_BASE_SYSCTL(sctp_fr_max_burst_default) = SCTPCTL_FRMAXBURST_DEFAULT;
@@ -604,9 +601,6 @@
 		RANGECHK(SCTP_BASE_SYSCTL(sctp_auto_asconf), SCTPCTL_AUTOASCONF_MIN, SCTPCTL_AUTOASCONF_MAX);
 		RANGECHK(SCTP_BASE_SYSCTL(sctp_ecn_enable), SCTPCTL_ECN_ENABLE_MIN, SCTPCTL_ECN_ENABLE_MAX);
 		RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_sacks), SCTPCTL_STRICT_SACKS_MIN, SCTPCTL_STRICT_SACKS_MAX);
-#if !defined(SCTP_WITH_NO_CSUM)
-		RANGECHK(SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback), SCTPCTL_LOOPBACK_NOCSUM_MIN, SCTPCTL_LOOPBACK_NOCSUM_MAX);
-#endif
 		RANGECHK(SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), SCTPCTL_PEER_CHKOH_MIN, SCTPCTL_PEER_CHKOH_MAX);
 		RANGECHK(SCTP_BASE_SYSCTL(sctp_max_burst_default), SCTPCTL_MAXBURST_MIN, SCTPCTL_MAXBURST_MAX);
 		RANGECHK(SCTP_BASE_SYSCTL(sctp_fr_max_burst_default), SCTPCTL_FRMAXBURST_MIN, SCTPCTL_FRMAXBURST_MAX);
@@ -671,7 +665,7 @@
 #ifdef SCTP_DEBUG
 		RANGECHK(SCTP_BASE_SYSCTL(sctp_debug_on), SCTPCTL_DEBUG_MIN, SCTPCTL_DEBUG_MAX);
 #endif
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		RANGECHK(SCTP_BASE_SYSCTL(sctp_output_unlocked), SCTPCTL_OUTPUT_UNLOCKED_MIN, SCTPCTL_OUTPUT_UNLOCKED_MAX);
 #endif
 	}
@@ -870,11 +864,6 @@
     &SCTP_BASE_SYSCTL(sctp_strict_sacks), 0, sysctl_sctp_check, "IU",
     SCTPCTL_STRICT_SACKS_DESC);
 
-#if !defined(SCTP_WITH_NO_CSUM)
-SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, loopback_nocsum, CTLTYPE_UINT | CTLFLAG_RW,
-    &SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback), 0, sysctl_sctp_check, "IU",
-    SCTPCTL_LOOPBACK_NOCSUM_DESC);
-#endif
 
 SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, peer_chkoh, CTLTYPE_UINT | CTLFLAG_RW,
     &SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), 0, sysctl_sctp_check, "IU",
@@ -1136,7 +1125,7 @@
 #endif
 
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, output_unlocked, CTLTYPE_UINT | CTLFLAG_RW,
     &SCTP_BASE_SYSCTL(sctp_output_unlocked), 0, sysctl_sctp_check, "IU",
     SCTPCTL_OUTPUT_UNLOCKED_DESC);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_sysctl.h
--- a/head/sys/netinet/sctp_sysctl.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_sysctl.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,10 +31,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_sysctl.h 229805 2012-01-08 09:56:24Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_sysctl.h 237565 2012-06-25 17:15:09Z tuexen $");
 
-#ifndef __sctp_sysctl_h__
-#define __sctp_sysctl_h__
+#ifndef _NETINET_SCTP_SYSCTL_H_
+#define _NETINET_SCTP_SYSCTL_H_
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_constants.h>
@@ -47,9 +47,6 @@
 	uint32_t sctp_ecn_enable;
 	uint32_t sctp_fr_max_burst_default;
 	uint32_t sctp_strict_sacks;
-#if !defined(SCTP_WITH_NO_CSUM)
-	uint32_t sctp_no_csum_on_loopback;
-#endif
 	uint32_t sctp_peer_chunk_oh;
 	uint32_t sctp_max_burst_default;
 	uint32_t sctp_max_chunks_on_queue;
@@ -377,7 +374,7 @@
 #define SCTPCTL_ABC_L_VAR_DESC		"SCTP ABC max increase per SACK (L)"
 #define SCTPCTL_ABC_L_VAR_MIN		0
 #define SCTPCTL_ABC_L_VAR_MAX		0xFFFFFFFF
-#define SCTPCTL_ABC_L_VAR_DEFAULT	1
+#define SCTPCTL_ABC_L_VAR_DEFAULT	2
 
 /* max_chained_mbufs: Default max number of small mbufs on a chain */
 #define SCTPCTL_MAX_CHAINED_MBUFS_DESC	"Default max number of small mbufs on a chain"
@@ -534,7 +531,7 @@
 #endif
 
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 #define SCTPCTL_OUTPUT_UNLOCKED_DESC	"Unlock socket when sending packets down to IP."
 #define SCTPCTL_OUTPUT_UNLOCKED_MIN	0
 #define SCTPCTL_OUTPUT_UNLOCKED_MAX	1
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_timer.c
--- a/head/sys/netinet/sctp_timer.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_timer.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_timer.c,v 1.29 2005/03/06 16:04:18 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_timer.c 234297 2012-04-14 21:01:44Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_timer.c 235828 2012-05-23 11:26:28Z tuexen $");
 
 #define _IP_VHL
 #include <netinet/sctp_os.h>
@@ -101,8 +99,7 @@
 				net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY;
 				net->dest_state &= ~SCTP_ADDR_PF;
 				sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
-				    stcb,
-				    SCTP_FAILED_THRESHOLD,
+				    stcb, 0,
 				    (void *)net, SCTP_SO_NOT_LOCKED);
 			}
 		} else if ((net->pf_threshold < net->failure_threshold) &&
@@ -167,7 +164,7 @@
 			*ippp = htonl(SCTP_FROM_SCTP_TIMER + SCTP_LOC_1);
 		}
 		inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_1;
-		sctp_abort_an_association(inp, stcb, SCTP_FAILED_THRESHOLD, oper, SCTP_SO_NOT_LOCKED);
+		sctp_abort_an_association(inp, stcb, oper, SCTP_SO_NOT_LOCKED);
 		return (1);
 	}
 	return (0);
@@ -615,7 +612,7 @@
 					if (chk->data) {
 						(void)sctp_release_pr_sctp_chunk(stcb,
 						    chk,
-						    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+						    1,
 						    SCTP_SO_NOT_LOCKED);
 						cnt_abandoned++;
 					}
@@ -628,7 +625,7 @@
 					if (chk->data) {
 						(void)sctp_release_pr_sctp_chunk(stcb,
 						    chk,
-						    (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT),
+						    1,
 						    SCTP_SO_NOT_LOCKED);
 						cnt_abandoned++;
 					}
@@ -1066,8 +1063,7 @@
 				*ippp = htonl(SCTP_FROM_SCTP_TIMER + SCTP_LOC_3);
 			}
 			inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_4;
-			sctp_abort_an_association(inp, stcb, SCTP_INTERNAL_ERROR,
-			    oper, SCTP_SO_NOT_LOCKED);
+			sctp_abort_an_association(inp, stcb, oper, SCTP_SO_NOT_LOCKED);
 		} else {
 #ifdef INVARIANTS
 			panic("Cookie timer expires in wrong state?");
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_timer.h
--- a/head/sys/netinet/sctp_timer.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_timer.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,12 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_timer.h,v 1.6 2005/03/06 16:04:18 itojun Exp $	 */
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_timer.h 228653 2011-12-17 19:21:40Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_timer.h 235828 2012-05-23 11:26:28Z tuexen $");
 
-#ifndef __sctp_timer_h__
-#define __sctp_timer_h__
+#ifndef _NETINET_SCTP_TIMER_H_
+#define _NETINET_SCTP_TIMER_H_
 
 #if defined(_KERNEL) || defined(__Userspace__)
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_uio.h
--- a/head/sys/netinet/sctp_uio.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_uio.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,12 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_uio.h,v 1.11 2005/03/06 16:04:18 itojun Exp $	 */
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_uio.h 233660 2012-03-29 13:36:53Z rrs $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_uio.h 238501 2012-07-15 20:16:17Z tuexen $");
 
-#ifndef __sctp_uio_h__
-#define __sctp_uio_h__
+#ifndef _NETINET_SCTP_UIO_H_
+#define _NETINET_SCTP_UIO_H_
 
 
 #if ! defined(_KERNEL)
@@ -169,7 +168,7 @@
 };
 
 struct sctp_authinfo {
-	uint16_t auth_keyid;
+	uint16_t auth_keynumber;
 };
 
 struct sctp_rcvinfo {
@@ -296,16 +295,23 @@
 	uint16_t sac_outbound_streams;
 	uint16_t sac_inbound_streams;
 	sctp_assoc_t sac_assoc_id;
+	uint8_t sac_info[];
 };
 
 /* sac_state values */
-#define SCTP_COMM_UP		0x0001
-#define SCTP_COMM_LOST		0x0002
-#define SCTP_RESTART		0x0003
-#define SCTP_SHUTDOWN_COMP	0x0004
-#define SCTP_CANT_STR_ASSOC	0x0005
+#define SCTP_COMM_UP            0x0001
+#define SCTP_COMM_LOST          0x0002
+#define SCTP_RESTART            0x0003
+#define SCTP_SHUTDOWN_COMP      0x0004
+#define SCTP_CANT_STR_ASSOC     0x0005
 
-
+/* sac_info values */
+#define SCTP_ASSOC_SUPPORTS_PR        0x01
+#define SCTP_ASSOC_SUPPORTS_AUTH      0x02
+#define SCTP_ASSOC_SUPPORTS_ASCONF    0x03
+#define SCTP_ASSOC_SUPPORTS_MULTIBUF  0x04
+#define SCTP_ASSOC_SUPPORTS_RE_CONFIG 0x05
+#define SCTP_ASSOC_SUPPORTS_MAX       0x05
 /*
  * Address event
  */
@@ -343,7 +349,7 @@
 	uint8_t sre_data[4];
 };
 
-/* data send failure event */
+/* data send failure event (deprecated) */
 struct sctp_send_failed {
 	uint16_t ssf_type;
 	uint16_t ssf_flags;
@@ -354,6 +360,17 @@
 	uint8_t ssf_data[];
 };
 
+/* data send failure event (not deprecated) */
+struct sctp_send_failed_event {
+	uint16_t ssfe_type;
+	uint16_t ssfe_flags;
+	uint32_t ssfe_length;
+	uint32_t ssfe_error;
+	struct sctp_sndinfo ssfe_info;
+	sctp_assoc_t ssfe_assoc_id;
+	uint8_t ssfe_data[];
+};
+
 /* flag that indicates state of data */
 #define SCTP_DATA_UNSENT	0x0001	/* inqueue never on wire */
 #define SCTP_DATA_SENT		0x0002	/* on wire at failure */
@@ -424,7 +441,8 @@
 };
 
 /* indication values */
-#define SCTP_AUTH_NEWKEY	0x0001
+#define SCTP_AUTH_NEW_KEY	0x0001
+#define SCTP_AUTH_NEWKEY	SCTP_AUTH_NEW_KEY
 #define SCTP_AUTH_NO_AUTH	0x0002
 #define SCTP_AUTH_FREE_KEY	0x0003
 
@@ -449,9 +467,10 @@
 };
 
 /* flags in stream_reset_event (strreset_flags) */
-#define SCTP_STREAM_RESET_DENIED        0x0004	/* SCTP_STRRESET_FAILED */
-#define SCTP_STREAM_RESET_FAILED        0x0008	/* SCTP_STRRESET_FAILED */
-#define SCTP_STREAM_CHANGED_DENIED	0x0010
+#define SCTP_STREAM_RESET_INCOMING_SSN  0x0001
+#define SCTP_STREAM_RESET_OUTGOING_SSN  0x0002
+#define SCTP_STREAM_RESET_DENIED        0x0004
+#define SCTP_STREAM_RESET_FAILED        0x0008
 
 /*
  * Assoc reset event - subscribe to SCTP_ASSOC_RESET_EVENT
@@ -504,29 +523,29 @@
 	struct sctp_pdapi_event sn_pdapi_event;
 	struct sctp_authkey_event sn_auth_event;
 	struct sctp_sender_dry_event sn_sender_dry_event;
+	struct sctp_send_failed_event sn_send_failed_event;
 	struct sctp_stream_reset_event sn_strreset_event;
 	struct sctp_assoc_reset_event sn_assocreset_event;
 	struct sctp_stream_change_event sn_strchange_event;
-
 };
 
 /* notification types */
-#define SCTP_ASSOC_CHANGE			0x0001
-#define SCTP_PEER_ADDR_CHANGE			0x0002
-#define SCTP_REMOTE_ERROR			0x0003
-#define SCTP_SEND_FAILED			0x0004
-#define SCTP_SHUTDOWN_EVENT			0x0005
-#define SCTP_ADAPTATION_INDICATION		0x0006
+#define SCTP_ASSOC_CHANGE                       0x0001
+#define SCTP_PEER_ADDR_CHANGE                   0x0002
+#define SCTP_REMOTE_ERROR                       0x0003
+#define SCTP_SEND_FAILED                        0x0004
+#define SCTP_SHUTDOWN_EVENT                     0x0005
+#define SCTP_ADAPTATION_INDICATION              0x0006
 /* same as above */
-#define SCTP_ADAPTION_INDICATION		0x0006
-#define SCTP_PARTIAL_DELIVERY_EVENT		0x0007
-#define SCTP_AUTHENTICATION_EVENT		0x0008
-#define SCTP_STREAM_RESET_EVENT			0x0009
-#define SCTP_SENDER_DRY_EVENT			0x000a
-#define SCTP_NOTIFICATIONS_STOPPED_EVENT	0x000b	/* we don't send this */
-#define SCTP_ASSOC_RESET_EVENT			0x000c
-#define SCTP_STREAM_CHANGE_EVENT		0x000d
-
+#define SCTP_ADAPTION_INDICATION                0x0006
+#define SCTP_PARTIAL_DELIVERY_EVENT             0x0007
+#define SCTP_AUTHENTICATION_EVENT               0x0008
+#define SCTP_STREAM_RESET_EVENT                 0x0009
+#define SCTP_SENDER_DRY_EVENT                   0x000a
+#define SCTP_NOTIFICATIONS_STOPPED_EVENT        0x000b	/* we don't send this */
+#define SCTP_ASSOC_RESET_EVENT                  0x000c
+#define SCTP_STREAM_CHANGE_EVENT                0x000d
+#define SCTP_SEND_FAILED_EVENT                  0x000e
 /*
  * socket option structs
  */
@@ -605,13 +624,6 @@
 	struct sockaddr addr[1];
 };
 
-struct sctp_setstrm_timeout {
-	sctp_assoc_t ssto_assoc_id;
-	uint32_t ssto_timeout;
-	uint32_t ssto_streamid_start;
-	uint32_t ssto_streamid_end;
-};
-
 struct sctp_status {
 	sctp_assoc_t sstat_assoc_id;
 	int32_t sstat_state;
@@ -664,6 +676,7 @@
 /* SCTP_PEER_AUTH_CHUNKS / SCTP_LOCAL_AUTH_CHUNKS */
 struct sctp_authchunks {
 	sctp_assoc_t gauth_assoc_id;
+	uint32_t gauth_number_of_chunks;
 	uint8_t gauth_chunks[];
 };
 
@@ -1111,10 +1124,10 @@
 #define SCTP_STAT_DECR_GAUGE32(_x) SCTP_STAT_DECR(_x)
 
 union sctp_sockstore {
-#if defined(INET) || !defined(_KERNEL)
+#if defined(INET)
 	struct sockaddr_in sin;
 #endif
-#if defined(INET6) || !defined(_KERNEL)
+#if defined(INET6)
 	struct sockaddr_in6 sin6;
 #endif
 	struct sockaddr sa;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_usrreq.c
--- a/head/sys/netinet/sctp_usrreq.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_usrreq.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,16 +30,15 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_usrreq.c,v 1.48 2005/03/07 23:26:08 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_usrreq.c 234464 2012-04-19 15:30:15Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_usrreq.c 238501 2012-07-15 20:16:17Z tuexen $");
+
 #include <netinet/sctp_os.h>
 #include <sys/proc.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctp_var.h>
-#if defined(INET6)
+#ifdef INET6
 #endif
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_output.h>
@@ -79,7 +78,6 @@
 	 * now I will just copy.
 	 */
 	SCTP_BASE_SYSCTL(sctp_recvspace) = SCTP_BASE_SYSCTL(sctp_sendspace);
-
 	SCTP_BASE_VAR(first_time) = 0;
 	SCTP_BASE_VAR(sctp_pcb_initialized) = 0;
 	sctp_pcb_init();
@@ -88,8 +86,6 @@
 	SCTP_BASE_VAR(packet_log_end) = 0;
 	bzero(&SCTP_BASE_VAR(packet_log_buffer), SCTP_PACKET_LOG_SIZE);
 #endif
-
-
 }
 
 void
@@ -228,15 +224,13 @@
     struct sctp_tcb *stcb,
     struct sctp_nets *net)
 {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
+	struct icmp *icmph;
+
 	/* protection */
-	int reason;
-	struct icmp *icmph;
-
-
 	if ((inp == NULL) || (stcb == NULL) || (net == NULL) ||
 	    (sh == NULL) || (to == NULL)) {
 		if (stcb)
@@ -275,7 +269,7 @@
 			net->dest_state &= ~SCTP_ADDR_REACHABLE;
 			net->dest_state &= ~SCTP_ADDR_PF;
 			sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
-			    stcb, SCTP_FAILED_THRESHOLD,
+			    stcb, 0,
 			    (void *)net, SCTP_SO_NOT_LOCKED);
 		}
 		SCTP_TCB_UNLOCK(stcb);
@@ -288,9 +282,8 @@
 		 * now is dead. In either case treat it like a OOTB abort
 		 * with no TCB
 		 */
-		reason = SCTP_PEER_FAULTY;
-		sctp_abort_notification(stcb, reason, SCTP_SO_NOT_LOCKED);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+		sctp_abort_notification(stcb, 1, 0, NULL, SCTP_SO_NOT_LOCKED);
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		so = SCTP_INP_SO(inp);
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -299,7 +292,7 @@
 		atomic_subtract_int(&stcb->asoc.refcnt, 1);
 #endif
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 		/* SCTP_TCB_UNLOCK(stcb); MT: I think this is not needed. */
 #endif
@@ -352,8 +345,8 @@
 		 * 'from' holds our local endpoint address. Thus we reverse
 		 * the to and the from in the lookup.
 		 */
-		stcb = sctp_findassociation_addr_sa((struct sockaddr *)&from,
-		    (struct sockaddr *)&to,
+		stcb = sctp_findassociation_addr_sa((struct sockaddr *)&to,
+		    (struct sockaddr *)&from,
 		    &inp, &net, 1, vrf_id);
 		if (stcb != NULL && inp && (inp->sctp_socket != NULL)) {
 			if (cmd != PRC_MSGSIZE) {
@@ -404,8 +397,8 @@
 	if (error)
 		return (error);
 
-	stcb = sctp_findassociation_addr_sa(sintosa(&addrs[0]),
-	    sintosa(&addrs[1]),
+	stcb = sctp_findassociation_addr_sa(sintosa(&addrs[1]),
+	    sintosa(&addrs[0]),
 	    &inp, &net, 1, vrf_id);
 	if (stcb == NULL || inp == NULL || inp->sctp_socket == NULL) {
 		if ((inp != NULL) && (stcb == NULL)) {
@@ -783,9 +776,6 @@
 						ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
 						ph->param_length = htons(SCTP_BUF_LEN(err));
 					}
-#if defined(SCTP_PANIC_ON_ABORT)
-					panic("disconnect does an abort");
-#endif
 					sctp_send_abort_tcb(stcb, err, SCTP_SO_LOCKED);
 					SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 				}
@@ -891,10 +881,6 @@
 						ippp = (uint32_t *) (ph + 1);
 						*ippp = htonl(SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4);
 					}
-#if defined(SCTP_PANIC_ON_ABORT)
-					panic("disconnect does an abort");
-#endif
-
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4;
 					sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED);
 					SCTP_STAT_INCR_COUNTER32(sctps_aborted);
@@ -1106,12 +1092,8 @@
 					ippp = (uint32_t *) (ph + 1);
 					*ippp = htonl(SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6);
 				}
-#if defined(SCTP_PANIC_ON_ABORT)
-				panic("shutdown does an abort");
-#endif
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6;
 				sctp_abort_an_association(stcb->sctp_ep, stcb,
-				    SCTP_RESPONSE_TO_USER_REQ,
 				    op_err, SCTP_SO_LOCKED);
 				goto skip_unlock;
 			} else {
@@ -2882,6 +2864,7 @@
 				} else {
 					/* copy in the chunks */
 					(void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
+					sac->gauth_number_of_chunks = (uint32_t) size;
 					*optsize = sizeof(struct sctp_authchunks) + size;
 				}
 				SCTP_TCB_UNLOCK(stcb);
@@ -2900,6 +2883,7 @@
 					} else {
 						/* copy in the chunks */
 						(void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
+						sac->gauth_number_of_chunks = (uint32_t) size;
 						*optsize = sizeof(struct sctp_authchunks) + size;
 					}
 					SCTP_INP_RUNLOCK(inp);
@@ -2930,6 +2914,7 @@
 				} else {
 					/* copy in the chunks */
 					(void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
+					sac->gauth_number_of_chunks = (uint32_t) size;
 					*optsize = sizeof(struct sctp_authchunks) + size;
 				}
 				SCTP_TCB_UNLOCK(stcb);
@@ -2983,6 +2968,15 @@
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP);
 				error = ENOTSUP;
 				break;
+			case SCTP_ASSOC_RESET_EVENT:
+				event_type = SCTP_PCB_FLAGS_ASSOC_RESETEVNT;
+				break;
+			case SCTP_STREAM_CHANGE_EVENT:
+				event_type = SCTP_PCB_FLAGS_STREAM_CHANGEEVNT;
+				break;
+			case SCTP_SEND_FAILED_EVENT:
+				event_type = SCTP_PCB_FLAGS_RECVNSENDFAILEVNT;
+				break;
 			default:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
@@ -3290,6 +3284,33 @@
 			}
 			break;
 		}
+	case SCTP_ENABLE_STREAM_RESET:
+		{
+			struct sctp_assoc_value *av;
+
+			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
+			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
+
+			if (stcb) {
+				av->assoc_value = (uint32_t) stcb->asoc.local_strreset_support;
+				SCTP_TCB_UNLOCK(stcb);
+			} else {
+				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
+				    (av->assoc_id == SCTP_FUTURE_ASSOC)) {
+					SCTP_INP_RLOCK(inp);
+					av->assoc_value = (uint32_t) inp->local_strreset_support;
+					SCTP_INP_RUNLOCK(inp);
+				} else {
+					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+					error = EINVAL;
+				}
+			}
+			if (error == 0) {
+				*optsize = sizeof(struct sctp_assoc_value);
+			}
+			break;
+		}
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
 		error = ENOPROTOOPT;
@@ -4090,7 +4111,6 @@
 	case SCTP_ENABLE_STREAM_RESET:
 		{
 			struct sctp_assoc_value *av;
-			uint8_t set_value = 0;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			if (av->assoc_value & (~SCTP_ENABLE_VALUE_MASK)) {
@@ -4098,10 +4118,9 @@
 				error = EINVAL;
 				break;
 			}
-			set_value = av->assoc_value & SCTP_ENABLE_VALUE_MASK;
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
-				stcb->asoc.local_strreset_support = set_value;
+				stcb->asoc.local_strreset_support = (uint8_t) av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
@@ -4109,7 +4128,7 @@
 				    (av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)) {
 					SCTP_INP_WLOCK(inp);
-					inp->local_strreset_support = set_value;
+					inp->local_strreset_support = (uint8_t) av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
@@ -4117,7 +4136,7 @@
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
-						stcb->asoc.local_strreset_support = set_value;
+						stcb->asoc.local_strreset_support = (uint8_t) av->assoc_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
@@ -4133,7 +4152,6 @@
 
 			SCTP_CHECK_AND_CAST(strrst, optval, struct sctp_reset_streams, optsize);
 			SCTP_FIND_STCB(inp, stcb, strrst->srs_assoc_id);
-
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
@@ -4148,15 +4166,6 @@
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
-			if (!(stcb->asoc.local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) {
-				/*
-				 * User did not enable the operation.
-				 */
-				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EPERM);
-				error = EPERM;
-				SCTP_TCB_UNLOCK(stcb);
-				break;
-			}
 			if (stcb->asoc.stream_reset_outstanding) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 				error = EALREADY;
@@ -4215,6 +4224,21 @@
 				error = ENOENT;
 				break;
 			}
+			if (stcb->asoc.peer_supports_strreset == 0) {
+				/*
+				 * Peer does not support the chunk type.
+				 */
+				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
+				error = EOPNOTSUPP;
+				SCTP_TCB_UNLOCK(stcb);
+				break;
+			}
+			if (stcb->asoc.stream_reset_outstanding) {
+				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
+				error = EALREADY;
+				SCTP_TCB_UNLOCK(stcb);
+				break;
+			}
 			if ((stradd->sas_outstrms == 0) &&
 			    (stradd->sas_instrms == 0)) {
 				error = EINVAL;
@@ -4278,15 +4302,6 @@
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
-			if (!(stcb->asoc.local_strreset_support & SCTP_ENABLE_RESET_ASSOC_REQ)) {
-				/*
-				 * User did not enable the operation.
-				 */
-				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EPERM);
-				error = EPERM;
-				SCTP_TCB_UNLOCK(stcb);
-				break;
-			}
 			if (stcb->asoc.stream_reset_outstanding) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 				error = EALREADY;
@@ -4810,12 +4825,12 @@
 						if (net->dest_state & SCTP_ADDR_REACHABLE) {
 							if (net->error_count > paddrp->spp_pathmaxrxt) {
 								net->dest_state &= ~SCTP_ADDR_REACHABLE;
-								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, SCTP_RESPONSE_TO_USER_REQ, net, SCTP_SO_LOCKED);
+								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						} else {
 							if (net->error_count <= paddrp->spp_pathmaxrxt) {
 								net->dest_state |= SCTP_ADDR_REACHABLE;
-								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, SCTP_RESPONSE_TO_USER_REQ, net, SCTP_SO_LOCKED);
+								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						}
 						net->failure_threshold = paddrp->spp_pathmaxrxt;
@@ -4853,12 +4868,12 @@
 							if (net->dest_state & SCTP_ADDR_REACHABLE) {
 								if (net->error_count > paddrp->spp_pathmaxrxt) {
 									net->dest_state &= ~SCTP_ADDR_REACHABLE;
-									sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, SCTP_RESPONSE_TO_USER_REQ, net, SCTP_SO_LOCKED);
+									sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 								}
 							} else {
 								if (net->error_count <= paddrp->spp_pathmaxrxt) {
 									net->dest_state |= SCTP_ADDR_REACHABLE;
-									sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, SCTP_RESPONSE_TO_USER_REQ, net, SCTP_SO_LOCKED);
+									sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 								}
 							}
 							net->failure_threshold = paddrp->spp_pathmaxrxt;
@@ -5243,7 +5258,6 @@
 	case SCTP_BINDX_ADD_ADDR:
 		{
 			struct sctp_getaddresses *addrs;
-			size_t sz;
 			struct thread *td;
 
 			td = (struct thread *)p;
@@ -5251,8 +5265,7 @@
 			    optsize);
 #ifdef INET
 			if (addrs->addr->sa_family == AF_INET) {
-				sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in);
-				if (optsize < sz) {
+				if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
@@ -5265,8 +5278,7 @@
 #endif
 #ifdef INET6
 			if (addrs->addr->sa_family == AF_INET6) {
-				sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6);
-				if (optsize < sz) {
+				if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
@@ -5290,7 +5302,6 @@
 	case SCTP_BINDX_REM_ADDR:
 		{
 			struct sctp_getaddresses *addrs;
-			size_t sz;
 			struct thread *td;
 
 			td = (struct thread *)p;
@@ -5298,8 +5309,7 @@
 			SCTP_CHECK_AND_CAST(addrs, optval, struct sctp_getaddresses, optsize);
 #ifdef INET
 			if (addrs->addr->sa_family == AF_INET) {
-				sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in);
-				if (optsize < sz) {
+				if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
@@ -5312,8 +5322,7 @@
 #endif
 #ifdef INET6
 			if (addrs->addr->sa_family == AF_INET6) {
-				sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6);
-				if (optsize < sz) {
+				if (optsize < sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
@@ -5379,6 +5388,15 @@
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP);
 				error = ENOTSUP;
 				break;
+			case SCTP_ASSOC_RESET_EVENT:
+				event_type = SCTP_PCB_FLAGS_ASSOC_RESETEVNT;
+				break;
+			case SCTP_STREAM_CHANGE_EVENT:
+				event_type = SCTP_PCB_FLAGS_STREAM_CHANGEEVNT;
+				break;
+			case SCTP_SEND_FAILED_EVENT:
+				event_type = SCTP_PCB_FLAGS_RECVNSENDFAILEVNT;
+				break;
 			default:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
@@ -5655,12 +5673,12 @@
 					if (net->dest_state & SCTP_ADDR_REACHABLE) {
 						if (net->failure_threshold > thlds->spt_pathmaxrxt) {
 							net->dest_state &= ~SCTP_ADDR_REACHABLE;
-							sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, SCTP_RESPONSE_TO_USER_REQ, net, SCTP_SO_LOCKED);
+							sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 						}
 					} else {
 						if (net->failure_threshold <= thlds->spt_pathmaxrxt) {
 							net->dest_state |= SCTP_ADDR_REACHABLE;
-							sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, SCTP_RESPONSE_TO_USER_REQ, net, SCTP_SO_LOCKED);
+							sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 						}
 					}
 					net->failure_threshold = thlds->spt_pathmaxrxt;
@@ -5684,12 +5702,12 @@
 						if (net->dest_state & SCTP_ADDR_REACHABLE) {
 							if (net->failure_threshold > thlds->spt_pathmaxrxt) {
 								net->dest_state &= ~SCTP_ADDR_REACHABLE;
-								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, SCTP_RESPONSE_TO_USER_REQ, net, SCTP_SO_LOCKED);
+								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						} else {
 							if (net->failure_threshold <= thlds->spt_pathmaxrxt) {
 								net->dest_state |= SCTP_ADDR_REACHABLE;
-								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, SCTP_RESPONSE_TO_USER_REQ, net, SCTP_SO_LOCKED);
+								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						}
 						net->failure_threshold = thlds->spt_pathmaxrxt;
@@ -5809,7 +5827,6 @@
 {
 	void *optval = NULL;
 	size_t optsize = 0;
-	struct sctp_inpcb *inp;
 	void *p;
 	int error = 0;
 
@@ -5819,7 +5836,7 @@
 		if (INP_CHECK_SOCKAF(so, AF_INET6))
 			error = ip6_ctloutput(so, sopt);
 #endif				/* INET6 */
-#if defined(INET) && defined (INET6)
+#if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
@@ -5827,12 +5844,11 @@
 #endif
 		return (error);
 	}
-	inp = (struct sctp_inpcb *)so->so_pcb;
 	optsize = sopt->sopt_valsize;
 	if (optsize) {
 		SCTP_MALLOC(optval, void *, optsize, SCTP_M_SOCKOPT);
 		if (optval == NULL) {
-			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS);
+			SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS);
 			return (ENOBUFS);
 		}
 		error = sooptcopyin(sopt, optval, optsize, optsize);
@@ -5847,7 +5863,7 @@
 	} else if (sopt->sopt_dir == SOPT_GET) {
 		error = sctp_getopt(so, sopt->sopt_name, optval, &optsize, p);
 	} else {
-		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
+		SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		error = EINVAL;
 	}
 	if ((error == 0) && (optval != NULL)) {
@@ -6454,7 +6470,6 @@
 	return (0);
 }
 
-#ifdef INET
 struct pr_usrreqs sctp_usrreqs = {
 	.pru_abort = sctp_abort,
 	.pru_accept = sctp_accept,
@@ -6477,4 +6492,3 @@
 };
 
 #endif
-#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctp_var.h
--- a/head/sys/netinet/sctp_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctp_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctp_var.h,v 1.24 2005/03/06 16:04:19 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctp_var.h 234464 2012-04-19 15:30:15Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctp_var.h 235828 2012-05-23 11:26:28Z tuexen $");
 
 #ifndef _NETINET_SCTP_VAR_H_
 #define _NETINET_SCTP_VAR_H_
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctputil.c
--- a/head/sys/netinet/sctputil.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctputil.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,10 +30,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* $KAME: sctputil.c,v 1.37 2005/03/07 23:26:09 itojun Exp $	 */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctputil.c 234539 2012-04-21 11:53:24Z tuexen $");
+__FBSDID("$FreeBSD: head/sys/netinet/sctputil.c 238550 2012-07-17 13:03:47Z tuexen $");
 
 #include <netinet/sctp_os.h>
 #include <netinet/sctp_pcb.h>
@@ -1127,7 +1125,7 @@
 {
 	unsigned int i, limit;
 
-	printf("Mapping array size: %d, baseTSN: %8.8x, cumAck: %8.8x, highestTSN: (%8.8x, %8.8x).\n",
+	SCTP_PRINTF("Mapping array size: %d, baseTSN: %8.8x, cumAck: %8.8x, highestTSN: (%8.8x, %8.8x).\n",
 	    asoc->mapping_array_size,
 	    asoc->mapping_array_base_tsn,
 	    asoc->cumulative_tsn,
@@ -1138,23 +1136,23 @@
 			break;
 		}
 	}
-	printf("Renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit);
+	SCTP_PRINTF("Renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit);
 	for (i = 0; i < limit; i++) {
-		printf("%2.2x%c", asoc->mapping_array[i], ((i + 1) % 16) ? ' ' : '\n');
+		SCTP_PRINTF("%2.2x%c", asoc->mapping_array[i], ((i + 1) % 16) ? ' ' : '\n');
 	}
 	if (limit % 16)
-		printf("\n");
+		SCTP_PRINTF("\n");
 	for (limit = asoc->mapping_array_size; limit > 1; limit--) {
 		if (asoc->nr_mapping_array[limit - 1]) {
 			break;
 		}
 	}
-	printf("Non renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit);
+	SCTP_PRINTF("Non renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit);
 	for (i = 0; i < limit; i++) {
-		printf("%2.2x%c", asoc->nr_mapping_array[i], ((i + 1) % 16) ? ' ' : '\n');
+		SCTP_PRINTF("%2.2x%c", asoc->nr_mapping_array[i], ((i + 1) % 16) ? ' ' : '\n');
 	}
 	if (limit % 16)
-		printf("\n");
+		SCTP_PRINTF("\n");
 }
 
 int
@@ -1292,7 +1290,7 @@
 					goto no_stcb;
 				}
 				/* If we reach here huh? */
-				printf("Unknown it ctl flag %x\n",
+				SCTP_PRINTF("Unknown it ctl flag %x\n",
 				    sctp_it_ctl.iterator_flags);
 				sctp_it_ctl.iterator_flags = 0;
 			}
@@ -1414,7 +1412,7 @@
 	struct sctp_nets *net;
 	struct sctp_timer *tmr;
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
@@ -1725,8 +1723,7 @@
 			break;
 		}
 		SCTP_STAT_INCR(sctps_timoshutdownguard);
-		sctp_abort_an_association(inp, stcb,
-		    SCTP_SHUTDOWN_GUARD_EXPIRES, NULL, SCTP_SO_NOT_LOCKED);
+		sctp_abort_an_association(inp, stcb, NULL, SCTP_SO_NOT_LOCKED);
 		/* no need to unlock on tcb its gone */
 		goto out_decr;
 
@@ -1780,7 +1777,7 @@
 		/* Can we free it yet? */
 		SCTP_INP_DECR_REF(inp);
 		sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_1);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		so = SCTP_INP_SO(inp);
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -1789,7 +1786,7 @@
 		atomic_subtract_int(&stcb->asoc.refcnt, 1);
 #endif
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_2);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 		/*
@@ -2576,15 +2573,13 @@
 	/* find the last mbuf in chain and pad it */
 	struct mbuf *m_at;
 
-	m_at = m;
 	if (last_mbuf) {
 		return (sctp_add_pad_tombuf(last_mbuf, padval));
 	} else {
-		while (m_at) {
+		for (m_at = m; m_at; m_at = SCTP_BUF_NEXT(m_at)) {
 			if (SCTP_BUF_NEXT(m_at) == NULL) {
 				return (sctp_add_pad_tombuf(m_at, padval));
 			}
-			m_at = SCTP_BUF_NEXT(m_at);
 		}
 	}
 	SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EFAULT);
@@ -2592,8 +2587,8 @@
 }
 
 static void
-sctp_notify_assoc_change(uint32_t event, struct sctp_tcb *stcb,
-    uint32_t error, int so_locked
+sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb,
+    uint16_t error, struct sctp_abort_chunk *abort, uint8_t from_peer, int so_locked
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
 #endif
@@ -2602,111 +2597,133 @@
 	struct mbuf *m_notify;
 	struct sctp_assoc_change *sac;
 	struct sctp_queued_to_read *control;
-
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+	size_t notif_len, abort_len;
+	unsigned int i;
+
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
 
+	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT)) {
+		notif_len = sizeof(struct sctp_assoc_change);
+		if (abort != NULL) {
+			abort_len = htons(abort->ch.chunk_length);
+		} else {
+			abort_len = 0;
+		}
+		if ((state == SCTP_COMM_UP) || (state == SCTP_RESTART)) {
+			notif_len += SCTP_ASSOC_SUPPORTS_MAX;
+		} else if ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC)) {
+			notif_len += abort_len;
+		}
+		m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA);
+		if (m_notify == NULL) {
+			/* Retry with smaller value. */
+			notif_len = sizeof(struct sctp_assoc_change);
+			m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA);
+			if (m_notify == NULL) {
+				goto set_error;
+			}
+		}
+		SCTP_BUF_NEXT(m_notify) = NULL;
+		sac = mtod(m_notify, struct sctp_assoc_change *);
+		sac->sac_type = SCTP_ASSOC_CHANGE;
+		sac->sac_flags = 0;
+		sac->sac_length = sizeof(struct sctp_assoc_change);
+		sac->sac_state = state;
+		sac->sac_error = error;
+		/* XXX verify these stream counts */
+		sac->sac_outbound_streams = stcb->asoc.streamoutcnt;
+		sac->sac_inbound_streams = stcb->asoc.streamincnt;
+		sac->sac_assoc_id = sctp_get_associd(stcb);
+		if (notif_len > sizeof(struct sctp_assoc_change)) {
+			if ((state == SCTP_COMM_UP) || (state == SCTP_RESTART)) {
+				i = 0;
+				if (stcb->asoc.peer_supports_prsctp) {
+					sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_PR;
+				}
+				if (stcb->asoc.peer_supports_auth) {
+					sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_AUTH;
+				}
+				if (stcb->asoc.peer_supports_asconf) {
+					sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_ASCONF;
+				}
+				sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_MULTIBUF;
+				if (stcb->asoc.peer_supports_strreset) {
+					sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_RE_CONFIG;
+				}
+				sac->sac_length += i;
+			} else if ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC)) {
+				memcpy(sac->sac_info, abort, abort_len);
+				sac->sac_length += abort_len;
+			}
+		}
+		SCTP_BUF_LEN(m_notify) = sac->sac_length;
+		control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+		    0, 0, stcb->asoc.context, 0, 0, 0,
+		    m_notify);
+		if (control != NULL) {
+			control->length = SCTP_BUF_LEN(m_notify);
+			/* not that we need this */
+			control->tail_mbuf = m_notify;
+			control->spec_flags = M_NOTIFICATION;
+			sctp_add_to_readq(stcb->sctp_ep, stcb,
+			    control,
+			    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD,
+			    so_locked);
+		} else {
+			sctp_m_freem(m_notify);
+		}
+	}
 	/*
-	 * For TCP model AND UDP connected sockets we will send an error up
-	 * when an ABORT comes in.
+	 * For 1-to-1 style sockets, we send up and error when an ABORT
+	 * comes in.
 	 */
+set_error:
 	if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
-	    ((event == SCTP_COMM_LOST) || (event == SCTP_CANT_STR_ASSOC))) {
-		if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) {
-			SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNREFUSED);
-			stcb->sctp_socket->so_error = ECONNREFUSED;
+	    ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC))) {
+		if (from_peer) {
+			if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) {
+				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNREFUSED);
+				stcb->sctp_socket->so_error = ECONNREFUSED;
+			} else {
+				SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET);
+				stcb->sctp_socket->so_error = ECONNRESET;
+			}
 		} else {
-			SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET);
-			stcb->sctp_socket->so_error = ECONNRESET;
-		}
-		/* Wake ANY sleepers */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
-		so = SCTP_INP_SO(stcb->sctp_ep);
-		if (!so_locked) {
-			atomic_add_int(&stcb->asoc.refcnt, 1);
-			SCTP_TCB_UNLOCK(stcb);
-			SCTP_SOCKET_LOCK(so, 1);
-			SCTP_TCB_LOCK(stcb);
-			atomic_subtract_int(&stcb->asoc.refcnt, 1);
-			if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
-				SCTP_SOCKET_UNLOCK(so, 1);
-				return;
-			}
-		}
+			SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNABORTED);
+			stcb->sctp_socket->so_error = ECONNABORTED;
+		}
+	}
+	/* Wake ANY sleepers */
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+	so = SCTP_INP_SO(stcb->sctp_ep);
+	if (!so_locked) {
+		atomic_add_int(&stcb->asoc.refcnt, 1);
+		SCTP_TCB_UNLOCK(stcb);
+		SCTP_SOCKET_LOCK(so, 1);
+		SCTP_TCB_LOCK(stcb);
+		atomic_subtract_int(&stcb->asoc.refcnt, 1);
+		if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
+			SCTP_SOCKET_UNLOCK(so, 1);
+			return;
+		}
+	}
 #endif
+	if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
+	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) &&
+	    ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC))) {
 		socantrcvmore(stcb->sctp_socket);
-		sorwakeup(stcb->sctp_socket);
-		sowwakeup(stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
-		if (!so_locked) {
-			SCTP_SOCKET_UNLOCK(so, 1);
-		}
+	}
+	sorwakeup(stcb->sctp_socket);
+	sowwakeup(stcb->sctp_socket);
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+	if (!so_locked) {
+		SCTP_SOCKET_UNLOCK(so, 1);
+	}
 #endif
-	}
-	if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT)) {
-		/* event not enabled */
-		return;
-	}
-	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_assoc_change), 0, M_DONTWAIT, 1, MT_DATA);
-	if (m_notify == NULL)
-		/* no space left */
-		return;
-	SCTP_BUF_LEN(m_notify) = 0;
-
-	sac = mtod(m_notify, struct sctp_assoc_change *);
-	sac->sac_type = SCTP_ASSOC_CHANGE;
-	sac->sac_flags = 0;
-	sac->sac_length = sizeof(struct sctp_assoc_change);
-	sac->sac_state = event;
-	sac->sac_error = error;
-	/* XXX verify these stream counts */
-	sac->sac_outbound_streams = stcb->asoc.streamoutcnt;
-	sac->sac_inbound_streams = stcb->asoc.streamincnt;
-	sac->sac_assoc_id = sctp_get_associd(stcb);
-	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_assoc_change);
-	SCTP_BUF_NEXT(m_notify) = NULL;
-	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
-	    0, 0, stcb->asoc.context, 0, 0, 0,
-	    m_notify);
-	if (control == NULL) {
-		/* no memory */
-		sctp_m_freem(m_notify);
-		return;
-	}
-	control->length = SCTP_BUF_LEN(m_notify);
-	/* not that we need this */
-	control->tail_mbuf = m_notify;
-	control->spec_flags = M_NOTIFICATION;
-	sctp_add_to_readq(stcb->sctp_ep, stcb,
-	    control,
-	    &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD,
-	    so_locked);
-	if (event == SCTP_COMM_LOST) {
-		/* Wake up any sleeper */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
-		so = SCTP_INP_SO(stcb->sctp_ep);
-		if (!so_locked) {
-			atomic_add_int(&stcb->asoc.refcnt, 1);
-			SCTP_TCB_UNLOCK(stcb);
-			SCTP_SOCKET_LOCK(so, 1);
-			SCTP_TCB_LOCK(stcb);
-			atomic_subtract_int(&stcb->asoc.refcnt, 1);
-			if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) {
-				SCTP_SOCKET_UNLOCK(so, 1);
-				return;
-			}
-		}
-#endif
-		sctp_sowwakeup(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
-		if (!so_locked) {
-			SCTP_SOCKET_UNLOCK(so, 1);
-		}
-#endif
-	}
 }
 
 static void
@@ -2789,7 +2806,7 @@
 
 
 static void
-sctp_notify_send_failed(struct sctp_tcb *stcb, uint32_t error,
+sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error,
     struct sctp_tmit_chunk *chk, int so_locked
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
@@ -2798,39 +2815,68 @@
 {
 	struct mbuf *m_notify;
 	struct sctp_send_failed *ssf;
+	struct sctp_send_failed_event *ssfe;
 	struct sctp_queued_to_read *control;
 	int length;
 
 	if ((stcb == NULL) ||
-	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) {
+	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) &&
+	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT))) {
 		/* event not enabled */
 		return;
 	}
-	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_send_failed), 0, M_DONTWAIT, 1, MT_DATA);
+	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
+		length = sizeof(struct sctp_send_failed_event);
+	} else {
+		length = sizeof(struct sctp_send_failed);
+	}
+	m_notify = sctp_get_mbuf_for_msg(length, 0, M_DONTWAIT, 1, MT_DATA);
 	if (m_notify == NULL)
 		/* no space left */
 		return;
-	length = sizeof(struct sctp_send_failed) + chk->send_size;
+	length += chk->send_size;
 	length -= sizeof(struct sctp_data_chunk);
 	SCTP_BUF_LEN(m_notify) = 0;
-	ssf = mtod(m_notify, struct sctp_send_failed *);
-	ssf->ssf_type = SCTP_SEND_FAILED;
-	if (error == SCTP_NOTIFY_DATAGRAM_UNSENT)
-		ssf->ssf_flags = SCTP_DATA_UNSENT;
-	else
-		ssf->ssf_flags = SCTP_DATA_SENT;
-	ssf->ssf_length = length;
-	ssf->ssf_error = error;
-	/* not exactly what the user sent in, but should be close :) */
-	bzero(&ssf->ssf_info, sizeof(ssf->ssf_info));
-	ssf->ssf_info.sinfo_stream = chk->rec.data.stream_number;
-	ssf->ssf_info.sinfo_ssn = chk->rec.data.stream_seq;
-	ssf->ssf_info.sinfo_flags = chk->rec.data.rcv_flags;
-	ssf->ssf_info.sinfo_ppid = chk->rec.data.payloadtype;
-	ssf->ssf_info.sinfo_context = chk->rec.data.context;
-	ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
-	ssf->ssf_assoc_id = sctp_get_associd(stcb);
-
+	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
+		ssfe = mtod(m_notify, struct sctp_send_failed_event *);
+		ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT;
+		if (sent) {
+			ssfe->ssfe_flags = SCTP_DATA_SENT;
+		} else {
+			ssfe->ssfe_flags = SCTP_DATA_UNSENT;
+		}
+		ssfe->ssfe_length = length;
+		ssfe->ssfe_error = error;
+		/* not exactly what the user sent in, but should be close :) */
+		bzero(&ssfe->ssfe_info, sizeof(ssfe->ssfe_info));
+		ssfe->ssfe_info.snd_sid = chk->rec.data.stream_number;
+		ssfe->ssfe_info.snd_flags = chk->rec.data.rcv_flags;
+		ssfe->ssfe_info.snd_ppid = chk->rec.data.payloadtype;
+		ssfe->ssfe_info.snd_context = chk->rec.data.context;
+		ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb);
+		ssfe->ssfe_assoc_id = sctp_get_associd(stcb);
+		SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed_event);
+	} else {
+		ssf = mtod(m_notify, struct sctp_send_failed *);
+		ssf->ssf_type = SCTP_SEND_FAILED;
+		if (sent) {
+			ssf->ssf_flags = SCTP_DATA_SENT;
+		} else {
+			ssf->ssf_flags = SCTP_DATA_UNSENT;
+		}
+		ssf->ssf_length = length;
+		ssf->ssf_error = error;
+		/* not exactly what the user sent in, but should be close :) */
+		bzero(&ssf->ssf_info, sizeof(ssf->ssf_info));
+		ssf->ssf_info.sinfo_stream = chk->rec.data.stream_number;
+		ssf->ssf_info.sinfo_ssn = chk->rec.data.stream_seq;
+		ssf->ssf_info.sinfo_flags = chk->rec.data.rcv_flags;
+		ssf->ssf_info.sinfo_ppid = chk->rec.data.payloadtype;
+		ssf->ssf_info.sinfo_context = chk->rec.data.context;
+		ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
+		ssf->ssf_assoc_id = sctp_get_associd(stcb);
+		SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed);
+	}
 	if (chk->data) {
 		/*
 		 * trim off the sctp chunk header(it should be there)
@@ -2842,7 +2888,6 @@
 		}
 	}
 	SCTP_BUF_NEXT(m_notify) = chk->data;
-	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed);
 	/* Steal off the mbuf */
 	chk->data = NULL;
 	/*
@@ -2882,43 +2927,69 @@
 {
 	struct mbuf *m_notify;
 	struct sctp_send_failed *ssf;
+	struct sctp_send_failed_event *ssfe;
 	struct sctp_queued_to_read *control;
 	int length;
 
 	if ((stcb == NULL) ||
-	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) {
+	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) &&
+	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT))) {
 		/* event not enabled */
 		return;
 	}
-	length = sizeof(struct sctp_send_failed) + sp->length;
-	m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_send_failed), 0, M_DONTWAIT, 1, MT_DATA);
-	if (m_notify == NULL)
+	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
+		length = sizeof(struct sctp_send_failed_event);
+	} else {
+		length = sizeof(struct sctp_send_failed);
+	}
+	m_notify = sctp_get_mbuf_for_msg(length, 0, M_DONTWAIT, 1, MT_DATA);
+	if (m_notify == NULL) {
 		/* no space left */
 		return;
+	}
+	length += sp->length;
 	SCTP_BUF_LEN(m_notify) = 0;
-	ssf = mtod(m_notify, struct sctp_send_failed *);
-	ssf->ssf_type = SCTP_SEND_FAILED;
-	if (error == SCTP_NOTIFY_DATAGRAM_UNSENT)
+	if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) {
+		ssfe = mtod(m_notify, struct sctp_send_failed_event *);
+		ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT;
+		ssfe->ssfe_flags = SCTP_DATA_UNSENT;
+		ssfe->ssfe_length = length;
+		ssfe->ssfe_error = error;
+		/* not exactly what the user sent in, but should be close :) */
+		bzero(&ssfe->ssfe_info, sizeof(ssfe->ssfe_info));
+		ssfe->ssfe_info.snd_sid = sp->stream;
+		if (sp->some_taken) {
+			ssfe->ssfe_info.snd_flags = SCTP_DATA_LAST_FRAG;
+		} else {
+			ssfe->ssfe_info.snd_flags = SCTP_DATA_NOT_FRAG;
+		}
+		ssfe->ssfe_info.snd_ppid = sp->ppid;
+		ssfe->ssfe_info.snd_context = sp->context;
+		ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb);
+		ssfe->ssfe_assoc_id = sctp_get_associd(stcb);
+		SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed_event);
+	} else {
+		ssf = mtod(m_notify, struct sctp_send_failed *);
+		ssf->ssf_type = SCTP_SEND_FAILED;
 		ssf->ssf_flags = SCTP_DATA_UNSENT;
-	else
-		ssf->ssf_flags = SCTP_DATA_SENT;
-	ssf->ssf_length = length;
-	ssf->ssf_error = error;
-	/* not exactly what the user sent in, but should be close :) */
-	bzero(&ssf->ssf_info, sizeof(ssf->ssf_info));
-	ssf->ssf_info.sinfo_stream = sp->stream;
-	ssf->ssf_info.sinfo_ssn = sp->strseq;
-	if (sp->some_taken) {
-		ssf->ssf_info.sinfo_flags = SCTP_DATA_LAST_FRAG;
-	} else {
-		ssf->ssf_info.sinfo_flags = SCTP_DATA_NOT_FRAG;
-	}
-	ssf->ssf_info.sinfo_ppid = sp->ppid;
-	ssf->ssf_info.sinfo_context = sp->context;
-	ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
-	ssf->ssf_assoc_id = sctp_get_associd(stcb);
+		ssf->ssf_length = length;
+		ssf->ssf_error = error;
+		/* not exactly what the user sent in, but should be close :) */
+		bzero(&ssf->ssf_info, sizeof(ssf->ssf_info));
+		ssf->ssf_info.sinfo_stream = sp->stream;
+		ssf->ssf_info.sinfo_ssn = sp->strseq;
+		if (sp->some_taken) {
+			ssf->ssf_info.sinfo_flags = SCTP_DATA_LAST_FRAG;
+		} else {
+			ssf->ssf_info.sinfo_flags = SCTP_DATA_NOT_FRAG;
+		}
+		ssf->ssf_info.sinfo_ppid = sp->ppid;
+		ssf->ssf_info.sinfo_context = sp->context;
+		ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb);
+		ssf->ssf_assoc_id = sctp_get_associd(stcb);
+		SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed);
+	}
 	SCTP_BUF_NEXT(m_notify) = sp->data;
-	SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed);
 
 	/* Steal off the mbuf */
 	sp->data = NULL;
@@ -3063,7 +3134,7 @@
 	}
 	if (stcb->sctp_ep && stcb->sctp_socket) {
 		/* This should always be the case */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		struct socket *so;
 
 		so = SCTP_INP_SO(stcb->sctp_ep);
@@ -3080,7 +3151,7 @@
 		}
 #endif
 		sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		if (!so_locked) {
 			SCTP_SOCKET_UNLOCK(so, 1);
 		}
@@ -3102,7 +3173,7 @@
 	if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		/* mark socket closed for read/write and wakeup! */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		struct socket *so;
 
 		so = SCTP_INP_SO(stcb->sctp_ep);
@@ -3117,7 +3188,7 @@
 		}
 #endif
 		socantsendmore(stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	}
@@ -3214,7 +3285,8 @@
 	struct sctp_stream_change_event *stradd;
 	int len;
 
-	if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT)) {
+	if ((stcb == NULL) ||
+	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_STREAM_CHANGEEVNT))) {
 		/* event not enabled */
 		return;
 	}
@@ -3275,7 +3347,8 @@
 	struct sctp_assoc_reset_event *strasoc;
 	int len;
 
-	if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT)) {
+	if ((stcb == NULL) ||
+	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_ASSOC_RESETEVNT))) {
 		/* event not enabled */
 		return;
 	}
@@ -3333,7 +3406,8 @@
 	struct sctp_stream_reset_event *strreset;
 	int len;
 
-	if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT)) {
+	if ((stcb == NULL) ||
+	    (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT))) {
 		/* event not enabled */
 		return;
 	}
@@ -3386,6 +3460,63 @@
 }
 
 
+static void
+sctp_notify_remote_error(struct sctp_tcb *stcb, uint16_t error, struct sctp_error_chunk *chunk)
+{
+	struct mbuf *m_notify;
+	struct sctp_remote_error *sre;
+	struct sctp_queued_to_read *control;
+	size_t notif_len, chunk_len;
+
+	if ((stcb == NULL) ||
+	    sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVPEERERR)) {
+		return;
+	}
+	if (chunk != NULL) {
+		chunk_len = htons(chunk->ch.chunk_length);
+	} else {
+		chunk_len = 0;
+	}
+	notif_len = sizeof(struct sctp_remote_error) + chunk_len;
+	m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA);
+	if (m_notify == NULL) {
+		/* Retry with smaller value. */
+		notif_len = sizeof(struct sctp_remote_error);
+		m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA);
+		if (m_notify == NULL) {
+			return;
+		}
+	}
+	SCTP_BUF_NEXT(m_notify) = NULL;
+	sre = mtod(m_notify, struct sctp_remote_error *);
+	sre->sre_type = SCTP_REMOTE_ERROR;
+	sre->sre_flags = 0;
+	sre->sre_length = sizeof(struct sctp_remote_error);
+	sre->sre_error = error;
+	sre->sre_assoc_id = sctp_get_associd(stcb);
+	if (notif_len > sizeof(struct sctp_remote_error)) {
+		memcpy(sre->sre_data, chunk, chunk_len);
+		sre->sre_length += chunk_len;
+	}
+	SCTP_BUF_LEN(m_notify) = sre->sre_length;
+	control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination,
+	    0, 0, stcb->asoc.context, 0, 0, 0,
+	    m_notify);
+	if (control != NULL) {
+		control->length = SCTP_BUF_LEN(m_notify);
+		/* not that we need this */
+		control->tail_mbuf = m_notify;
+		control->spec_flags = M_NOTIFICATION;
+		sctp_add_to_readq(stcb->sctp_ep, stcb,
+		    control,
+		    &stcb->sctp_socket->so_rcv, 1,
+		    SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED);
+	} else {
+		sctp_m_freem(m_notify);
+	}
+}
+
+
 void
 sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb,
     uint32_t error, void *data, int so_locked
@@ -3416,7 +3547,7 @@
 	switch (notification) {
 	case SCTP_NOTIFY_ASSOC_UP:
 		if (stcb->asoc.assoc_up_sent == 0) {
-			sctp_notify_assoc_change(SCTP_COMM_UP, stcb, error, so_locked);
+			sctp_notify_assoc_change(SCTP_COMM_UP, stcb, error, NULL, 0, so_locked);
 			stcb->asoc.assoc_up_sent = 1;
 		}
 		if (stcb->asoc.adaptation_needed && (stcb->asoc.adaptation_sent == 0)) {
@@ -3428,7 +3559,7 @@
 		}
 		break;
 	case SCTP_NOTIFY_ASSOC_DOWN:
-		sctp_notify_assoc_change(SCTP_SHUTDOWN_COMP, stcb, error, so_locked);
+		sctp_notify_assoc_change(SCTP_SHUTDOWN_COMP, stcb, error, NULL, 0, so_locked);
 		break;
 	case SCTP_NOTIFY_INTERFACE_DOWN:
 		{
@@ -3461,8 +3592,12 @@
 		sctp_notify_send_failed2(stcb, error,
 		    (struct sctp_stream_queue_pending *)data, so_locked);
 		break;
-	case SCTP_NOTIFY_DG_FAIL:
-		sctp_notify_send_failed(stcb, error,
+	case SCTP_NOTIFY_SENT_DG_FAIL:
+		sctp_notify_send_failed(stcb, 1, error,
+		    (struct sctp_tmit_chunk *)data, so_locked);
+		break;
+	case SCTP_NOTIFY_UNSENT_DG_FAIL:
+		sctp_notify_send_failed(stcb, 0, error,
 		    (struct sctp_tmit_chunk *)data, so_locked);
 		break;
 	case SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION:
@@ -3474,42 +3609,50 @@
 			sctp_notify_partial_delivery_indication(stcb, error, val, so_locked);
 			break;
 		}
-	case SCTP_NOTIFY_STRDATA_ERR:
-		break;
-	case SCTP_NOTIFY_ASSOC_ABORTED:
+	case SCTP_NOTIFY_ASSOC_LOC_ABORTED:
 		if ((stcb) && (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) ||
 		    ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED))) {
-			sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, so_locked);
+			sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 0, so_locked);
 		} else {
-			sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, so_locked);
+			sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 0, so_locked);
 		}
 		break;
-	case SCTP_NOTIFY_PEER_OPENED_STREAM:
-		break;
-	case SCTP_NOTIFY_STREAM_OPENED_OK:
+	case SCTP_NOTIFY_ASSOC_REM_ABORTED:
+		if ((stcb) && (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) ||
+		    ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED))) {
+			sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 1, so_locked);
+		} else {
+			sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 1, so_locked);
+		}
 		break;
 	case SCTP_NOTIFY_ASSOC_RESTART:
-		sctp_notify_assoc_change(SCTP_RESTART, stcb, error, so_locked);
+		sctp_notify_assoc_change(SCTP_RESTART, stcb, error, NULL, 0, so_locked);
 		if (stcb->asoc.peer_supports_auth == 0) {
 			sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0,
 			    NULL, so_locked);
 		}
 		break;
-	case SCTP_NOTIFY_HB_RESP:
-		break;
 	case SCTP_NOTIFY_STR_RESET_SEND:
-		sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), SCTP_STREAM_RESET_INCOMING);
+		sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), SCTP_STREAM_RESET_OUTGOING_SSN);
 		break;
 	case SCTP_NOTIFY_STR_RESET_RECV:
-		sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), SCTP_STREAM_RESET_OUTGOING);
+		sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), SCTP_STREAM_RESET_INCOMING);
 		break;
 	case SCTP_NOTIFY_STR_RESET_FAILED_OUT:
 		sctp_notify_stream_reset(stcb, error, ((uint16_t *) data),
-		    (SCTP_STREAM_RESET_OUTGOING | SCTP_STREAM_RESET_INCOMING));
+		    (SCTP_STREAM_RESET_OUTGOING_SSN | SCTP_STREAM_RESET_FAILED));
+		break;
+	case SCTP_NOTIFY_STR_RESET_DENIED_OUT:
+		sctp_notify_stream_reset(stcb, error, ((uint16_t *) data),
+		    (SCTP_STREAM_RESET_OUTGOING_SSN | SCTP_STREAM_RESET_DENIED));
 		break;
 	case SCTP_NOTIFY_STR_RESET_FAILED_IN:
 		sctp_notify_stream_reset(stcb, error, ((uint16_t *) data),
-		    (SCTP_STREAM_RESET_OUTGOING | SCTP_STREAM_RESET_INCOMING));
+		    (SCTP_STREAM_RESET_INCOMING | SCTP_STREAM_RESET_FAILED));
+		break;
+	case SCTP_NOTIFY_STR_RESET_DENIED_IN:
+		sctp_notify_stream_reset(stcb, error, ((uint16_t *) data),
+		    (SCTP_STREAM_RESET_INCOMING | SCTP_STREAM_RESET_DENIED));
 		break;
 	case SCTP_NOTIFY_ASCONF_ADD_IP:
 		sctp_notify_peer_addr_change(stcb, SCTP_ADDR_ADDED, data,
@@ -3523,15 +3666,11 @@
 		sctp_notify_peer_addr_change(stcb, SCTP_ADDR_MADE_PRIM, data,
 		    error);
 		break;
-	case SCTP_NOTIFY_ASCONF_SUCCESS:
-		break;
-	case SCTP_NOTIFY_ASCONF_FAILED:
-		break;
 	case SCTP_NOTIFY_PEER_SHUTDOWN:
 		sctp_notify_shutdown_event(stcb);
 		break;
 	case SCTP_NOTIFY_AUTH_NEW_KEY:
-		sctp_notify_authentication(stcb, SCTP_AUTH_NEWKEY, error,
+		sctp_notify_authentication(stcb, SCTP_AUTH_NEW_KEY, error,
 		    (uint16_t) (uintptr_t) data,
 		    so_locked);
 		break;
@@ -3548,6 +3687,9 @@
 	case SCTP_NOTIFY_SENDER_DRY:
 		sctp_notify_sender_dry_event(stcb, so_locked);
 		break;
+	case SCTP_NOTIFY_REMOTE_ERROR:
+		sctp_notify_remote_error(stcb, error, data);
+		break;
 	default:
 		SCTPDBG(SCTP_DEBUG_UTIL1, "%s: unknown notification %xh (%u)\n",
 		    __FUNCTION__, notification, notification);
@@ -3556,7 +3698,7 @@
 }
 
 void
-sctp_report_all_outbound(struct sctp_tcb *stcb, int holds_lock, int so_locked
+sctp_report_all_outbound(struct sctp_tcb *stcb, uint16_t error, int holds_lock, int so_locked
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
 #endif
@@ -3591,8 +3733,8 @@
 		asoc->sent_queue_cnt--;
 		if (chk->data != NULL) {
 			sctp_free_bufspace(stcb, asoc, chk, 1);
-			sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
-			    SCTP_NOTIFY_DATAGRAM_SENT, chk, so_locked);
+			sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb,
+			    error, chk, so_locked);
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
@@ -3607,8 +3749,8 @@
 		asoc->send_queue_cnt--;
 		if (chk->data != NULL) {
 			sctp_free_bufspace(stcb, asoc, chk, 1);
-			sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb,
-			    SCTP_NOTIFY_DATAGRAM_UNSENT, chk, so_locked);
+			sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb,
+			    error, chk, so_locked);
 			if (chk->data) {
 				sctp_m_freem(chk->data);
 				chk->data = NULL;
@@ -3628,10 +3770,12 @@
 			sctp_free_spbufspace(stcb, asoc, sp);
 			if (sp->data) {
 				sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb,
-				    SCTP_NOTIFY_DATAGRAM_UNSENT, (void *)sp, so_locked);
+				    error, (void *)sp, so_locked);
 				if (sp->data) {
 					sctp_m_freem(sp->data);
 					sp->data = NULL;
+					sp->tail_mbuf = NULL;
+					sp->length = 0;
 				}
 			}
 			if (sp->net) {
@@ -3650,7 +3794,8 @@
 }
 
 void
-sctp_abort_notification(struct sctp_tcb *stcb, int error, int so_locked
+sctp_abort_notification(struct sctp_tcb *stcb, uint8_t from_peer, uint16_t error,
+    struct sctp_abort_chunk *abort, int so_locked
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
 #endif
@@ -3670,18 +3815,25 @@
 		return;
 	}
 	/* Tell them we lost the asoc */
-	sctp_report_all_outbound(stcb, 1, so_locked);
-	sctp_ulp_notify(SCTP_NOTIFY_ASSOC_ABORTED, stcb, error, NULL, so_locked);
+	sctp_report_all_outbound(stcb, error, 1, so_locked);
+	if (from_peer) {
+		sctp_ulp_notify(SCTP_NOTIFY_ASSOC_REM_ABORTED, stcb, error, abort, so_locked);
+	} else {
+		sctp_ulp_notify(SCTP_NOTIFY_ASSOC_LOC_ABORTED, stcb, error, abort, so_locked);
+	}
 }
 
 void
 sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
-    struct mbuf *m, int iphlen, struct sctphdr *sh, struct mbuf *op_err,
+    struct mbuf *m, int iphlen,
+    struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh, struct mbuf *op_err,
+    uint8_t use_mflowid, uint32_t mflowid,
     uint32_t vrf_id, uint16_t port)
 {
 	uint32_t vtag;
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
@@ -3690,15 +3842,17 @@
 	if (stcb != NULL) {
 		/* We have a TCB to abort, send notification too */
 		vtag = stcb->asoc.peer_vtag;
-		sctp_abort_notification(stcb, 0, SCTP_SO_NOT_LOCKED);
+		sctp_abort_notification(stcb, 0, 0, NULL, SCTP_SO_NOT_LOCKED);
 		/* get the assoc vrf id and table id */
 		vrf_id = stcb->asoc.vrf_id;
 		stcb->asoc.state |= SCTP_STATE_WAS_ABORTED;
 	}
-	sctp_send_abort(m, iphlen, sh, vtag, op_err, vrf_id, port);
+	sctp_send_abort(m, iphlen, src, dst, sh, vtag, op_err,
+	    use_mflowid, mflowid,
+	    vrf_id, port);
 	if (stcb != NULL) {
 		/* Ok, now lets free it */
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		so = SCTP_INP_SO(inp);
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -3712,7 +3866,7 @@
 			SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 		}
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_4);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 	}
@@ -3784,19 +3938,19 @@
 
 void
 sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb,
-    int error, struct mbuf *op_err,
+    struct mbuf *op_err,
     int so_locked
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
 #endif
 )
 {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	struct socket *so;
 
 #endif
 
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	so = SCTP_INP_SO(inp);
 #endif
 	if (stcb == NULL) {
@@ -3812,12 +3966,10 @@
 		stcb->asoc.state |= SCTP_STATE_WAS_ABORTED;
 	}
 	/* notify the ulp */
-	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0)
-		sctp_abort_notification(stcb, error, so_locked);
+	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
+		sctp_abort_notification(stcb, 0, 0, NULL, so_locked);
+	}
 	/* notify the peer */
-#if defined(SCTP_PANIC_ON_ABORT)
-	panic("aborting an association");
-#endif
 	sctp_send_abort_tcb(stcb, op_err, so_locked);
 	SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 	if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) ||
@@ -3828,7 +3980,7 @@
 #ifdef SCTP_ASOCLOG_OF_TSNS
 	sctp_print_out_track_log(stcb);
 #endif
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	if (!so_locked) {
 		atomic_add_int(&stcb->asoc.refcnt, 1);
 		SCTP_TCB_UNLOCK(stcb);
@@ -3838,7 +3990,7 @@
 	}
 #endif
 	(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_5);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 	if (!so_locked) {
 		SCTP_SOCKET_UNLOCK(so, 1);
 	}
@@ -3846,8 +3998,11 @@
 }
 
 void
-sctp_handle_ootb(struct mbuf *m, int iphlen, int offset, struct sctphdr *sh,
-    struct sctp_inpcb *inp, struct mbuf *op_err, uint32_t vrf_id, uint16_t port)
+sctp_handle_ootb(struct mbuf *m, int iphlen, int offset,
+    struct sockaddr *src, struct sockaddr *dst,
+    struct sctphdr *sh, struct sctp_inpcb *inp,
+    uint8_t use_mflowid, uint32_t mflowid,
+    uint32_t vrf_id, uint16_t port)
 {
 	struct sctp_chunkhdr *ch, chunk_buf;
 	unsigned int chk_length;
@@ -3890,7 +4045,9 @@
 			 */
 			return;
 		case SCTP_SHUTDOWN_ACK:
-			sctp_send_shutdown_complete2(m, sh, vrf_id, port);
+			sctp_send_shutdown_complete2(src, dst, sh,
+			    use_mflowid, mflowid,
+			    vrf_id, port);
 			return;
 		default:
 			break;
@@ -3902,7 +4059,9 @@
 	if ((SCTP_BASE_SYSCTL(sctp_blackhole) == 0) ||
 	    ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) &&
 	    (contains_init_chunk == 0))) {
-		sctp_send_abort(m, iphlen, sh, 0, op_err, vrf_id, port);
+		sctp_send_abort(m, iphlen, src, dst, sh, 0, NULL,
+		    use_mflowid, mflowid,
+		    vrf_id, port);
 	}
 }
 
@@ -4093,62 +4252,6 @@
 }
 
 void
-sctp_print_address_pkt(struct ip *iph, struct sctphdr *sh)
-{
-	switch (iph->ip_v) {
-#ifdef INET
-	case IPVERSION:
-		{
-			struct sockaddr_in lsa, fsa;
-
-			bzero(&lsa, sizeof(lsa));
-			lsa.sin_len = sizeof(lsa);
-			lsa.sin_family = AF_INET;
-			lsa.sin_addr = iph->ip_src;
-			lsa.sin_port = sh->src_port;
-			bzero(&fsa, sizeof(fsa));
-			fsa.sin_len = sizeof(fsa);
-			fsa.sin_family = AF_INET;
-			fsa.sin_addr = iph->ip_dst;
-			fsa.sin_port = sh->dest_port;
-			SCTP_PRINTF("src: ");
-			sctp_print_address((struct sockaddr *)&lsa);
-			SCTP_PRINTF("dest: ");
-			sctp_print_address((struct sockaddr *)&fsa);
-			break;
-		}
-#endif
-#ifdef INET6
-	case IPV6_VERSION >> 4:
-		{
-			struct ip6_hdr *ip6;
-			struct sockaddr_in6 lsa6, fsa6;
-
-			ip6 = (struct ip6_hdr *)iph;
-			bzero(&lsa6, sizeof(lsa6));
-			lsa6.sin6_len = sizeof(lsa6);
-			lsa6.sin6_family = AF_INET6;
-			lsa6.sin6_addr = ip6->ip6_src;
-			lsa6.sin6_port = sh->src_port;
-			bzero(&fsa6, sizeof(fsa6));
-			fsa6.sin6_len = sizeof(fsa6);
-			fsa6.sin6_family = AF_INET6;
-			fsa6.sin6_addr = ip6->ip6_dst;
-			fsa6.sin6_port = sh->dest_port;
-			SCTP_PRINTF("src: ");
-			sctp_print_address((struct sockaddr *)&lsa6);
-			SCTP_PRINTF("dest: ");
-			sctp_print_address((struct sockaddr *)&fsa6);
-			break;
-		}
-#endif
-	default:
-		/* TSNH */
-		break;
-	}
-}
-
-void
 sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp,
     struct sctp_inpcb *new_inp,
     struct sctp_tcb *stcb,
@@ -4320,7 +4423,7 @@
 		if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) {
 			SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket);
 		} else {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			struct socket *so;
 
 			so = SCTP_INP_SO(inp);
@@ -4341,7 +4444,7 @@
 			}
 #endif
 			sctp_sorwakeup(inp, inp->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			if (!so_locked) {
 				SCTP_SOCKET_UNLOCK(so, 1);
 			}
@@ -4474,7 +4577,7 @@
 		if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) {
 			SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket);
 		} else {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			struct socket *so;
 
 			so = SCTP_INP_SO(inp);
@@ -4493,7 +4596,7 @@
 			}
 #endif
 			sctp_sorwakeup(inp, inp->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 			SCTP_SOCKET_UNLOCK(so, 1);
 #endif
 		}
@@ -4566,7 +4669,7 @@
 
 int
 sctp_release_pr_sctp_chunk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *tp1,
-    int reason, int so_locked
+    uint8_t sent, int so_locked
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
 #endif
@@ -4593,7 +4696,11 @@
 			sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1);
 			stcb->asoc.peers_rwnd += tp1->send_size;
 			stcb->asoc.peers_rwnd += SCTP_BASE_SYSCTL(sctp_peer_chunk_oh);
-			sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, reason, tp1, so_locked);
+			if (sent) {
+				sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, 0, tp1, so_locked);
+			} else {
+				sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, tp1, so_locked);
+			}
 			if (tp1->data) {
 				sctp_m_freem(tp1->data);
 				tp1->data = NULL;
@@ -4640,7 +4747,11 @@
 			chk = tp1;
 			ret_sz += tp1->book_size;
 			sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1);
-			sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, reason, tp1, so_locked);
+			if (sent) {
+				sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, 0, tp1, so_locked);
+			} else {
+				sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, tp1, so_locked);
+			}
 			if (tp1->data) {
 				sctp_m_freem(tp1->data);
 				tp1->data = NULL;
@@ -4724,7 +4835,7 @@
 					/*
 					 * Pull any data to free up the SB
 					 * and allow sender to "add more"
-					 * whilc we will throw away :-)
+					 * while we will throw away :-)
 					 */
 					sctp_free_spbufspace(stcb, &stcb->asoc,
 					    sp);
@@ -4732,9 +4843,9 @@
 					do_wakeup_routine = 1;
 					sp->some_taken = 1;
 					sctp_m_freem(sp->data);
-					sp->length = 0;
 					sp->data = NULL;
 					sp->tail_mbuf = NULL;
+					sp->length = 0;
 				}
 				break;
 			}
@@ -4742,7 +4853,7 @@
 		SCTP_TCB_SEND_UNLOCK(stcb);
 	}
 	if (do_wakeup_routine) {
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		struct socket *so;
 
 		so = SCTP_INP_SO(stcb->sctp_ep);
@@ -4760,7 +4871,7 @@
 		}
 #endif
 		sctp_sowwakeup(stcb->sctp_ep, stcb->sctp_socket);
-#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
+#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING)
 		if (!so_locked) {
 			SCTP_SOCKET_UNLOCK(so, 1);
 		}
@@ -5115,7 +5226,7 @@
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) {
 		goto out;
 	}
-	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && (so->so_rcv.sb_cc == 0)) {
 		if (so->so_error) {
 			error = so->so_error;
 			if ((in_flags & MSG_PEEK) == 0)
@@ -5123,7 +5234,6 @@
 			goto out;
 		} else {
 			if (so->so_rcv.sb_cc == 0) {
-				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN);
 				/* indicate EOF */
 				error = 0;
 				goto out;
@@ -5392,7 +5502,7 @@
 #ifdef INVARIANTS
 				panic("refcnt already incremented");
 #else
-				printf("refcnt already incremented?\n");
+				SCTP_PRINTF("refcnt already incremented?\n");
 #endif
 			} else {
 				atomic_add_int(&stcb->asoc.refcnt, 1);
@@ -5529,7 +5639,7 @@
 			memcpy(from, &sin6, sizeof(struct sockaddr_in6));
 		}
 #endif
-#if defined(INET6)
+#ifdef INET6
 		{
 			struct sockaddr_in6 lsa6, *from6;
 
@@ -6426,7 +6536,7 @@
 		return;
 	}
 	addr_touse = sa;
-#if defined(INET6)
+#ifdef INET6
 	if (sa->sa_family == AF_INET6) {
 		struct sockaddr_in6 *sin6;
 
@@ -6675,83 +6785,61 @@
 	struct ip *iph;
 	struct mbuf *sp, *last;
 	struct udphdr *uhdr;
-	uint16_t port = 0;
-	int header_size = sizeof(struct udphdr) + sizeof(struct sctphdr);
-
+	uint16_t port;
+
+	if ((m->m_flags & M_PKTHDR) == 0) {
+		/* Can't handle one that is not a pkt hdr */
+		goto out;
+	}
+	/* Pull the src port */
+	iph = mtod(m, struct ip *);
+	uhdr = (struct udphdr *)((caddr_t)iph + off);
+	port = uhdr->uh_sport;
 	/*
 	 * Split out the mbuf chain. Leave the IP header in m, place the
 	 * rest in the sp.
 	 */
-	if ((m->m_flags & M_PKTHDR) == 0) {
-		/* Can't handle one that is not a pkt hdr */
-		goto out;
-	}
-	/* pull the src port */
-	iph = mtod(m, struct ip *);
-	uhdr = (struct udphdr *)((caddr_t)iph + off);
-
-	port = uhdr->uh_sport;
 	sp = m_split(m, off, M_DONTWAIT);
 	if (sp == NULL) {
 		/* Gak, drop packet, we can't do a split */
 		goto out;
 	}
-	if (sp->m_pkthdr.len < header_size) {
-		/* Gak, packet can't have an SCTP header in it - to small */
+	if (sp->m_pkthdr.len < sizeof(struct udphdr) + sizeof(struct sctphdr)) {
+		/* Gak, packet can't have an SCTP header in it - too small */
 		m_freem(sp);
 		goto out;
 	}
-	/* ok now pull up the UDP header and SCTP header together */
-	sp = m_pullup(sp, header_size);
+	/* Now pull up the UDP header and SCTP header together */
+	sp = m_pullup(sp, sizeof(struct udphdr) + sizeof(struct sctphdr));
 	if (sp == NULL) {
 		/* Gak pullup failed */
 		goto out;
 	}
-	/* trim out the UDP header */
+	/* Trim out the UDP header */
 	m_adj(sp, sizeof(struct udphdr));
 
 	/* Now reconstruct the mbuf chain */
-	/* 1) find last one */
-	last = m;
-	while (last->m_next != NULL) {
-		last = last->m_next;
-	}
+	for (last = m; last->m_next; last = last->m_next);
 	last->m_next = sp;
 	m->m_pkthdr.len += sp->m_pkthdr.len;
-	last = m;
-	while (last != NULL) {
-		last = last->m_next;
-	}
-	/* Now its ready for sctp_input or sctp6_input */
 	iph = mtod(m, struct ip *);
 	switch (iph->ip_v) {
 #ifdef INET
 	case IPVERSION:
-		{
-			uint16_t len;
-
-			/* its IPv4 */
-			len = SCTP_GET_IPV4_LENGTH(iph);
-			len -= sizeof(struct udphdr);
-			SCTP_GET_IPV4_LENGTH(iph) = len;
-			sctp_input_with_port(m, off, port);
-			break;
-		}
+		iph->ip_len -= sizeof(struct udphdr);
+		sctp_input_with_port(m, off, port);
+		break;
 #endif
 #ifdef INET6
 	case IPV6_VERSION >> 4:
-		{
-			/* its IPv6 - NOT supported */
-			goto out;
-			break;
-
-		}
+		/* Not yet supported. */
+		goto out;
+		break;
+
 #endif
 	default:
-		{
-			m_freem(m);
-			break;
-		}
+		goto out;
+		break;
 	}
 	return;
 out:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/sctputil.h
--- a/head/sys/netinet/sctputil.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/sctputil.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
- * Copyright (c) 2008-2011, by Randall Stewart. All rights reserved.
- * Copyright (c) 2008-2011, by Michael Tuexen. All rights reserved.
+ * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
+ * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,14 +30,11 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/sctputil.h 237715 2012-06-28 16:01:08Z tuexen $");
 
-/* $KAME: sctputil.h,v 1.15 2005/03/06 16:04:19 itojun Exp $	 */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/sctputil.h 233660 2012-03-29 13:36:53Z rrs $");
-#ifndef __sctputil_h__
-#define __sctputil_h__
-
+#ifndef _NETINET_SCTP_UTIL_H_
+#define _NETINET_SCTP_UTIL_H_
 
 #if defined(_KERNEL) || defined(__Userspace__)
 
@@ -170,7 +167,7 @@
 void sctp_stop_timers_for_shutdown(struct sctp_tcb *);
 
 void 
-sctp_report_all_outbound(struct sctp_tcb *, int, int
+sctp_report_all_outbound(struct sctp_tcb *, uint16_t, int, int
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
 #endif
@@ -179,7 +176,8 @@
 int sctp_expand_mapping_array(struct sctp_association *, uint32_t);
 
 void 
-sctp_abort_notification(struct sctp_tcb *, int, int
+sctp_abort_notification(struct sctp_tcb *, uint8_t, uint16_t,
+    struct sctp_abort_chunk *, int
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
 #endif
@@ -187,13 +185,16 @@
 
 /* We abort responding to an IP packet for some reason */
 void
-sctp_abort_association(struct sctp_inpcb *, struct sctp_tcb *,
-    struct mbuf *, int, struct sctphdr *, struct mbuf *, uint32_t, uint16_t);
+sctp_abort_association(struct sctp_inpcb *, struct sctp_tcb *, struct mbuf *,
+    int, struct sockaddr *, struct sockaddr *,
+    struct sctphdr *, struct mbuf *,
+    uint8_t, uint32_t,
+    uint32_t, uint16_t);
 
 
 /* We choose to abort via user input */
 void
-sctp_abort_an_association(struct sctp_inpcb *, struct sctp_tcb *, int,
+sctp_abort_an_association(struct sctp_inpcb *, struct sctp_tcb *,
     struct mbuf *, int
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
@@ -201,8 +202,11 @@
 );
 
 void 
-sctp_handle_ootb(struct mbuf *, int, int, struct sctphdr *,
-    struct sctp_inpcb *, struct mbuf *, uint32_t, uint16_t);
+sctp_handle_ootb(struct mbuf *, int, int,
+    struct sockaddr *, struct sockaddr *,
+    struct sctphdr *, struct sctp_inpcb *,
+    uint8_t, uint32_t,
+    uint32_t, uint16_t);
 
 int 
 sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr,
@@ -239,11 +243,10 @@
 int sctp_cmpaddr(struct sockaddr *, struct sockaddr *);
 
 void sctp_print_address(struct sockaddr *);
-void sctp_print_address_pkt(struct ip *, struct sctphdr *);
 
 int
 sctp_release_pr_sctp_chunk(struct sctp_tcb *, struct sctp_tmit_chunk *,
-    int, int
+    uint8_t, int
 #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING)
     SCTP_UNUSED
 #endif
@@ -383,7 +386,5 @@
 void sctp_audit_log(uint8_t, uint8_t);
 
 #endif
-
-
 #endif				/* _KERNEL */
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_hostcache.c
--- a/head/sys/netinet/tcp_hostcache.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_hostcache.c	Wed Jul 25 16:40:53 2012 +0300
@@ -63,7 +63,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/tcp_hostcache.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_hostcache.c 238083 2012-07-03 18:59:13Z trociny $");
 
 #include "opt_inet6.h"
 
@@ -624,7 +624,7 @@
 			    msec(hc_entry->rmx_rtt *
 				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
 			    msec(hc_entry->rmx_rttvar *
-				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
+				(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))),
 			    hc_entry->rmx_bandwidth * 8,
 			    hc_entry->rmx_cwnd,
 			    hc_entry->rmx_sendpipe,
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_input.c
--- a/head/sys/netinet/tcp_input.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_input.c	Wed Jul 25 16:40:53 2012 +0300
@@ -48,7 +48,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/tcp_input.c 234342 2012-04-16 13:49:03Z glebius $");
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_input.c 238699 2012-07-22 17:31:36Z rwatson $");
 
 #include "opt_ipfw.h"		/* for ipfw_fwd	*/
 #include "opt_inet.h"
@@ -105,6 +105,9 @@
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -512,6 +515,8 @@
 			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 		return IPPROTO_DONE;
 	}
+	if (ia6)
+		ifa_free(&ia6->ia_ifa);
 
 	tcp_input(m, *offp);
 	return IPPROTO_DONE;
@@ -577,13 +582,31 @@
 #ifdef INET6
 	if (isipv6) {
 		/* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
+
+		if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
+			m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
+			if (m == NULL) {
+				TCPSTAT_INC(tcps_rcvshort);
+				return;
+			}
+		}
+
 		ip6 = mtod(m, struct ip6_hdr *);
+		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
-		if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
+		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
+			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+				th->th_sum = m->m_pkthdr.csum_data;
+			else
+				th->th_sum = in6_cksum_pseudo(ip6, tlen,
+				    IPPROTO_TCP, m->m_pkthdr.csum_data);
+			th->th_sum ^= 0xffff;
+		} else
+			th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen);
+		if (th->th_sum) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
-		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
@@ -886,7 +909,7 @@
 	/*
 	 * A previous connection in TIMEWAIT state is supposed to catch stray
 	 * or duplicate segments arriving late.  If this segment was a
-	 * legitimate new connection attempt the old INPCB gets removed and
+	 * legitimate new connection attempt, the old INPCB gets removed and
 	 * we can try again to find a listening socket.
 	 *
 	 * At this point, due to earlier optimism, we may hold only an inpcb
@@ -938,6 +961,14 @@
 		goto dropwithreset;
 	}
 
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE) {
+		tcp_offload_input(tp, m);
+		m = NULL;	/* consumed by the TOE driver */
+		goto dropunlock;
+	}
+#endif
+
 	/*
 	 * We've identified a valid inpcb, but it could be that we need an
 	 * inpcbinfo write lock but don't hold it.  In this case, attempt to
@@ -1222,7 +1253,8 @@
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
-			ifa_free(&ia6->ia_ifa);
+			if (ia6)
+				ifa_free(&ia6->ia_ifa);
 		}
 #endif /* INET6 */
 		/*
@@ -1299,7 +1331,7 @@
 			    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 		tcp_dooptions(&to, optp, optlen, TO_SYN);
-		syncache_add(&inc, &to, th, inp, &so, m);
+		syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL);
 		/*
 		 * Entry added to syncache and mbuf consumed.
 		 * Everything already unlocked by syncache_add().
@@ -1406,15 +1438,8 @@
 	/*
 	 * If this is either a state-changing packet or current state isn't
 	 * established, we require a write lock on tcbinfo.  Otherwise, we
-	 * allow either a read lock or a write lock, as we may have acquired
-	 * a write lock due to a race.
-	 *
-	 * Require a global write lock for SYN/FIN/RST segments or
-	 * non-established connections; otherwise accept either a read or
-	 * write lock, as we may have conservatively acquired a write lock in
-	 * certain cases in tcp_input() (is this still true?).  Currently we
-	 * will never enter with no lock, so we try to drop it quickly in the
-	 * common pure ack/pure data cases.
+	 * allow the tcbinfo to be in either alocked or unlocked, as the
+	 * caller may have unnecessarily acquired a write lock due to a race.
 	 */
 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
 	    tp->t_state != TCPS_ESTABLISHED) {
@@ -3542,7 +3567,6 @@
 	if (inc->inc_flags & INC_ISIPV6) {
 		mss = V_tcp_v6mssdflt;
 		maxmtu = tcp_maxmtu6(inc, NULL);
-		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	}
 #endif
@@ -3553,10 +3577,13 @@
 	{
 		mss = V_tcp_mssdflt;
 		maxmtu = tcp_maxmtu(inc, NULL);
-		thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 		min_protoh = sizeof(struct tcpiphdr);
 	}
 #endif
+#if defined(INET6) || defined(INET)
+	thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+#endif
+
 	if (maxmtu && thcmtu)
 		mss = min(maxmtu, thcmtu) - min_protoh;
 	else if (maxmtu || thcmtu)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_lro.c
--- a/head/sys/netinet/tcp_lro.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_lro.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,397 +1,615 @@
-/******************************************************************************
+/*-
+ * Copyright (c) 2007, Myricom Inc.
+ * Copyright (c) 2008, Intel Corporation.
+ * Copyright (c) 2012 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Bjoern Zeeb
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
 
-Copyright (c) 2007, Myricom Inc.
-Copyright (c) 2008, Intel Corporation.
-All rights reserved.
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_lro.c 236394 2012-06-01 11:42:50Z bz $");
 
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Myricom Inc, nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
- 3. Neither the name of the Intel Corporation, nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-$FreeBSD: head/sys/netinet/tcp_lro.c 223797 2011-07-05 18:43:54Z cperciva $ 
-***************************************************************************/
+#include "opt_inet.h"
+#include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/endian.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
+#include <net/if_var.h>
 #include <net/ethernet.h>
-#include <net/if_media.h>
+#include <net/vnet.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
+#include <netinet/ip6.h>
 #include <netinet/ip.h>
+#include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_lro.h>
 
-#include <machine/bus.h>
+#include <netinet6/ip6_var.h>
+
 #include <machine/in_cksum.h>
 
+#ifndef LRO_ENTRIES
+#define	LRO_ENTRIES	8	/* # of LRO entries per RX queue. */
+#endif
 
-static uint16_t do_csum_data(uint16_t *raw, int len)
+#define	TCP_LRO_UPDATE_CSUM	1
+#ifndef	TCP_LRO_UPDATE_CSUM
+#define	TCP_LRO_INVALID_CSUM	0x0000
+#endif
+
+int
+tcp_lro_init(struct lro_ctrl *lc)
 {
-	uint32_t csum;
-	csum = 0;
-	while (len > 0) {
-		csum += *raw;
-		raw++;
-		csum += *raw;
-		raw++;
-		len -= 4;
-	}
-	csum = (csum >> 16) + (csum & 0xffff);
-	csum = (csum >> 16) + (csum & 0xffff);
-	return (uint16_t)csum;
-}
+	struct lro_entry *le;
+	int error, i;
 
-/*
- * Allocate and init the LRO data structures
- */
-int
-tcp_lro_init(struct lro_ctrl *cntl)
-{
-	struct lro_entry *lro;
-	int i, error = 0;
+	lc->lro_bad_csum = 0;
+	lc->lro_queued = 0;
+	lc->lro_flushed = 0;
+	lc->lro_cnt = 0;
+	SLIST_INIT(&lc->lro_free);
+	SLIST_INIT(&lc->lro_active);
 
-	SLIST_INIT(&cntl->lro_free);
-	SLIST_INIT(&cntl->lro_active);
-
-	cntl->lro_bad_csum = 0;
-	cntl->lro_queued = 0;
-	cntl->lro_flushed = 0;
-
+	error = 0;
 	for (i = 0; i < LRO_ENTRIES; i++) {
-                lro = (struct lro_entry *) malloc(sizeof (struct lro_entry),
-		    M_DEVBUF, M_NOWAIT | M_ZERO);
-                if (lro == NULL) {
+		le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF,
+		    M_NOWAIT | M_ZERO);
+                if (le == NULL) {
 			if (i == 0)
 				error = ENOMEM;
                         break;
                 }
-		cntl->lro_cnt = i;
-                SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
+		lc->lro_cnt = i + 1;
+		SLIST_INSERT_HEAD(&lc->lro_free, le, next);
         }
 
 	return (error);
 }
 
 void
-tcp_lro_free(struct lro_ctrl *cntl)
+tcp_lro_free(struct lro_ctrl *lc)
 {
-	struct lro_entry *entry;
+	struct lro_entry *le;
 
-	while (!SLIST_EMPTY(&cntl->lro_free)) {
-		entry = SLIST_FIRST(&cntl->lro_free);
-		SLIST_REMOVE_HEAD(&cntl->lro_free, next);
-		free(entry, M_DEVBUF);
+	while (!SLIST_EMPTY(&lc->lro_free)) {
+		le = SLIST_FIRST(&lc->lro_free);
+		SLIST_REMOVE_HEAD(&lc->lro_free, next);
+		free(le, M_DEVBUF);
 	}
 }
 
-void
-tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro)
+#ifdef TCP_LRO_UPDATE_CSUM
+static uint16_t
+tcp_lro_csum_th(struct tcphdr *th)
 {
-	struct ifnet *ifp;
-	struct ip *ip;
-	struct tcphdr *tcp;
-	uint32_t *ts_ptr;
-	uint32_t tcplen, tcp_csum;
+	uint32_t ch;
+	uint16_t *p, l;
 
+	ch = th->th_sum = 0x0000;
+	l = th->th_off;
+	p = (uint16_t *)th;
+	while (l > 0) {
+		ch += *p;
+		p++;
+		ch += *p;
+		p++;
+		l--;
+	}
+	while (ch > 0xffff)
+		ch = (ch >> 16) + (ch & 0xffff);
 
-	if (lro->append_cnt) {
-		/* incorporate the new len into the ip header and
-		 * re-calculate the checksum */
-		ip = lro->ip;
-		ip->ip_len = htons(lro->len - ETHER_HDR_LEN);
-		ip->ip_sum = 0;
-		ip->ip_sum = 0xffff ^ 
-			do_csum_data((uint16_t*)ip,
-					      sizeof (*ip));
-
-		lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED |
-			CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
-		lro->m_head->m_pkthdr.csum_data = 0xffff;
-		lro->m_head->m_pkthdr.len = lro->len;
-
-		/* incorporate the latest ack into the tcp header */
-		tcp = (struct tcphdr *) (ip + 1);
-		tcp->th_ack = lro->ack_seq;
-		tcp->th_win = lro->window;
-		/* incorporate latest timestamp into the tcp header */
-		if (lro->timestamp) {
-			ts_ptr = (uint32_t *)(tcp + 1);
-			ts_ptr[1] = htonl(lro->tsval);
-			ts_ptr[2] = lro->tsecr;
-		}
-		/* 
-		 * update checksum in tcp header by re-calculating the
-		 * tcp pseudoheader checksum, and adding it to the checksum
-		 * of the tcp payload data 
-		 */
-		tcp->th_sum = 0;
-		tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN;
-		tcp_csum = lro->data_csum;
-		tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
-				      htons(tcplen + IPPROTO_TCP));
-		tcp_csum += do_csum_data((uint16_t*)tcp,
-						  tcp->th_off << 2);
-		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
-		tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16);
-		tcp->th_sum = 0xffff ^ tcp_csum;
-	}
-	ifp = cntl->ifp;
-	(*ifp->if_input)(cntl->ifp, lro->m_head);
-	cntl->lro_queued += lro->append_cnt + 1;
-	cntl->lro_flushed++;
-	lro->m_head = NULL;
-	lro->timestamp = 0;
-	lro->append_cnt = 0;
-	SLIST_INSERT_HEAD(&cntl->lro_free, lro, next);
+	return (ch & 0xffff);
 }
 
-int
-tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum)
+static uint16_t
+tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
+    uint16_t tcp_data_len, uint16_t csum)
 {
-	struct ether_header *eh;
-	struct ip *ip;
-	struct tcphdr *tcp;
-	uint32_t *ts_ptr;
-	struct mbuf *m_nxt, *m_tail;
-	struct lro_entry *lro;
-	int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len;
-	int opt_bytes, trim, csum_flags;
-	uint32_t seq, tmp_csum, device_mtu;
+	uint32_t c;
+	uint16_t cs;
 
+	c = csum;
 
-	eh = mtod(m_head, struct ether_header *);
-	if (eh->ether_type != htons(ETHERTYPE_IP))
-		return 1;
-	ip = (struct ip *) (eh + 1);
-	if (ip->ip_p != IPPROTO_TCP)
-		return 1;
-	
-	/* ensure there are no options */
-	if ((ip->ip_hl << 2) != sizeof (*ip))
-		return -1;
+	/* Remove length from checksum. */
+	switch (le->eh_type) {
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+	{
+		struct ip6_hdr *ip6;
 
-	/* .. and the packet is not fragmented */
-	if (ip->ip_off & htons(IP_MF|IP_OFFMASK))
-		return -1;
+		ip6 = (struct ip6_hdr *)l3hdr;
+		if (le->append_cnt == 0)
+			cs = ip6->ip6_plen;
+		else {
+			uint32_t cx;
 
-	/* verify that the IP header checksum is correct */
-	csum_flags = m_head->m_pkthdr.csum_flags;
+			cx = ntohs(ip6->ip6_plen);
+			cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
+		}
+		break;
+	}
+#endif
+#ifdef INET
+	case ETHERTYPE_IP:
+	{
+		struct ip *ip4;
+
+		ip4 = (struct ip *)l3hdr;
+		if (le->append_cnt == 0)
+			cs = ip4->ip_len;
+		else {
+			cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
+			    IPPROTO_TCP);
+			cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
+			    htons(cs));
+		}
+		break;
+	}
+#endif
+	default:
+		cs = 0;		/* Keep compiler happy. */
+	}
+
+	cs = ~cs;
+	c += cs;
+
+	/* Remove TCP header csum. */
+	cs = ~tcp_lro_csum_th(th);
+	c += cs;
+	while (c > 0xffff)
+		c = (c >> 16) + (c & 0xffff);
+
+	return (c & 0xffff);
+}
+#endif
+
+void
+tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
+{
+
+	if (le->append_cnt > 0) {
+		struct tcphdr *th;
+		uint16_t p_len;
+
+		p_len = htons(le->p_len);
+		switch (le->eh_type) {
+#ifdef INET6
+		case ETHERTYPE_IPV6:
+		{
+			struct ip6_hdr *ip6;
+
+			ip6 = le->le_ip6;
+			ip6->ip6_plen = p_len;
+			th = (struct tcphdr *)(ip6 + 1);
+			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
+			    CSUM_PSEUDO_HDR;
+			le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
+			break;
+		}
+#endif
+#ifdef INET
+		case ETHERTYPE_IP:
+		{
+			struct ip *ip4;
+#ifdef TCP_LRO_UPDATE_CSUM
+			uint32_t cl;
+			uint16_t c;
+#endif
+
+			ip4 = le->le_ip4;
+#ifdef TCP_LRO_UPDATE_CSUM
+			/* Fix IP header checksum for new length. */
+			c = ~ip4->ip_sum;
+			cl = c;
+			c = ~ip4->ip_len;
+			cl += c + p_len;
+			while (cl > 0xffff)
+				cl = (cl >> 16) + (cl & 0xffff);
+			c = cl;
+			ip4->ip_sum = ~c;
+#else
+			ip4->ip_sum = TCP_LRO_INVALID_CSUM;
+#endif
+			ip4->ip_len = p_len;
+			th = (struct tcphdr *)(ip4 + 1);
+			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
+			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
+			le->p_len += ETHER_HDR_LEN;
+			break;
+		}
+#endif
+		default:
+			th = NULL;	/* Keep compiler happy. */
+		}
+		le->m_head->m_pkthdr.csum_data = 0xffff;
+		le->m_head->m_pkthdr.len = le->p_len;
+
+		/* Incorporate the latest ACK into the TCP header. */
+		th->th_ack = le->ack_seq;
+		th->th_win = le->window;
+		/* Incorporate latest timestamp into the TCP header. */
+		if (le->timestamp != 0) {
+			uint32_t *ts_ptr;
+
+			ts_ptr = (uint32_t *)(th + 1);
+			ts_ptr[1] = htonl(le->tsval);
+			ts_ptr[2] = le->tsecr;
+		}
+#ifdef TCP_LRO_UPDATE_CSUM
+		/* Update the TCP header checksum. */
+		le->ulp_csum += p_len;
+		le->ulp_csum += tcp_lro_csum_th(th);
+		while (le->ulp_csum > 0xffff)
+			le->ulp_csum = (le->ulp_csum >> 16) +
+			    (le->ulp_csum & 0xffff);
+		th->th_sum = (le->ulp_csum & 0xffff);
+		th->th_sum = ~th->th_sum;
+#else
+		th->th_sum = TCP_LRO_INVALID_CSUM;
+#endif
+	}
+
+	(*lc->ifp->if_input)(lc->ifp, le->m_head);
+	lc->lro_queued += le->append_cnt + 1;
+	lc->lro_flushed++;
+	bzero(le, sizeof(*le));
+	SLIST_INSERT_HEAD(&lc->lro_free, le, next);
+}
+
+#ifdef INET6
+static int
+tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
+    struct tcphdr **th)
+{
+
+	/* XXX-BZ we should check the flow-label. */
+
+	/* XXX-BZ We do not yet support ext. hdrs. */
+	if (ip6->ip6_nxt != IPPROTO_TCP)
+		return (TCP_LRO_NOT_SUPPORTED);
+
+	/* Find the TCP header. */
+	*th = (struct tcphdr *)(ip6 + 1);
+
+	return (0);
+}
+#endif
+
+#ifdef INET
+static int
+tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
+    struct tcphdr **th)
+{
+	int csum_flags;
+	uint16_t csum;
+
+	if (ip4->ip_p != IPPROTO_TCP)
+		return (TCP_LRO_NOT_SUPPORTED);
+
+	/* Ensure there are no options. */
+	if ((ip4->ip_hl << 2) != sizeof (*ip4))
+		return (TCP_LRO_CANNOT);
+
+	/* .. and the packet is not fragmented. */
+	if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
+		return (TCP_LRO_CANNOT);
+
+	/* Legacy IP has a header checksum that needs to be correct. */
+	csum_flags = m->m_pkthdr.csum_flags;
 	if (csum_flags & CSUM_IP_CHECKED) {
 		if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
-			cntl->lro_bad_csum++;
-			return -1;
+			lc->lro_bad_csum++;
+			return (TCP_LRO_CANNOT);
 		}
 	} else {
-		tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip));
-		if (__predict_false((tmp_csum ^ 0xffff) != 0)) {
-			cntl->lro_bad_csum++;
-			return -1;
-		}
-	}
-	
-	/* find the TCP header */
-	tcp = (struct tcphdr *) (ip + 1);
-
-	/* Get the TCP checksum if we dont have it */
-	if (!csum)
-		csum = tcp->th_sum;
-
-	/* ensure no bits set besides ack or psh */
-	if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
-		return -1;
-
-	/* check for timestamps. Since the only option we handle are
-	   timestamps, we only have to handle the simple case of
-	   aligned timestamps */
-
-	opt_bytes = (tcp->th_off << 2) - sizeof (*tcp);
-	tcp_hdr_len =  sizeof (*tcp) + opt_bytes;
-	ts_ptr = (uint32_t *)(tcp + 1);
-	if (opt_bytes != 0) {
-		if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) ||
-		    (*ts_ptr !=  ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
-		    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))
-			return -1;
-	}
-
-	ip_len = ntohs(ip->ip_len);
-	tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip);
-	
-
-	/* 
-	 * If frame is padded beyond the end of the IP packet,
-	 * then we must trim the extra bytes off the end.
-	 */
-	tot_len = m_head->m_pkthdr.len;
-	trim = tot_len - (ip_len + ETHER_HDR_LEN);
-	if (trim != 0) {
-		if (trim < 0) {
-			/* truncated packet */
-			return -1;
-		}
-		m_adj(m_head, -trim);
-		tot_len = m_head->m_pkthdr.len;
-	}
-
-	m_nxt = m_head;
-	m_tail = NULL; /* -Wuninitialized */
-	while (m_nxt != NULL) {
-		m_tail = m_nxt;
-		m_nxt = m_tail->m_next;
-	}
-
-	hlen = ip_len + ETHER_HDR_LEN - tcp_data_len;
-	seq = ntohl(tcp->th_seq);
-
-	SLIST_FOREACH(lro, &cntl->lro_active, next) {
-		if (lro->source_port == tcp->th_sport && 
-		    lro->dest_port == tcp->th_dport &&
-		    lro->source_ip == ip->ip_src.s_addr && 
-		    lro->dest_ip == ip->ip_dst.s_addr) {
-			/* Flush now if appending will result in overflow. */
-			if (lro->len > (65535 - tcp_data_len)) {
-				SLIST_REMOVE(&cntl->lro_active, lro,
-					     lro_entry, next);
-				tcp_lro_flush(cntl, lro);
-				break;
-			}
-
-			/* Try to append it */
-
-			if (__predict_false(seq != lro->next_seq ||
-				    (tcp_data_len == 0 &&
-				    lro->ack_seq == tcp->th_ack))) {
-				/* out of order packet or dup ack */
-				SLIST_REMOVE(&cntl->lro_active, lro,
-					     lro_entry, next);
-				tcp_lro_flush(cntl, lro);
-				return -1;
-			}
-
-			if (opt_bytes) {
-				uint32_t tsval = ntohl(*(ts_ptr + 1));
-				/* make sure timestamp values are increasing */
-				if (__predict_false(lro->tsval > tsval || 
-					     *(ts_ptr + 2) == 0)) {
-					return -1;
-				}
-				lro->tsval = tsval;
-				lro->tsecr = *(ts_ptr + 2);
-			}
-
-			lro->next_seq += tcp_data_len;
-			lro->ack_seq = tcp->th_ack;
-			lro->window = tcp->th_win;
-			lro->append_cnt++;
-			if (tcp_data_len == 0) {
-				m_freem(m_head);
-				return 0;
-			}
-			/* subtract off the checksum of the tcp header
-                         * from the hardware checksum, and add it to the
-                         * stored tcp data checksum.  Byteswap the checksum
-			 * if the total length so far is odd 
-                         */
-			tmp_csum = do_csum_data((uint16_t*)tcp,
-							 tcp_hdr_len);
-			csum = csum + (tmp_csum ^ 0xffff);
-			csum = (csum & 0xffff) + (csum >> 16);
-			csum = (csum & 0xffff) + (csum >> 16);
-			if (lro->len & 0x1) {
-				/* Odd number of bytes so far, flip bytes */
-				csum = ((csum << 8) | (csum >> 8)) & 0xffff;
-			}
-			csum = csum + lro->data_csum;
-			csum = (csum & 0xffff) + (csum >> 16);
-			csum = (csum & 0xffff) + (csum >> 16);
-			lro->data_csum = csum;
-
-			lro->len += tcp_data_len;
-
-			/* adjust mbuf so that m->m_data points to
-			   the first byte of the payload */
-			m_adj(m_head, hlen);
-			/* append mbuf chain */
-			lro->m_tail->m_next = m_head;
-			/* advance the last pointer */
-			lro->m_tail = m_tail;
-			/* flush packet if required */
-			device_mtu = cntl->ifp->if_mtu;
-			if (lro->len > (65535 - device_mtu)) {
-				SLIST_REMOVE(&cntl->lro_active, lro,
-					     lro_entry, next);
-				tcp_lro_flush(cntl, lro);
-			}
-			return 0;
+		csum = in_cksum_hdr(ip4);
+		if (__predict_false((csum ^ 0xffff) != 0)) {
+			lc->lro_bad_csum++;
+			return (TCP_LRO_CANNOT);
 		}
 	}
 
-	if (SLIST_EMPTY(&cntl->lro_free))
-	    return -1;
+	/* Find the TCP header (we assured there are no IP options). */
+	*th = (struct tcphdr *)(ip4 + 1);
 
-	/* start a new chain */
-	lro = SLIST_FIRST(&cntl->lro_free);
-	SLIST_REMOVE_HEAD(&cntl->lro_free, next);
-	SLIST_INSERT_HEAD(&cntl->lro_active, lro, next);
-	lro->source_port = tcp->th_sport;
-	lro->dest_port = tcp->th_dport;
-	lro->source_ip = ip->ip_src.s_addr;
-	lro->dest_ip = ip->ip_dst.s_addr;
-	lro->next_seq = seq + tcp_data_len;
-	lro->mss = tcp_data_len;
-	lro->ack_seq = tcp->th_ack;
-	lro->window = tcp->th_win;
+	return (0);
+}
+#endif
 
-	/* save the checksum of just the TCP payload by
-	 * subtracting off the checksum of the TCP header from
-	 * the entire hardware checksum 
-	 * Since IP header checksum is correct, checksum over
-	 * the IP header is -0.  Substracting -0 is unnecessary.
+int
+tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
+{
+	struct lro_entry *le;
+	struct ether_header *eh;
+#ifdef INET6
+	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
+#endif
+#ifdef INET
+	struct ip *ip4 = NULL;		/* Keep compiler happy. */
+#endif
+	struct tcphdr *th;
+	void *l3hdr = NULL;		/* Keep compiler happy. */
+	uint32_t *ts_ptr;
+	tcp_seq seq;
+	int error, ip_len, l;
+	uint16_t eh_type, tcp_data_len;
+
+	/* We expect a contiguous header [eh, ip, tcp]. */
+
+	eh = mtod(m, struct ether_header *);
+	eh_type = ntohs(eh->ether_type);
+	switch (eh_type) {
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+	{
+		CURVNET_SET(lc->ifp->if_vnet);
+		if (V_ip6_forwarding != 0) {
+			/* XXX-BZ stats but changing lro_ctrl is a problem. */
+			CURVNET_RESTORE();
+			return (TCP_LRO_CANNOT);
+		}
+		CURVNET_RESTORE();
+		l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
+		error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
+		if (error != 0)
+			return (error);
+		tcp_data_len = ntohs(ip6->ip6_plen);
+		ip_len = sizeof(*ip6) + tcp_data_len;
+		break;
+	}
+#endif
+#ifdef INET
+	case ETHERTYPE_IP:
+	{
+		CURVNET_SET(lc->ifp->if_vnet);
+		if (V_ipforwarding != 0) {
+			/* XXX-BZ stats but changing lro_ctrl is a problem. */
+			CURVNET_RESTORE();
+			return (TCP_LRO_CANNOT);
+		}
+		CURVNET_RESTORE();
+		l3hdr = ip4 = (struct ip *)(eh + 1);
+		error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
+		if (error != 0)
+			return (error);
+		ip_len = ntohs(ip4->ip_len);
+		tcp_data_len = ip_len - sizeof(*ip4);
+		break;
+	}
+#endif
+	/* XXX-BZ what happens in case of VLAN(s)? */
+	default:
+		return (TCP_LRO_NOT_SUPPORTED);
+	}
+
+	/*
+	 * If the frame is padded beyond the end of the IP packet, then we must
+	 * trim the extra bytes off.
 	 */
-	tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len);
-	csum = csum + (tmp_csum ^ 0xffff);
-	csum = (csum & 0xffff) + (csum >> 16);
-	csum = (csum & 0xffff) + (csum >> 16);
-	lro->data_csum = csum;
-	
-	lro->ip = ip;
-	/* record timestamp if it is present */
-	if (opt_bytes) {
-		lro->timestamp = 1;
-		lro->tsval = ntohl(*(ts_ptr + 1));
-		lro->tsecr = *(ts_ptr + 2);
+	l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
+	if (l != 0) {
+		if (l < 0)
+			/* Truncated packet. */
+			return (TCP_LRO_CANNOT);
+
+		m_adj(m, -l);
 	}
-	lro->len = tot_len;
-	lro->m_head = m_head;
-	lro->m_tail = m_tail;
-	return 0;
+
+	/*
+	 * Check TCP header constraints.
+	 */
+	/* Ensure no bits set besides ACK or PSH. */
+	if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0)
+		return (TCP_LRO_CANNOT);
+
+	/* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */
+	/* XXX-BZ Ideally we'd flush on PUSH? */
+
+	/*
+	 * Check for timestamps.
+	 * Since the only option we handle are timestamps, we only have to
+	 * handle the simple case of aligned timestamps.
+	 */
+	l = (th->th_off << 2);
+	tcp_data_len -= l;
+	l -= sizeof(*th);
+	ts_ptr = (uint32_t *)(th + 1);
+	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
+	    (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
+	    TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))))
+		return (TCP_LRO_CANNOT);
+
+	/* If the driver did not pass in the checksum, set it now. */
+	if (csum == 0x0000)
+		csum = th->th_sum;
+
+	seq = ntohl(th->th_seq);
+
+	/* Try to find a matching previous segment. */
+	SLIST_FOREACH(le, &lc->lro_active, next) {
+		if (le->eh_type != eh_type)
+			continue;
+		if (le->source_port != th->th_sport ||
+		    le->dest_port != th->th_dport)
+			continue;
+		switch (eh_type) {
+#ifdef INET6
+		case ETHERTYPE_IPV6:
+			if (bcmp(&le->source_ip6, &ip6->ip6_src,
+			    sizeof(struct in6_addr)) != 0 ||
+			    bcmp(&le->dest_ip6, &ip6->ip6_dst,
+			    sizeof(struct in6_addr)) != 0)
+				continue;
+			break;
+#endif
+#ifdef INET
+		case ETHERTYPE_IP:
+			if (le->source_ip4 != ip4->ip_src.s_addr ||
+			    le->dest_ip4 != ip4->ip_dst.s_addr)
+				continue;
+			break;
+#endif
+		}
+
+		/* Flush now if appending will result in overflow. */
+		if (le->p_len > (65535 - tcp_data_len)) {
+			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+			tcp_lro_flush(lc, le);
+			break;
+		}
+
+		/* Try to append the new segment. */
+		if (__predict_false(seq != le->next_seq ||
+		    (tcp_data_len == 0 && le->ack_seq == th->th_ack))) {
+			/* Out of order packet or duplicate ACK. */
+			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+			tcp_lro_flush(lc, le);
+			return (TCP_LRO_CANNOT);
+		}
+
+		if (l != 0) {
+			uint32_t tsval = ntohl(*(ts_ptr + 1));
+			/* Make sure timestamp values are increasing. */
+			/* XXX-BZ flip and use TSTMP_GEQ macro for this? */
+			if (__predict_false(le->tsval > tsval ||
+			    *(ts_ptr + 2) == 0))
+				return (TCP_LRO_CANNOT);
+			le->tsval = tsval;
+			le->tsecr = *(ts_ptr + 2);
+		}
+
+		le->next_seq += tcp_data_len;
+		le->ack_seq = th->th_ack;
+		le->window = th->th_win;
+		le->append_cnt++;
+
+#ifdef TCP_LRO_UPDATE_CSUM
+		le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
+		    tcp_data_len, ~csum);
+#endif
+
+		if (tcp_data_len == 0) {
+			m_freem(m);
+			return (0);
+		}
+
+		le->p_len += tcp_data_len;
+
+		/*
+		 * Adjust the mbuf so that m_data points to the first byte of
+		 * the ULP payload.  Adjust the mbuf to avoid complications and
+		 * append new segment to existing mbuf chain.
+		 */
+		m_adj(m, m->m_pkthdr.len - tcp_data_len);
+		m->m_flags &= ~M_PKTHDR;
+
+		le->m_tail->m_next = m;
+		le->m_tail = m_last(m);
+
+		/*
+		 * If a possible next full length packet would cause an
+		 * overflow, pro-actively flush now.
+		 */
+		if (le->p_len > (65535 - lc->ifp->if_mtu)) {
+			SLIST_REMOVE(&lc->lro_active, le, lro_entry, next);
+			tcp_lro_flush(lc, le);
+		}
+
+		return (0);
+	}
+
+	/* Try to find an empty slot. */
+	if (SLIST_EMPTY(&lc->lro_free))
+		return (TCP_LRO_CANNOT);
+
+	/* Start a new segment chain. */
+	le = SLIST_FIRST(&lc->lro_free);
+	SLIST_REMOVE_HEAD(&lc->lro_free, next);
+	SLIST_INSERT_HEAD(&lc->lro_active, le, next);
+
+	/* Start filling in details. */
+	switch (eh_type) {
+#ifdef INET6
+	case ETHERTYPE_IPV6:
+		le->le_ip6 = ip6;
+		le->source_ip6 = ip6->ip6_src;
+		le->dest_ip6 = ip6->ip6_dst;
+		le->eh_type = eh_type;
+		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
+		break;
+#endif
+#ifdef INET
+	case ETHERTYPE_IP:
+		le->le_ip4 = ip4;
+		le->source_ip4 = ip4->ip_src.s_addr;
+		le->dest_ip4 = ip4->ip_dst.s_addr;
+		le->eh_type = eh_type;
+		le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
+		break;
+#endif
+	}
+	le->source_port = th->th_sport;
+	le->dest_port = th->th_dport;
+
+	le->next_seq = seq + tcp_data_len;
+	le->ack_seq = th->th_ack;
+	le->window = th->th_win;
+	if (l != 0) {
+		le->timestamp = 1;
+		le->tsval = ntohl(*(ts_ptr + 1));
+		le->tsecr = *(ts_ptr + 2);
+	}
+
+#ifdef TCP_LRO_UPDATE_CSUM
+	/*
+	 * Do not touch the csum of the first packet.  However save the
+	 * "adjusted" checksum of just the source and destination addresses,
+	 * the next header and the TCP payload.  The length and TCP header
+	 * parts may change, so we remove those from the saved checksum and
+	 * re-add with final values on tcp_lro_flush() if needed.
+	 */
+	KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
+	    __func__, le, le->ulp_csum));
+
+	le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
+	    ~csum);
+	th->th_sum = csum;	/* Restore checksum on first packet. */
+#endif
+
+	le->m_head = m;
+	le->m_tail = m_last(m);
+
+	return (0);
 }
+
+/* end */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_lro.h
--- a/head/sys/netinet/tcp_lro.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_lro.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,67 +1,75 @@
-/*******************************************************************************
+/*-
+ * Copyright (c) 2006, Myricom Inc.
+ * Copyright (c) 2008, Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/netinet/tcp_lro.h 235944 2012-05-24 23:03:23Z bz $
+ */
 
-Copyright (c) 2006, Myricom Inc.
-Copyright (c) 2008, Intel Corporation.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
-    this list of conditions and the following disclaimer.
-
- 2. Neither the name of the Myricom Inc, nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
- 2. Neither the name of the Intel Corporation, nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-
-$FreeBSD$
-
-***************************************************************************/
 #ifndef _TCP_LRO_H_
 #define _TCP_LRO_H_
 
-struct lro_entry;
 struct lro_entry
 {
-	SLIST_ENTRY(lro_entry) next;
-	struct mbuf	*m_head;
-	struct mbuf	*m_tail;
-	int		timestamp;
-	struct ip	*ip;
-	uint32_t	tsval;
-	uint32_t	tsecr;
-	uint32_t	source_ip;
-	uint32_t	dest_ip;
-	uint32_t	next_seq;
-	uint32_t	ack_seq;
-	uint32_t	len;
-	uint32_t	data_csum;
-	uint16_t	window;
-	uint16_t	source_port;
-	uint16_t	dest_port;
-	uint16_t	append_cnt;
-	uint16_t	mss;
-	
+	SLIST_ENTRY(lro_entry)	next;
+	struct mbuf		*m_head;
+	struct mbuf		*m_tail;
+	union {
+		struct ip	*ip4;
+		struct ip6_hdr	*ip6;
+	} leip;
+	union {
+		in_addr_t	s_ip4;
+		struct in6_addr	s_ip6;
+	} lesource;
+	union {
+		in_addr_t	d_ip4;
+		struct in6_addr	d_ip6;
+	} ledest;
+	uint16_t		source_port;
+	uint16_t		dest_port;
+	uint16_t		eh_type;	/* EthernetHeader type. */
+	uint16_t		append_cnt;
+	uint32_t		p_len;		/* IP header payload length. */
+	uint32_t		ulp_csum;	/* TCP, etc. checksum. */
+	uint32_t		next_seq;	/* tcp_seq */
+	uint32_t		ack_seq;	/* tcp_seq */
+	uint32_t		tsval;
+	uint32_t		tsecr;
+	uint16_t		window;
+	uint16_t		timestamp;	/* flag, not a TCP hdr field. */
 };
 SLIST_HEAD(lro_head, lro_entry);
 
+#define	le_ip4			leip.ip4
+#define	le_ip6			leip.ip6
+#define	source_ip4		lesource.s_ip4
+#define	dest_ip4		ledest.d_ip4
+#define	source_ip6		lesource.s_ip6
+#define	dest_ip6		ledest.d_ip6
+
+/* NB: This is part of driver structs. */
 struct lro_ctrl {
 	struct ifnet	*ifp;
 	int		lro_queued;
@@ -73,13 +81,12 @@
 	struct lro_head	lro_free;
 };
 
-
 int tcp_lro_init(struct lro_ctrl *);
 void tcp_lro_free(struct lro_ctrl *);
 void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *);
 int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t);
 
-/* Number of LRO entries - these are per rx queue */
-#define LRO_ENTRIES			8
+#define	TCP_LRO_CANNOT		-1
+#define	TCP_LRO_NOT_SUPPORTED	1
 
 #endif /* _TCP_LRO_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_offload.c
--- a/head/sys/netinet/tcp_offload.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_offload.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,145 +1,176 @@
 /*-
- * Copyright (c) 2007, Chelsio Inc.
+ * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
  *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_offload.c 237263 2012-06-19 07:34:13Z np $");
+
+#include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/types.h>
-#include <sys/malloc.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
-
+#include <sys/sockopt.h>
 #include <net/if.h>
-#include <net/if_types.h>
-#include <net/if_var.h>
 #include <net/route.h>
-#include <net/vnet.h>
-
 #include <netinet/in.h>
-#include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_offload.h>
-#include <netinet/toedev.h>
+#define	TCPOUTFLAGS
+#include <netinet/tcp_fsm.h>
+#include <netinet/toecore.h>
 
-uint32_t toedev_registration_count;
+int registered_toedevs;
 
+/*
+ * Provide an opportunity for a TOE driver to offload.
+ */
 int
 tcp_offload_connect(struct socket *so, struct sockaddr *nam)
 {
 	struct ifnet *ifp;
-	struct toedev *tdev;
+	struct toedev *tod;
 	struct rtentry *rt;
-	int error;
+	int error = EOPNOTSUPP;
 
-	if (toedev_registration_count == 0)
-		return (EINVAL);
-	
-	/*
-	 * Look up the route used for the connection to 
-	 * determine if it uses an interface capable of
-	 * offloading the connection.
-	 */
-	rt = rtalloc1(nam, 0 /*report*/, 0 /*ignflags*/);
-	if (rt) 
+	INP_WLOCK_ASSERT(sotoinpcb(so));
+	KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6,
+	    ("%s: called with sa_family %d", __func__, nam->sa_family));
+
+	if (registered_toedevs == 0)
+		return (error);
+
+	rt = rtalloc1(nam, 0, 0);
+	if (rt)
 		RT_UNLOCK(rt);
-	else 
+	else
 		return (EHOSTUNREACH);
 
 	ifp = rt->rt_ifp;
-	if ((ifp->if_capenable & IFCAP_TOE) == 0) {
-		error = EINVAL;
-		goto fail;
-	}
-	
-	tdev = TOEDEV(ifp);
-	if (tdev == NULL) {
-		error = EPERM;
-		goto fail;
-	}
-	
-	if (tdev->tod_can_offload(tdev, so) == 0) {
-		error = EPERM;
-		goto fail;
-	}
-	
-	return (tdev->tod_connect(tdev, so, rt, nam));
-fail:
+
+	if (nam->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4))
+		goto done;
+	if (nam->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6))
+		goto done;
+
+	tod = TOEDEV(ifp);
+	if (tod != NULL)
+		error = tod->tod_connect(tod, so, rt, nam);
+done:
 	RTFREE(rt);
 	return (error);
 }
 
+void
+tcp_offload_listen_start(struct tcpcb *tp)
+{
 
-/*
- * This file contains code as a short-term staging area before it is moved in 
- * to sys/netinet/tcp_offload.c
- */
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
+}
 
 void
-tcp_offload_twstart(struct tcpcb *tp)
+tcp_offload_listen_stop(struct tcpcb *tp)
 {
 
-	INP_INFO_WLOCK(&V_tcbinfo);
-	INP_WLOCK(tp->t_inpcb);
-	tcp_twstart(tp);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
 }
 
-struct tcpcb *
-tcp_offload_close(struct tcpcb *tp)
+void
+tcp_offload_input(struct tcpcb *tp, struct mbuf *m)
 {
+	struct toedev *tod = tp->tod;
 
-	INP_INFO_WLOCK(&V_tcbinfo);
-	INP_WLOCK(tp->t_inpcb);
-	tp = tcp_close(tp);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
-	if (tp)
-		INP_WUNLOCK(tp->t_inpcb);
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
 
-	return (tp);
+	tod->tod_input(tod, tp, m);
 }
 
-struct tcpcb *
-tcp_offload_drop(struct tcpcb *tp, int error)
+int
+tcp_offload_output(struct tcpcb *tp)
 {
+	struct toedev *tod = tp->tod;
+	int error, flags;
 
-	INP_INFO_WLOCK(&V_tcbinfo);
-	INP_WLOCK(tp->t_inpcb);
-	tp = tcp_drop(tp, error);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
-	if (tp)
-		INP_WUNLOCK(tp->t_inpcb);
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
 
-	return (tp);
+	flags = tcp_outflags[tp->t_state];
+
+	if (flags & TH_RST) {
+		/* XXX: avoid repeated calls like we do for FIN */
+		error = tod->tod_send_rst(tod, tp);
+	} else if ((flags & TH_FIN || tp->t_flags & TF_NEEDFIN) &&
+	    (tp->t_flags & TF_SENTFIN) == 0) {
+		error = tod->tod_send_fin(tod, tp);
+		if (error == 0)
+			tp->t_flags |= TF_SENTFIN;
+	} else
+		error = tod->tod_output(tod, tp);
+
+	return (error);
 }
 
+void
+tcp_offload_rcvd(struct tcpcb *tp)
+{
+	struct toedev *tod = tp->tod;
+
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tod->tod_rcvd(tod, tp);
+}
+
+void
+tcp_offload_ctloutput(struct tcpcb *tp, int sopt_dir, int sopt_name)
+{
+	struct toedev *tod = tp->tod;
+
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tod->tod_ctloutput(tod, tp, sopt_dir, sopt_name);
+}
+
+void
+tcp_offload_detach(struct tcpcb *tp)
+{
+	struct toedev *tod = tp->tod;
+
+	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	tod->tod_pcb_detach(tod, tp);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_offload.h
--- a/head/sys/netinet/tcp_offload.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_offload.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,30 +1,30 @@
 /*-
- * Copyright (c) 2007, Chelsio Inc.
+ * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
  *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
+ * $FreeBSD: head/sys/netinet/tcp_offload.h 237263 2012-06-19 07:34:13Z np $
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
  */
 
 #ifndef _NETINET_TCP_OFFLOAD_H_
@@ -34,321 +34,15 @@
 #error "no user-serviceable parts inside"
 #endif
 
-/*
- * A driver publishes that it provides offload services
- * by setting IFCAP_TOE in the ifnet. The offload connect
- * will bypass any further work if the interface that a
- * connection would use does not support TCP offload.
- *
- * The TOE API assumes that the tcp offload engine can offload the 
- * the entire connection from set up to teardown, with some provision 
- * being made to allowing the software stack to handle time wait. If
- * the device does not meet these criteria, it is the driver's responsibility
- * to overload the functions that it needs to in tcp_usrreqs and make
- * its own calls to tcp_output if it needs to do so.
- *
- * There is currently no provision for the device advertising the congestion
- * control algorithms it supports as there is currently no API for querying 
- * an operating system for the protocols that it has loaded. This is a desirable
- * future extension.
- *
- *
- *
- * It is assumed that individuals deploying TOE will want connections
- * to be offloaded without software changes so all connections on an
- * interface providing TOE are offloaded unless the SO_NO_OFFLOAD 
- * flag is set on the socket.
- *
- *
- * The toe_usrreqs structure constitutes the TOE driver's 
- * interface to the TCP stack for functionality that doesn't
- * interact directly with userspace. If one wants to provide
- * (optional) functionality to do zero-copy to/from
- * userspace one still needs to override soreceive/sosend 
- * with functions that fault in and pin the user buffers.
- *
- * + tu_send
- *   - tells the driver that new data may have been added to the 
- *     socket's send buffer - the driver should not fail if the
- *     buffer is in fact unchanged
- *   - the driver is responsible for providing credits (bytes in the send window)
- *     back to the socket by calling sbdrop() as segments are acknowledged.
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- * + tu_rcvd
- *   - returns credits to the driver and triggers window updates
- *     to the peer (a credit as used here is a byte in the peer's receive window)
- *   - the driver is expected to determine how many bytes have been 
- *     consumed and credit that back to the card so that it can grow
- *     the window again by maintaining its own state between invocations.
- *   - In principle this could be used to shrink the window as well as
- *     grow the window, although it is not used for that now.
- *   - this function needs to correctly handle being called any number of
- *     times without any bytes being consumed from the receive buffer.
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- * + tu_disconnect
- *   - tells the driver to send FIN to peer
- *   - driver is expected to send the remaining data and then do a clean half close
- *   - disconnect implies at least half-close so only send, reset, and detach
- *     are legal
- *   - the driver is expected to handle transition through the shutdown
- *     state machine and allow the stack to support SO_LINGER.
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- * + tu_reset
- *   - closes the connection and sends a RST to peer
- *   - driver is expectd to trigger an RST and detach the toepcb
- *   - no further calls are legal after reset
- *   - The driver expects the inpcb lock to be held - the driver is expected
- *     not to drop the lock. Hence the driver is not allowed to acquire the
- *     pcbinfo lock during this call.
- *
- *   The following fields in the tcpcb are expected to be referenced by the driver:
- *	+ iss
- *	+ rcv_nxt
- *	+ rcv_wnd
- *	+ snd_isn
- *	+ snd_max
- *	+ snd_nxt
- *	+ snd_una
- *	+ t_flags
- *	+ t_inpcb
- *	+ t_maxseg
- *	+ t_toe
- *
- *   The following fields in the inpcb are expected to be referenced by the driver:
- *	+ inp_lport
- *	+ inp_fport
- *	+ inp_laddr
- *	+ inp_fport
- *	+ inp_socket
- *	+ inp_ip_tos
- *
- *   The following fields in the socket are expected to be referenced by the
- *   driver:
- *	+ so_comp
- *	+ so_error
- *	+ so_linger
- *	+ so_options
- *	+ so_rcv
- *	+ so_snd
- *	+ so_state
- *	+ so_timeo
- *
- *   These functions all return 0 on success and can return the following errors
- *   as appropriate:
- *	+ EPERM:
- *	+ ENOBUFS: memory allocation failed
- *	+ EMSGSIZE: MTU changed during the call
- *	+ EHOSTDOWN:
- *	+ EHOSTUNREACH:
- *	+ ENETDOWN:
- *	* ENETUNREACH: the peer is no longer reachable
- *
- * + tu_detach
- *   - tells driver that the socket is going away so disconnect
- *     the toepcb and free appropriate resources
- *   - allows the driver to cleanly handle the case of connection state
- *     outliving the socket
- *   - no further calls are legal after detach
- *   - the driver is expected to provide its own synchronization between
- *     detach and receiving new data.
- * 
- * + tu_syncache_event
- *   - even if it is not actually needed, the driver is expected to
- *     call syncache_add for the initial SYN and then syncache_expand
- *     for the SYN,ACK
- *   - tells driver that a connection either has not been added or has 
- *     been dropped from the syncache
- *   - the driver is expected to maintain state that lives outside the 
- *     software stack so the syncache needs to be able to notify the
- *     toe driver that the software stack is not going to create a connection
- *     for a received SYN
- *   - The driver is responsible for any synchronization required between
- *     the syncache dropping an entry and the driver processing the SYN,ACK.
- * 
- */
-struct toe_usrreqs {
-	int (*tu_send)(struct tcpcb *tp);
-	int (*tu_rcvd)(struct tcpcb *tp);
-	int (*tu_disconnect)(struct tcpcb *tp);
-	int (*tu_reset)(struct tcpcb *tp);
-	void (*tu_detach)(struct tcpcb *tp);
-	void (*tu_syncache_event)(int event, void *toep);
-};
+extern int registered_toedevs;
 
-/*
- * Proxy for struct tcpopt between TOE drivers and TCP functions.
- */
-struct toeopt {
-	u_int64_t	to_flags;	/* see tcpopt in tcp_var.h */
-	u_int16_t	to_mss;		/* maximum segment size */
-	u_int8_t	to_wscale;	/* window scaling */
+int  tcp_offload_connect(struct socket *, struct sockaddr *);
+void tcp_offload_listen_start(struct tcpcb *);
+void tcp_offload_listen_stop(struct tcpcb *);
+void tcp_offload_input(struct tcpcb *, struct mbuf *);
+int  tcp_offload_output(struct tcpcb *);
+void tcp_offload_rcvd(struct tcpcb *);
+void tcp_offload_ctloutput(struct tcpcb *, int, int);
+void tcp_offload_detach(struct tcpcb *);
 
-	u_int8_t	_pad1;		/* explicit pad for 64bit alignment */
-	u_int32_t	_pad2;		/* explicit pad for 64bit alignment */
-	u_int64_t	_pad3[4];	/* TBD */
-};
-
-#define	TOE_SC_ENTRY_PRESENT		1	/* 4-tuple already present */
-#define	TOE_SC_DROP			2	/* connection was timed out */
-
-/*
- * Because listen is a one-to-many relationship (a socket can be listening 
- * on all interfaces on a machine some of which may be using different TCP
- * offload devices), listen uses a publish/subscribe mechanism. The TCP
- * offload driver registers a listen notification function with the stack.
- * When a listen socket is created all TCP offload devices are notified
- * so that they can do the appropriate set up to offload connections on the
- * port to which the socket is bound. When the listen socket is closed,
- * the offload devices are notified so that they will stop listening on that
- * port and free any associated resources as well as sending RSTs on any
- * connections in the SYN_RCVD state.
- *
- */
-
-typedef	void	(*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
-typedef	void	(*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
-
-EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
-EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
-
-/*
- * Check if the socket can be offloaded by the following steps:
- * - determine the egress interface
- * - check the interface for TOE capability and TOE is enabled
- * - check if the device has resources to offload the connection
- */
-int	tcp_offload_connect(struct socket *so, struct sockaddr *nam);
-
-/*
- * The tcp_output_* routines are wrappers around the toe_usrreqs calls
- * which trigger packet transmission. In the non-offloaded case they
- * translate to tcp_output. The tcp_offload_* routines notify TOE
- * of specific events. I the non-offloaded case they are no-ops.
- *
- * Listen is a special case because it is a 1 to many relationship
- * and there can be more than one offload driver in the system.
- */
-
-/*
- * Connection is offloaded
- */
-#define	tp_offload(tp)		((tp)->t_flags & TF_TOE)
-
-/*
- * hackish way of allowing this file to also be included by TOE
- * which needs to be kept ignorant of socket implementation details
- */
-#ifdef _SYS_SOCKETVAR_H_
-/*
- * The socket has not been marked as "do not offload"
- */
-#define	SO_OFFLOADABLE(so)	((so->so_options & SO_NO_OFFLOAD) == 0)
-
-static __inline int
-tcp_output_connect(struct socket *so, struct sockaddr *nam)
-{
-	struct tcpcb *tp = sototcpcb(so);
-	int error;
-
-	/*
-	 * If offload has been disabled for this socket or the 
-	 * connection cannot be offloaded just call tcp_output
-	 * to start the TCP state machine.
-	 */
-#ifndef TCP_OFFLOAD_DISABLE	
-	if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0)
-#endif		
-		error = tcp_output(tp);
-	return (error);
-}
-
-static __inline int
-tcp_output_send(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_send(tp));
 #endif
-	return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_rcvd(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_rcvd(tp));
-#endif
-	return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_disconnect(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_disconnect(tp));
-#endif
-	return (tcp_output(tp));
-}
-
-static __inline int
-tcp_output_reset(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		return (tp->t_tu->tu_reset(tp));
-#endif
-	return (tcp_output(tp));
-}
-
-static __inline void
-tcp_offload_detach(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (tp_offload(tp))
-		tp->t_tu->tu_detach(tp);
-#endif	
-}
-
-static __inline void
-tcp_offload_listen_open(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket))
-		EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
-#endif	
-}
-
-static __inline void
-tcp_offload_listen_close(struct tcpcb *tp)
-{
-
-#ifndef TCP_OFFLOAD_DISABLE
-	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
-#endif	
-}
-#undef SO_OFFLOADABLE
-#endif /* _SYS_SOCKETVAR_H_ */
-#undef tp_offload
-
-void tcp_offload_twstart(struct tcpcb *tp);
-struct tcpcb *tcp_offload_close(struct tcpcb *tp);
-struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error);
-
-#endif /* _NETINET_TCP_OFFLOAD_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_output.c
--- a/head/sys/netinet/tcp_output.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_output.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/tcp_output.c 234342 2012-04-16 13:49:03Z glebius $");
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_output.c 238516 2012-07-16 07:08:34Z glebius $");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
@@ -75,6 +75,9 @@
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -177,7 +180,7 @@
 	int idle, sendalot;
 	int sack_rxmit, sack_bytes_rxmt;
 	struct sackhole *p;
-	int tso;
+	int tso, mtu;
 	struct tcpopt to;
 #if 0
 	int maxburst = TCP_MAXBURST;
@@ -191,6 +194,11 @@
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE)
+		return (tcp_offload_output(tp));
+#endif
+
 	/*
 	 * Determine length of data that should be transmitted,
 	 * and flags that will be used.
@@ -218,6 +226,7 @@
 		tcp_sack_adjust(tp);
 	sendalot = 0;
 	tso = 0;
+	mtu = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 
@@ -1047,19 +1056,24 @@
 	 * checksum extended header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
+	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
-	if (isipv6)
+	if (isipv6) {
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
-		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
-				       sizeof(struct tcphdr) + optlen + len);
+		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+		th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
+		    optlen + len, IPPROTO_TCP, 0);
+	}
+#endif
+#if defined(INET6) && defined(INET)
 	else
-#endif /* INET6 */
+#endif
+#ifdef INET
 	{
 		m->m_pkthdr.csum_flags = CSUM_TCP;
-		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
 
@@ -1067,6 +1081,7 @@
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
+#endif
 
 	/*
 	 * Enable TSO and specify the size of the segments.
@@ -1195,6 +1210,9 @@
 	 */
 #ifdef INET6
 	if (isipv6) {
+		struct route_in6 ro;
+
+		bzero(&ro, sizeof(ro));
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt.
@@ -1204,10 +1222,13 @@
 		ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
 
 		/* TODO: IPv6 IP6TOS_ECT bit on */
-		error = ip6_output(m,
-			    tp->t_inpcb->in6p_outputopts, NULL,
-			    ((so->so_options & SO_DONTROUTE) ?
-			    IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb);
+		error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro,
+		    ((so->so_options & SO_DONTROUTE) ?  IP_ROUTETOIF : 0),
+		    NULL, NULL, tp->t_inpcb);
+
+		if (error == EMSGSIZE && ro.ro_rt != NULL)
+			mtu = ro.ro_rt->rt_rmx.rmx_mtu;
+		RO_RTFREE(&ro);
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
@@ -1215,6 +1236,9 @@
 #endif
 #ifdef INET
     {
+	struct route ro;
+
+	bzero(&ro, sizeof(ro));
 	ip->ip_len = m->m_pkthdr.len;
 #ifdef INET6
 	if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO)
@@ -1231,9 +1255,13 @@
 	if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss)
 		ip->ip_off |= IP_DF;
 
-	error = ip_output(m, tp->t_inpcb->inp_options, NULL,
+	error = ip_output(m, tp->t_inpcb->inp_options, &ro,
 	    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 	    tp->t_inpcb);
+
+	if (error == EMSGSIZE && ro.ro_rt != NULL)
+		mtu = ro.ro_rt->rt_rmx.rmx_mtu;
+	RO_RTFREE(&ro);
     }
 #endif /* INET */
 	if (error) {
@@ -1280,21 +1308,18 @@
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU.
-			 *
-			 * tcp_mtudisc() will find out the new MTU and as
-			 * its last action, initiate retransmission, so it
-			 * is important to not do so here.
-			 *
 			 * If TSO was active we either got an interface
 			 * without TSO capabilits or TSO was turned off.
-			 * Disable it for this connection as too and
-			 * immediatly retry with MSS sized segments generated
-			 * by this function.
+			 * If we obtained mtu from ip_output() then update
+			 * it and try again.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
-			tcp_mtudisc(tp->t_inpcb, -1);
-			return (0);
+			if (mtu != 0) {
+				tcp_mss_update(tp, -1, mtu, NULL, NULL);
+				goto again;
+			}
+			return (error);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_subr.c
--- a/head/sys/netinet/tcp_subr.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_subr.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/tcp_subr.c 234342 2012-04-16 13:49:03Z glebius $");
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_subr.c 237263 2012-06-19 07:34:13Z np $");
 
 #include "opt_compat.h"
 #include "opt_inet.h"
@@ -85,7 +85,6 @@
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
@@ -96,6 +95,9 @@
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -573,8 +575,7 @@
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_nxt = IPPROTO_TCP;
-		ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
-						tlen));
+		ip6->ip6_plen = 0;		/* Set in ip6_output(). */
 		tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 	}
 #endif
@@ -619,12 +620,13 @@
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
+
+	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
-		nth->th_sum = 0;
-		nth->th_sum = in6_cksum(m, IPPROTO_TCP,
-					sizeof(struct ip6_hdr),
-					tlen - sizeof(struct ip6_hdr));
+		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+		nth->th_sum = in6_cksum_pseudo(ip6,
+		    tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	}
@@ -634,10 +636,9 @@
 #endif
 #ifdef INET
 	{
+		m->m_pkthdr.csum_flags = CSUM_TCP;
 		nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
-		m->m_pkthdr.csum_flags = CSUM_TCP;
-		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 	}
 #endif /* INET */
 #ifdef TCPDEBUG
@@ -825,7 +826,7 @@
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tp->t_state = TCPS_CLOSED;
-		(void) tcp_output_reset(tp);
+		(void) tcp_output(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
@@ -925,8 +926,12 @@
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
+
+#ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
-	tcp_offload_detach(tp);
+	if (tp->t_flags & TF_TOE)
+		tcp_offload_detach(tp);
+#endif
 		
 	tcp_free_sackholes(tp);
 
@@ -955,9 +960,10 @@
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
-	/* Notify any offload devices of listener close */
+#ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
-		tcp_offload_listen_close(tp);
+		tcp_offload_listen_stop(tp);
+#endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
@@ -1696,7 +1702,7 @@
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
-	tcp_output_send(tp);
+	tcp_output(tp);
 	return (inp);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_syncache.c
--- a/head/sys/netinet/tcp_syncache.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_syncache.c	Wed Jul 25 16:40:53 2012 +0300
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/tcp_syncache.c 231767 2012-02-15 16:09:56Z bz $");
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_syncache.c 237263 2012-06-19 07:34:13Z np $");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
@@ -81,10 +81,12 @@
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
-#include <netinet/tcp_offload.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
+#ifdef TCP_OFFLOAD
+#include <netinet/toecore.h>
+#endif
 
 #ifdef IPSEC
 #include <netipsec/ipsec.h>
@@ -110,10 +112,8 @@
     &VNET_NAME(tcp_syncookiesonly), 0,
     "Use only TCP SYN cookies");
 
-#ifdef TCP_OFFLOAD_DISABLE
-#define TOEPCB_ISSET(sc) (0)
-#else
-#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL)
+#ifdef TCP_OFFLOAD
+#define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL)
 #endif
 
 static void	 syncache_drop(struct syncache *, struct syncache_head *);
@@ -332,6 +332,14 @@
 	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length++;
 
+#ifdef TCP_OFFLOAD
+	if (ADDED_BY_TOE(sc)) {
+		struct toedev *tod = sc->sc_tod;
+
+		tod->tod_syncache_added(tod, sc->sc_todctx);
+	}
+#endif
+
 	/* Reinitialize the bucket row's timer. */
 	if (sch->sch_length == 1)
 		sch->sch_nextc = ticks + INT_MAX;
@@ -356,10 +364,14 @@
 	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length--;
 
-#ifndef TCP_OFFLOAD_DISABLE
-	if (sc->sc_tu)
-		sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb);
-#endif		    
+#ifdef TCP_OFFLOAD
+	if (ADDED_BY_TOE(sc)) {
+		struct toedev *tod = sc->sc_tod;
+
+		tod->tod_syncache_removed(tod, sc->sc_todctx);
+	}
+#endif
+
 	syncache_free(sc);
 	V_tcp_syncache.cache_count--;
 }
@@ -846,6 +858,18 @@
 	if (sc->sc_rxmits > 1)
 		tp->snd_cwnd = tp->t_maxseg;
 
+#ifdef TCP_OFFLOAD
+	/*
+	 * Allow a TOE driver to install its hooks.  Note that we hold the
+	 * pcbinfo lock too and that prevents tcp_usr_accept from accepting a
+	 * new connection before the TOE driver has done its thing.
+	 */
+	if (ADDED_BY_TOE(sc)) {
+		struct toedev *tod = sc->sc_tod;
+
+		tod->tod_offload_socket(tod, sc->sc_todctx, so);
+	}
+#endif
 	/*
 	 * Copy and activate timers.
 	 */
@@ -926,6 +950,13 @@
 		/* Pull out the entry to unlock the bucket row. */
 		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 		sch->sch_length--;
+#ifdef TCP_OFFLOAD
+		if (ADDED_BY_TOE(sc)) {
+			struct toedev *tod = sc->sc_tod;
+
+			tod->tod_syncache_removed(tod, sc->sc_todctx);
+		}
+#endif
 		V_tcp_syncache.cache_count--;
 		SCH_UNLOCK(sch);
 	}
@@ -934,7 +965,7 @@
 	 * Segment validation:
 	 * ACK must match our initial sequence number + 1 (the SYN|ACK).
 	 */
-	if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) {
+	if (th->th_ack != sc->sc_iss + 1) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_ack, sc->sc_iss);
@@ -945,9 +976,8 @@
 	 * The SEQ must fall in the window starting at the received
 	 * initial receive sequence number + 1 (the SYN).
 	 */
-	if ((SEQ_LEQ(th->th_seq, sc->sc_irs) ||
-	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) &&
-	    !TOEPCB_ISSET(sc)) {
+	if (SEQ_LEQ(th->th_seq, sc->sc_irs) ||
+	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_seq, sc->sc_irs);
@@ -964,8 +994,7 @@
 	 * If timestamps were negotiated the reflected timestamp
 	 * must be equal to what we actually sent in the SYN|ACK.
 	 */
-	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts &&
-	    !TOEPCB_ISSET(sc)) {
+	if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, "
 			    "segment rejected\n",
@@ -993,25 +1022,6 @@
 	return (0);
 }
 
-int
-tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
-    struct tcphdr *th, struct socket **lsop, struct mbuf *m)
-{
-	struct tcpopt to;
-	int rc;
-
-	bzero(&to, sizeof(struct tcpopt));
-	to.to_mss = toeo->to_mss;
-	to.to_wscale = toeo->to_wscale;
-	to.to_flags = toeo->to_flags;
-	
-	INP_INFO_WLOCK(&V_tcbinfo);
-	rc = syncache_expand(inc, &to, th, lsop, m);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
-
-	return (rc);
-}
-
 /*
  * Given a LISTEN socket and an inbound SYN request, add
  * this to the syn cache, and send back a segment:
@@ -1025,10 +1035,10 @@
  * consume all available buffer space if it were ACKed.  By not ACKing
  * the data, we avoid this DoS scenario.
  */
-static void
-_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
-    struct inpcb *inp, struct socket **lsop, struct mbuf *m,
-    struct toe_usrreqs *tu, void *toepcb)
+void
+syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+    struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod,
+    void *todctx)
 {
 	struct tcpcb *tp;
 	struct socket *so;
@@ -1114,11 +1124,6 @@
 	sc = syncache_lookup(inc, &sch);	/* returns locked entry */
 	SCH_LOCK_ASSERT(sch);
 	if (sc != NULL) {
-#ifndef TCP_OFFLOAD_DISABLE
-		if (sc->sc_tu)
-			sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT,
-			    sc->sc_toepcb);
-#endif		    
 		TCPSTAT_INC(tcps_sc_dupsyn);
 		if (ipopts) {
 			/*
@@ -1151,7 +1156,7 @@
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
-		if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) {
+		if (syncache_respond(sc) == 0) {
 			sc->sc_rxmits = 0;
 			syncache_timeout(sc, sch, 1);
 			TCPSTAT_INC(tcps_sndacks);
@@ -1202,9 +1207,9 @@
 		sc->sc_ip_tos = ip_tos;
 		sc->sc_ip_ttl = ip_ttl;
 	}
-#ifndef TCP_OFFLOAD_DISABLE	
-	sc->sc_tu = tu;
-	sc->sc_toepcb = toepcb;
+#ifdef TCP_OFFLOAD
+	sc->sc_tod = tod;
+	sc->sc_todctx = todctx;
 #endif
 	sc->sc_irs = th->th_seq;
 	sc->sc_iss = arc4random();
@@ -1299,7 +1304,7 @@
 	/*
 	 * Do a standard 3-way handshake.
 	 */
-	if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) {
+	if (syncache_respond(sc) == 0) {
 		if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
 			syncache_free(sc);
 		else if (sc != &scs)
@@ -1473,11 +1478,12 @@
 		optlen = 0;
 
 	M_SETFIB(m, sc->sc_inc.inc_fibnum);
+	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
-		th->th_sum = 0;
-		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen,
-				       tlen + optlen - hlen);
+		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+		th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen,
+		    IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
 		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 	}
@@ -1487,41 +1493,24 @@
 #endif
 #ifdef INET
 	{
+		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(tlen + optlen - hlen + IPPROTO_TCP));
-		m->m_pkthdr.csum_flags = CSUM_TCP;
-		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+#ifdef TCP_OFFLOAD
+		if (ADDED_BY_TOE(sc)) {
+			struct toedev *tod = sc->sc_tod;
+
+			error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
+
+			return (error);
+		}
+#endif
 		error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
 	}
 #endif
 	return (error);
 }
 
-void
-syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
-    struct inpcb *inp, struct socket **lsop, struct mbuf *m)
-{
-	_syncache_add(inc, to, th, inp, lsop, m, NULL, NULL);
-}
-
-void
-tcp_offload_syncache_add(struct in_conninfo *inc, struct toeopt *toeo,
-    struct tcphdr *th, struct inpcb *inp, struct socket **lsop,
-    struct toe_usrreqs *tu, void *toepcb)
-{
-	struct tcpopt to;
-
-	bzero(&to, sizeof(struct tcpopt));
-	to.to_mss = toeo->to_mss;
-	to.to_wscale = toeo->to_wscale;
-	to.to_flags = toeo->to_flags;
-
-	INP_INFO_WLOCK(&V_tcbinfo);
-	INP_WLOCK(inp);
-
-	_syncache_add(inc, &to, th, inp, lsop, NULL, tu, toepcb);
-}
-
 /*
  * The purpose of SYN cookies is to avoid keeping track of all SYN's we
  * receive and to be able to handle SYN floods from bogus source addresses
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_syncache.h
--- a/head/sys/netinet/tcp_syncache.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_syncache.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,15 +27,13 @@
  * SUCH DAMAGE.
  *
  *	@(#)tcp_var.h	8.4 (Berkeley) 5/24/95
- * $FreeBSD: head/sys/netinet/tcp_syncache.h 224151 2011-07-17 21:15:20Z bz $
+ * $FreeBSD: head/sys/netinet/tcp_syncache.h 237263 2012-06-19 07:34:13Z np $
  */
 
 #ifndef _NETINET_TCP_SYNCACHE_H_
 #define _NETINET_TCP_SYNCACHE_H_
 #ifdef _KERNEL
 
-struct toeopt;
-
 void	 syncache_init(void);
 #ifdef VIMAGE
 void	syncache_destroy(void);
@@ -43,14 +41,9 @@
 void	 syncache_unreach(struct in_conninfo *, struct tcphdr *);
 int	 syncache_expand(struct in_conninfo *, struct tcpopt *,
 	     struct tcphdr *, struct socket **, struct mbuf *);
-int	 tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo,
-             struct tcphdr *th, struct socket **lsop, struct mbuf *m);
 void	 syncache_add(struct in_conninfo *, struct tcpopt *,
-	     struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *);
-void	 tcp_offload_syncache_add(struct in_conninfo *, struct toeopt *,
-             struct tcphdr *, struct inpcb *, struct socket **,
-             struct toe_usrreqs *tu, void *toepcb);
-
+	     struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *,
+	     void *, void *);
 void	 syncache_chkrst(struct in_conninfo *, struct tcphdr *);
 void	 syncache_badack(struct in_conninfo *);
 int	 syncache_pcbcount(void);
@@ -75,10 +68,10 @@
 	u_int8_t	sc_requested_s_scale:4,
 			sc_requested_r_scale:4;
 	u_int16_t	sc_flags;
-#ifndef TCP_OFFLOAD_DISABLE
-	struct toe_usrreqs *sc_tu;		/* TOE operations */
-	void		*sc_toepcb;		/* TOE protocol block */
-#endif			
+#if defined(TCP_OFFLOAD) || !defined(TCP_OFFLOAD_DISABLE)
+	struct toedev	*sc_tod;		/* entry added by this TOE */
+	void		*sc_todctx;		/* TOE driver context */
+#endif
 	struct label	*sc_label;		/* MAC label reference */
 	struct ucred	*sc_cred;		/* cred cache for jail checks */
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_timer.c
--- a/head/sys/netinet/tcp_timer.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_timer.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/tcp_timer.c 231025 2012-02-05 16:53:02Z glebius $");
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_timer.c 237263 2012-06-19 07:34:13Z np $");
 
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
@@ -602,6 +602,11 @@
 	struct inpcb *inp = tp->t_inpcb;
 	int cpu = INP_CPU(inp);
 
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE)
+		return;
+#endif
+
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_timewait.c
--- a/head/sys/netinet/tcp_timewait.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_timewait.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/tcp_timewait.c 231767 2012-02-15 16:09:56Z bz $");
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_timewait.c 236170 2012-05-28 09:30:13Z bz $");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
@@ -574,10 +574,12 @@
 	th->th_flags = flags;
 	th->th_win = htons(tw->last_win);
 
+	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 #ifdef INET6
 	if (isipv6) {
-		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
-		    sizeof(struct tcphdr) + optlen);
+		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+		th->th_sum = in6_cksum_pseudo(ip6,
+		    sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0);
 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 		error = ip6_output(m, inp->in6p_outputopts, NULL,
 		    (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
@@ -588,10 +590,9 @@
 #endif
 #ifdef INET
 	{
+		m->m_pkthdr.csum_flags = CSUM_TCP;
 		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
-		m->m_pkthdr.csum_flags = CSUM_TCP;
-		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 		ip->ip_len = m->m_pkthdr.len;
 		if (V_path_mtu_discovery)
 			ip->ip_off |= IP_DF;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_usrreq.c
--- a/head/sys/netinet/tcp_usrreq.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_usrreq.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/tcp_usrreq.c 231025 2012-02-05 16:53:02Z glebius $");
+__FBSDID("$FreeBSD: head/sys/netinet/tcp_usrreq.c 237263 2012-06-19 07:34:13Z np $");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
@@ -87,7 +87,9 @@
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
+#ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
+#endif
 
 /*
  * TCP protocol interface to socket abstraction.
@@ -367,7 +369,9 @@
 	if (error == 0) {
 		tp->t_state = TCPS_LISTEN;
 		solisten_proto(so, backlog);
-		tcp_offload_listen_open(tp);
+#ifdef TCP_OFFLOAD
+		tcp_offload_listen_start(tp);
+#endif
 	}
 	SOCK_UNLOCK(so);
 
@@ -409,6 +413,9 @@
 	if (error == 0) {
 		tp->t_state = TCPS_LISTEN;
 		solisten_proto(so, backlog);
+#ifdef TCP_OFFLOAD
+		tcp_offload_listen_start(tp);
+#endif
 	}
 	SOCK_UNLOCK(so);
 
@@ -459,7 +466,13 @@
 	TCPDEBUG1();
 	if ((error = tcp_connect(tp, nam, td)) != 0)
 		goto out;
-	error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+	if (registered_toedevs > 0 &&
+	    (error = tcp_offload_connect(so, nam)) == 0)
+		goto out;
+#endif
+	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+	error = tcp_output(tp);
 out:
 	TCPDEBUG2(PRU_CONNECT);
 	INP_WUNLOCK(inp);
@@ -519,7 +532,12 @@
 			goto out;
 		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
 			goto out;
-		error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+		if (registered_toedevs > 0 &&
+		    (error = tcp_offload_connect(so, nam)) == 0)
+			goto out;
+#endif
+		error = tcp_output(tp);
 		goto out;
 	}
 #endif
@@ -530,7 +548,13 @@
 		goto out;
 	if ((error = tcp6_connect(tp, nam, td)) != 0)
 		goto out;
-	error = tcp_output_connect(so, nam);
+#ifdef TCP_OFFLOAD
+	if (registered_toedevs > 0 &&
+	    (error = tcp_offload_connect(so, nam)) == 0)
+		goto out;
+#endif
+	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+	error = tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_CONNECT);
@@ -709,7 +733,7 @@
 	socantsendmore(so);
 	tcp_usrclosed(tp);
 	if (!(inp->inp_flags & INP_DROPPED))
-		error = tcp_output_disconnect(tp);
+		error = tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_SHUTDOWN);
@@ -739,7 +763,11 @@
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
-	tcp_output_rcvd(tp);
+#ifdef TCP_OFFLOAD
+	if (tp->t_flags & TF_TOE)
+		tcp_offload_rcvd(tp);
+#endif
+	tcp_output(tp);
 
 out:
 	TCPDEBUG2(PRU_RCVD);
@@ -835,7 +863,7 @@
 		if (!(inp->inp_flags & INP_DROPPED)) {
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags |= TF_MORETOCOME;
-			error = tcp_output_send(tp);
+			error = tcp_output(tp);
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags &= ~TF_MORETOCOME;
 		}
@@ -884,7 +912,7 @@
 		}
 		tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
 		tp->t_flags |= TF_FORCEDATA;
-		error = tcp_output_send(tp);
+		error = tcp_output(tp);
 		tp->t_flags &= ~TF_FORCEDATA;
 	}
 out:
@@ -1119,7 +1147,6 @@
 	soisconnecting(so);
 	TCPSTAT_INC(tcps_connattempt);
 	tp->t_state = TCPS_SYN_SENT;
-	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	tp->iss = tcp_new_isn(tp);
 	tcp_sendseqinit(tp);
 
@@ -1192,7 +1219,6 @@
 	soisconnecting(so);
 	TCPSTAT_INC(tcps_connattempt);
 	tp->t_state = TCPS_SYN_SENT;
-	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	tp->iss = tcp_new_isn(tp);
 	tcp_sendseqinit(tp);
 
@@ -1323,9 +1349,9 @@
 				tp->t_flags |= TF_SIGNATURE;
 			else
 				tp->t_flags &= ~TF_SIGNATURE;
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 #endif /* TCP_SIGNATURE */
+
 		case TCP_NODELAY:
 		case TCP_NOOPT:
 			INP_WUNLOCK(inp);
@@ -1351,6 +1377,13 @@
 				tp->t_flags |= opt;
 			else
 				tp->t_flags &= ~opt;
+unlock_and_done:
+#ifdef TCP_OFFLOAD
+			if (tp->t_flags & TF_TOE) {
+				tcp_offload_ctloutput(tp, sopt->sopt_dir,
+				    sopt->sopt_name);
+			}
+#endif
 			INP_WUNLOCK(inp);
 			break;
 
@@ -1369,8 +1402,7 @@
 				if (TCPS_HAVEESTABLISHED(tp->t_state))
 					error = tcp_output(tp);
 			}
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_MAXSEG:
 			INP_WUNLOCK(inp);
@@ -1385,8 +1417,7 @@
 				tp->t_maxseg = optval;
 			else
 				error = EINVAL;
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_INFO:
 			INP_WUNLOCK(inp);
@@ -1438,8 +1469,7 @@
 				}
 			}
 			CC_LIST_RUNLOCK();
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		case TCP_KEEPIDLE:
 		case TCP_KEEPINTVL:
@@ -1491,8 +1521,7 @@
 					    TP_KEEPINIT(tp));
 				break;
 			}
-			INP_WUNLOCK(inp);
-			break;
+			goto unlock_and_done;
 
 		default:
 			INP_WUNLOCK(inp);
@@ -1635,7 +1664,7 @@
 		sbflush(&so->so_rcv);
 		tcp_usrclosed(tp);
 		if (!(inp->inp_flags & INP_DROPPED))
-			tcp_output_disconnect(tp);
+			tcp_output(tp);
 	}
 }
 
@@ -1658,7 +1687,9 @@
 
 	switch (tp->t_state) {
 	case TCPS_LISTEN:
-		tcp_offload_listen_close(tp);
+#ifdef TCP_OFFLOAD
+		tcp_offload_listen_stop(tp);
+#endif
 		/* FALLTHROUGH */
 	case TCPS_CLOSED:
 		tp->t_state = TCPS_CLOSED;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/tcp_var.h
--- a/head/sys/netinet/tcp_var.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/tcp_var.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)tcp_var.h	8.4 (Berkeley) 5/24/95
- * $FreeBSD: head/sys/netinet/tcp_var.h 234342 2012-04-16 13:49:03Z glebius $
+ * $FreeBSD: head/sys/netinet/tcp_var.h 237263 2012-06-19 07:34:13Z np $
  */
 
 #ifndef _NETINET_TCP_VAR_H_
@@ -194,7 +194,7 @@
 	int	t_rttlow;		/* smallest observerved RTT */
 	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
 	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
-	struct toe_usrreqs *t_tu;	/* offload operations vector */
+	struct toedev	*tod;		/* toedev handling this connection */
 	int	t_sndrexmitpack;	/* retransmit packets sent */
 	int	t_rcvoopack;		/* out-of-order packets received */
 	void	*t_toe;			/* TOE pcb pointer */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/toecore.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/netinet/toecore.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,575 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ * Written by: Navdeep Parhar <np at FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/netinet/toecore.c 237263 2012-06-19 07:34:13Z np $");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/types.h>
+#include <sys/sockopt.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_vlan_var.h>
+#include <net/if_llatbl.h>
+#include <net/route.h>
+
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_var.h>
+#include <netinet6/nd6.h>
+#define TCPSTATES
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_offload.h>
+#include <netinet/toecore.h>
+
+static struct mtx toedev_lock;
+static TAILQ_HEAD(, toedev) toedev_list;
+static eventhandler_tag listen_start_eh;
+static eventhandler_tag listen_stop_eh;
+static eventhandler_tag lle_event_eh;
+static eventhandler_tag route_redirect_eh;
+
+static int
+toedev_connect(struct toedev *tod __unused, struct socket *so __unused,
+    struct rtentry *rt __unused, struct sockaddr *nam __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static int
+toedev_listen_start(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static int
+toedev_listen_stop(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static void
+toedev_input(struct toedev *tod __unused, struct tcpcb *tp __unused,
+    struct mbuf *m)
+{
+
+	m_freem(m);
+	return;
+}
+
+static void
+toedev_rcvd(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return;
+}
+
+static int
+toedev_output(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return (ENOTSUP);
+}
+
+static void
+toedev_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp __unused)
+{
+
+	return;
+}
+
+static void
+toedev_l2_update(struct toedev *tod __unused, struct ifnet *ifp __unused,
+    struct sockaddr *sa __unused, uint8_t *lladdr __unused,
+    uint16_t vtag __unused)
+{
+
+	return;
+}
+
+static void
+toedev_route_redirect(struct toedev *tod __unused, struct ifnet *ifp __unused,
+    struct rtentry *rt0 __unused, struct rtentry *rt1 __unused)
+{
+
+	return;
+}
+
+static void
+toedev_syncache_added(struct toedev *tod __unused, void *ctx __unused)
+{
+
+	return;
+}
+
+static void
+toedev_syncache_removed(struct toedev *tod __unused, void *ctx __unused)
+{
+
+	return;
+}
+
+static int
+toedev_syncache_respond(struct toedev *tod __unused, void *ctx __unused,
+    struct mbuf *m)
+{
+
+	m_freem(m);
+	return (0);
+}
+
+static void
+toedev_offload_socket(struct toedev *tod __unused, void *ctx __unused,
+    struct socket *so __unused)
+{
+
+	return;
+}
+
+static void
+toedev_ctloutput(struct toedev *tod __unused, struct tcpcb *tp __unused,
+    int sopt_dir __unused, int sopt_name __unused)
+{
+
+	return;
+}
+
+/*
+ * Inform one or more TOE devices about a listening socket.
+ */
+static void
+toe_listen_start(struct inpcb *inp, void *arg)
+{
+	struct toedev *t, *tod;
+	struct tcpcb *tp;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(inp->inp_pcbinfo == &V_tcbinfo,
+	    ("%s: inp is not a TCP inp", __func__));
+
+	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))
+		return;
+
+	tp = intotcpcb(inp);
+	if (tp->t_state != TCPS_LISTEN)
+		return;
+
+	t = arg;
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH(tod, &toedev_list, link) {
+		if (t == NULL || t == tod)
+			tod->tod_listen_start(tod, tp);
+	}
+	mtx_unlock(&toedev_lock);
+}
+
+static void
+toe_listen_start_event(void *arg __unused, struct tcpcb *tp)
+{
+	struct inpcb *inp = tp->t_inpcb;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_state == TCPS_LISTEN,
+	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
+
+	toe_listen_start(inp, NULL);
+}
+
+static void
+toe_listen_stop_event(void *arg __unused, struct tcpcb *tp)
+{
+	struct toedev *tod;
+#ifdef INVARIANTS
+	struct inpcb *inp = tp->t_inpcb;
+#endif
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_state == TCPS_LISTEN,
+	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH(tod, &toedev_list, link)
+	    tod->tod_listen_stop(tod, tp);
+	mtx_unlock(&toedev_lock);
+}
+
+/*
+ * Fill up a freshly allocated toedev struct with reasonable defaults.
+ */
+void
+init_toedev(struct toedev *tod)
+{
+
+	tod->tod_softc = NULL;
+
+	/*
+	 * Provide no-op defaults so that the kernel can call any toedev
+	 * function without having to check whether the TOE driver supplied one
+	 * or not.
+	 */
+	tod->tod_connect = toedev_connect;
+	tod->tod_listen_start = toedev_listen_start;
+	tod->tod_listen_stop = toedev_listen_stop;
+	tod->tod_input = toedev_input;
+	tod->tod_rcvd = toedev_rcvd;
+	tod->tod_output = toedev_output;
+	tod->tod_send_rst = toedev_output;
+	tod->tod_send_fin = toedev_output;
+	tod->tod_pcb_detach = toedev_pcb_detach;
+	tod->tod_l2_update = toedev_l2_update;
+	tod->tod_route_redirect = toedev_route_redirect;
+	tod->tod_syncache_added = toedev_syncache_added;
+	tod->tod_syncache_removed = toedev_syncache_removed;
+	tod->tod_syncache_respond = toedev_syncache_respond;
+	tod->tod_offload_socket = toedev_offload_socket;
+	tod->tod_ctloutput = toedev_ctloutput;
+}
+
+/*
+ * Register an active TOE device with the system.  This allows it to receive
+ * notifications from the kernel.
+ */
+int
+register_toedev(struct toedev *tod)
+{
+	struct toedev *t;
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH(t, &toedev_list, link) {
+		if (t == tod) {
+			mtx_unlock(&toedev_lock);
+			return (EEXIST);
+		}
+	}
+
+	TAILQ_INSERT_TAIL(&toedev_list, tod, link);
+	registered_toedevs++;
+	mtx_unlock(&toedev_lock);
+
+	inp_apply_all(toe_listen_start, tod);
+
+	return (0);
+}
+
+/*
+ * Remove the TOE device from the global list of active TOE devices.  It is the
+ * caller's responsibility to ensure that the TOE device is quiesced prior to
+ * this call.
+ */
+int
+unregister_toedev(struct toedev *tod)
+{
+	struct toedev *t, *t2;
+	int rc = ENODEV;
+
+	mtx_lock(&toedev_lock);
+	TAILQ_FOREACH_SAFE(t, &toedev_list, link, t2) {
+		if (t == tod) {
+			TAILQ_REMOVE(&toedev_list, tod, link);
+			registered_toedevs--;
+			rc = 0;
+			break;
+		}
+	}
+	KASSERT(registered_toedevs >= 0,
+	    ("%s: registered_toedevs (%d) < 0", __func__, registered_toedevs));
+	mtx_unlock(&toedev_lock);
+	return (rc);
+}
+
+void
+toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
+    struct inpcb *inp, void *tod, void *todctx)
+{
+	struct socket *lso = inp->inp_socket;
+
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(inp);
+
+	syncache_add(inc, to, th, inp, &lso, NULL, tod, todctx);
+}
+
+int
+toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to,
+    struct tcphdr *th, struct socket **lsop)
+{
+
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+	return (syncache_expand(inc, to, th, lsop, NULL));
+}
+
+/*
+ * General purpose check to see if a 4-tuple is in use by the kernel.  If a TCP
+ * header (presumably for an incoming SYN) is also provided, an existing 4-tuple
+ * in TIME_WAIT may be assassinated freeing it up for re-use.
+ *
+ * Note that the TCP header must have been run through tcp_fields_to_host() or
+ * equivalent.
+ */
+int
+toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp)
+{
+	struct inpcb *inp;
+
+	if (inc->inc_flags & INC_ISIPV6)
+		return (ENOSYS);	/* XXX: implement */
+
+	inp = in_pcblookup(&V_tcbinfo, inc->inc_faddr, inc->inc_fport,
+	    inc->inc_laddr, inc->inc_lport, INPLOOKUP_WLOCKPCB, ifp);
+	if (inp != NULL) {
+		INP_WLOCK_ASSERT(inp);
+
+		if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) {
+
+			INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* for twcheck */
+			if (!tcp_twcheck(inp, NULL, th, NULL, 0))
+				return (EADDRINUSE);
+		} else {
+			INP_WUNLOCK(inp);
+			return (EADDRINUSE);
+		}
+	}
+
+	return (0);
+}
+
+static void
+toe_lle_event(void *arg __unused, struct llentry *lle, int evt)
+{
+	struct toedev *tod;
+	struct ifnet *ifp;
+	struct sockaddr *sa;
+	uint8_t *lladdr;
+	uint16_t vtag;
+
+	LLE_WLOCK_ASSERT(lle);
+
+	ifp = lle->lle_tbl->llt_ifp;
+	sa = L3_ADDR(lle);
+
+	KASSERT(sa->sa_family == AF_INET || sa->sa_family == AF_INET6,
+	    ("%s: lle_event %d for lle %p but sa %p !INET && !INET6",
+	    __func__, evt, lle, sa));
+
+	/*
+	 * Not interested if the interface's TOE capability is not enabled.
+	 */
+	if ((sa->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) ||
+	    (sa->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6)))
+		return;
+
+	tod = TOEDEV(ifp);
+	if (tod == NULL)
+		return;
+
+	vtag = 0xfff;
+	if (evt != LLENTRY_RESOLVED) {
+
+		/*
+		 * LLENTRY_TIMEDOUT, LLENTRY_DELETED, LLENTRY_EXPIRED all mean
+		 * this entry is going to be deleted.
+		 */
+
+		lladdr = NULL;
+	} else {
+
+		KASSERT(lle->la_flags & LLE_VALID,
+		    ("%s: %p resolved but not valid?", __func__, lle));
+
+		lladdr = (uint8_t *)&lle->ll_addr;
+#ifdef VLAN_TAG
+		VLAN_TAG(ifp, &vtag);
+#endif
+	}
+
+	tod->tod_l2_update(tod, ifp, sa, lladdr, vtag);
+}
+
+/*
+ * XXX: implement.
+ */
+static void
+toe_route_redirect_event(void *arg __unused, struct rtentry *rt0,
+    struct rtentry *rt1, struct sockaddr *sa)
+{
+
+	return;
+}
+
+/*
+ * Returns 0 or EWOULDBLOCK on success (any other value is an error).  0 means
+ * lladdr and vtag are valid on return, EWOULDBLOCK means the TOE driver's
+ * tod_l2_update will be called later, when the entry is resolved or times out.
+ */
+int
+toe_l2_resolve(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
+    uint8_t *lladdr, uint16_t *vtag)
+{
+	struct llentry *lle;
+	int rc;
+
+	switch (sa->sa_family) {
+#ifdef INET
+	case AF_INET:
+		rc = arpresolve(ifp, NULL, NULL, sa, lladdr, &lle);
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		rc = nd6_storelladdr(ifp, NULL, sa, lladdr, &lle);
+		break;
+#endif
+	default:
+		return (EPROTONOSUPPORT);
+	}
+
+	if (rc == 0) {
+#ifdef VLAN_TAG
+		if (VLAN_TAG(ifp, vtag) != 0)
+#endif
+			*vtag = 0xfff;
+	}
+
+	return (rc);
+}
+
+void
+toe_connect_failed(struct toedev *tod, struct tcpcb *tp, int err)
+{
+	struct inpcb *inp = tp->t_inpcb;
+
+	INP_WLOCK_ASSERT(inp);
+	KASSERT(tp->t_flags & TF_TOE,
+	    ("%s: tp %p not offloaded.", __func__, tp));
+
+	if (!(inp->inp_flags & INP_DROPPED)) {
+		if (err == EAGAIN) {
+
+			/*
+			 * Temporary failure during offload, take this PCB back.
+			 * Detach from the TOE driver and do the rest of what
+			 * TCP's pru_connect would have done if the connection
+			 * wasn't offloaded.
+			 */
+
+			tod->tod_pcb_detach(tod, tp);
+			KASSERT(!(tp->t_flags & TF_TOE),
+			    ("%s: tp %p still offloaded.", __func__, tp));
+			tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
+			(void) tcp_output(tp);
+		} else {
+
+			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+			tp = tcp_drop(tp, err);
+			if (tp == NULL)
+				INP_WLOCK(inp);	/* re-acquire */
+		}
+	}
+	INP_WLOCK_ASSERT(inp);
+}
+
+static int
+toecore_load(void)
+{
+
+	mtx_init(&toedev_lock, "toedev lock", NULL, MTX_DEF);
+	TAILQ_INIT(&toedev_list);
+
+	listen_start_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_start,
+	    toe_listen_start_event, NULL, EVENTHANDLER_PRI_ANY);
+	listen_stop_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_stop,
+	    toe_listen_stop_event, NULL, EVENTHANDLER_PRI_ANY);
+	lle_event_eh = EVENTHANDLER_REGISTER(lle_event, toe_lle_event, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	route_redirect_eh = EVENTHANDLER_REGISTER(route_redirect_event,
+	    toe_route_redirect_event, NULL, EVENTHANDLER_PRI_ANY);
+
+	return (0);
+}
+
+static int
+toecore_unload(void)
+{
+
+	mtx_lock(&toedev_lock);
+	if (!TAILQ_EMPTY(&toedev_list)) {
+		mtx_unlock(&toedev_lock);
+		return (EBUSY);
+	}
+
+	EVENTHANDLER_DEREGISTER(tcp_offload_listen_start, listen_start_eh);
+	EVENTHANDLER_DEREGISTER(tcp_offload_listen_stop, listen_stop_eh);
+	EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
+	EVENTHANDLER_DEREGISTER(route_redirect_event, route_redirect_eh);
+
+	mtx_unlock(&toedev_lock);
+	mtx_destroy(&toedev_lock);
+
+	return (0);
+}
+
+static int
+toecore_mod_handler(module_t mod, int cmd, void *arg)
+{
+
+	if (cmd == MOD_LOAD)
+		return (toecore_load());
+
+	if (cmd == MOD_UNLOAD)
+		return (toecore_unload());
+
+	return (EOPNOTSUPP);
+}
+
+static moduledata_t mod_data= {
+	"toecore",
+	toecore_mod_handler,
+	0
+};
+
+MODULE_VERSION(toecore, 1);
+DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/toecore.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/netinet/toecore.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,130 @@
+/*-
+ * Copyright (c) 2012 Chelsio Communications, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/netinet/toecore.h 237263 2012-06-19 07:34:13Z np $
+ */
+
+#ifndef _NETINET_TOE_H_
+#define	_NETINET_TOE_H_
+
+#ifndef _KERNEL
+#error "no user-serviceable parts inside"
+#endif
+
+struct tcpopt;
+struct tcphdr;
+struct in_conninfo;
+
+struct toedev {
+	TAILQ_ENTRY(toedev) link;	/* glue for toedev_list */
+	void *tod_softc;		/* TOE driver private data */
+
+	/*
+	 * Active open.  If a failure occurs, it is reported back by the driver
+	 * via toe_connect_failed.
+	 */
+	int (*tod_connect)(struct toedev *, struct socket *, struct rtentry *,
+	    struct sockaddr *);
+
+	/* Passive open. */
+	int (*tod_listen_start)(struct toedev *, struct tcpcb *);
+	int (*tod_listen_stop)(struct toedev *, struct tcpcb *);
+
+	/*
+	 * The kernel uses this routine to pass on any frame it receives for an
+	 * offloaded connection to the TOE driver.  This is an unusual event.
+	 */
+	void (*tod_input)(struct toedev *, struct tcpcb *, struct mbuf *);
+
+	/*
+	 * This is called by the kernel during pru_rcvd for an offloaded TCP
+	 * connection and provides an opportunity for the TOE driver to manage
+	 * its rx window and credits.
+	 */
+	void (*tod_rcvd)(struct toedev *, struct tcpcb *);
+
+	/*
+	 * Transmit routine.  The kernel calls this to have the TOE driver
+	 * evaluate whether there is data to be transmitted, and transmit it.
+	 */
+	int (*tod_output)(struct toedev *, struct tcpcb *);
+
+	/* Immediate teardown: send RST to peer. */
+	int (*tod_send_rst)(struct toedev *, struct tcpcb *);
+
+	/* Initiate orderly disconnect by sending FIN to the peer. */
+	int (*tod_send_fin)(struct toedev *, struct tcpcb *);
+
+	/* Called to indicate that the kernel is done with this TCP PCB. */
+	void (*tod_pcb_detach)(struct toedev *, struct tcpcb *);
+
+	/*
+	 * The kernel calls this once it has information about an L2 entry that
+	 * the TOE driver enquired about previously (via toe_l2_resolve).
+	 */
+	void (*tod_l2_update)(struct toedev *, struct ifnet *,
+	    struct sockaddr *, uint8_t *, uint16_t);
+
+	/* XXX.  Route has been redirected. */
+	void (*tod_route_redirect)(struct toedev *, struct ifnet *,
+	    struct rtentry *, struct rtentry *);
+
+	/* Syncache interaction. */
+	void (*tod_syncache_added)(struct toedev *, void *);
+	void (*tod_syncache_removed)(struct toedev *, void *);
+	int (*tod_syncache_respond)(struct toedev *, void *, struct mbuf *);
+	void (*tod_offload_socket)(struct toedev *, void *, struct socket *);
+
+	/* TCP socket option */
+	void (*tod_ctloutput)(struct toedev *, struct tcpcb *, int, int);
+};
+
+#include <sys/eventhandler.h>
+typedef	void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *);
+typedef	void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *);
+EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn);
+EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn);
+
+void init_toedev(struct toedev *);
+int register_toedev(struct toedev *);
+int unregister_toedev(struct toedev *);
+
+/*
+ * General interface for looking up L2 information for an IP address.  If an
+ * answer is not available right away then the TOE driver's tod_l2_update will
+ * be called later.
+ */
+int toe_l2_resolve(struct toedev *, struct ifnet *, struct sockaddr *,
+    uint8_t *, uint16_t *);
+
+void toe_connect_failed(struct toedev *, struct tcpcb *, int);
+
+void toe_syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *,
+    struct inpcb *, void *, void *);
+int  toe_syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *,
+    struct socket **);
+
+int toe_4tuple_check(struct in_conninfo *, struct tcphdr *, struct ifnet *);
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/toedev.h
--- a/head/sys/netinet/toedev.h	Wed Jul 25 16:32:50 2012 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,162 +0,0 @@
-/*-
- * Copyright (c) 2007, Chelsio Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Neither the name of the Chelsio Corporation nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _NETINET_TOEDEV_H_
-#define	_NETINET_TOEDEV_H_
-
-#ifndef _KERNEL
-#error "no user-serviceable parts inside"
-#endif
-
-extern uint32_t toedev_registration_count;
-
-/* Parameter values for offload_get_phys_egress(). */
-enum {
-	TOE_OPEN,
-	TOE_FAILOVER,
-};
-
-/* Parameter values for toe_failover(). */
-enum {
-	TOE_ACTIVE_SLAVE,
-	TOE_LINK_DOWN,
-	TOE_LINK_UP,
-	TOE_RELEASE,
-	TOE_RELEASE_ALL,
-};
-
-#define	TOENAMSIZ	16
-
-/* Get the toedev associated with a ifnet. */
-#define	TOEDEV(ifp)	((ifp)->if_llsoftc)
-
-struct offload_id {
-	unsigned int	id;
-	unsigned long	data;
-};
-
-struct ifnet;
-struct rt_entry;
-struct tom_info;
-struct sysctl_oid;
-struct socket;
-struct mbuf;
-
-struct toedev {
-	TAILQ_ENTRY(toedev) entry;  
-	char 		tod_name[TOENAMSIZ];	/* TOE device name */
-	unsigned int 	tod_ttid;		/* TOE type id */
-	unsigned long 	tod_flags;		/* device flags */
-	unsigned int	tod_mtu;		/* max TX offloaded data */
-	unsigned int	tod_nconn;		/* max # of offloaded
-						 * connections
-						 */
-	struct ifnet 	*tod_lldev;   		/* first interface */
-	const struct tom_info *tod_offload_mod; /* TCP offload module */
-
-	/*
-	 * This TOE device is capable of offloading the connection for socket so
-	 */
-	int	(*tod_can_offload)(struct toedev *dev, struct socket *so);
-
-	/*
-	 * Establish a connection to nam using the TOE device dev
-	 */
-	int	(*tod_connect)(struct toedev *dev, struct socket *so,
-	        struct rtentry *rt, struct sockaddr *nam);
-	/*
-	 * Send an mbuf down to the toe device 
-	 */
-	int	(*tod_send)(struct toedev *dev, struct mbuf *m);
-	/*
-	 * Receive an array of mbufs from the TOE device dev 
-	 */
-	int	(*tod_recv)(struct toedev *dev, struct mbuf **m, int n);
-	/*
-	 * Device specific ioctl interface
-	 */
-	int	(*tod_ctl)(struct toedev *dev, unsigned int req, void *data);
-	/*
-	 * Update L2 entry in toedev 
-	 */
-	void	(*tod_arp_update)(struct toedev *dev, struct rtentry *neigh);
-	/*
-	 * Failover from one toe device to another
-	 */
-	void	(*tod_failover)(struct toedev *dev, struct ifnet *bond_ifp,
-			 struct ifnet *ndev, int event);
-	void	*tod_priv;			/* driver private data */
-	void 	*tod_l2opt;			/* optional layer 2 data */
-	void	*tod_l3opt; 			/* optional layer 3 data */
-	void 	*tod_l4opt;			/* optional layer 4 data */
-	void 	*tod_ulp;			/* upper lever protocol */
-};
-
-struct tom_info {
-	TAILQ_ENTRY(tom_info)	entry;
-	int		(*ti_attach)(struct toedev *dev,
-	                             const struct offload_id *entry);
-	int		(*ti_detach)(struct toedev *dev);
-	const char	*ti_name;
-	const struct offload_id	*ti_id_table;
-};
-
-static __inline void
-init_offload_dev(struct toedev *dev)
-{
-}
-
-int	register_tom(struct tom_info *t);
-int	unregister_tom(struct tom_info *t);
-int	register_toedev(struct toedev *dev, const char *name);
-int	unregister_toedev(struct toedev *dev);
-int	activate_offload(struct toedev *dev);
-int	toe_send(struct toedev *dev, struct mbuf *m);
-void	toe_arp_update(struct rtentry *rt);
-struct ifnet	*offload_get_phys_egress(struct ifnet *ifp,
-        struct socket *so, int context);
-int 	toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n);
-
-static __inline void
-toe_neigh_update(struct ifnet *ifp)
-{
-}
-
-static __inline void
-toe_failover(struct ifnet *bond_ifp, struct ifnet *fail_ifp, int event)
-{
-}
-
-static __inline int
-toe_enslave(struct ifnet *bond_ifp, struct ifnet *slave_ifp)
-{
-	return (0);
-}
-
-#endif /* _NETINET_TOEDEV_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/netinet/udp_usrreq.c
--- a/head/sys/netinet/udp_usrreq.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/netinet/udp_usrreq.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/netinet/udp_usrreq.c 233554 2012-03-27 15:14:29Z bz $");
+__FBSDID("$FreeBSD: head/sys/netinet/udp_usrreq.c 236961 2012-06-12 14:56:08Z tuexen $");
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
@@ -956,6 +956,7 @@
 	int ipflags;
 	u_short fport, lport;
 	int unlock_udbinfo;
+	u_char tos;
 
 	/*
 	 * udp_output() may need to temporarily bind or connect the current
@@ -971,12 +972,15 @@
 	}
 
 	src.sin_family = 0;
+	INP_RLOCK(inp);
+	tos = inp->inp_ip_tos;
 	if (control != NULL) {
 		/*
 		 * XXX: Currently, we assume all the optional information is
 		 * stored in a single mbuf.
 		 */
 		if (control->m_next) {
+			INP_RUNLOCK(inp);
 			m_freem(control);
 			m_freem(m);
 			return (EINVAL);
@@ -1008,6 +1012,14 @@
 				    *(struct in_addr *)CMSG_DATA(cm);
 				break;
 
+			case IP_TOS:
+				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
+					error = EINVAL;
+					break;
+				}
+				tos = *(u_char *)CMSG_DATA(cm);
+				break;
+
 			default:
 				error = ENOPROTOOPT;
 				break;
@@ -1018,6 +1030,7 @@
 		m_freem(control);
 	}
 	if (error) {
+		INP_RUNLOCK(inp);
 		m_freem(m);
 		return (error);
 	}
@@ -1039,7 +1052,6 @@
 	 * XXXRW: Check that hash locking update here is correct.
 	 */
 	sin = (struct sockaddr_in *)addr;
-	INP_RLOCK(inp);
 	if (sin != NULL &&
 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
 		INP_RUNLOCK(inp);
@@ -1223,7 +1235,7 @@
 		ui->ui_sum = 0;
 	((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len;
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
-	((struct ip *)ui)->ip_tos = inp->inp_ip_tos;	/* XXX */
+	((struct ip *)ui)->ip_tos = tos;		/* XXX */
 	UDPSTAT_INC(udps_opackets);
 
 	if (unlock_udbinfo == UH_WLOCKED)
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/pc98/conf/GENERIC
--- a/head/sys/pc98/conf/GENERIC	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/pc98/conf/GENERIC	Wed Jul 25 16:40:53 2012 +0300
@@ -16,7 +16,7 @@
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
-# $FreeBSD: head/sys/pc98/conf/GENERIC 233271 2012-03-21 08:38:42Z ed $
+# $FreeBSD: head/sys/pc98/conf/GENERIC 235898 2012-05-24 11:20:51Z mav $
 
 cpu		I486_CPU
 cpu		I586_CPU
@@ -110,7 +110,7 @@
 device		sa		# Sequential Access (tape etc)
 device		cd		# CD
 device		pass		# Passthrough device (direct ATA/SCSI access)
-device		ses		# SCSI Environmental Services (and SAF-TE)
+device		ses		# Enclosure Services (SES and SAF-TE)
 
 # keyboard driver
 device		pckbd		# PC98 keyboard
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/pc98/include/vdso.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/pc98/include/vdso.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/pc98/include/vdso.h 237433 2012-06-22 07:06:40Z kib $ */
+
+#include <x86/vdso.h>
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/pc98/pc98/machdep.c
--- a/head/sys/pc98/pc98/machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/pc98/pc98/machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/pc98/pc98/machdep.c 233031 2012-03-16 12:13:44Z nyan $");
+__FBSDID("$FreeBSD: head/sys/pc98/pc98/machdep.c 238310 2012-07-09 20:42:08Z jhb $");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
@@ -73,6 +73,7 @@
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
@@ -151,7 +152,6 @@
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
-extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
@@ -217,6 +217,8 @@
 
 struct mtx icu_lock;
 
+struct mem_range_softc mem_range_softc;
+
 static void
 cpu_startup(dummy)
 	void *dummy;
@@ -271,6 +273,11 @@
 	bufinit();
 	vm_pager_bufferinit();
 	cpu_setregs();
+
+	/*
+	 * Add BSP as an interrupt target.
+	 */
+	intr_add_cpu(0);
 }
 
 /*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/aim/locore32.S
--- a/head/sys/powerpc/aim/locore32.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/aim/locore32.S	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/powerpc/aim/locore32.S 228605 2011-12-16 23:40:56Z nwhitehorn $ */
+/* $FreeBSD: head/sys/powerpc/aim/locore32.S 237737 2012-06-29 01:55:20Z rpaulo $ */
 /* $NetBSD: locore.S,v 1.24 2000/05/31 05:09:17 thorpej Exp $ */
 
 /*-
@@ -164,13 +164,14 @@
 	
 	bl	OF_initial_setup
 
+	lis	3,kernel_text at ha
+	addi	3,3,kernel_text at l
+
 	lis	4,end at ha
 	addi	4,4,end at l
+	add	4,4,3
 	mr	5,4
 
-	lis	3,kernel_text at ha
-	addi	3,3,kernel_text at l
-
 	/* Restore the argument pointer and length */
 	mr	6,20
 	mr	7,21
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/aim/locore64.S
--- a/head/sys/powerpc/aim/locore64.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/aim/locore64.S	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/powerpc/aim/locore64.S 230400 2012-01-20 22:34:19Z andreast $ */
+/* $FreeBSD: head/sys/powerpc/aim/locore64.S 237737 2012-06-29 01:55:20Z rpaulo $ */
 /* $NetBSD: locore.S,v 1.24 2000/05/31 05:09:17 thorpej Exp $ */
 
 /*-
@@ -164,13 +164,14 @@
 	bl	OF_initial_setup
 	nop
 
+	lis	3,kernbase at ha
+	addi	3,3,kernbase at l
+
 	lis	4,end at ha
 	addi	4,4,end at l
+	add	4,4,3
 	mr	5,4
 
-	lis	3,kernbase at ha
-	addi	3,3,kernbase at l
-
 	/* Restore the argument pointer and length */
 	mr	6,20
 	mr	7,21
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/aim/mmu_oea.c
--- a/head/sys/powerpc/aim/mmu_oea.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/aim/mmu_oea.c	Wed Jul 25 16:40:53 2012 +0300
@@ -91,7 +91,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/aim/mmu_oea.c 234156 2012-04-11 22:23:50Z nwhitehorn $");
+__FBSDID("$FreeBSD: head/sys/powerpc/aim/mmu_oea.c 238357 2012-07-10 22:10:21Z alc $");
 
 /*
  * Manages physical address maps.
@@ -125,6 +125,7 @@
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
@@ -204,6 +205,17 @@
 struct	pvo_head moea_pvo_kunmanaged =
     LIST_HEAD_INITIALIZER(moea_pvo_kunmanaged);	/* list of unmanaged pages */
 
+/*
+ * Isolate the global pv list lock from data and other locks to prevent false
+ * sharing within the cache.
+ */
+static struct {
+	struct rwlock	lock;
+	char		padding[CACHE_LINE_SIZE - sizeof(struct rwlock)];
+} pvh_global __aligned(CACHE_LINE_SIZE);
+
+#define	pvh_global_lock	pvh_global.lock
+
 uma_zone_t	moea_upvo_zone;	/* zone for pvo entries for unmanaged pages */
 uma_zone_t	moea_mpvo_zone;	/* zone for pvo entries for managed pages */
 
@@ -288,8 +300,8 @@
 boolean_t moea_is_modified(mmu_t, vm_page_t);
 boolean_t moea_is_prefaultable(mmu_t, pmap_t, vm_offset_t);
 boolean_t moea_is_referenced(mmu_t, vm_page_t);
-boolean_t moea_ts_referenced(mmu_t, vm_page_t);
-vm_offset_t moea_map(mmu_t, vm_offset_t *, vm_offset_t, vm_offset_t, int);
+int moea_ts_referenced(mmu_t, vm_page_t);
+vm_offset_t moea_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
 boolean_t moea_page_exists_quick(mmu_t, pmap_t, vm_page_t);
 int moea_page_wired_mappings(mmu_t, vm_page_t);
 void moea_pinit(mmu_t, pmap_t);
@@ -308,14 +320,14 @@
 void moea_deactivate(mmu_t, struct thread *);
 void moea_cpu_bootstrap(mmu_t, int);
 void moea_bootstrap(mmu_t, vm_offset_t, vm_offset_t);
-void *moea_mapdev(mmu_t, vm_offset_t, vm_size_t);
+void *moea_mapdev(mmu_t, vm_paddr_t, vm_size_t);
 void *moea_mapdev_attr(mmu_t, vm_offset_t, vm_size_t, vm_memattr_t);
 void moea_unmapdev(mmu_t, vm_offset_t, vm_size_t);
-vm_offset_t moea_kextract(mmu_t, vm_offset_t);
+vm_paddr_t moea_kextract(mmu_t, vm_offset_t);
 void moea_kenter_attr(mmu_t, vm_offset_t, vm_offset_t, vm_memattr_t);
-void moea_kenter(mmu_t, vm_offset_t, vm_offset_t);
+void moea_kenter(mmu_t, vm_offset_t, vm_paddr_t);
 void moea_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma);
-boolean_t moea_dev_direct_mapped(mmu_t, vm_offset_t, vm_size_t);
+boolean_t moea_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t);
 static void moea_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t);
 
 static mmu_method_t moea_methods[] = {
@@ -455,7 +467,7 @@
 moea_attr_clear(vm_page_t m, int ptebit)
 {
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	m->md.mdpg_attrs &= ~ptebit;
 }
 
@@ -470,7 +482,7 @@
 moea_attr_save(vm_page_t m, int ptebit)
 {
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	m->md.mdpg_attrs |= ptebit;
 }
 
@@ -857,7 +869,12 @@
 	for (i = 0; i < 16; i++)
 		kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
 	CPU_FILL(&kernel_pmap->pm_active);
-	LIST_INIT(&kernel_pmap->pmap_pvo);
+	RB_INIT(&kernel_pmap->pmap_pvo);
+
+ 	/*
+	 * Initialize the global pv list lock.
+	 */
+	rw_init(&pvh_global_lock, "pmap pv global");
 
 	/*
 	 * Set up the Open Firmware mappings
@@ -1066,10 +1083,10 @@
 	   boolean_t wired)
 {
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	moea_enter_locked(pmap, va, m, prot, wired);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
@@ -1102,7 +1119,7 @@
 		pvo_flags = PVO_MANAGED;
 	}
 	if (pmap_bootstrapped)
-		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+		rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
 	    VM_OBJECT_LOCKED(m->object),
@@ -1166,14 +1183,14 @@
 
 	psize = atop(end - start);
 	m = m_start;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		moea_enter_locked(pm, start + ptoa(diff), m, prot &
 		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
 		m = TAILQ_NEXT(m, listq);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pm);
 }
 
@@ -1182,11 +1199,11 @@
     vm_prot_t prot)
 {
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
 	moea_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 	    FALSE);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pm);
 }
 
@@ -1252,15 +1269,20 @@
 boolean_t
 moea_is_referenced(mmu_t mmu, vm_page_t m)
 {
+	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("moea_is_referenced: page %p is not managed", m));
-	return (moea_query_bit(m, PTE_REF));
+	rw_wlock(&pvh_global_lock);
+	rv = moea_query_bit(m, PTE_REF);
+	rw_wunlock(&pvh_global_lock);
+	return (rv);
 }
 
 boolean_t
 moea_is_modified(mmu_t mmu, vm_page_t m)
 {
+	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("moea_is_modified: page %p is not managed", m));
@@ -1274,7 +1296,10 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
-	return (moea_query_bit(m, PTE_CHG));
+	rw_wlock(&pvh_global_lock);
+	rv = moea_query_bit(m, PTE_CHG);
+	rw_wunlock(&pvh_global_lock);
+	return (rv);
 }
 
 boolean_t
@@ -1296,7 +1321,9 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("moea_clear_reference: page %p is not managed", m));
+	rw_wlock(&pvh_global_lock);
 	moea_clear_bit(m, PTE_REF);
+	rw_wunlock(&pvh_global_lock);
 }
 
 void
@@ -1316,7 +1343,9 @@
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
+	rw_wlock(&pvh_global_lock);
 	moea_clear_bit(m, PTE_CHG);
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -1342,7 +1371,7 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	lo = moea_attr_fetch(m);
 	powerpc_sync();
 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
@@ -1368,7 +1397,7 @@
 		vm_page_dirty(m);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -1383,13 +1412,17 @@
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
-boolean_t
+int
 moea_ts_referenced(mmu_t mmu, vm_page_t m)
 {
+	int count;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("moea_ts_referenced: page %p is not managed", m));
-	return (moea_clear_bit(m, PTE_REF));
+	rw_wlock(&pvh_global_lock);
+	count = moea_clear_bit(m, PTE_REF);
+	rw_wunlock(&pvh_global_lock);
+	return (count);
 }
 
 /*
@@ -1409,7 +1442,7 @@
 		return;
 	}
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	pvo_head = vm_page_to_pvoh(m);
 	lo = moea_calc_wimg(VM_PAGE_TO_PHYS(m), ma);
 
@@ -1429,14 +1462,14 @@
 		PMAP_UNLOCK(pmap);
 	}
 	m->md.mdpg_cache_attrs = ma;
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  * Map a wired page into kernel virtual address space.
  */
 void
-moea_kenter(mmu_t mmu, vm_offset_t va, vm_offset_t pa)
+moea_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa)
 {
 
 	moea_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT);
@@ -1471,7 +1504,7 @@
  * Extract the physical page address associated with the given kernel virtual
  * address.
  */
-vm_offset_t
+vm_paddr_t
 moea_kextract(mmu_t mmu, vm_offset_t va)
 {
 	struct		pvo_entry *pvo;
@@ -1512,8 +1545,8 @@
  * first usable address after the mapped region.
  */
 vm_offset_t
-moea_map(mmu_t mmu, vm_offset_t *virt, vm_offset_t pa_start,
-    vm_offset_t pa_end, int prot)
+moea_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start,
+    vm_paddr_t pa_end, int prot)
 {
 	vm_offset_t	sva, va;
 
@@ -1543,7 +1576,7 @@
 	    ("moea_page_exists_quick: page %p is not managed", m));
 	loops = 0;
 	rv = FALSE;
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
 		if (pvo->pvo_pmap == pmap) {
 			rv = TRUE;
@@ -1552,7 +1585,7 @@
 		if (++loops >= 16)
 			break;
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
@@ -1569,11 +1602,11 @@
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
 		if ((pvo->pvo_vaddr & PVO_WIRED) != 0)
 			count++;
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
@@ -1587,7 +1620,7 @@
 
 	KASSERT((int)pmap < VM_MIN_KERNEL_ADDRESS, ("moea_pinit: virt pmap"));
 	PMAP_LOCK_INIT(pmap);
-	LIST_INIT(&pmap->pmap_pvo);
+	RB_INIT(&pmap->pmap_pvo);
 
 	entropy = 0;
 	__asm __volatile("mftb %0" : "=r"(entropy));
@@ -1661,9 +1694,8 @@
 moea_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva,
     vm_prot_t prot)
 {
-	struct	pvo_entry *pvo;
+	struct	pvo_entry *pvo, *tpvo, key;
 	struct	pte *pt;
-	int	pteidx;
 
 	KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
 	    ("moea_protect: non current pmap"));
@@ -1673,13 +1705,12 @@
 		return;
 	}
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
-	for (; sva < eva; sva += PAGE_SIZE) {
-		pvo = moea_pvo_find_va(pm, sva, &pteidx);
-		if (pvo == NULL)
-			continue;
-
+	key.pvo_vaddr = sva;
+	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
+	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
+		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
 		if ((prot & VM_PROT_EXECUTE) == 0)
 			pvo->pvo_vaddr &= ~PVO_EXECUTABLE;
 
@@ -1687,7 +1718,7 @@
 		 * Grab the PTE pointer before we diddle with the cached PTE
 		 * copy.
 		 */
-		pt = moea_pvo_to_pte(pvo, pteidx);
+		pt = moea_pvo_to_pte(pvo, -1);
 		/*
 		 * Change the protection of the page.
 		 */
@@ -1702,7 +1733,7 @@
 			mtx_unlock(&moea_table_mutex);
 		}
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pm);
 }
 
@@ -1766,26 +1797,18 @@
 void
 moea_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva)
 {
-	struct	pvo_entry *pvo, *tpvo;
-	int	pteidx;
+	struct	pvo_entry *pvo, *tpvo, key;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pm);
-	if ((eva - sva)/PAGE_SIZE < 10) {
-		for (; sva < eva; sva += PAGE_SIZE) {
-			pvo = moea_pvo_find_va(pm, sva, &pteidx);
-			if (pvo != NULL)
-				moea_pvo_remove(pvo, pteidx);
-		}
-	} else {
-		LIST_FOREACH_SAFE(pvo, &pm->pmap_pvo, pvo_plink, tpvo) {
-			if (PVO_VADDR(pvo) < sva || PVO_VADDR(pvo) >= eva)
-				continue;
-			moea_pvo_remove(pvo, -1);
-		}
+	key.pvo_vaddr = sva;
+	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
+	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
+		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
+		moea_pvo_remove(pvo, -1);
 	}
 	PMAP_UNLOCK(pm);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -1799,7 +1822,7 @@
 	struct	pvo_entry *pvo, *next_pvo;
 	pmap_t	pmap;
 
-	vm_page_lock_queues();
+	rw_wlock(&pvh_global_lock);
 	pvo_head = vm_page_to_pvoh(m);
 	for (pvo = LIST_FIRST(pvo_head); pvo != NULL; pvo = next_pvo) {
 		next_pvo = LIST_NEXT(pvo, pvo_vlink);
@@ -1809,12 +1832,12 @@
 		moea_pvo_remove(pvo, -1);
 		PMAP_UNLOCK(pmap);
 	}
-	if ((m->aflags & PGA_WRITEABLE) && moea_is_modified(mmu, m)) {
+	if ((m->aflags & PGA_WRITEABLE) && moea_query_bit(m, PTE_CHG)) {
 		moea_attr_clear(m, PTE_CHG);
 		vm_page_dirty(m);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&pvh_global_lock);
 }
 
 /*
@@ -1946,7 +1969,7 @@
 	/*
 	 * Add to pmap list
 	 */
-	LIST_INSERT_HEAD(&pm->pmap_pvo, pvo, pvo_plink);
+	RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo);
 
 	/*
 	 * Remember if the list was empty and therefore will be the first
@@ -2017,7 +2040,7 @@
 	 * Remove this PVO from the PV and pmap lists.
 	 */
 	LIST_REMOVE(pvo, pvo_vlink);
-	LIST_REMOVE(pvo, pvo_plink);
+	RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
 
 	/*
 	 * Remove this from the overflow list and return it to the pool
@@ -2286,10 +2309,10 @@
 	struct	pvo_entry *pvo;
 	struct	pte *pt;
 
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	if (moea_attr_fetch(m) & ptebit)
 		return (TRUE);
 
-	vm_page_lock_queues();
 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
 
 		/*
@@ -2298,7 +2321,6 @@
 		 */
 		if (pvo->pvo_pte.pte.pte_lo & ptebit) {
 			moea_attr_save(m, ptebit);
-			vm_page_unlock_queues();
 			return (TRUE);
 		}
 	}
@@ -2322,13 +2344,11 @@
 			mtx_unlock(&moea_table_mutex);
 			if (pvo->pvo_pte.pte.pte_lo & ptebit) {
 				moea_attr_save(m, ptebit);
-				vm_page_unlock_queues();
 				return (TRUE);
 			}
 		}
 	}
 
-	vm_page_unlock_queues();
 	return (FALSE);
 }
 
@@ -2339,7 +2359,7 @@
 	struct	pvo_entry *pvo;
 	struct	pte *pt;
 
-	vm_page_lock_queues();
+	rw_assert(&pvh_global_lock, RA_WLOCKED);
 
 	/*
 	 * Clear the cached value.
@@ -2373,7 +2393,6 @@
 		pvo->pvo_pte.pte.pte_lo &= ~ptebit;
 	}
 
-	vm_page_unlock_queues();
 	return (count);
 }
 
@@ -2418,7 +2437,7 @@
 }
 
 boolean_t
-moea_dev_direct_mapped(mmu_t mmu, vm_offset_t pa, vm_size_t size)
+moea_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
 {
 	int i;
 
@@ -2441,7 +2460,7 @@
  * NOT real memory.
  */
 void *
-moea_mapdev(mmu_t mmu, vm_offset_t pa, vm_size_t size)
+moea_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
 {
 
 	return (moea_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT));
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/aim/mmu_oea64.c
--- a/head/sys/powerpc/aim/mmu_oea64.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/aim/mmu_oea64.c	Wed Jul 25 16:40:53 2012 +0300
@@ -91,7 +91,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/aim/mmu_oea64.c 234156 2012-04-11 22:23:50Z nwhitehorn $");
+__FBSDID("$FreeBSD: head/sys/powerpc/aim/mmu_oea64.c 238357 2012-07-10 22:10:21Z alc $");
 
 /*
  * Manages physical address maps.
@@ -223,8 +223,6 @@
  * PVO data.
  */
 struct	pvo_head *moea64_pvo_table;		/* pvo entries by pteg index */
-struct	pvo_head moea64_pvo_kunmanaged =	/* list of unmanaged pages */
-    LIST_HEAD_INITIALIZER(moea64_pvo_kunmanaged);
 
 uma_zone_t	moea64_upvo_zone; /* zone for pvo entries for unmanaged pages */
 uma_zone_t	moea64_mpvo_zone; /* zone for pvo entries for managed pages */
@@ -307,8 +305,8 @@
 boolean_t moea64_is_modified(mmu_t, vm_page_t);
 boolean_t moea64_is_prefaultable(mmu_t, pmap_t, vm_offset_t);
 boolean_t moea64_is_referenced(mmu_t, vm_page_t);
-boolean_t moea64_ts_referenced(mmu_t, vm_page_t);
-vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_offset_t, vm_offset_t, int);
+int moea64_ts_referenced(mmu_t, vm_page_t);
+vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
 boolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t);
 int moea64_page_wired_mappings(mmu_t, vm_page_t);
 void moea64_pinit(mmu_t, pmap_t);
@@ -326,14 +324,14 @@
 void moea64_zero_page_idle(mmu_t, vm_page_t);
 void moea64_activate(mmu_t, struct thread *);
 void moea64_deactivate(mmu_t, struct thread *);
-void *moea64_mapdev(mmu_t, vm_offset_t, vm_size_t);
+void *moea64_mapdev(mmu_t, vm_paddr_t, vm_size_t);
 void *moea64_mapdev_attr(mmu_t, vm_offset_t, vm_size_t, vm_memattr_t);
 void moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t);
-vm_offset_t moea64_kextract(mmu_t, vm_offset_t);
+vm_paddr_t moea64_kextract(mmu_t, vm_offset_t);
 void moea64_page_set_memattr(mmu_t, vm_page_t m, vm_memattr_t ma);
 void moea64_kenter_attr(mmu_t, vm_offset_t, vm_offset_t, vm_memattr_t ma);
-void moea64_kenter(mmu_t, vm_offset_t, vm_offset_t);
-boolean_t moea64_dev_direct_mapped(mmu_t, vm_offset_t, vm_size_t);
+void moea64_kenter(mmu_t, vm_offset_t, vm_paddr_t);
+boolean_t moea64_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t);
 static void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t);
 
 static mmu_method_t moea64_methods[] = {
@@ -622,8 +620,8 @@
 				pte_lo |= LPTE_G;
 
 			moea64_pvo_enter(mmup, kernel_pmap, moea64_upvo_zone,
-				    &moea64_pvo_kunmanaged, pa, pa,
-				    pte_lo, PVO_WIRED | PVO_LARGE);
+				    NULL, pa, pa, pte_lo,
+				    PVO_WIRED | PVO_LARGE);
 		  }
 		}
 		PMAP_UNLOCK(kernel_pmap);
@@ -820,7 +818,7 @@
 
 	kernel_pmap->pmap_phys = kernel_pmap;
 	CPU_FILL(&kernel_pmap->pm_active);
-	LIST_INIT(&kernel_pmap->pmap_pvo);
+	RB_INIT(&kernel_pmap->pmap_pvo);
 
 	PMAP_LOCK_INIT(kernel_pmap);
 
@@ -1179,7 +1177,7 @@
 	int		error;
 
 	if (!moea64_initialized) {
-		pvo_head = &moea64_pvo_kunmanaged;
+		pvo_head = NULL;
 		pg = NULL;
 		zone = moea64_upvo_zone;
 		pvo_flags = 0;
@@ -1197,7 +1195,7 @@
 	/* XXX change the pvo head for fake pages */
 	if ((m->oflags & VPO_UNMANAGED) != 0) {
 		pvo_flags &= ~PVO_MANAGED;
-		pvo_head = &moea64_pvo_kunmanaged;
+		pvo_head = NULL;
 		zone = moea64_upvo_zone;
 	}
 
@@ -1315,7 +1313,6 @@
 	struct	pvo_entry *pvo;
 	vm_paddr_t pa;
 
-	LOCK_TABLE_RD();
 	PMAP_LOCK(pm);
 	pvo = moea64_pvo_find_va(pm, va);
 	if (pvo == NULL)
@@ -1323,7 +1320,6 @@
 	else
 		pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) |
 		    (va - PVO_VADDR(pvo));
-	UNLOCK_TABLE_RD();
 	PMAP_UNLOCK(pm);
 	return (pa);
 }
@@ -1342,7 +1338,6 @@
         
 	m = NULL;
 	pa = 0;
-	LOCK_TABLE_RD();
 	PMAP_LOCK(pmap);
 retry:
 	pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
@@ -1356,7 +1351,6 @@
 		vm_page_hold(m);
 	}
 	PA_UNLOCK_COND(pa);
-	UNLOCK_TABLE_RD();
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
@@ -1404,8 +1398,7 @@
 		PMAP_LOCK(kernel_pmap);
 
 	moea64_pvo_enter(installed_mmu, kernel_pmap, moea64_upvo_zone,
-	    &moea64_pvo_kunmanaged, va, VM_PAGE_TO_PHYS(m), LPTE_M,
-	    PVO_WIRED | PVO_BOOTSTRAP);
+	    NULL, va, VM_PAGE_TO_PHYS(m), LPTE_M, PVO_WIRED | PVO_BOOTSTRAP);
 
 	if (needed_lock)
 		PMAP_UNLOCK(kernel_pmap);
@@ -1479,12 +1472,10 @@
 	struct pvo_entry *pvo;
 	boolean_t rv;
 
-	LOCK_TABLE_RD();
 	PMAP_LOCK(pmap);
 	pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
 	rv = pvo == NULL || (pvo->pvo_pte.lpte.pte_hi & LPTE_VALID) == 0;
 	PMAP_UNLOCK(pmap);
-	UNLOCK_TABLE_RD();
 	return (rv);
 }
 
@@ -1579,7 +1570,7 @@
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
-boolean_t
+int
 moea64_ts_referenced(mmu_t mmu, vm_page_t m)
 {
 
@@ -1640,7 +1631,7 @@
 	LOCK_TABLE_WR();
 	PMAP_LOCK(kernel_pmap);
 	error = moea64_pvo_enter(mmu, kernel_pmap, moea64_upvo_zone,
-	    &moea64_pvo_kunmanaged, va, pa, pte_lo, PVO_WIRED);
+	    NULL, va, pa, pte_lo, PVO_WIRED);
 	PMAP_UNLOCK(kernel_pmap);
 	UNLOCK_TABLE_WR();
 
@@ -1650,7 +1641,7 @@
 }
 
 void
-moea64_kenter(mmu_t mmu, vm_offset_t va, vm_offset_t pa)
+moea64_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa)
 {
 
 	moea64_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT);
@@ -1660,7 +1651,7 @@
  * Extract the physical page address associated with the given kernel virtual
  * address.
  */
-vm_offset_t
+vm_paddr_t
 moea64_kextract(mmu_t mmu, vm_offset_t va)
 {
 	struct		pvo_entry *pvo;
@@ -1673,13 +1664,11 @@
 	if (va < VM_MIN_KERNEL_ADDRESS)
 		return (va);
 
-	LOCK_TABLE_RD();
 	PMAP_LOCK(kernel_pmap);
 	pvo = moea64_pvo_find_va(kernel_pmap, va);
 	KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR,
 	    va));
 	pa = (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) | (va - PVO_VADDR(pvo));
-	UNLOCK_TABLE_RD();
 	PMAP_UNLOCK(kernel_pmap);
 	return (pa);
 }
@@ -1703,8 +1692,8 @@
  * first usable address after the mapped region.
  */
 vm_offset_t
-moea64_map(mmu_t mmu, vm_offset_t *virt, vm_offset_t pa_start,
-    vm_offset_t pa_end, int prot)
+moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start,
+    vm_paddr_t pa_end, int prot)
 {
 	vm_offset_t	sva, va;
 
@@ -1826,7 +1815,7 @@
 moea64_pinit(mmu_t mmu, pmap_t pmap)
 {
 	PMAP_LOCK_INIT(pmap);
-	LIST_INIT(&pmap->pmap_pvo);
+	RB_INIT(&pmap->pmap_pvo);
 
 	pmap->pm_slb_tree_root = slb_alloc_tree();
 	pmap->pm_slb = slb_alloc_user_cache();
@@ -1840,7 +1829,7 @@
 	uint32_t hash;
 
 	PMAP_LOCK_INIT(pmap);
-	LIST_INIT(&pmap->pmap_pvo);
+	RB_INIT(&pmap->pmap_pvo);
 
 	if (pmap_bootstrapped)
 		pmap->pmap_phys = (pmap_t)moea64_kextract(mmu,
@@ -1936,7 +1925,7 @@
 moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva,
     vm_prot_t prot)
 {
-	struct	pvo_entry *pvo, *tpvo;
+	struct	pvo_entry *pvo, *tpvo, key;
 
 	CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm,
 	    sva, eva, prot);
@@ -1951,26 +1940,11 @@
 
 	LOCK_TABLE_RD();
 	PMAP_LOCK(pm);
-	if ((eva - sva)/PAGE_SIZE < pm->pm_stats.resident_count) {
-		while (sva < eva) {
-			#ifdef __powerpc64__
-			if (pm != kernel_pmap &&
-			    user_va_to_slb_entry(pm, sva) == NULL) {
-				sva = roundup2(sva + 1, SEGMENT_LENGTH);
-				continue;
-			}
-			#endif
-			pvo = moea64_pvo_find_va(pm, sva);
-			if (pvo != NULL)
-				moea64_pvo_protect(mmu, pm, pvo, prot);
-			sva += PAGE_SIZE;
-		}
-	} else {
-		LIST_FOREACH_SAFE(pvo, &pm->pmap_pvo, pvo_plink, tpvo) {
-			if (PVO_VADDR(pvo) < sva || PVO_VADDR(pvo) >= eva)
-				continue;
-			moea64_pvo_protect(mmu, pm, pvo, prot);
-		}
+	key.pvo_vaddr = sva;
+	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
+	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
+		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
+		moea64_pvo_protect(mmu, pm, pvo, prot);
 	}
 	UNLOCK_TABLE_RD();
 	PMAP_UNLOCK(pm);
@@ -2049,7 +2023,7 @@
 
 	LOCK_TABLE_WR();
 	PMAP_LOCK(pm);
-	LIST_FOREACH_SAFE(pvo, &pm->pmap_pvo, pvo_plink, tpvo) {
+	RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) {
 		if (!(pvo->pvo_vaddr & PVO_WIRED))
 			moea64_pvo_remove(mmu, pvo);
 	}
@@ -2063,7 +2037,7 @@
 void
 moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva)
 {
-	struct	pvo_entry *pvo, *tpvo;
+	struct	pvo_entry *pvo, *tpvo, key;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
@@ -2073,26 +2047,11 @@
 
 	LOCK_TABLE_WR();
 	PMAP_LOCK(pm);
-	if ((eva - sva)/PAGE_SIZE < pm->pm_stats.resident_count) {
-		while (sva < eva) {
-			#ifdef __powerpc64__
-			if (pm != kernel_pmap &&
-			    user_va_to_slb_entry(pm, sva) == NULL) {
-				sva = roundup2(sva + 1, SEGMENT_LENGTH);
-				continue;
-			}
-			#endif
-			pvo = moea64_pvo_find_va(pm, sva);
-			if (pvo != NULL)
-				moea64_pvo_remove(mmu, pvo);
-			sva += PAGE_SIZE;
-		}
-	} else {
-		LIST_FOREACH_SAFE(pvo, &pm->pmap_pvo, pvo_plink, tpvo) {
-			if (PVO_VADDR(pvo) < sva || PVO_VADDR(pvo) >= eva)
-				continue;
-			moea64_pvo_remove(mmu, pvo);
-		}
+	key.pvo_vaddr = sva;
+	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
+	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
+		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
+		moea64_pvo_remove(mmu, pvo);
 	}
 	UNLOCK_TABLE_WR();
 	PMAP_UNLOCK(pm);
@@ -2244,12 +2203,6 @@
 		moea64_bpvo_pool_index++;
 		bootstrap = 1;
 	} else {
-		/*
-		 * Note: drop the table lock around the UMA allocation in
-		 * case the UMA allocator needs to manipulate the page
-		 * table. The mapping we are working with is already
-		 * protected by the PMAP lock.
-		 */
 		pvo = uma_zalloc(zone, M_NOWAIT);
 	}
 
@@ -2266,7 +2219,7 @@
 
 	if (flags & PVO_WIRED)
 		pvo->pvo_vaddr |= PVO_WIRED;
-	if (pvo_head != &moea64_pvo_kunmanaged)
+	if (pvo_head != NULL)
 		pvo->pvo_vaddr |= PVO_MANAGED;
 	if (bootstrap)
 		pvo->pvo_vaddr |= PVO_BOOTSTRAP;
@@ -2279,15 +2232,17 @@
 	/*
 	 * Add to pmap list
 	 */
-	LIST_INSERT_HEAD(&pm->pmap_pvo, pvo, pvo_plink);
+	RB_INSERT(pvo_tree, &pm->pmap_pvo, pvo);
 
 	/*
 	 * Remember if the list was empty and therefore will be the first
 	 * item.
 	 */
-	if (LIST_FIRST(pvo_head) == NULL)
-		first = 1;
-	LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
+	if (pvo_head != NULL) {
+		if (LIST_FIRST(pvo_head) == NULL)
+			first = 1;
+		LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
+	}
 
 	if (pvo->pvo_vaddr & PVO_WIRED) {
 		pvo->pvo_pte.lpte.pte_hi |= LPTE_WIRED;
@@ -2350,10 +2305,9 @@
 		pvo->pvo_pmap->pm_stats.wired_count--;
 
 	/*
-	 * Remove this PVO from the PV and pmap lists.
+	 * Remove this PVO from the pmap list.
 	 */
-	LIST_REMOVE(pvo, pvo_vlink);
-	LIST_REMOVE(pvo, pvo_plink);
+	RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
 
 	/*
 	 * Remove this from the overflow list and return it to the pool
@@ -2367,6 +2321,7 @@
 	pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN);
 
 	if ((pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED && pg != NULL) {
+		LIST_REMOVE(pvo, pvo_vlink);
 		if ((pvo->pvo_pte.lpte.pte_lo & LPTE_PP) != LPTE_BR) {
 			if (pvo->pvo_pte.lpte.pte_lo & LPTE_CHG)
 				vm_page_dirty(pg);
@@ -2390,41 +2345,10 @@
 static struct pvo_entry *
 moea64_pvo_find_va(pmap_t pm, vm_offset_t va)
 {
-	struct		pvo_entry *pvo;
-	int		ptegidx;
-	uint64_t	vsid;
-	#ifdef __powerpc64__
-	uint64_t	slbv;
+	struct pvo_entry key;
 
-	if (pm == kernel_pmap) {
-		slbv = kernel_va_to_slbv(va);
-	} else {
-		struct slb *slb;
-		slb = user_va_to_slb_entry(pm, va);
-		/* The page is not mapped if the segment isn't */
-		if (slb == NULL)
-			return NULL;
-		slbv = slb->slbv;
-	}
-
-	vsid = (slbv & SLBV_VSID_MASK) >> SLBV_VSID_SHIFT;
-	if (slbv & SLBV_L)
-		va &= ~moea64_large_page_mask;
-	else
-		va &= ~ADDR_POFF;
-	ptegidx = va_to_pteg(vsid, va, slbv & SLBV_L);
-	#else
-	va &= ~ADDR_POFF;
-	vsid = va_to_vsid(pm, va);
-	ptegidx = va_to_pteg(vsid, va, 0);
-	#endif
-
-	LIST_FOREACH(pvo, &moea64_pvo_table[ptegidx], pvo_olink) {
-		if (pvo->pvo_pmap == pm && PVO_VADDR(pvo) == va)
-			break;
-	}
-
-	return (pvo);
+	key.pvo_vaddr = va & ~ADDR_POFF;
+	return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key));
 }
 
 static boolean_t
@@ -2516,23 +2440,23 @@
 }
 
 boolean_t
-moea64_dev_direct_mapped(mmu_t mmu, vm_offset_t pa, vm_size_t size)
+moea64_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
 {
-	struct pvo_entry *pvo;
+	struct pvo_entry *pvo, key;
 	vm_offset_t ppa;
 	int error = 0;
 
-	LOCK_TABLE_RD();
 	PMAP_LOCK(kernel_pmap);
-	for (ppa = pa & ~ADDR_POFF; ppa < pa + size; ppa += PAGE_SIZE) {
-		pvo = moea64_pvo_find_va(kernel_pmap, ppa);
+	key.pvo_vaddr = ppa = pa & ~ADDR_POFF;
+	for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key);
+	    ppa < pa + size; ppa += PAGE_SIZE,
+	    pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) {
 		if (pvo == NULL ||
 		    (pvo->pvo_pte.lpte.pte_lo & LPTE_RPGN) != ppa) {
 			error = EFAULT;
 			break;
 		}
 	}
-	UNLOCK_TABLE_RD();
 	PMAP_UNLOCK(kernel_pmap);
 
 	return (error);
@@ -2569,7 +2493,7 @@
 }
 
 void *
-moea64_mapdev(mmu_t mmu, vm_offset_t pa, vm_size_t size)
+moea64_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
 {
 
 	return moea64_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT);
@@ -2595,7 +2519,6 @@
 	vm_paddr_t pa;
 	vm_size_t len;
 
-	LOCK_TABLE_RD();
 	PMAP_LOCK(pm);
 	while (sz > 0) {
 		lim = round_page(va);
@@ -2609,6 +2532,5 @@
 		va += len;
 		sz -= len;
 	}
-	UNLOCK_TABLE_RD();
 	PMAP_UNLOCK(pm);
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/aim/moea64_native.c
--- a/head/sys/powerpc/aim/moea64_native.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/aim/moea64_native.c	Wed Jul 25 16:40:53 2012 +0300
@@ -91,7 +91,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/aim/moea64_native.c 233964 2012-04-06 22:33:13Z nwhitehorn $");
+__FBSDID("$FreeBSD: head/sys/powerpc/aim/moea64_native.c 234760 2012-04-28 14:42:49Z nwhitehorn $");
 
 /*
  * Native 64-bit page table operations for running without a hypervisor.
@@ -133,36 +133,31 @@
 
 #define	VSID_HASH_MASK	0x0000007fffffffffULL
 
-/*
- * The tlbie instruction must be executed in 64-bit mode
- * so we have to twiddle MSR[SF] around every invocation.
- * Just to add to the fun, exceptions must be off as well
- * so that we can't trap in 64-bit mode. What a pain.
- */
-static struct mtx	tlbie_mutex;
-
 static __inline void
 TLBIE(uint64_t vpn) {
 #ifndef __powerpc64__
 	register_t vpn_hi, vpn_lo;
 	register_t msr;
-	register_t scratch;
+	register_t scratch, intr;
 #endif
 
+	static volatile u_int tlbie_lock = 0;
+
 	vpn <<= ADDR_PIDX_SHFT;
 	vpn &= ~(0xffffULL << 48);
 
+	/* Hobo spinlock: we need stronger guarantees than mutexes provide */
+	while (!atomic_cmpset_int(&tlbie_lock, 0, 1));
+	isync(); /* Flush instruction queue once lock acquired */
+
 #ifdef __powerpc64__
-	mtx_lock(&tlbie_mutex);
 	__asm __volatile("tlbie %0" :: "r"(vpn) : "memory");
-	mtx_unlock(&tlbie_mutex);
-	__asm __volatile("eieio; tlbsync; ptesync");
+	__asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
 #else
 	vpn_hi = (uint32_t)(vpn >> 32);
 	vpn_lo = (uint32_t)vpn;
 
-	/* Note: spin mutex is to disable exceptions while fiddling MSR */
-	mtx_lock_spin(&tlbie_mutex);
+	intr = intr_disable();
 	__asm __volatile("\
 	    mfmsr %0; \
 	    mr %1, %0; \
@@ -179,8 +174,11 @@
 	    ptesync;" 
 	: "=r"(msr), "=r"(scratch) : "r"(vpn_hi), "r"(vpn_lo), "r"(32), "r"(1)
 	    : "memory");
-	mtx_unlock_spin(&tlbie_mutex);
+	intr_restore(intr);
 #endif
+
+	/* No barriers or special ops -- taken care of by ptesync above */
+	tlbie_lock = 0;
 }
 
 #define DISABLE_TRANS(msr)	msr = mfmsr(); mtmsr(msr & ~PSL_DR)
@@ -261,9 +259,9 @@
 	 * As shown in Section 7.6.3.2.3
 	 */
 	pt->pte_lo &= ~ptebit;
-	sched_pin();
+	critical_enter();
 	TLBIE(vpn);
-	sched_unpin();
+	critical_exit();
 }
 
 static void
@@ -297,12 +295,12 @@
 	 * Invalidate the pte.
 	 */
 	isync();
-	sched_pin();
+	critical_enter();
 	pvo_pt->pte_hi &= ~LPTE_VALID;
 	pt->pte_hi &= ~LPTE_VALID;
 	PTESYNC();
 	TLBIE(vpn);
-	sched_unpin();
+	critical_exit();
 
 	/*
 	 * Save the reg & chg bits.
@@ -405,15 +403,6 @@
 
 	CTR1(KTR_PMAP, "moea64_bootstrap: PTEG table at %p", moea64_pteg_table);
 
-	/*
-	 * Initialize the TLBIE lock. TLBIE can only be executed by one CPU.
-	 */
-#ifdef __powerpc64__
-	mtx_init(&tlbie_mutex, "tlbie", NULL, MTX_DEF);
-#else
-	mtx_init(&tlbie_mutex, "tlbie", NULL, MTX_SPIN);
-#endif
-
 	moea64_mid_bootstrap(mmup, kernelstart, kernelend);
 
 	/*
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/aim/slb.c
--- a/head/sys/powerpc/aim/slb.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/aim/slb.c	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/powerpc/aim/slb.c 230123 2012-01-15 00:08:14Z nwhitehorn $
+ * $FreeBSD: head/sys/powerpc/aim/slb.c 234745 2012-04-28 00:12:23Z nwhitehorn $
  */
 
 #include <sys/param.h>
@@ -139,7 +139,7 @@
 	 * that a lockless searcher always sees a valid path through
 	 * the tree.
 	 */
-	powerpc_sync();
+	mb();
 
 	idx = esid2idx(esid, parent->ua_level);
 	parent->u.ua_child[idx] = child;
@@ -187,7 +187,7 @@
 	idx = esid2idx(child->ua_base, inter->ua_level);
 	inter->u.ua_child[idx] = child;
 	setbit(&inter->ua_alloc, idx);
-	powerpc_sync();
+	mb();
 
 	/* Set up parent to point to intermediate node ... */
 	idx = esid2idx(inter->ua_base, parent->ua_level);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/aim/swtch32.S
--- a/head/sys/powerpc/aim/swtch32.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/aim/swtch32.S	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/powerpc/aim/swtch32.S 234517 2012-04-20 23:01:36Z nwhitehorn $ */
+/* $FreeBSD: head/sys/powerpc/aim/swtch32.S 235013 2012-05-04 16:00:22Z nwhitehorn $ */
 /* $NetBSD: locore.S,v 1.24 2000/05/31 05:09:17 thorpej Exp $ */
 
 /*-
@@ -124,7 +124,8 @@
 blocked_loop:
 	lwz	%r7,TD_LOCK(%r2)
 	cmpw	%r6,%r7 
-	beq	blocked_loop
+	beq-	blocked_loop
+	isync
 #endif
 
 	mfsprg	%r7,0			/* Get the pcpu pointer */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/aim/swtch64.S
--- a/head/sys/powerpc/aim/swtch64.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/aim/swtch64.S	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/* $FreeBSD: head/sys/powerpc/aim/swtch64.S 234517 2012-04-20 23:01:36Z nwhitehorn $ */
+/* $FreeBSD: head/sys/powerpc/aim/swtch64.S 235013 2012-05-04 16:00:22Z nwhitehorn $ */
 /* $NetBSD: locore.S,v 1.24 2000/05/31 05:09:17 thorpej Exp $ */
 
 /*-
@@ -150,7 +150,8 @@
 blocked_loop:
 	ld	%r7,TD_LOCK(%r13)
 	cmpd	%r6,%r7 
-	beq	blocked_loop
+	beq-	blocked_loop
+	isync
 #endif
 
 	mfsprg	%r7,0			/* Get the pcpu pointer */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/booke/locore.S
--- a/head/sys/powerpc/booke/locore.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/booke/locore.S	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/powerpc/booke/locore.S 224617 2011-08-02 23:33:44Z marcel $
+ * $FreeBSD: head/sys/powerpc/booke/locore.S 236141 2012-05-27 10:25:20Z raj $
  */
 
 #include "assym.s"
@@ -218,7 +218,7 @@
 	mr	%r3, %r30
 	mr	%r4, %r31
 
-	/* Prepare e500 core */
+	/* Prepare core */
 	bl	booke_init
 
 	/* Switch to thread0.td_kstack now */
@@ -242,14 +242,20 @@
 __boot_page:
 	bl	1f
 
-	.globl	kernload_ap
-kernload_ap:
+	.globl	bp_trace
+bp_trace:
+	.long	0
+
+	.globl	bp_kernload
+bp_kernload:
 	.long	0
 
 /*
  * Initial configuration
  */
 1:
+	mflr    %r31		/* r31 hold the address of bp_trace */
+
 	/* Set HIDs */
 	lis	%r3, HID0_E500_DEFAULT_SET at h
 	ori	%r3, %r3, HID0_E500_DEFAULT_SET at l
@@ -318,15 +324,15 @@
 	mtspr	SPR_MAS2, %r3
 	isync
 
-	/* Retrieve kernel load [physical] address from kernload_ap */
+	/* Retrieve kernel load [physical] address from bp_kernload */
 	bl	4f
 4:	mflr	%r3
 	rlwinm	%r3, %r3, 0, 0, 19
-	lis	%r4, kernload_ap at h
-	ori	%r4, %r4, kernload_ap at l
+	lis	%r4, bp_kernload at h
+	ori	%r4, %r4, bp_kernload at l
 	lis	%r5, __boot_page at h
 	ori	%r5, %r5, __boot_page at l
-	sub	%r4, %r4, %r5	/* offset of kernload_ap within __boot_page */
+	sub	%r4, %r4, %r5	/* offset of bp_kernload within __boot_page */
 	lwzx	%r3, %r4, %r3
 
 	/* Set RPN and protection */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/booke/machdep.c
--- a/head/sys/powerpc/booke/machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/booke/machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,5 +1,5 @@
 /*-
- * Copyright (C) 2006 Semihalf, Marian Balakowicz <m8 at semihalf.com>
+ * Copyright (C) 2006-2012 Semihalf
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -79,7 +79,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/booke/machdep.c 230767 2012-01-30 07:56:00Z kib $");
+__FBSDID("$FreeBSD: head/sys/powerpc/booke/machdep.c 238030 2012-07-02 21:11:01Z marcel $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -129,6 +129,7 @@
 #include <machine/md_var.h>
 #include <machine/mmuvar.h>
 #include <machine/sigframe.h>
+#include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/platform.h>
 
@@ -138,8 +139,6 @@
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
-#include <powerpc/mpc85xx/mpc85xx.h>
-
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
@@ -158,11 +157,6 @@
 extern unsigned char __sbss_end[];
 extern unsigned char _end[];
 
-extern void dcache_enable(void);
-extern void dcache_inval(void);
-extern void icache_enable(void);
-extern void icache_inval(void);
-
 /*
  * Bootinfo is passed to us by legacy loaders. Save the address of the
  * structure to handle backward compatibility.
@@ -185,8 +179,8 @@
 
 int hw_direct_map = 0;
 
-static void cpu_e500_startup(void *);
-SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_e500_startup, NULL);
+static void cpu_booke_startup(void *);
+SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_booke_startup, NULL);
 
 void print_kernel_section_addr(void);
 void print_kenv(void);
@@ -195,7 +189,7 @@
 extern int elf32_nxstack;
 
 static void
-cpu_e500_startup(void *dummy)
+cpu_booke_startup(void *dummy)
 {
 	int indx, size;
 
@@ -286,7 +280,6 @@
 	struct pcpu *pc;
 	void *kmdp, *mdp;
 	vm_offset_t dtbp, end;
-	uint32_t csr;
 
 	kmdp = NULL;
 
@@ -359,9 +352,9 @@
 		while (1);
 
 	OF_interpret("perform-fixup", 0);
-
-	/* Initialize TLB1 handling */
-	tlb1_init(fdt_immr_pa);
+	
+	/* Set up TLB initially */
+	booke_init_tlb(fdt_immr_pa);
 
 	/* Reset Time Base */
 	mttb(0);
@@ -392,20 +385,20 @@
 	debugf(" boothowto = 0x%08x\n", boothowto);
 	debugf(" kernel ccsrbar = 0x%08x\n", CCSRBAR_VA);
 	debugf(" MSR = 0x%08x\n", mfmsr());
+#if defined(BOOKE_E500)
 	debugf(" HID0 = 0x%08x\n", mfspr(SPR_HID0));
 	debugf(" HID1 = 0x%08x\n", mfspr(SPR_HID1));
 	debugf(" BUCSR = 0x%08x\n", mfspr(SPR_BUCSR));
-
-	__asm __volatile("msync; isync");
-	csr = ccsr_read4(OCP85XX_L2CTL);
-	debugf(" L2CTL = 0x%08x\n", csr);
+#endif
 
 	debugf(" dtbp = 0x%08x\n", (uint32_t)dtbp);
 
 	print_kernel_section_addr();
 	print_kenv();
+#if defined(BOOKE_E500)
 	//tlb1_print_entries();
 	//tlb1_print_tlbentries();
+#endif
 
 	kdb_init();
 
@@ -421,8 +414,10 @@
 	pmap_mmu_install(MMU_TYPE_BOOKE, 0);
 	pmap_bootstrap((uintptr_t)kernel_text, end);
 	debugf("MSR = 0x%08x\n", mfmsr());
+#if defined(BOOKE_E500)
 	//tlb1_print_entries();
 	//tlb1_print_tlbentries();
+#endif
 
 	/* Initialize params/tunables that are derived from memsize. */
 	init_param2(physmem);
@@ -441,29 +436,8 @@
 	mtmsr(mfmsr() | PSL_ME);
 	isync();
 
-	/* Enable D-cache if applicable */
-	csr = mfspr(SPR_L1CSR0);
-	if ((csr & L1CSR0_DCE) == 0) {
-		dcache_inval();
-		dcache_enable();
-	}
-
-	csr = mfspr(SPR_L1CSR0);
-	if ((boothowto & RB_VERBOSE) != 0 || (csr & L1CSR0_DCE) == 0)
-		printf("L1 D-cache %sabled\n",
-		    (csr & L1CSR0_DCE) ? "en" : "dis");
-
-	/* Enable L1 I-cache if applicable. */
-	csr = mfspr(SPR_L1CSR1);
-	if ((csr & L1CSR1_ICE) == 0) {
-		icache_inval();
-		icache_enable();
-	}
-
-	csr = mfspr(SPR_L1CSR1);
-	if ((boothowto & RB_VERBOSE) != 0 || (csr & L1CSR1_ICE) == 0)
-		printf("L1 I-cache %sabled\n",
-		    (csr & L1CSR1_ICE) ? "en" : "dis");
+	/* Enable L1 caches */
+	booke_enable_l1_cache();
 
 	debugf("%s: SP = 0x%08x\n", __func__,
 	    ((uintptr_t)thread0.td_pcb - 16) & ~15);
@@ -499,7 +473,24 @@
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
-	/* TBD */
+	register_t addr, off;
+
+	/*
+	 * Align the address to a cacheline and adjust the length
+	 * accordingly. Then round the length to a multiple of the
+	 * cacheline for easy looping.
+	 */
+	addr = (uintptr_t)ptr;
+	off = addr & (cacheline_size - 1);
+	addr -= off;
+	len = (len + off + cacheline_size - 1) & ~(cacheline_size - 1);
+
+	while (len > 0) {
+		__asm __volatile ("dcbf 0,%0" :: "r"(addr));
+		__asm __volatile ("sync");
+		addr += cacheline_size;
+		len -= cacheline_size;
+	}
 }
 
 void
@@ -538,7 +529,8 @@
 {
 
 	mtmsr(mfmsr() & ~(PSL_CE | PSL_EE | PSL_ME | PSL_DE));
-	while (1);
+	while (1)
+		;
 }
 
 int
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/booke/machdep_e500.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/powerpc/booke/machdep_e500.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,158 @@
+/*-
+ * Copyright (c) 2011-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/powerpc/booke/machdep_e500.c 236324 2012-05-30 17:34:40Z raj $");
+
+#include <sys/types.h>
+#include <sys/reboot.h>
+
+#include <machine/machdep.h>
+
+#include <dev/fdt/fdt_common.h>
+
+#include <powerpc/mpc85xx/mpc85xx.h>
+
+extern void dcache_enable(void);
+extern void dcache_inval(void);
+extern void icache_enable(void);
+extern void icache_inval(void);
+extern void l2cache_enable(void);
+extern void l2cache_inval(void);
+
+void
+booke_init_tlb(vm_paddr_t fdt_immr_pa)
+{
+
+	/* Initialize TLB1 handling */
+	tlb1_init(fdt_immr_pa);
+}
+
+void
+booke_enable_l1_cache(void)
+{
+	uint32_t csr;
+
+	/* Enable D-cache if applicable */
+	csr = mfspr(SPR_L1CSR0);
+	if ((csr & L1CSR0_DCE) == 0) {
+		dcache_inval();
+		dcache_enable();
+	}
+
+	csr = mfspr(SPR_L1CSR0);
+	if ((boothowto & RB_VERBOSE) != 0 || (csr & L1CSR0_DCE) == 0)
+		printf("L1 D-cache %sabled\n",
+		    (csr & L1CSR0_DCE) ? "en" : "dis");
+
+	/* Enable L1 I-cache if applicable. */
+	csr = mfspr(SPR_L1CSR1);
+	if ((csr & L1CSR1_ICE) == 0) {
+		icache_inval();
+		icache_enable();
+	}
+
+	csr = mfspr(SPR_L1CSR1);
+	if ((boothowto & RB_VERBOSE) != 0 || (csr & L1CSR1_ICE) == 0)
+		printf("L1 I-cache %sabled\n",
+		    (csr & L1CSR1_ICE) ? "en" : "dis");
+}
+
+#if 0
+void
+booke_enable_l2_cache(void)
+{
+	uint32_t csr;
+
+	/* Enable L2 cache on E500mc */
+	if ((((mfpvr() >> 16) & 0xFFFF) == FSL_E500mc) ||
+	    (((mfpvr() >> 16) & 0xFFFF) == FSL_E5500)) {
+		csr = mfspr(SPR_L2CSR0);
+		if ((csr & L2CSR0_L2E) == 0) {
+			l2cache_inval();
+			l2cache_enable();
+		}
+
+		csr = mfspr(SPR_L2CSR0);
+		if ((boothowto & RB_VERBOSE) != 0 || (csr & L2CSR0_L2E) == 0)
+			printf("L2 cache %sabled\n",
+			    (csr & L2CSR0_L2E) ? "en" : "dis");
+	}
+}
+
+void
+booke_enable_l3_cache(void)
+{
+	uint32_t csr, size, ver;
+
+	/* Enable L3 CoreNet Platform Cache (CPC) */
+	ver = SVR_VER(mfspr(SPR_SVR));
+	if (ver == SVR_P2041 || ver == SVR_P2041E || ver == SVR_P3041 ||
+	    ver == SVR_P3041E || ver == SVR_P5020 || ver == SVR_P5020E) {
+		csr = ccsr_read4(OCP85XX_CPC_CSR0);
+		if ((csr & OCP85XX_CPC_CSR0_CE) == 0) {
+			l3cache_inval();
+			l3cache_enable();
+		}
+
+		csr = ccsr_read4(OCP85XX_CPC_CSR0);
+		if ((boothowto & RB_VERBOSE) != 0 ||
+		    (csr & OCP85XX_CPC_CSR0_CE) == 0) {
+			size = OCP85XX_CPC_CFG0_SZ_K(ccsr_read4(OCP85XX_CPC_CFG0));
+			printf("L3 Corenet Platform Cache: %d KB %sabled\n",
+			    size, (csr & OCP85XX_CPC_CSR0_CE) == 0 ?
+			    "dis" : "en");
+		}
+	}
+}
+
+void
+booke_disable_l2_cache(void)
+{
+}
+
+static void
+l3cache_inval(void)
+{
+
+	/* Flash invalidate the CPC and clear all the locks */
+	ccsr_write4(OCP85XX_CPC_CSR0, OCP85XX_CPC_CSR0_FI |
+	    OCP85XX_CPC_CSR0_LFC);
+	while (ccsr_read4(OCP85XX_CPC_CSR0) & (OCP85XX_CPC_CSR0_FI |
+	    OCP85XX_CPC_CSR0_LFC))
+		;
+}
+
+static void
+l3cache_enable(void)
+{
+
+	ccsr_write4(OCP85XX_CPC_CSR0, OCP85XX_CPC_CSR0_CE |
+	    OCP85XX_CPC_CSR0_PE);
+	/* Read back to sync write */
+	ccsr_read4(OCP85XX_CPC_CSR0);
+}
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/booke/machdep_ppc4xx.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/powerpc/booke/machdep_ppc4xx.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,219 @@
+/*-
+ * Copyright (c) 2011-2012 Semihalf.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/powerpc/booke/machdep_ppc4xx.c 236324 2012-05-30 17:34:40Z raj $");
+
+#include <sys/types.h>
+#include <sys/systm.h>
+
+#include <machine/machdep.h>
+
+#include <powerpc/booke/dcr.h>
+#include <powerpc/apm86xxx/apm86xxx.h>
+
+#include <dev/fdt/fdt_common.h>
+
+#define OCP_ADDR_WORDLO(addr)	((uint32_t)((uint64_t)(addr) & 0xFFFFFFFF))
+#define OCP_ADDR_WORDHI(addr)	((uint32_t)((uint64_t)(addr) >> 32))
+
+extern void tlb_write(u_int, uint32_t, uint32_t, uint32_t, tlbtid_t, uint32_t,
+    uint32_t);
+extern void tlb_read(u_int, uint32_t *, uint32_t *, uint32_t *, uint32_t *,
+    uint32_t *, uint32_t *);
+
+unsigned int tlb_static_entries;
+unsigned int tlb_current_entry = TLB_SIZE;
+unsigned int tlb_misses = 0;
+unsigned int tlb_invals = 0;
+
+void tlb_map(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t);
+void tlb_map_mem(uint32_t, uint32_t, uint32_t);
+void tlb_dump(void);
+
+void
+booke_init_tlb(vm_paddr_t fdt_immr_pa)
+{
+
+	/* Map register space */
+	tlb_map(APM86XXX_DEEP_SLEEP_VA,
+	    OCP_ADDR_WORDLO(APM86XXX_DEEP_SLEEP_PA),
+	    OCP_ADDR_WORDHI(APM86XXX_DEEP_SLEEP_PA), TLB_VALID | TLB_SIZE_16M,
+	    TLB_SW | TLB_SR | TLB_I | TLB_G);
+
+	tlb_map(APM86XXX_CSR_VA, OCP_ADDR_WORDLO(APM86XXX_CSR_PA),
+	    OCP_ADDR_WORDHI(APM86XXX_CSR_PA), TLB_VALID | TLB_SIZE_16M,
+	    TLB_SW | TLB_SR | TLB_I | TLB_G);
+
+	tlb_map(APM86XXX_PRIMARY_FABRIC_VA,
+	    OCP_ADDR_WORDLO(APM86XXX_PRIMARY_FABRIC_PA),
+	    OCP_ADDR_WORDHI(APM86XXX_PRIMARY_FABRIC_PA),
+	    TLB_VALID | TLB_SIZE_16M,
+	    TLB_SW | TLB_SR | TLB_I | TLB_G);
+
+	tlb_map(APM86XXX_AHB_VA, OCP_ADDR_WORDLO(APM86XXX_AHB_PA),
+	    OCP_ADDR_WORDHI(APM86XXX_AHB_PA),
+	    TLB_VALID | TLB_SIZE_16M,
+	    TLB_SW | TLB_SR | TLB_I | TLB_G);
+
+	/* Map MailBox space */
+	tlb_map(APM86XXX_MBOX_VA, OCP_ADDR_WORDLO(APM86XXX_MBOX_PA),
+	    OCP_ADDR_WORDHI(APM86XXX_MBOX_PA),
+	    TLB_VALID | TLB_SIZE_4K,
+	    TLB_UX | TLB_UW | TLB_UR |
+	    TLB_SX | TLB_SW | TLB_SR |
+	    TLB_I | TLB_G);
+
+	tlb_map(APM86XXX_MBOX_VA + 0x1000,
+	    OCP_ADDR_WORDLO(APM86XXX_MBOX_PA) + 0x1000,
+	    OCP_ADDR_WORDHI(APM86XXX_MBOX_PA),
+	    TLB_VALID | TLB_SIZE_4K,
+	    TLB_UX | TLB_UW | TLB_UR |
+	    TLB_SX | TLB_SW | TLB_SR |
+	    TLB_I | TLB_G);
+
+	tlb_map(APM86XXX_MBOX_VA + 0x2000,
+	    OCP_ADDR_WORDLO(APM86XXX_MBOX_PA)+ 0x2000,
+	    OCP_ADDR_WORDHI(APM86XXX_MBOX_PA),
+	    TLB_VALID | TLB_SIZE_4K,
+	    TLB_UX | TLB_UW | TLB_UR |
+	    TLB_SX | TLB_SW | TLB_SR |
+	    TLB_I | TLB_G);
+}
+
+void
+booke_enable_l1_cache(void)
+{
+}
+
+void
+booke_enable_l2_cache(void)
+{
+}
+
+void
+booke_enable_l3_cache(void)
+{
+}
+
+void
+booke_disable_l2_cache(void)
+{
+	uint32_t ccr1,l2cr0;
+
+	/* Disable L2 cache op broadcast */
+	ccr1 = mfspr(SPR_CCR1);
+	ccr1 &= ~CCR1_L2COBE;
+	mtspr(SPR_CCR1, ccr1);
+
+	/* Set L2 array size to 0 i.e. disable L2 cache */
+	mtdcr(DCR_L2DCDCRAI, DCR_L2CR0);
+	l2cr0 = mfdcr(DCR_L2DCDCRDI);
+	l2cr0 &= ~L2CR0_AS;
+	mtdcr(DCR_L2DCDCRDI, l2cr0);
+}
+
+void tlb_map(uint32_t epn, uint32_t rpn, uint32_t erpn, uint32_t flags,
+    uint32_t perms)
+{
+
+	tlb_write(++tlb_static_entries, epn, rpn, erpn, 0, flags, perms);
+}
+
+static void tlb_dump_entry(u_int entry)
+{
+	uint32_t epn, rpn, erpn, tid, flags, perms;
+	const char *size;
+
+	tlb_read(entry, &epn, &rpn, &erpn, &tid, &flags, &perms);
+
+	switch (flags & TLB_SIZE_MASK) {
+	case TLB_SIZE_1K:
+		size = "  1k";
+		break;
+	case TLB_SIZE_4K:
+		size = "  4k";
+		break;
+	case TLB_SIZE_16K:
+		size = " 16k";
+		break;
+	case TLB_SIZE_256K:
+		size = "256k";
+		break;
+	case TLB_SIZE_1M:
+		size = "  1M";
+		break;
+	case TLB_SIZE_16M:
+		size = " 16M";
+		break;
+	case TLB_SIZE_256M:
+		size = "256M";
+		break;
+	case TLB_SIZE_1G:
+		size = "  1G";
+		break;
+	default:
+		size = "????";
+		break;
+	}
+
+
+	printf("TLB[%02u]: 0x%08X => "
+	    "0x%01X_%08X %s %c %c %s %s %s %s %s "
+	    "%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c (%u)\n",
+	    entry, epn, erpn, rpn, size,
+	    (flags & TLB_TS)	? '1'		: '0',
+	    (flags & TLB_VALID)	?  'V'		: '.',
+	    (perms & TLB_WL1)	? "WL1"		: "___",
+	    (perms & TLB_IL1I)	? "IL1I"	: "____",
+	    (perms & TLB_IL1D)	? "IL1D"	: "____",
+	    (perms & TLB_IL2I)	? "IL2I"	: "____",
+	    (perms & TLB_IL2D)	? "IL2D"	: "____",
+	    (perms & TLB_U0)	? '1'		: '.',
+	    (perms & TLB_U1)	? '2'		: '.',
+	    (perms & TLB_U2)	? '3'		: '.',
+	    (perms & TLB_U3)	? '4'		: '.',
+	    (perms & TLB_W)		? 'W'		: '.',
+	    (perms & TLB_I)		? 'I'		: '.',
+	    (perms & TLB_M)		? 'M'		: '.',
+	    (perms & TLB_G)		? 'G'		: '.',
+	    (perms & TLB_E)		? 'E'		: '.',
+	    (perms & TLB_UX)	? 'x'		: '.',
+	    (perms & TLB_UW)	? 'w'		: '.',
+	    (perms & TLB_UR)	? 'r'		: '.',
+	    (perms & TLB_SX)	? 'X'		: '.',
+	    (perms & TLB_SW)	? 'W'		: '.',
+	    (perms & TLB_SR)	? 'R'		: '.',
+	    tid);
+}
+
+void tlb_dump(void)
+{
+	int i;
+
+	for (i = 0; i < TLB_SIZE; i++)
+		tlb_dump_entry(i);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/booke/platform_bare.c
--- a/head/sys/powerpc/booke/platform_bare.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/booke/platform_bare.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2008-2009 Semihalf, Rafal Jaworowski
+ * Copyright (c) 2008-2012 Semihalf.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/booke/platform_bare.c 228201 2011-12-02 15:24:39Z jchandra $");
+__FBSDID("$FreeBSD: head/sys/powerpc/booke/platform_bare.c 236325 2012-05-30 18:05:48Z raj $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -56,7 +56,8 @@
 #ifdef SMP
 extern void *ap_pcpu;
 extern uint8_t __boot_page[];		/* Boot page body */
-extern uint32_t kernload_ap;		/* Kernel physical load address */
+extern uint32_t bp_kernload;		/* Kernel physical load address */
+extern uint32_t bp_trace;		/* AP boot trace field */
 #endif
 
 extern uint32_t *bootinfo;
@@ -72,10 +73,10 @@
 static int bare_smp_get_bsp(platform_t, struct cpuref *cpuref);
 static int bare_smp_start_cpu(platform_t, struct pcpu *cpu);
 
-static void e500_reset(platform_t);
+static void booke_reset(platform_t);
 
 static platform_method_t bare_methods[] = {
-	PLATFORMMETHOD(platform_probe, 		bare_probe),
+	PLATFORMMETHOD(platform_probe,		bare_probe),
 	PLATFORMMETHOD(platform_mem_regions,	bare_mem_regions),
 	PLATFORMMETHOD(platform_timebase_freq,	bare_timebase_freq),
 
@@ -84,7 +85,7 @@
 	PLATFORMMETHOD(platform_smp_get_bsp,	bare_smp_get_bsp),
 	PLATFORMMETHOD(platform_smp_start_cpu,	bare_smp_start_cpu),
 
-	PLATFORMMETHOD(platform_reset,		e500_reset),
+	PLATFORMMETHOD(platform_reset,		booke_reset),
 
 	{ 0, 0 }
 };
@@ -100,26 +101,16 @@
 static int
 bare_probe(platform_t plat)
 {
-	uint32_t ver, sr;
+	phandle_t cpus, child;
+	uint32_t sr;
 	int i, law_max, tgt;
 
-	ver = SVR_VER(mfspr(SPR_SVR));
-	switch (ver & ~0x0008) {	/* Mask Security Enabled bit */
-	case SVR_P4080:
-		maxcpu = 8;
-		break;
-	case SVR_P4040:
-		maxcpu = 4;
-		break;
-	case SVR_MPC8572:
-	case SVR_P1020:
-	case SVR_P2020:
-		maxcpu = 2;
-		break;
-	default:
+	if ((cpus = OF_finddevice("/cpus")) != 0) {
+		for (maxcpu = 0, child = OF_child(cpus); child != 0;
+		    child = OF_peer(child), maxcpu++)
+			;
+	} else
 		maxcpu = 1;
-		break;
-	}
 
 	/*
 	 * Clear local access windows. Skip DRAM entries, so we don't shoot
@@ -152,9 +143,9 @@
 	int i, rv;
 
 	rv = fdt_get_mem_regions(avail_regions, availsz, &memsize);
-
 	if (rv != 0)
-		return;
+		panic("%s: could not retrieve mem regions from the 'memory' "
+		    "node, error: %d", __func__, rv);
 
 	for (i = 0; i < *availsz; i++) {
 		if (avail_regions[i].mr_start < 1048576) {
@@ -262,8 +253,8 @@
 
 	eebpcr = ccsr_read4(OCP85XX_EEBPCR);
 	if ((eebpcr & (1 << (pc->pc_cpuid + 24))) != 0) {
-		printf("%s: CPU=%d already out of hold-off state!\n",
-		    __func__, pc->pc_cpuid);
+		printf("SMP: CPU %d already out of hold-off state!\n",
+		    pc->pc_cpuid);
 		return (ENXIO);
 	}
 
@@ -273,12 +264,13 @@
 	/*
 	 * Set BPTR to the physical address of the boot page
 	 */
-	bptr = ((uint32_t)__boot_page - KERNBASE) + kernload_ap;
+	bptr = ((uint32_t)__boot_page - KERNBASE) + bp_kernload;
 	ccsr_write4(OCP85XX_BPTR, (bptr >> 12) | 0x80000000);
 
 	/*
 	 * Release AP from hold-off state
 	 */
+	bp_trace = 0;
 	eebpcr |= (1 << (pc->pc_cpuid + 24));
 	ccsr_write4(OCP85XX_EEBPCR, eebpcr);
 	__asm __volatile("isync; msync");
@@ -287,6 +279,16 @@
 	while (!pc->pc_awake && timeout--)
 		DELAY(1000);	/* wait 1ms */
 
+	/*
+	 * Disable boot page translation so that the 4K page at the default
+	 * address (= 0xfffff000) isn't permanently remapped and thus not
+	 * usable otherwise.
+	 */
+	ccsr_write4(OCP85XX_BPTR, 0);
+
+	if (!pc->pc_awake)
+		printf("SMP: CPU %d didn't wake up (trace code %#x).\n",
+		    pc->pc_awake, bp_trace);
 	return ((pc->pc_awake) ? 0 : EBUSY);
 #else
 	/* No SMP support */
@@ -295,7 +297,7 @@
 }
 
 static void
-e500_reset(platform_t plat)
+booke_reset(platform_t plat)
 {
 
 	/*
@@ -316,6 +318,7 @@
 	mtspr(SPR_DBCR0, mfspr(SPR_DBCR0) | DBCR0_IDM | DBCR0_RST_SYSTEM);
 
 	printf("Reset failed...\n");
-	while (1);
+	while (1)
+		;
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/booke/pmap.c
--- a/head/sys/powerpc/booke/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/booke/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -49,7 +49,7 @@
   */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/booke/pmap.c 225841 2011-09-28 15:01:20Z kib $");
+__FBSDID("$FreeBSD: head/sys/powerpc/booke/pmap.c 238357 2012-07-10 22:10:21Z alc $");
 
 #include <sys/types.h>
 #include <sys/param.h>
@@ -101,10 +101,6 @@
 
 #define TODO			panic("%s: not implemented", __func__);
 
-#include "opt_sched.h"
-#ifndef SCHED_4BSD
-#error "e500 only works with SCHED_4BSD which uses a global scheduler lock."
-#endif
 extern struct mtx sched_lock;
 
 extern int dumpsys_minidump;
@@ -115,7 +111,7 @@
 extern uint32_t *bootinfo;
 
 #ifdef SMP
-extern uint32_t kernload_ap;
+extern uint32_t bp_kernload;
 #endif
 
 vm_paddr_t kernload;
@@ -290,8 +286,8 @@
 static boolean_t	mmu_booke_is_modified(mmu_t, vm_page_t);
 static boolean_t	mmu_booke_is_prefaultable(mmu_t, pmap_t, vm_offset_t);
 static boolean_t	mmu_booke_is_referenced(mmu_t, vm_page_t);
-static boolean_t	mmu_booke_ts_referenced(mmu_t, vm_page_t);
-static vm_offset_t	mmu_booke_map(mmu_t, vm_offset_t *, vm_offset_t, vm_offset_t,
+static int		mmu_booke_ts_referenced(mmu_t, vm_page_t);
+static vm_offset_t	mmu_booke_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t,
     int);
 static int		mmu_booke_mincore(mmu_t, pmap_t, vm_offset_t,
     vm_paddr_t *);
@@ -316,12 +312,12 @@
 static void		mmu_booke_activate(mmu_t, struct thread *);
 static void		mmu_booke_deactivate(mmu_t, struct thread *);
 static void		mmu_booke_bootstrap(mmu_t, vm_offset_t, vm_offset_t);
-static void		*mmu_booke_mapdev(mmu_t, vm_offset_t, vm_size_t);
+static void		*mmu_booke_mapdev(mmu_t, vm_paddr_t, vm_size_t);
 static void		mmu_booke_unmapdev(mmu_t, vm_offset_t, vm_size_t);
-static vm_offset_t	mmu_booke_kextract(mmu_t, vm_offset_t);
-static void		mmu_booke_kenter(mmu_t, vm_offset_t, vm_offset_t);
+static vm_paddr_t	mmu_booke_kextract(mmu_t, vm_offset_t);
+static void		mmu_booke_kenter(mmu_t, vm_offset_t, vm_paddr_t);
 static void		mmu_booke_kremove(mmu_t, vm_offset_t);
-static boolean_t	mmu_booke_dev_direct_mapped(mmu_t, vm_offset_t, vm_size_t);
+static boolean_t	mmu_booke_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t);
 static void		mmu_booke_sync_icache(mmu_t, pmap_t, vm_offset_t,
     vm_size_t);
 static vm_offset_t	mmu_booke_dumpsys_map(mmu_t, struct pmap_md *,
@@ -967,10 +963,9 @@
 	debugf("mmu_booke_bootstrap: entered\n");
 
 #ifdef SMP
-	kernload_ap = kernload;
+	bp_kernload = kernload;
 #endif
 
-
 	/* Initialize invalidation mutex */
 	mtx_init(&tlbivax_mutex, "tlbivax", NULL, MTX_SPIN);
 
@@ -981,8 +976,13 @@
 	 * Align kernel start and end address (kernel image).
 	 * Note that kernel end does not necessarily relate to kernsize.
 	 * kernsize is the size of the kernel that is actually mapped.
+	 * Also note that "start - 1" is deliberate. With SMP, the
+	 * entry point is exactly a page from the actual load address.
+	 * As such, trunc_page() has no effect and we're off by a page.
+	 * Since we always have the ELF header between the load address
+	 * and the entry point, we can safely subtract 1 to compensate.
 	 */
-	kernstart = trunc_page(start);
+	kernstart = trunc_page(start - 1);
 	data_start = round_page(kernelend);
 	data_end = data_start;
 
@@ -1233,9 +1233,9 @@
 	 * entries, but for pte_vatopa() to work correctly with kernel area
 	 * addresses.
 	 */
-	for (va = KERNBASE; va < data_end; va += PAGE_SIZE) {
+	for (va = kernstart; va < data_end; va += PAGE_SIZE) {
 		pte = &(kernel_pmap->pm_pdir[PDIR_IDX(va)][PTBL_IDX(va)]);
-		pte->rpn = kernload + (va - KERNBASE);
+		pte->rpn = kernload + (va - kernstart);
 		pte->flags = PTE_M | PTE_SR | PTE_SW | PTE_SX | PTE_WIRED |
 		    PTE_VALID;
 	}
@@ -1387,7 +1387,7 @@
  * Map a wired page into kernel virtual address space.
  */
 static void
-mmu_booke_kenter(mmu_t mmu, vm_offset_t va, vm_offset_t pa)
+mmu_booke_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa)
 {
 	unsigned int pdir_idx = PDIR_IDX(va);
 	unsigned int ptbl_idx = PTBL_IDX(va);
@@ -1397,9 +1397,7 @@
 	KASSERT(((va >= VM_MIN_KERNEL_ADDRESS) &&
 	    (va <= VM_MAX_KERNEL_ADDRESS)), ("mmu_booke_kenter: invalid va"));
 
-	flags = 0;
-	flags |= (PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID);
-	flags |= PTE_M;
+	flags = PTE_M | PTE_SR | PTE_SW | PTE_SX | PTE_WIRED | PTE_VALID;
 
 	pte = &(kernel_pmap->pm_pdir[pdir_idx][ptbl_idx]);
 
@@ -1812,8 +1810,8 @@
  * Map a range of physical addresses into kernel virtual address space.
  */
 static vm_offset_t
-mmu_booke_map(mmu_t mmu, vm_offset_t *virt, vm_offset_t pa_start,
-    vm_offset_t pa_end, int prot)
+mmu_booke_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start,
+    vm_paddr_t pa_end, int prot)
 {
 	vm_offset_t sva = *virt;
 	vm_offset_t va = sva;
@@ -2439,7 +2437,7 @@
 }
 
 static int
-mmu_booke_dev_direct_mapped(mmu_t mmu, vm_offset_t pa, vm_size_t size)
+mmu_booke_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
 {
 	int i;
 	vm_offset_t va;
@@ -2597,7 +2595,7 @@
  * for mapping device memory, NOT real memory.
  */
 static void *
-mmu_booke_mapdev(mmu_t mmu, vm_offset_t pa, vm_size_t size)
+mmu_booke_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
 {
 	void *res;
 	uintptr_t va;
@@ -3044,6 +3042,10 @@
 	/* Map in CCSRBAR. */
 	tlb1_set_entry(CCSRBAR_VA, ccsrbar, CCSRBAR_SIZE, _TLB_ENTRY_IO);
 
+	/* Purge the remaining entries */
+	for (i = tlb1_idx; i < TLB1_ENTRIES; i++)
+		tlb1_write_entry(i);
+
 	/* Setup TLB miss defaults */
 	set_mas4_defaults();
 }
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/booke/trap.c
--- a/head/sys/powerpc/booke/trap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/booke/trap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/booke/trap.c 225474 2011-09-11 16:05:09Z kib $");
+__FBSDID("$FreeBSD: head/sys/powerpc/booke/trap.c 238032 2012-07-02 21:18:09Z marcel $");
 
 #include "opt_fpu_emu.h"
 
@@ -143,6 +143,13 @@
 	int		sig, type, user;
 	ksiginfo_t	ksi;
 
+#ifdef KDB
+	if (kdb_active) {
+		kdb_reenter();
+		return;
+	}
+#endif
+
 	PCPU_INC(cnt.v_trap);
 
 	td = curthread;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/booke/trap_subr.S
--- a/head/sys/powerpc/booke/trap_subr.S	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/booke/trap_subr.S	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/booke/trap_subr.S 238033 2012-07-02 21:21:12Z marcel $
  */
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
@@ -668,7 +668,7 @@
 	lwarx	%r21, %r23, %r25		/* get pte->flags */
 	oris	%r21, %r21, PTE_REFERENCED at h	/* set referenced bit */
 
-	andi.	%r22, %r21, (PTE_UW | PTE_UW)@l	/* check if writable */
+	andi.	%r22, %r21, (PTE_SW | PTE_UW)@l	/* check if writable */
 	beq	2f
 	oris	%r21, %r21, PTE_MODIFIED at h	/* set modified bit */
 2:
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/conf/DEFAULTS
--- a/head/sys/powerpc/conf/DEFAULTS	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/conf/DEFAULTS	Wed Jul 25 16:40:53 2012 +0300
@@ -1,16 +1,14 @@
 #
 # DEFAULTS -- Default kernel configuration file for FreeBSD/powerpc
 #
-# $FreeBSD: head/sys/powerpc/conf/DEFAULTS 232619 2012-03-06 20:01:25Z attilio $
+# $FreeBSD: head/sys/powerpc/conf/DEFAULTS 238034 2012-07-02 21:25:24Z marcel $
 
 # Pseudo devices.
 device		mem		# Memory and kernel memory devices
 
 # UART chips on this platform
 device		uart_ns8250
-device		uart_z8530
 
-options 	GEOM_PART_APM
 options 	GEOM_PART_MBR
 
 options         NEW_PCIB
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/conf/GENERIC
--- a/head/sys/powerpc/conf/GENERIC	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/conf/GENERIC	Wed Jul 25 16:40:53 2012 +0300
@@ -16,19 +16,19 @@
 # If you are in doubt as to the purpose or necessity of a line, check first 
 # in NOTES.
 #
-# $FreeBSD: head/sys/powerpc/conf/GENERIC 233271 2012-03-21 08:38:42Z ed $
+# $FreeBSD: head/sys/powerpc/conf/GENERIC 238034 2012-07-02 21:25:24Z marcel $
 
 cpu		AIM
 ident		GENERIC
 
-machine		powerpc	powerpc
+machine 	powerpc powerpc
 
 makeoptions	DEBUG=-g		#Build kernel with gdb(1) debug symbols
 
 # Platform support
 options 	POWERMAC		#NewWorld Apple PowerMacs
 options 	PSIM			#GDB PSIM ppc simulator
-options		MAMBO			#IBM Mambo Full System Simulator
+options 	MAMBO			#IBM Mambo Full System Simulator
 
 options 	SCHED_ULE		#ULE scheduler
 options 	PREEMPTION		#Enable kernel thread preemption
@@ -49,6 +49,7 @@
 options 	CD9660			#ISO 9660 Filesystem
 options 	PROCFS			#Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		#Pseudo-filesystem framework
+options 	GEOM_PART_APM		#Apple Partition Maps.
 options 	GEOM_PART_GPT		#GUID Partition Tables.
 options 	GEOM_LABEL		#Provides labelization
 options 	COMPAT_FREEBSD4		#Keep this for a while
@@ -82,7 +83,7 @@
 options 	WITNESS_SKIPSPIN	#Don't run witness on spinlocks for speed
 options 	MALLOC_DEBUG_MAXZONES=8	# Separate malloc(9) zones
 
-# To make an SMP kernel, the next line is needed
+# Make an SMP-capable kernel by default
 options 	SMP			# Symmetric MultiProcessor Kernel
 
 # CPU frequency control
@@ -107,6 +108,7 @@
 device		isp		# Qlogic family
 device		ispfw		# Firmware module for Qlogic host adapters
 device		mpt		# LSI-Logic MPT-Fusion
+device		mps		# LSI-Logic MPT-Fusion 2
 device		sym		# NCR/Symbios/LSI Logic 53C8XX/53C1010/53C1510D
 
 # ATA/SCSI peripherals
@@ -126,6 +128,7 @@
 # Serial (COM) ports
 device		scc
 device		uart
+device		uart_z8530
 
 # PCI Ethernet NICs that use the common MII bus controller code.
 device		miibus		# MII bus support
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/conf/GENERIC64
--- a/head/sys/powerpc/conf/GENERIC64	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/conf/GENERIC64	Wed Jul 25 16:40:53 2012 +0300
@@ -16,19 +16,19 @@
 # If you are in doubt as to the purpose or necessity of a line, check first 
 # in NOTES.
 #
-# $FreeBSD: head/sys/powerpc/conf/GENERIC64 233271 2012-03-21 08:38:42Z ed $
+# $FreeBSD: head/sys/powerpc/conf/GENERIC64 238034 2012-07-02 21:25:24Z marcel $
 
 cpu		AIM
 ident		GENERIC
 
-machine		powerpc	powerpc64
+machine 	powerpc	powerpc64
 
 makeoptions	DEBUG=-g		#Build kernel with gdb(1) debug symbols
 
 # Platform support
 options 	POWERMAC		#NewWorld Apple PowerMacs
-options		PS3			#Sony Playstation 3
-options		MAMBO			#IBM Mambo Full System Simulator
+options 	PS3			#Sony Playstation 3
+options 	MAMBO			#IBM Mambo Full System Simulator
 
 options 	SCHED_ULE		#ULE scheduler
 options 	PREEMPTION		#Enable kernel thread preemption
@@ -49,6 +49,7 @@
 options 	CD9660			#ISO 9660 Filesystem
 options 	PROCFS			#Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		#Pseudo-filesystem framework
+options 	GEOM_PART_APM		#Apple Partition Maps.
 options 	GEOM_PART_GPT		#GUID Partition Tables.
 options 	GEOM_LABEL		#Provides labelization
 options 	COMPAT_FREEBSD32	#Compatible with FreeBSD/powerpc binaries
@@ -67,8 +68,11 @@
 options 	MAC			# TrustedBSD MAC Framework
 options 	INCLUDE_CONFIG_FILE     # Include this file in kernel
 
-# Debugging for use in -current
-options 	KDB			#Enable the kernel debugger
+# Debugging support.  Always need this:
+options 	KDB			# Enable kernel debugger support.
+# For minimum debugger support (stable branch) use:
+#options 	KDB_TRACE		# Print a stack trace for a panic.
+# For full debugger support use this instead:
 options 	DDB			#Support DDB
 #options 	DEADLKRES		#Enable the deadlock resolver
 options 	INVARIANTS		#Enable calls of extra sanity checking
@@ -102,6 +106,7 @@
 device		isp		# Qlogic family
 device		ispfw		# Firmware module for Qlogic host adapters
 device		mpt		# LSI-Logic MPT-Fusion
+device		mps		# LSI-Logic MPT-Fusion 2
 device		sym		# NCR/Symbios/LSI Logic 53C8XX/53C1010/53C1510D
 
 # ATA/SCSI peripherals
@@ -121,6 +126,7 @@
 # Serial (COM) ports
 device		scc
 device		uart
+device		uart_z8530
 
 # Ethernet hardware
 device		glc		# Sony Playstation 3 Ethernet
@@ -170,8 +176,8 @@
 device		kue		# Kawasaki LSI USB Ethernet
 
 # Wireless NIC cards
-options         IEEE80211_SUPPORT_MESH
-options         AH_SUPPORT_AR5416
+options		IEEE80211_SUPPORT_MESH
+options		AH_SUPPORT_AR5416
 
 # FireWire support
 device		firewire	# FireWire bus code
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/conf/MPC85XX
--- a/head/sys/powerpc/conf/MPC85XX	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/conf/MPC85XX	Wed Jul 25 16:40:53 2012 +0300
@@ -1,10 +1,11 @@
 #
 # Custom kernel for Freescale MPC85XX development boards like the CDS etc.
 #
-# $FreeBSD: head/sys/powerpc/conf/MPC85XX 233271 2012-03-21 08:38:42Z ed $
+# $FreeBSD: head/sys/powerpc/conf/MPC85XX 236141 2012-05-27 10:25:20Z raj $
 #
 
-cpu		E500
+cpu		BOOKE
+cpu		BOOKE_E500
 ident		MPC85XX
 
 machine		powerpc	powerpc
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/conf/NOTES
--- a/head/sys/powerpc/conf/NOTES	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/conf/NOTES	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-# $FreeBSD: head/sys/powerpc/conf/NOTES 222686 2011-06-04 15:17:35Z andreast $
+# $FreeBSD: head/sys/powerpc/conf/NOTES 236141 2012-05-27 10:25:20Z raj $
 #
 # This file contains machine dependent kernel configuration notes.  For
 # machine independent notes, look in /sys/conf/NOTES.
@@ -14,7 +14,8 @@
 #
 # You must specify at least one CPU (the one you intend to run on).
 cpu		AIM
-#cpu		E500
+#cpu		BOOKE_E500
+#cpu		BOOKE_PPC440
 
 options 	FPU_EMU
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/_stdint.h
--- a/head/sys/powerpc/include/_stdint.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/_stdint.h	Wed Jul 25 16:40:53 2012 +0300
@@ -34,7 +34,7 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/powerpc/include/_stdint.h 229494 2012-01-04 16:02:52Z andreast $
+ * $FreeBSD: head/sys/powerpc/include/_stdint.h 237517 2012-06-24 04:15:58Z andrew $
  */
 
 #ifndef _MACHINE__STDINT_H_
@@ -189,12 +189,6 @@
 #define	SIZE_MAX	UINT32_MAX
 #endif
 
-#ifndef WCHAR_MIN /* Also possibly defined in <wchar.h> */
-/* Limits of wchar_t. */
-#define	WCHAR_MIN	INT32_MIN
-#define	WCHAR_MAX	INT32_MAX
-#endif
-
 /* Limits of wint_t. */
 #define	WINT_MIN	INT32_MIN
 #define	WINT_MAX	INT32_MAX
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/_types.h
--- a/head/sys/powerpc/include/_types.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/_types.h	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  *
  *	From: @(#)ansi.h	8.2 (Berkeley) 1/4/94
  *	From: @(#)types.h	8.3 (Berkeley) 1/5/94
- * $FreeBSD: head/sys/powerpc/include/_types.h 230229 2012-01-16 20:17:51Z das $
+ * $FreeBSD: head/sys/powerpc/include/_types.h 237517 2012-06-24 04:15:58Z andrew $
  */
 
 #ifndef _MACHINE__TYPES_H_
@@ -133,6 +133,10 @@
 #endif
 typedef	__int64_t	__vm_ooffset_t;
 typedef	__uint64_t	__vm_pindex_t;
+typedef	int		__wchar_t;
+
+#define	__WCHAR_MIN	__INT_MIN	/* min value for a wchar_t */
+#define	__WCHAR_MAX	__INT_MAX	/* max value for a wchar_t */
 
 /*
  * Unusual type definitions.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/atomic.h
--- a/head/sys/powerpc/include/atomic.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/atomic.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/powerpc/include/atomic.h 222198 2011-05-22 20:55:54Z attilio $
+ * $FreeBSD: head/sys/powerpc/include/atomic.h 235946 2012-05-24 23:46:17Z bz $
  */
 
 #ifndef _MACHINE_ATOMIC_H_
@@ -36,12 +36,30 @@
 #error this file needs sys/cdefs.h as a prerequisite
 #endif
 
-#define	__ATOMIC_BARRIER					\
-    __asm __volatile("sync" : : : "memory")
+/*
+ * The __ATOMIC_REL/ACQ() macros provide memory barriers only in conjunction
+ * with the atomic lXarx/stXcx. sequences below. They are not exposed outside
+ * of this file. See also Appendix B.2 of Book II of the architecture manual.
+ *
+ * Note that not all Book-E processors accept the light-weight sync variant.
+ * In particular, early models of E500 cores are known to wedge. Bank on all
+ * 64-bit capable CPUs to accept lwsync properly and pressimize 32-bit CPUs
+ * to use the heavier-weight sync.
+ */
 
-#define mb()	__ATOMIC_BARRIER
-#define	wmb()	mb()
-#define	rmb()	mb()
+#ifdef __powerpc64__
+#define mb()		__asm __volatile("lwsync" : : : "memory")
+#define rmb()		__asm __volatile("lwsync" : : : "memory")
+#define wmb()		__asm __volatile("lwsync" : : : "memory")
+#define __ATOMIC_REL()	__asm __volatile("lwsync" : : : "memory")
+#define __ATOMIC_ACQ()	__asm __volatile("isync" : : : "memory")
+#else
+#define mb()		__asm __volatile("sync" : : : "memory")
+#define rmb()		__asm __volatile("sync" : : : "memory")
+#define wmb()		__asm __volatile("sync" : : : "memory")
+#define __ATOMIC_REL()	__asm __volatile("sync" : : : "memory")
+#define __ATOMIC_ACQ()	__asm __volatile("isync" : : : "memory")
+#endif
 
 /*
  * atomic_add(p, v)
@@ -94,13 +112,13 @@
     atomic_add_acq_##type(volatile u_##type *p, u_##type v) {	\
 	u_##type t;						\
 	__atomic_add_##type(p, v, t);				\
-	__ATOMIC_BARRIER;					\
+	__ATOMIC_ACQ();						\
     }								\
 								\
     static __inline void					\
     atomic_add_rel_##type(volatile u_##type *p, u_##type v) {	\
 	u_##type t;						\
-	__ATOMIC_BARRIER;					\
+	__ATOMIC_REL();						\
 	__atomic_add_##type(p, v, t);				\
     }								\
     /* _ATOMIC_ADD */
@@ -180,13 +198,13 @@
     atomic_clear_acq_##type(volatile u_##type *p, u_##type v) {	\
 	u_##type t;						\
 	__atomic_clear_##type(p, v, t);				\
-	__ATOMIC_BARRIER;					\
+	__ATOMIC_ACQ();						\
     }								\
 								\
     static __inline void					\
     atomic_clear_rel_##type(volatile u_##type *p, u_##type v) {	\
 	u_##type t;						\
-	__ATOMIC_BARRIER;					\
+	__ATOMIC_REL();						\
 	__atomic_clear_##type(p, v, t);				\
     }								\
     /* _ATOMIC_CLEAR */
@@ -282,13 +300,13 @@
     atomic_set_acq_##type(volatile u_##type *p, u_##type v) {	\
 	u_##type t;						\
 	__atomic_set_##type(p, v, t);				\
-	__ATOMIC_BARRIER;					\
+	__ATOMIC_ACQ();						\
     }								\
 								\
     static __inline void					\
     atomic_set_rel_##type(volatile u_##type *p, u_##type v) {	\
 	u_##type t;						\
-	__ATOMIC_BARRIER;					\
+	__ATOMIC_REL();						\
 	__atomic_set_##type(p, v, t);				\
     }								\
     /* _ATOMIC_SET */
@@ -368,13 +386,13 @@
     atomic_subtract_acq_##type(volatile u_##type *p, u_##type v) {	\
 	u_##type t;							\
 	__atomic_subtract_##type(p, v, t);				\
-	__ATOMIC_BARRIER;						\
+	__ATOMIC_ACQ();							\
     }									\
 									\
     static __inline void						\
     atomic_subtract_rel_##type(volatile u_##type *p, u_##type v) {	\
 	u_##type t;							\
-	__ATOMIC_BARRIER;						\
+	__ATOMIC_REL();							\
 	__atomic_subtract_##type(p, v, t);				\
     }									\
     /* _ATOMIC_SUBTRACT */
@@ -481,14 +499,14 @@
 	u_##TYPE v;						\
 								\
 	v = *p;							\
-	__ATOMIC_BARRIER;					\
+	mb();							\
 	return (v);						\
 }								\
 								\
 static __inline void						\
 atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)	\
 {								\
-	__ATOMIC_BARRIER;					\
+	mb();							\
 	*p = v;							\
 }
 
@@ -598,14 +616,14 @@
 	int retval;
 
 	retval = atomic_cmpset_int(p, cmpval, newval);
-	__ATOMIC_BARRIER;
+	__ATOMIC_ACQ();
 	return (retval);
 }
 
 static __inline int
 atomic_cmpset_rel_int(volatile u_int *p, u_int cmpval, u_int newval)
 {
-	__ATOMIC_BARRIER;
+	__ATOMIC_REL();
 	return (atomic_cmpset_int(p, cmpval, newval));
 }
 
@@ -615,14 +633,14 @@
 	u_long retval;
 
 	retval = atomic_cmpset_long(p, cmpval, newval);
-	__ATOMIC_BARRIER;
+	__ATOMIC_ACQ();
 	return (retval);
 }
 
 static __inline int
 atomic_cmpset_rel_long(volatile u_long *p, u_long cmpval, u_long newval)
 {
-	__ATOMIC_BARRIER;
+	__ATOMIC_REL();
 	return (atomic_cmpset_long(p, cmpval, newval));
 }
 
@@ -672,4 +690,7 @@
 #define	atomic_fetchadd_64	atomic_fetchadd_long
 #endif
 
+#undef __ATOMIC_REL
+#undef __ATOMIC_ACQ
+
 #endif /* ! _MACHINE_ATOMIC_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/cpu.h
--- a/head/sys/powerpc/include/cpu.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/cpu.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *	$NetBSD: cpu.h,v 1.11 2000/05/26 21:19:53 thorpej Exp $
- * $FreeBSD: head/sys/powerpc/include/cpu.h 234156 2012-04-11 22:23:50Z nwhitehorn $
+ * $FreeBSD: head/sys/powerpc/include/cpu.h 234785 2012-04-29 11:04:31Z dim $
  */
 
 #ifndef _MACHINE_CPU_H_
@@ -99,6 +99,6 @@
 void	swi_vm(void *);
 
 /* XXX the following should not be here. */
-void	savectx(struct pcb *);
+void	savectx(struct pcb *) __returns_twice;
 
 #endif	/* _MACHINE_CPU_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/cpufunc.h
--- a/head/sys/powerpc/include/cpufunc.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/cpufunc.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,22 +23,12 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/include/cpufunc.h 234589 2012-04-22 21:55:19Z nwhitehorn $
  */
 
 #ifndef _MACHINE_CPUFUNC_H_
 #define	_MACHINE_CPUFUNC_H_
 
-/*
- * Required for user-space atomic.h includes
- */
-static __inline void
-powerpc_mb(void)
-{
-
-	__asm __volatile("eieio; sync" : : : "memory");
-}
-
 #ifdef _KERNEL
 
 #include <sys/types.h>
@@ -176,21 +166,21 @@
 eieio(void)
 {
 
-	__asm __volatile ("eieio");
+	__asm __volatile ("eieio" : : : "memory");
 }
 
 static __inline void
 isync(void)
 {
 
-	__asm __volatile ("isync");
+	__asm __volatile ("isync" : : : "memory");
 }
 
 static __inline void
 powerpc_sync(void)
 {
 
-	__asm __volatile ("sync");
+	__asm __volatile ("sync" : : : "memory");
 }
 
 static __inline register_t
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/elf.h
--- a/head/sys/powerpc/include/elf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/elf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/include/elf.h 237430 2012-06-22 06:38:31Z kib $
  */
 
 #ifndef _MACHINE_ELF_H_
@@ -106,8 +106,9 @@
 #define	AT_PAGESIZES	18	/* Pagesizes. */
 #define	AT_PAGESIZESLEN	19	/* Number of pagesizes. */
 #define	AT_STACKPROT	21	/* Initial stack protection. */
+#define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 
-#define	AT_COUNT	22	/* Count of defined aux entry types. */
+#define	AT_COUNT	23	/* Count of defined aux entry types. */
 
 /*
  * Relocation types.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/hid.h
--- a/head/sys/powerpc/include/hid.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/hid.h	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $NetBSD: hid.h,v 1.2 2001/08/22 21:05:25 matt Exp $
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/include/hid.h 236025 2012-05-25 21:12:24Z raj $
  */
 
 #ifndef _POWERPC_HID_H_
@@ -78,6 +78,8 @@
 #define HID0_E500_SEL_TBCLK	0x00002000 /* Select Time Base clock */
 #define HID0_E500_MAS7UPDEN	0x00000080 /* Enable MAS7 update (e500v2) */
 
+#define HID0_E500MC_L2MMU_MHD	0x40000000 /* L2MMU Multiple Hit Detection */
+
 #define HID0_BITMASK							\
     "\20"								\
     "\040EMCP\037DBP\036EBA\035EBD\034BCLK\033EICE\032ECLK\031PAR"	\
@@ -105,6 +107,20 @@
     "\027NAP\025DPM\023TG\022HANGDETECT\021NHR\020INORDER"		\
     "\016TBCTRL\015TBEN\012CIABREN\011HDICEEN\001ENATTN"		
 
+#define HID0_E500MC_BITMASK						\
+    "\20"								\
+    "\040EMCP\037EN_L2MMU_MHD\036b2\035b3\034b4\033b5\032b6\031b7"	\
+    "\030b8\027b9\026b10\025b11\024b12\023b13\022b14\021b15"		\
+    "\020b16\017b17\016b18\015b19\014b20\013b21\012b22\011b23"		\
+    "\010EN_MAS7_UPDATE\007DCFA\006b26\005CIGLSO\004b28\003b29\002b30\001NOPTI"
+
+#define HID0_E5500_BITMASK						\
+    "\20"								\
+    "\040EMCP\037EN_L2MMU_MHD\036b2\035b3\034b4\033b5\032b6\031b7"	\
+    "\030b8\027b9\026b10\025b11\024b12\023b13\022b14\021b15"		\
+    "\020b16\017b17\016b18\015b19\014b20\013b21\012b22\011b23"		\
+    "\010b24\007DCFA\006b26\005CIGLSO\004b28\003b29\002b30\001NOPTI"
+
 /*
  *  HID0 bit definitions per cpu model
  *
@@ -142,6 +158,40 @@
  *  30	-	-	-	NOPDST	NOPDST	NOPDST	NOPDST	-
  *  31	NOOPTI	-	NOOPTI	NOPTI	NOPTI	NOPTI	NOPTI	NOPTI
  *
+ * bit	e500mc		e5500
+ *   0	EMCP		EMCP
+ *   1	EN_L2MMU_MHD	EN_L2MMU_MHD
+ *   2	-		-
+ *   3	-		-
+ *   4	-		-
+ *   5	-		-
+ *   6	-		-
+ *   7	-		-
+ *   8	-		-
+ *   9	-		-
+ *  10	-		-
+ *  11	-		-
+ *  12	-		-
+ *  13	-		-
+ *  14	-		-
+ *  15	-		-
+ *  16	-		-
+ *  17	-		-
+ *  18	-		-
+ *  19	-		-
+ *  20	-		-
+ *  21	-		-
+ *  22	-		-
+ *  23	-		-
+ *  24	EN_MAS7_UPDATE	-
+ *  25	DCFA		DCFA
+ *  26	-		-
+ *  27	CIGLSO		CIGLSO
+ *  28	-		-
+ *  29	-		-
+ *  30	-		-
+ *  31	NOPTI		NOPTI
+ *
  *  604: ECP = Enable cache parity checking
  *  604: SIE = Serial instruction execution disable
  * 7450: TBEN = Time Base Enable
@@ -160,6 +210,9 @@
 
 #define HID0_E500_DEFAULT_SET	(HID0_EMCP | HID0_E500_TBEN)
 #define HID1_E500_DEFAULT_SET	(HID1_E500_ABE | HID1_E500_ASTME)
+#define HID0_E500MC_DEFAULT_SET	(HID0_EMCP | HID0_E500MC_L2MMU_MHD | \
+				 HID0_E500_MAS7UPDEN)
+#define HID0_E5500_DEFAULT_SET	(HID0_EMCP | HID0_E500MC_L2MMU_MHD)
 
 #define HID5_970_DCBZ_SIZE_HI	0x00000080UL	/* dcbz does a 32-byte store */
 #define HID4_970_DISABLE_LG_PG	0x00000004ULL	/* disables large pages */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/in_cksum.h
--- a/head/sys/powerpc/include/in_cksum.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/in_cksum.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  *	from tahoe:	in_cksum.c	1.2	86/01/05
  *	from:		@(#)in_cksum.c	1.3 (Berkeley) 1/19/91
  *	from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/include/in_cksum.h 235941 2012-05-24 22:00:48Z bz $
  */
 
 #ifndef _MACHINE_IN_CKSUM_H_
@@ -39,6 +39,7 @@
 
 #define in_cksum(m, len)	in_cksum_skip(m, len, 0)
 
+#if defined(IPVERSION) && (IPVERSION == 4)
 /*
  * It it useful to have an Internet checksum routine which is inlineable
  * and optimized specifically for the task of computing IP header checksums
@@ -65,9 +66,12 @@
 	} while(0)
 
 #endif
+#endif
 
 #ifdef _KERNEL
+#if defined(IPVERSION) && (IPVERSION == 4)
 u_int in_cksum_hdr(const struct ip *ip);
+#endif
 u_short	in_addword(u_short sum, u_short b);
 u_short	in_pseudo(u_int sum, u_int b, u_int c);
 u_short	in_cksum_skip(struct mbuf *m, int len, int skip);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/machdep.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/powerpc/include/machdep.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2011-2012 Semihalf
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/powerpc/include/machdep.h 236324 2012-05-30 17:34:40Z raj $
+ */
+
+#ifndef _POWERPC_MACHDEP_H_
+#define _POWERPC_MACHDEP_H_
+
+void booke_disable_l2_cache(void);
+void booke_enable_l1_cache(void);
+void booke_enable_l2_cache(void);
+void booke_enable_l3_cache(void);
+void booke_enable_bpred(void);
+void booke_init_tlb(vm_paddr_t);
+
+#endif /* _POWERPC_MACHDEP_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/pcpu.h
--- a/head/sys/powerpc/include/pcpu.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/pcpu.h	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/powerpc/include/pcpu.h 230123 2012-01-15 00:08:14Z nwhitehorn $
+ * $FreeBSD: head/sys/powerpc/include/pcpu.h 236141 2012-05-27 10:25:20Z raj $
  */
 
 #ifndef	_MACHINE_PCPU_H_
@@ -115,7 +115,7 @@
 	PCPU_MD_COMMON_FIELDS	\
 	PCPU_MD_AIM_FIELDS
 #endif
-#ifdef E500
+#if defined(BOOKE)
 #define	PCPU_MD_FIELDS		\
 	PCPU_MD_COMMON_FIELDS	\
 	PCPU_MD_BOOKE_FIELDS
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/pio.h
--- a/head/sys/powerpc/include/pio.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/pio.h	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  *
  *	$NetBSD: pio.h,v 1.1 1998/05/15 10:15:54 tsubai Exp $
  *	$OpenBSD: pio.h,v 1.1 1997/10/13 10:53:47 pefo Exp $
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/include/pio.h 235013 2012-05-04 16:00:22Z nwhitehorn $
  */
 
 #ifndef _MACHINE_PIO_H_
@@ -39,46 +39,53 @@
  * I/O macros.
  */
 
+/*
+ * Use sync so that bus space operations cannot sneak out the bottom of
+ * mutex-protected sections (mutex release does not guarantee completion of
+ * accesses to caching-inhibited memory on some systems)
+ */
+#define powerpc_iomb() __asm __volatile("sync" : : : "memory")
+
 static __inline void
 __outb(volatile u_int8_t *a, u_int8_t v)
 {
 	*a = v;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
 __outw(volatile u_int16_t *a, u_int16_t v)
 {
 	*a = v;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
 __outl(volatile u_int32_t *a, u_int32_t v)
 {
 	*a = v;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
 __outll(volatile u_int64_t *a, u_int64_t v)
 {
 	*a = v;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
 __outwrb(volatile u_int16_t *a, u_int16_t v)
 {
 	__asm__ volatile("sthbrx %0, 0, %1" :: "r"(v), "r"(a));
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
 __outlrb(volatile u_int32_t *a, u_int32_t v)
 {
 	__asm__ volatile("stwbrx %0, 0, %1" :: "r"(v), "r"(a));
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline u_int8_t
@@ -87,7 +94,7 @@
 	u_int8_t _v_;
 
 	_v_ = *a;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 	return _v_;
 }
 
@@ -97,7 +104,7 @@
 	u_int16_t _v_;
 
 	_v_ = *a;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 	return _v_;
 }
 
@@ -107,7 +114,7 @@
 	u_int32_t _v_;
 
 	_v_ = *a;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 	return _v_;
 }
 
@@ -117,7 +124,7 @@
 	u_int64_t _v_;
 
 	_v_ = *a;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 	return _v_;
 }
 
@@ -127,7 +134,7 @@
 	u_int16_t _v_;
 
 	__asm__ volatile("lhbrx %0, 0, %1" : "=r"(_v_) : "r"(a));
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 	return _v_;
 }
 
@@ -137,7 +144,7 @@
 	u_int32_t _v_;
 
 	__asm__ volatile("lwbrx %0, 0, %1" : "=r"(_v_) : "r"(a));
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 	return _v_;
 }
 
@@ -175,7 +182,7 @@
 {
 	while (c--)
 		*a = *s++;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -183,7 +190,7 @@
 {
 	while (c--)
 		*a = *s++;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -191,7 +198,7 @@
 {
 	while (c--)
 		*a = *s++;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -199,7 +206,7 @@
 {
 	while (c--)
 		*a = *s++;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -207,7 +214,7 @@
 {
 	while (c--)
 		__asm__ volatile("sthbrx %0, 0, %1" :: "r"(*s++), "r"(a));
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -215,7 +222,7 @@
 {
 	while (c--)
 		__asm__ volatile("stwbrx %0, 0, %1" :: "r"(*s++), "r"(a));
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -223,7 +230,7 @@
 {
 	while (c--)
 		*d++ = *a;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -231,7 +238,7 @@
 {
 	while (c--)
 		*d++ = *a;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -239,7 +246,7 @@
 {
 	while (c--)
 		*d++ = *a;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -247,7 +254,7 @@
 {
 	while (c--)
 		*d++ = *a;
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -255,7 +262,7 @@
 {
 	while (c--)
 		__asm__ volatile("lhbrx %0, 0, %1" : "=r"(*d++) : "r"(a));
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static __inline void
@@ -263,7 +270,7 @@
 {
 	while (c--)
 		__asm__ volatile("lwbrx %0, 0, %1" : "=r"(*d++) : "r"(a));
-	__asm__ volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 #define	outsb(a,s,c)	(__outsb((volatile u_int8_t *)(a), s, c))
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/pmap.h
--- a/head/sys/powerpc/include/pmap.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/pmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: head/sys/powerpc/include/pmap.h 233948 2012-04-06 16:00:37Z nwhitehorn $
+ * $FreeBSD: head/sys/powerpc/include/pmap.h 237168 2012-06-16 18:56:19Z alc $
  */
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
@@ -94,7 +94,7 @@
 struct pvo_entry {
 	LIST_ENTRY(pvo_entry) pvo_vlink;	/* Link to common virt page */
 	LIST_ENTRY(pvo_entry) pvo_olink;	/* Link to overflow entry */
-	LIST_ENTRY(pvo_entry) pvo_plink;	/* Link to pmap entries */
+	RB_ENTRY(pvo_entry) pvo_plink;	/* Link to pmap entries */
 	union {
 		struct	pte pte;		/* 32 bit PTE */
 		struct	lpte lpte;		/* 64 bit PTE */
@@ -104,6 +104,9 @@
 	uint64_t	pvo_vpn;		/* Virtual page number */
 };
 LIST_HEAD(pvo_head, pvo_entry);
+RB_HEAD(pvo_tree, pvo_entry);
+int pvo_vaddr_compare(struct pvo_entry *, struct pvo_entry *);
+RB_PROTOTYPE(pvo_tree, pvo_entry, pvo_plink, pvo_vaddr_compare);
 
 #define	PVO_PTEGIDX_MASK	0x007UL		/* which PTEG slot */
 #define	PVO_PTEGIDX_VALID	0x008UL		/* slot is valid */
@@ -136,7 +139,7 @@
 
 	struct pmap	*pmap_phys;
 	struct		pmap_statistics	pm_stats;
-	struct pvo_head pmap_pvo;
+	struct pvo_tree pmap_pvo;
 };
 
 struct	md_page {
@@ -220,17 +223,19 @@
 #define	PMAP_TRYLOCK(pmap)	mtx_trylock(&(pmap)->pm_mtx)
 #define	PMAP_UNLOCK(pmap)	mtx_unlock(&(pmap)->pm_mtx)
 
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
+
 void		pmap_bootstrap(vm_offset_t, vm_offset_t);
-void		pmap_kenter(vm_offset_t va, vm_offset_t pa);
+void		pmap_kenter(vm_offset_t va, vm_paddr_t pa);
 void		pmap_kenter_attr(vm_offset_t va, vm_offset_t pa, vm_memattr_t);
 void		pmap_kremove(vm_offset_t);
-void		*pmap_mapdev(vm_offset_t, vm_size_t);
+void		*pmap_mapdev(vm_paddr_t, vm_size_t);
 void		*pmap_mapdev_attr(vm_offset_t, vm_size_t, vm_memattr_t);
 void		pmap_unmapdev(vm_offset_t, vm_size_t);
 void		pmap_page_set_memattr(vm_page_t, vm_memattr_t);
 void		pmap_deactivate(struct thread *);
-vm_offset_t	pmap_kextract(vm_offset_t);
-int		pmap_dev_direct_mapped(vm_offset_t, vm_size_t);
+vm_paddr_t	pmap_kextract(vm_offset_t);
+int		pmap_dev_direct_mapped(vm_paddr_t, vm_size_t);
 boolean_t	pmap_mmu_install(char *name, int prio);
 
 #define	vtophys(va)	pmap_kextract((vm_offset_t)(va))
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/profile.h
--- a/head/sys/powerpc/include/profile.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/profile.h	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  *
  *	from: NetBSD: profile.h,v 1.9 1997/04/06 08:47:37 cgd Exp
  *	from: FreeBSD: src/sys/alpha/include/profile.h,v 1.4 1999/12/29
- * $FreeBSD: head/sys/powerpc/include/profile.h 230400 2012-01-20 22:34:19Z andreast $
+ * $FreeBSD: head/sys/powerpc/include/profile.h 236141 2012-05-27 10:25:20Z raj $
  */
 
 #ifndef _MACHINE_PROFILE_H_
@@ -85,7 +85,7 @@
 	"_mcount:				\n" \
 	"	.quad .L._mcount,.TOC. at tocbase,0\n" \
 	"	.previous			\n" \
-	"	.size   main,24			\n" \
+	"	.size   _mcount,24		\n" \
 	"	.type	_mcount, at function	\n" \
 	"	.align	4			\n" \
 	".L._mcount:				\n" \
@@ -172,12 +172,13 @@
 #define	__PROFILE_VECTOR_BASE	EXC_RST
 #define	__PROFILE_VECTOR_TOP	(EXC_LAST + 0x100)
 #endif	/* AIM */
-#ifdef E500
+#if defined(BOOKE)
 extern char interrupt_vector_base[];
 extern char interrupt_vector_top[];
 #define	__PROFILE_VECTOR_BASE	(uintfptr_t)interrupt_vector_base
 #define	__PROFILE_VECTOR_TOP	(uintfptr_t)interrupt_vector_top
-#endif	/* E500 */
+#endif	/* BOOKE_E500 || BOOKE_PPC4XX */
+
 #endif	/* !COMPILING_LINT */
 
 #ifndef __PROFILE_VECTOR_BASE
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/psl.h
--- a/head/sys/powerpc/include/psl.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/psl.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,13 +29,13 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *	$NetBSD: psl.h,v 1.5 2000/11/19 19:52:37 matt Exp $
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/include/psl.h 236141 2012-05-27 10:25:20Z raj $
  */
 
 #ifndef	_MACHINE_PSL_H_
 #define	_MACHINE_PSL_H_
 
-#if defined(E500)
+#if defined(BOOKE_E500)
 /*
  * Machine State Register (MSR) - e500 core
  *
@@ -67,7 +67,29 @@
 #define PSL_KERNSET		(PSL_CE | PSL_ME | PSL_EE)
 #define PSL_USERSET		(PSL_KERNSET | PSL_PR)
 
-#else	/* if defined(E500) */
+#elif defined(BOOKE_PPC4XX)
+/*
+ * Machine State Register (MSR) - PPC4xx core
+ */
+#define PSL_WE		(0x80000000 >> 13) /* Wait State Enable */
+#define PSL_CE		(0x80000000 >> 14) /* Critical Interrupt Enable */
+#define PSL_EE		(0x80000000 >> 16) /* External Interrupt Enable */
+#define PSL_PR		(0x80000000 >> 17) /* Problem State */
+#define PSL_FP		(0x80000000 >> 18) /* Floating Point Available */
+#define PSL_ME		(0x80000000 >> 19) /* Machine Check Enable */
+#define PSL_FE0		(0x80000000 >> 20) /* Floating-point exception mode 0 */
+#define PSL_DWE		(0x80000000 >> 21) /* Debug Wait Enable */
+#define PSL_DE		(0x80000000 >> 22) /* Debug interrupt Enable */
+#define PSL_FE1		(0x80000000 >> 23) /* Floating-point exception mode 1 */
+#define PSL_IS		(0x80000000 >> 26) /* Instruction Address Space */
+#define PSL_DS		(0x80000000 >> 27) /* Data Address Space */
+
+#define PSL_KERNSET	(PSL_CE | PSL_ME | PSL_EE | PSL_FP)
+#define PSL_USERSET	(PSL_KERNSET | PSL_PR)
+
+#define PSL_FE_DFLT	0x00000000UL	/* default == none */
+
+#else	/* if defined(BOOKE_*) */
 /*
  * Machine State Register (MSR)
  *
@@ -127,5 +149,5 @@
 
 #define	PSL_USERSTATIC	(PSL_USERSET | PSL_IP | 0x87c0008c)
 
-#endif	/* if defined(E500) */
+#endif	/* if defined(BOOKE_E500) */
 #endif	/* _MACHINE_PSL_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/pte.h
--- a/head/sys/powerpc/include/pte.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/pte.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *	$NetBSD: pte.h,v 1.2 1998/08/31 14:43:40 tsubai Exp $
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/include/pte.h 236141 2012-05-27 10:25:20Z raj $
  */
 
 #ifndef	_MACHINE_PTE_H_
@@ -163,7 +163,7 @@
 #endif	/* _KERNEL */
 #endif	/* LOCORE */
 
-#else
+#else /* BOOKE */
 
 #include <machine/tlb.h>
 
@@ -224,6 +224,8 @@
 /* RPN mask, TLB0 4K pages */
 #define PTE_PA_MASK	PAGE_MASK
 
+#if defined(BOOKE_E500)
+
 /* PTE bits assigned to MAS2, MAS3 flags */
 #define PTE_W		MAS2_W
 #define PTE_I		MAS2_I
@@ -241,6 +243,26 @@
 #define PTE_MAS3_MASK	((MAS3_UX | MAS3_SX | MAS3_UW	\
 			| MAS3_SW | MAS3_UR | MAS3_SR) << PTE_MAS3_SHIFT)
 
+#elif defined(BOOKE_PPC4XX)
+
+#define PTE_WL1		TLB_WL1
+#define PTE_IL2I	TLB_IL2I
+#define PTE_IL2D	TLB_IL2D
+
+#define PTE_W		TLB_W
+#define PTE_I		TLB_I
+#define PTE_M		TLB_M
+#define PTE_G		TLB_G
+
+#define PTE_UX		TLB_UX
+#define PTE_SX		TLB_SX
+#define PTE_UW		TLB_UW
+#define PTE_SW		TLB_SW
+#define PTE_UR		TLB_UR
+#define PTE_SR		TLB_SR
+
+#endif
+
 /* Other PTE flags */
 #define PTE_VALID	0x80000000	/* Valid */
 #define PTE_MODIFIED	0x40000000	/* Modified */
@@ -256,6 +278,5 @@
 #define PTE_ISMODIFIED(pte)	((pte)->flags & PTE_MODIFIED)
 #define PTE_ISREFERENCED(pte)	((pte)->flags & PTE_REFERENCED)
 
-#endif /* #elif defined(E500) */
-
+#endif /* BOOKE_PPC4XX */
 #endif /* _MACHINE_PTE_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/spr.h
--- a/head/sys/powerpc/include/spr.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/spr.h	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $NetBSD: spr.h,v 1.25 2002/08/14 15:38:40 matt Exp $
- * $FreeBSD: head/sys/powerpc/include/spr.h 228869 2011-12-24 19:34:52Z jhibbits $
+ * $FreeBSD: head/sys/powerpc/include/spr.h 236141 2012-05-27 10:25:20Z raj $
  */
 #ifndef _POWERPC_SPR_H_
 #define	_POWERPC_SPR_H_
@@ -115,9 +115,9 @@
 #define	  SRR1_ISI_NOEXECUTE	0x10000000 /* Memory marked no-execute */
 #define	  SRR1_ISI_PP		0x08000000 /* PP bits forbid access */
 #define	SPR_DECAR		0x036	/* ..8 Decrementer auto reload */
-#define SPR_EIE			0x050	/* ..8 Exception Interrupt ??? */
-#define SPR_EID			0x051	/* ..8 Exception Interrupt ??? */
-#define SPR_NRI			0x052	/* ..8 Exception Interrupt ??? */
+#define	SPR_EIE			0x050	/* ..8 Exception Interrupt ??? */
+#define	SPR_EID			0x051	/* ..8 Exception Interrupt ??? */
+#define	SPR_NRI			0x052	/* ..8 Exception Interrupt ??? */
 #define	SPR_USPRG0		0x100	/* 4.. User SPR General 0 */
 #define	SPR_VRSAVE		0x100	/* .6. AltiVec VRSAVE */
 #define	SPR_SPRG0		0x110	/* 468 SPR General 0 */
@@ -184,6 +184,8 @@
 #define	  MPC8245		  0x8081
 #define	  FSL_E500v1		  0x8020
 #define	  FSL_E500v2		  0x8021
+#define	  FSL_E500mc		  0x8023
+#define	  FSL_E5500		  0x8024
 
 #define	SPR_IBAT0U		0x210	/* .68 Instruction BAT Reg 0 Upper */
 #define	SPR_IBAT0U		0x210	/* .6. Instruction BAT Reg 0 Upper */
@@ -202,49 +204,49 @@
 #define	SPR_DBAT2L		0x21d	/* .6. Data BAT Reg 2 Lower */
 #define	SPR_DBAT3U		0x21e	/* .6. Data BAT Reg 3 Upper */
 #define	SPR_DBAT3L		0x21f	/* .6. Data BAT Reg 3 Lower */
-#define SPR_IC_CST		0x230	/* ..8 Instruction Cache CSR */
-#define  IC_CST_IEN		0x80000000 /* I cache is ENabled   (RO) */
-#define  IC_CST_CMD_INVALL	0x0c000000 /* I cache invalidate all */
-#define  IC_CST_CMD_UNLOCKALL	0x0a000000 /* I cache unlock all */
-#define  IC_CST_CMD_UNLOCK	0x08000000 /* I cache unlock block */
-#define  IC_CST_CMD_LOADLOCK	0x06000000 /* I cache load & lock block */
-#define  IC_CST_CMD_DISABLE	0x04000000 /* I cache disable */
-#define  IC_CST_CMD_ENABLE	0x02000000 /* I cache enable */
-#define  IC_CST_CCER1		0x00200000 /* I cache error type 1 (RO) */
-#define  IC_CST_CCER2		0x00100000 /* I cache error type 2 (RO) */
-#define  IC_CST_CCER3		0x00080000 /* I cache error type 3 (RO) */
+#define	SPR_IC_CST		0x230	/* ..8 Instruction Cache CSR */
+#define	  IC_CST_IEN		0x80000000 /* I cache is ENabled   (RO) */
+#define	  IC_CST_CMD_INVALL	0x0c000000 /* I cache invalidate all */
+#define	  IC_CST_CMD_UNLOCKALL	0x0a000000 /* I cache unlock all */
+#define	  IC_CST_CMD_UNLOCK	0x08000000 /* I cache unlock block */
+#define	  IC_CST_CMD_LOADLOCK	0x06000000 /* I cache load & lock block */
+#define	  IC_CST_CMD_DISABLE	0x04000000 /* I cache disable */
+#define	  IC_CST_CMD_ENABLE	0x02000000 /* I cache enable */
+#define	  IC_CST_CCER1		0x00200000 /* I cache error type 1 (RO) */
+#define	  IC_CST_CCER2		0x00100000 /* I cache error type 2 (RO) */
+#define	  IC_CST_CCER3		0x00080000 /* I cache error type 3 (RO) */
 #define	SPR_IBAT4U		0x230	/* .6. Instruction BAT Reg 4 Upper */
-#define SPR_IC_ADR		0x231	/* ..8 Instruction Cache Address */
+#define	SPR_IC_ADR		0x231	/* ..8 Instruction Cache Address */
 #define	SPR_IBAT4L		0x231	/* .6. Instruction BAT Reg 4 Lower */
-#define SPR_IC_DAT		0x232	/* ..8 Instruction Cache Data */
+#define	SPR_IC_DAT		0x232	/* ..8 Instruction Cache Data */
 #define	SPR_IBAT5U		0x232	/* .6. Instruction BAT Reg 5 Upper */
 #define	SPR_IBAT5L		0x233	/* .6. Instruction BAT Reg 5 Lower */
 #define	SPR_IBAT6U		0x234	/* .6. Instruction BAT Reg 6 Upper */
 #define	SPR_IBAT6L		0x235	/* .6. Instruction BAT Reg 6 Lower */
 #define	SPR_IBAT7U		0x236	/* .6. Instruction BAT Reg 7 Upper */
 #define	SPR_IBAT7L		0x237	/* .6. Instruction BAT Reg 7 Lower */
-#define SPR_DC_CST		0x230	/* ..8 Data Cache CSR */
-#define  DC_CST_DEN		0x80000000 /* D cache ENabled (RO) */
-#define  DC_CST_DFWT		0x40000000 /* D cache Force Write-Thru (RO) */
-#define  DC_CST_LES		0x20000000 /* D cache Little Endian Swap (RO) */
-#define  DC_CST_CMD_FLUSH	0x0e000000 /* D cache invalidate all */
-#define  DC_CST_CMD_INVALL	0x0c000000 /* D cache invalidate all */
-#define  DC_CST_CMD_UNLOCKALL	0x0a000000 /* D cache unlock all */
-#define  DC_CST_CMD_UNLOCK	0x08000000 /* D cache unlock block */
-#define  DC_CST_CMD_CLRLESWAP	0x07000000 /* D cache clr little-endian swap */
-#define  DC_CST_CMD_LOADLOCK	0x06000000 /* D cache load & lock block */
-#define  DC_CST_CMD_SETLESWAP	0x05000000 /* D cache set little-endian swap */
-#define  DC_CST_CMD_DISABLE	0x04000000 /* D cache disable */
-#define  DC_CST_CMD_CLRFWT	0x03000000 /* D cache clear forced write-thru */
-#define  DC_CST_CMD_ENABLE	0x02000000 /* D cache enable */
-#define  DC_CST_CMD_SETFWT	0x01000000 /* D cache set forced write-thru */
-#define  DC_CST_CCER1		0x00200000 /* D cache error type 1 (RO) */
-#define  DC_CST_CCER2		0x00100000 /* D cache error type 2 (RO) */
-#define  DC_CST_CCER3		0x00080000 /* D cache error type 3 (RO) */
+#define	SPR_DC_CST		0x230	/* ..8 Data Cache CSR */
+#define	  DC_CST_DEN		0x80000000 /* D cache ENabled (RO) */
+#define	  DC_CST_DFWT		0x40000000 /* D cache Force Write-Thru (RO) */
+#define	  DC_CST_LES		0x20000000 /* D cache Little Endian Swap (RO) */
+#define	  DC_CST_CMD_FLUSH	0x0e000000 /* D cache invalidate all */
+#define	  DC_CST_CMD_INVALL	0x0c000000 /* D cache invalidate all */
+#define	  DC_CST_CMD_UNLOCKALL	0x0a000000 /* D cache unlock all */
+#define	  DC_CST_CMD_UNLOCK	0x08000000 /* D cache unlock block */
+#define	  DC_CST_CMD_CLRLESWAP	0x07000000 /* D cache clr little-endian swap */
+#define	  DC_CST_CMD_LOADLOCK	0x06000000 /* D cache load & lock block */
+#define	  DC_CST_CMD_SETLESWAP	0x05000000 /* D cache set little-endian swap */
+#define	  DC_CST_CMD_DISABLE	0x04000000 /* D cache disable */
+#define	  DC_CST_CMD_CLRFWT	0x03000000 /* D cache clear forced write-thru */
+#define	  DC_CST_CMD_ENABLE	0x02000000 /* D cache enable */
+#define	  DC_CST_CMD_SETFWT	0x01000000 /* D cache set forced write-thru */
+#define	  DC_CST_CCER1		0x00200000 /* D cache error type 1 (RO) */
+#define	  DC_CST_CCER2		0x00100000 /* D cache error type 2 (RO) */
+#define	  DC_CST_CCER3		0x00080000 /* D cache error type 3 (RO) */
 #define	SPR_DBAT4U		0x238	/* .6. Data BAT Reg 4 Upper */
-#define SPR_DC_ADR		0x231	/* ..8 Data Cache Address */
+#define	SPR_DC_ADR		0x231	/* ..8 Data Cache Address */
 #define	SPR_DBAT4L		0x239	/* .6. Data BAT Reg 4 Lower */
-#define SPR_DC_DAT		0x232	/* ..8 Data Cache Data */
+#define	SPR_DC_DAT		0x232	/* ..8 Data Cache Data */
 #define	SPR_DBAT5U		0x23a	/* .6. Data BAT Reg 5 Upper */
 #define	SPR_DBAT5L		0x23b	/* .6. Data BAT Reg 5 Lower */
 #define	SPR_DBAT6U		0x23c	/* .6. Data BAT Reg 6 Upper */
@@ -252,46 +254,46 @@
 #define	SPR_DBAT7U		0x23e	/* .6. Data BAT Reg 7 Upper */
 #define	SPR_DBAT7L		0x23f	/* .6. Data BAT Reg 7 Lower */
 #define	SPR_MI_CTR		0x310	/* ..8 IMMU control */
-#define  Mx_CTR_GPM		0x80000000 /* Group Protection Mode */
-#define  Mx_CTR_PPM		0x40000000 /* Page Protection Mode */
-#define  Mx_CTR_CIDEF		0x20000000 /* Cache-Inhibit DEFault */
-#define  MD_CTR_WTDEF		0x20000000 /* Write-Through DEFault */
-#define  Mx_CTR_RSV4		0x08000000 /* Reserve 4 TLB entries */
-#define  MD_CTR_TWAM		0x04000000 /* TableWalk Assist Mode */
-#define  Mx_CTR_PPCS		0x02000000 /* Priv/user state compare mode */
-#define  Mx_CTR_TLB_INDX	0x000001f0 /* TLB index mask */
-#define  Mx_CTR_TLB_INDX_BITPOS	8	  /* TLB index shift */
+#define	  Mx_CTR_GPM		0x80000000 /* Group Protection Mode */
+#define	  Mx_CTR_PPM		0x40000000 /* Page Protection Mode */
+#define	  Mx_CTR_CIDEF		0x20000000 /* Cache-Inhibit DEFault */
+#define	  MD_CTR_WTDEF		0x20000000 /* Write-Through DEFault */
+#define	  Mx_CTR_RSV4		0x08000000 /* Reserve 4 TLB entries */
+#define	  MD_CTR_TWAM		0x04000000 /* TableWalk Assist Mode */
+#define	  Mx_CTR_PPCS		0x02000000 /* Priv/user state compare mode */
+#define	  Mx_CTR_TLB_INDX	0x000001f0 /* TLB index mask */
+#define	  Mx_CTR_TLB_INDX_BITPOS	8	  /* TLB index shift */
 #define	SPR_MI_AP		0x312	/* ..8 IMMU access protection */
-#define  Mx_GP_SUPER(n)		(0 << (2*(15-(n)))) /* access is supervisor */
-#define  Mx_GP_PAGE		(1 << (2*(15-(n)))) /* access is page protect */
-#define  Mx_GP_SWAPPED		(2 << (2*(15-(n)))) /* access is swapped */
-#define  Mx_GP_USER		(3 << (2*(15-(n)))) /* access is user */
+#define	  Mx_GP_SUPER(n)	(0 << (2*(15-(n)))) /* access is supervisor */
+#define	  Mx_GP_PAGE		(1 << (2*(15-(n)))) /* access is page protect */
+#define	  Mx_GP_SWAPPED		(2 << (2*(15-(n)))) /* access is swapped */
+#define	  Mx_GP_USER		(3 << (2*(15-(n)))) /* access is user */
 #define	SPR_MI_EPN		0x313	/* ..8 IMMU effective number */
-#define  Mx_EPN_EPN		0xfffff000 /* Effective Page Number mask */
-#define  Mx_EPN_EV		0x00000020 /* Entry Valid */
-#define  Mx_EPN_ASID		0x0000000f /* Address Space ID */
+#define	  Mx_EPN_EPN		0xfffff000 /* Effective Page Number mask */
+#define	  Mx_EPN_EV		0x00000020 /* Entry Valid */
+#define	  Mx_EPN_ASID		0x0000000f /* Address Space ID */
 #define	SPR_MI_TWC		0x315	/* ..8 IMMU tablewalk control */
-#define  MD_TWC_L2TB		0xfffff000 /* Level-2 Tablewalk Base */
-#define  Mx_TWC_APG		0x000001e0 /* Access Protection Group */
-#define  Mx_TWC_G		0x00000010 /* Guarded memory */
-#define  Mx_TWC_PS		0x0000000c /* Page Size (L1) */
-#define  MD_TWC_WT		0x00000002 /* Write-Through */
-#define  Mx_TWC_V		0x00000001 /* Entry Valid */
+#define	  MD_TWC_L2TB		0xfffff000 /* Level-2 Tablewalk Base */
+#define	  Mx_TWC_APG		0x000001e0 /* Access Protection Group */
+#define	  Mx_TWC_G		0x00000010 /* Guarded memory */
+#define	  Mx_TWC_PS		0x0000000c /* Page Size (L1) */
+#define	  MD_TWC_WT		0x00000002 /* Write-Through */
+#define	  Mx_TWC_V		0x00000001 /* Entry Valid */
 #define	SPR_MI_RPN		0x316	/* ..8 IMMU real (phys) page number */
-#define  Mx_RPN_RPN		0xfffff000 /* Real Page Number */
-#define  Mx_RPN_PP		0x00000ff0 /* Page Protection */
-#define  Mx_RPN_SPS		0x00000008 /* Small Page Size */
-#define  Mx_RPN_SH		0x00000004 /* SHared page */
-#define  Mx_RPN_CI		0x00000002 /* Cache Inhibit */
-#define  Mx_RPN_V		0x00000001 /* Valid */
+#define	  Mx_RPN_RPN		0xfffff000 /* Real Page Number */
+#define	  Mx_RPN_PP		0x00000ff0 /* Page Protection */
+#define	  Mx_RPN_SPS		0x00000008 /* Small Page Size */
+#define	  Mx_RPN_SH		0x00000004 /* SHared page */
+#define	  Mx_RPN_CI		0x00000002 /* Cache Inhibit */
+#define	  Mx_RPN_V		0x00000001 /* Valid */
 #define	SPR_MD_CTR		0x318	/* ..8 DMMU control */
 #define	SPR_M_CASID		0x319	/* ..8 CASID */
-#define  M_CASID		0x0000000f /* Current AS Id */
+#define	  M_CASID		0x0000000f /* Current AS Id */
 #define	SPR_MD_AP		0x31a	/* ..8 DMMU access protection */
 #define	SPR_MD_EPN		0x31b	/* ..8 DMMU effective number */
 #define	SPR_M_TWB		0x31c	/* ..8 MMU tablewalk base */
-#define  M_TWB_L1TB		0xfffff000 /* level-1 translation base */
-#define  M_TWB_L1INDX		0x00000ffc /* level-1 index */
+#define	  M_TWB_L1TB		0xfffff000 /* level-1 translation base */
+#define	  M_TWB_L1INDX		0x00000ffc /* level-1 index */
 #define	SPR_MD_TWC		0x31d	/* ..8 DMMU tablewalk control */
 #define	SPR_MD_RPN		0x31e	/* ..8 DMMU real (phys) page number */
 #define	SPR_MD_TW		0x31f	/* ..8 MMU tablewalk scratch */
@@ -307,8 +309,8 @@
 #define	SPR_UMMCR1		0x3ac	/* .6. User Monitor Mode Control Register 1 */
 #define	SPR_ZPR			0x3b0	/* 4.. Zone Protection Register */
 #define	SPR_MMCR2		0x3b0	/* .6. Monitor Mode Control Register 2 */
-#define	 SPR_MMCR2_THRESHMULT_32  0x80000000 /* Multiply MMCR0 threshold by 32 */
-#define	 SPR_MMCR2_THRESHMULT_2	  0x00000000 /* Multiply MMCR0 threshold by 2 */
+#define	  SPR_MMCR2_THRESHMULT_32	  0x80000000 /* Multiply MMCR0 threshold by 32 */
+#define	  SPR_MMCR2_THRESHMULT_2	  0x00000000 /* Multiply MMCR0 threshold by 2 */
 #define	SPR_PID			0x3b1	/* 4.. Process ID */
 #define	SPR_PMC5		0x3b1	/* .6. Performance Counter Register 5 */
 #define	SPR_PMC6		0x3b2	/* .6. Performance Counter Register 6 */
@@ -323,14 +325,14 @@
 #define	SPR_970MMCR1		0x31e	/* ... Monitor Mode Control Register 1 (PPC 970) */
 #define	SPR_970MMCRA		0x312	/* ... Monitor Mode Control Register 2 (PPC 970) */
 #define	SPR_970MMCR0		0x31b	/* ... Monitor Mode Control Register 0 (PPC 970) */
-#define SPR_970PMC1		0x313	/* ... PMC 1 */
-#define SPR_970PMC2		0x314	/* ... PMC 2 */
-#define SPR_970PMC3		0x315	/* ... PMC 3 */
-#define SPR_970PMC4		0x316	/* ... PMC 4 */
-#define SPR_970PMC5		0x317	/* ... PMC 5 */
-#define SPR_970PMC6		0x318	/* ... PMC 6 */
-#define SPR_970PMC7		0x319	/* ... PMC 7 */
-#define SPR_970PMC8		0x31a	/* ... PMC 8 */
+#define	SPR_970PMC1		0x313	/* ... PMC 1 */
+#define	SPR_970PMC2		0x314	/* ... PMC 2 */
+#define	SPR_970PMC3		0x315	/* ... PMC 3 */
+#define	SPR_970PMC4		0x316	/* ... PMC 4 */
+#define	SPR_970PMC5		0x317	/* ... PMC 5 */
+#define	SPR_970PMC6		0x318	/* ... PMC 6 */
+#define	SPR_970PMC7		0x319	/* ... PMC 7 */
+#define	SPR_970PMC8		0x31a	/* ... PMC 8 */
 
 #define	  SPR_MMCR0_FC		  0x80000000 /* Freeze counters */
 #define	  SPR_MMCR0_FCS		  0x40000000 /* Freeze counters in supervisor mode */
@@ -421,6 +423,7 @@
 #define	SPR_SRR3		0x3df	/* 4.. Save/Restore Register 3 */
 #define	SPR_HID0		0x3f0	/* ..8 Hardware Implementation Register 0 */
 #define	SPR_HID1		0x3f1	/* ..8 Hardware Implementation Register 1 */
+#define	SPR_HID2		0x3f3	/* ..8 Hardware Implementation Register 2 */
 #define	SPR_HID4		0x3f4	/* ..8 Hardware Implementation Register 4 */
 #define	SPR_HID5		0x3f6	/* ..8 Hardware Implementation Register 5 */
 #define	SPR_HID6		0x3f9	/* ..8 Hardware Implementation Register 6 */
@@ -452,7 +455,7 @@
 #define	SPR_DAC1		0x3f6	/* 4.. Data Address Compare 1 */
 #define	SPR_DAC2		0x3f7	/* 4.. Data Address Compare 2 */
 #define	SPR_PIR			0x3ff	/* .6. Processor Identification Register */
-#elif defined(E500)
+#elif defined(BOOKE)
 #define	SPR_PIR			0x11e	/* ..8 Processor Identification Register */
 #define	SPR_DBSR		0x130	/* ..8 Debug Status Register */
 #define	  DBSR_IDE		  0x80000000 /* Imprecise debug event. */
@@ -554,7 +557,6 @@
 #define	  L2CR_L2DRO		  0x00000100 /* 23: L2DLL rollover checkstop enable. */
 #define	  L2CR_L2IP		  0x00000001 /* 31: L2 global invalidate in */
 					     /*     progress (read only). */
-
 #define	SPR_L3CR		0x3fa	/* .6. L3 Control Register */
 #define	  L3CR_L3E		  0x80000000 /* 0: L3 enable */
 #define	  L3CR_L3PE		  0x40000000 /* 1: L3 data parity enable */
@@ -582,15 +584,15 @@
 #define	SPR_ICCR		0x3fb	/* 4.. Instruction Cache Cachability Register */
 #define	SPR_THRM1		0x3fc	/* .6. Thermal Management Register */
 #define	SPR_THRM2		0x3fd	/* .6. Thermal Management Register */
-#define	 SPR_THRM_TIN		  0x80000000 /* Thermal interrupt bit (RO) */
-#define	 SPR_THRM_TIV		  0x40000000 /* Thermal interrupt valid (RO) */
-#define	 SPR_THRM_THRESHOLD(x)	  ((x) << 23) /* Thermal sensor threshold */
-#define	 SPR_THRM_TID		  0x00000004 /* Thermal interrupt direction */
-#define	 SPR_THRM_TIE		  0x00000002 /* Thermal interrupt enable */
-#define	 SPR_THRM_VALID		  0x00000001 /* Valid bit */
+#define	  SPR_THRM_TIN		  0x80000000 /* Thermal interrupt bit (RO) */
+#define	  SPR_THRM_TIV		  0x40000000 /* Thermal interrupt valid (RO) */
+#define	  SPR_THRM_THRESHOLD(x)	  ((x) << 23) /* Thermal sensor threshold */
+#define	  SPR_THRM_TID		  0x00000004 /* Thermal interrupt direction */
+#define	  SPR_THRM_TIE		  0x00000002 /* Thermal interrupt enable */
+#define	  SPR_THRM_VALID		  0x00000001 /* Valid bit */
 #define	SPR_THRM3		0x3fe	/* .6. Thermal Management Register */
-#define	 SPR_THRM_TIMER(x)	  ((x) << 1) /* Sampling interval timer */
-#define	 SPR_THRM_ENABLE       	  0x00000001 /* TAU Enable */
+#define	  SPR_THRM_TIMER(x)	  ((x) << 1) /* Sampling interval timer */
+#define	  SPR_THRM_ENABLE	  0x00000001 /* TAU Enable */
 #define	SPR_FPECR		0x3fe	/* .6. Floating-Point Exception Cause Register */
 
 /* Time Base Register declarations */
@@ -600,7 +602,7 @@
 #define	TBR_TBWU		0x11d	/* 468 Time Base Upper - supervisor, write */
 
 /* Performance counter declarations */
-#define	PMC_OVERFLOW	  	0x80000000 /* Counter has overflowed */
+#define	PMC_OVERFLOW		0x80000000 /* Counter has overflowed */
 
 /* The first five countable [non-]events are common to many PMC's */
 #define	PMCN_NONE		 0 /* Count nothing */
@@ -616,7 +618,7 @@
 
 #if defined(AIM)
 
-#define SPR_ESR			0x3d4	/* 4.. Exception Syndrome Register */
+#define	SPR_ESR			0x3d4	/* 4.. Exception Syndrome Register */
 #define	  ESR_MCI		  0x80000000 /* Machine check - instruction */
 #define	  ESR_PIL		  0x08000000 /* Program interrupt - illegal */
 #define	  ESR_PPR		  0x04000000 /* Program interrupt - privileged */
@@ -626,7 +628,9 @@
 #define	  ESR_DIZ		  0x00800000 /* Data/instruction storage interrupt - zone fault */
 #define	  ESR_U0F		  0x00008000 /* Data storage interrupt - U0 fault */
 
-#elif defined(E500)
+#elif defined(BOOKE)
+
+#define	SPR_MCSR		0x23c	/* ..8 Machine Check Syndrome register */
 
 #define	SPR_ESR			0x003e	/* ..8 Exception Syndrome Register */
 #define	  ESR_PIL		  0x08000000 /* Program interrupt - illegal */
@@ -643,6 +647,19 @@
 #define	SPR_MCSRR0		0x23a	/* ..8 570 Machine check SRR0 */
 #define	SPR_MCSRR1		0x23b	/* ..8 571 Machine check SRR1 */
 
+#define	SPR_MMUCR		0x3b2	/* 4.. MMU Control Register */
+#define	  MMUCR_SWOA		(0x80000000 >> 7)
+#define	  MMUCR_U1TE		(0x80000000 >> 9)
+#define	  MMUCR_U2SWOAE		(0x80000000 >> 10)
+#define	  MMUCR_DULXE		(0x80000000 >> 12)
+#define	  MMUCR_IULXE		(0x80000000 >> 13)
+#define	  MMUCR_STS		(0x80000000 >> 15)
+#define	  MMUCR_STID_MASK	(0xFF000000 >> 24)
+
+#define	SPR_MMUCSR0		0x3f4	/* ..8 1012 MMU Control and Status Register 0 */
+#define	  MMUCSR0_L2TLB0_FI	0x04	/*  TLB0 flash invalidate */
+#define	  MMUCSR0_L2TLB1_FI	0x02	/*  TLB1 flash invalidate */
+
 #define	SPR_SVR			0x3ff	/* ..8 1023 System Version Register */
 #define	  SVR_MPC8533		  0x8034
 #define	  SVR_MPC8533E		  0x803c
@@ -662,10 +679,16 @@
 #define	  SVR_P2010E		  0x80eb
 #define	  SVR_P2020		  0x80e2
 #define	  SVR_P2020E		  0x80ea
+#define	  SVR_P2041		  0x8210
+#define	  SVR_P2041E		  0x8218
+#define	  SVR_P3041		  0x8211
+#define	  SVR_P3041E		  0x8219
 #define	  SVR_P4040		  0x8200
 #define	  SVR_P4040E		  0x8208
 #define	  SVR_P4080		  0x8201
 #define	  SVR_P4080E		  0x8209
+#define	  SVR_P5020		  0x8220
+#define	  SVR_P5020E		  0x8228
 #define	SVR_VER(svr)		(((svr) >> 16) & 0xffff)
 
 #define	SPR_PID0		0x030	/* ..8 Process ID Register 0 */
@@ -708,6 +731,18 @@
 #define	SPR_MAS5		0x275	/* ..8 MMU Assist Register 5 Book-E */
 #define	SPR_MAS6		0x276	/* ..8 MMU Assist Register 6 Book-E/e500 */
 #define	SPR_MAS7		0x3B0	/* ..8 MMU Assist Register 7 Book-E/e500 */
+#define	SPR_MAS8		0x155	/* ..8 MMU Assist Register 8 Book-E/e500 */
+
+#define	SPR_L1CFG0		0x203	/* ..8 L1 cache configuration register 0 */
+#define	SPR_L1CFG1		0x204	/* ..8 L1 cache configuration register 1 */
+
+#define	SPR_CCR1		0x378
+#define	  CCR1_L2COBE		0x00000040
+
+#define	DCR_L2DCDCRAI		0x0000	/* L2 D-Cache DCR Address Pointer */
+#define	DCR_L2DCDCRDI		0x0001	/* L2 D-Cache DCR Data Indirect */
+#define	DCR_L2CR0		0x00	/* L2 Cache Configuration Register 0 */
+#define	  L2CR0_AS		0x30000000
 
 #define	SPR_L1CSR0		0x3F2	/* ..8 L1 Cache Control and Status Register 0 */
 #define	  L1CSR0_DCPE		0x00010000	/* Data Cache Parity Enable */
@@ -716,13 +751,20 @@
 #define	  L1CSR0_DCE		0x00000001	/* Data Cache Enable */
 #define	SPR_L1CSR1		0x3F3	/* ..8 L1 Cache Control and Status Register 1 */
 #define	  L1CSR1_ICPE		0x00010000	/* Instruction Cache Parity Enable */
+#define	  L1CSR1_ICUL		0x00000400      /* Instr Cache Unable to Lock */
 #define	  L1CSR1_ICLFR		0x00000100	/* Instruction Cache Lock Bits Flash Reset */
 #define	  L1CSR1_ICFI		0x00000002	/* Instruction Cache Flash Invalidate */
 #define	  L1CSR1_ICE		0x00000001	/* Instruction Cache Enable */
 
+#define	SPR_L2CSR0		0x3F9	/* ..8 L2 Cache Control and Status Register 0 */
+#define	  L2CSR0_L2E		0x80000000	/* L2 Cache Enable */
+#define	  L2CSR0_L2PE		0x40000000	/* L2 Cache Parity Enable */
+#define	  L2CSR0_L2FI		0x00200000	/* L2 Cache Flash Invalidate */
+#define	  L2CSR0_L2LFC		0x00000400	/* L2 Cache Lock Flags Clear */
+
 #define	SPR_BUCSR		0x3F5	/* ..8 Branch Unit Control and Status Register */
 #define	  BUCSR_BPEN		0x00000001	/* Branch Prediction Enable */
+#define	  BUCSR_BBFI		0x00000200	/* Branch Buffer Flash Invalidate */
 
-#endif /* #elif defined(E500) */
-
+#endif /* BOOKE */
 #endif /* !_POWERPC_SPR_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/tlb.h
--- a/head/sys/powerpc/include/tlb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/tlb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,5 +1,5 @@
 /*-
- * Copyright (C) 2006 Semihalf, Marian Balakowicz <m8 at semihalf.com>
+ * Copyright (C) 2006-2012 Semihalf.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -24,12 +24,14 @@
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/include/tlb.h 236141 2012-05-27 10:25:20Z raj $
  */
 
 #ifndef	_MACHINE_TLB_H_
 #define	_MACHINE_TLB_H_
 
+#if defined(BOOKE_E500)
+
 /*  PowerPC E500 MAS registers */
 #define MAS0_TLBSEL(x)		((x << 28) & 0x10000000)
 #define MAS0_ESEL(x)		((x << 16) & 0x000F0000)
@@ -122,6 +124,73 @@
 #define _TLB_ENTRY_MEM	(0)
 #endif
 
+#if !defined(LOCORE)
+typedef struct tlb_entry {
+	uint32_t mas1;
+	uint32_t mas2;
+	uint32_t mas3;
+} tlb_entry_t;
+
+void tlb0_print_tlbentries(void);
+
+void tlb1_inval_entry(unsigned int);
+void tlb1_init(vm_offset_t);
+void tlb1_print_entries(void);
+void tlb1_print_tlbentries(void);
+#endif /* !LOCORE */
+
+#elif defined(BOOKE_PPC4XX)
+
+/* TLB Words */
+#define	TLB_PAGEID		0
+#define	TLB_XLAT		1
+#define	TLB_ATTRIB		2
+
+/* Page identification fields */
+#define	TLB_EPN_MASK		(0xFFFFFC00 >> 0)
+#define	TLB_VALID		(0x80000000 >> 22)
+#define	TLB_TS			(0x80000000 >> 23)
+#define	TLB_SIZE_1K		(0x00000000 >> 24)
+#define	TLB_SIZE_MASK		(0xF0000000 >> 24)
+
+/* Translation fields */
+#define	TLB_RPN_MASK		(0xFFFFFC00 >> 0)
+#define	TLB_ERPN_MASK		(0xF0000000 >> 28)
+
+/* Storage attribute and access control fields */
+#define	TLB_WL1			(0x80000000 >> 11)
+#define	TLB_IL1I		(0x80000000 >> 12)
+#define	TLB_IL1D		(0x80000000 >> 13)
+#define	TLB_IL2I		(0x80000000 >> 14)
+#define	TLB_IL2D		(0x80000000 >> 15)
+#define	TLB_U0			(0x80000000 >> 16)
+#define	TLB_U1			(0x80000000 >> 17)
+#define	TLB_U2			(0x80000000 >> 18)
+#define	TLB_U3			(0x80000000 >> 19)
+#define	TLB_W			(0x80000000 >> 20)
+#define	TLB_I			(0x80000000 >> 21)
+#define	TLB_M			(0x80000000 >> 22)
+#define	TLB_G			(0x80000000 >> 23)
+#define	TLB_E			(0x80000000 >> 24)
+#define	TLB_UX			(0x80000000 >> 26)
+#define	TLB_UW			(0x80000000 >> 27)
+#define	TLB_UR			(0x80000000 >> 28)
+#define	TLB_SX			(0x80000000 >> 29)
+#define	TLB_SW			(0x80000000 >> 30)
+#define	TLB_SR			(0x80000000 >> 31)
+#define	TLB_SIZE		64
+
+#define	TLB_SIZE_4K		(0x10000000 >> 24)
+#define	TLB_SIZE_16K		(0x20000000 >> 24)
+#define	TLB_SIZE_64K		(0x30000000 >> 24)
+#define	TLB_SIZE_256K		(0x40000000 >> 24)
+#define	TLB_SIZE_1M		(0x50000000 >> 24)
+#define	TLB_SIZE_16M		(0x70000000 >> 24)
+#define	TLB_SIZE_256M		(0x90000000 >> 24)
+#define	TLB_SIZE_1G		(0xA0000000 >> 24)
+
+#endif /* BOOKE_E500 */
+
 #define TID_KERNEL	0	/* TLB TID to use for kernel (shared) translations */
 #define TID_KRESERVED	1	/* Number of TIDs reserved for kernel */
 #define TID_URESERVED	0	/* Number of TIDs reserved for user */
@@ -132,22 +201,11 @@
 #define TLB_UNLOCKED	0
 
 #if !defined(LOCORE)
-typedef struct tlb_entry {
-	uint32_t mas1;
-	uint32_t mas2;
-	uint32_t mas3;
-} tlb_entry_t;
 
 typedef int tlbtid_t;
+
 struct pmap;
 
-void tlb0_print_tlbentries(void);
-
-void tlb1_inval_entry(unsigned int);
-void tlb1_init(vm_offset_t);
-void tlb1_print_entries(void);
-void tlb1_print_tlbentries(void);
-
 void tlb_lock(uint32_t *);
 void tlb_unlock(uint32_t *);
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/trap.h
--- a/head/sys/powerpc/include/trap.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/trap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -1,8 +1,8 @@
-/* $FreeBSD: head/sys/powerpc/include/trap.h 233635 2012-03-29 02:02:14Z nwhitehorn $ */
+/* $FreeBSD: head/sys/powerpc/include/trap.h 236141 2012-05-27 10:25:20Z raj $ */
 
 #if defined(AIM)
 #include <machine/trap_aim.h>
-#elif defined(E500)
+#elif defined(BOOKE)
 #include <machine/trap_booke.h>
 #endif
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/ucontext.h
--- a/head/sys/powerpc/include/ucontext.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/ucontext.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $NetBSD: signal.h,v 1.4 1998/09/14 02:48:34 thorpej Exp $
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/include/ucontext.h 234542 2012-04-21 14:39:47Z nwhitehorn $
  */
 
 #ifndef	_MACHINE_UCONTEXT_H_
@@ -71,9 +71,9 @@
 #define	mc_ctr		mc_frame[35]
 #define mc_srr0		mc_frame[36]
 #define mc_srr1		mc_frame[37]
-#define mc_dar		mc_frame[38]
-#define mc_dsisr	mc_frame[39]
-#define mc_exc		mc_frame[40]
+#define mc_exc		mc_frame[38]
+#define mc_dar		mc_frame[39]
+#define mc_dsisr	mc_frame[40]
 
 /* floating-point state */
 #define mc_fpscr	mc_fpreg[32]
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/vdso.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/powerpc/include/vdso.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,41 @@
+/*-
+ * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/powerpc/include/vdso.h 237433 2012-06-22 07:06:40Z kib $
+ */
+
+#ifndef _POWERPC_VDSO_H
+#define	_POWERPC_VDSO_H
+
+#define	VDSO_TIMEHANDS_MD			\
+	uint32_t	th_res[8];
+
+#ifdef _KERNEL
+#ifdef COMPAT_FREEBSD32
+
+#define	VDSO_TIMEHANDS_MD32	VDSO_TIMEHANDS_MD
+
+#endif
+#endif
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/include/vmparam.h
--- a/head/sys/powerpc/include/vmparam.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/include/vmparam.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *	$NetBSD: vmparam.h,v 1.11 2000/02/11 19:25:16 thorpej Exp $
- * $FreeBSD: head/sys/powerpc/include/vmparam.h 228413 2011-12-11 17:23:03Z nwhitehorn $
+ * $FreeBSD: head/sys/powerpc/include/vmparam.h 236141 2012-05-27 10:25:20Z raj $
  */
 
 #ifndef _MACHINE_VMPARAM_H_
@@ -78,7 +78,7 @@
 #endif
 #define	SHAREDPAGE		(VM_MAXUSER_ADDRESS - PAGE_SIZE)
 #else /* LOCORE */
-#if !defined(__powerpc64__) && defined(E500)
+#if !defined(__powerpc64__) && defined(BOOKE)
 #define	VM_MIN_ADDRESS		0
 #define	VM_MAXUSER_ADDRESS	0x7ffff000
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/mpc85xx/fsl_sdhc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/powerpc/mpc85xx/fsl_sdhc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,1306 @@
+/*-
+ * Copyright (c) 2011-2012 Semihalf
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Driver for Freescale integrated eSDHC controller.
+ * Limitations:
+ * 	- No support for multi-block transfers.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/fsl_sdhc.c 236121 2012-05-26 21:07:15Z raj $");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/rman.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <machine/bus.h>
+#include <machine/vmparam.h>
+
+#include <dev/fdt/fdt_common.h>
+#include <dev/ofw/ofw_bus.h>
+#include <dev/ofw/ofw_bus_subr.h>
+
+#include <dev/mmc/bridge.h>
+#include <dev/mmc/mmcreg.h>
+#include <dev/mmc/mmcvar.h>
+#include <dev/mmc/mmcbrvar.h>
+
+#include <powerpc/mpc85xx/mpc85xx.h>
+
+#include "opt_platform.h"
+
+#include "mmcbr_if.h"
+
+#include "fsl_sdhc.h"
+
+#define DEBUG
+#undef DEBUG
+#ifdef DEBUG
+#define	DPRINTF(fmt, arg...)	printf("DEBUG %s(): " fmt, __FUNCTION__, ##arg)
+#else
+#define	DPRINTF(fmt, arg...)
+#endif
+
+
+/*****************************************************************************
+ * Register the driver
+ *****************************************************************************/
+/* Forward declarations */
+static int	fsl_sdhc_probe(device_t);
+static int	fsl_sdhc_attach(device_t);
+static int	fsl_sdhc_detach(device_t);
+
+static int	fsl_sdhc_read_ivar(device_t, device_t, int, uintptr_t *);
+static int	fsl_sdhc_write_ivar(device_t, device_t, int, uintptr_t);
+
+static int	fsl_sdhc_update_ios(device_t, device_t);
+static int	fsl_sdhc_request(device_t, device_t, struct mmc_request *);
+static int	fsl_sdhc_get_ro(device_t, device_t);
+static int	fsl_sdhc_acquire_host(device_t, device_t);
+static int	fsl_sdhc_release_host(device_t, device_t);
+
+static device_method_t fsl_sdhc_methods[] = {
+	/* device_if */
+	DEVMETHOD(device_probe, fsl_sdhc_probe),
+	DEVMETHOD(device_attach, fsl_sdhc_attach),
+	DEVMETHOD(device_detach, fsl_sdhc_detach),
+
+	/* Bus interface */
+	DEVMETHOD(bus_read_ivar, fsl_sdhc_read_ivar),
+	DEVMETHOD(bus_write_ivar, fsl_sdhc_write_ivar),
+
+	/* OFW bus interface */
+	DEVMETHOD(ofw_bus_get_compat,   ofw_bus_gen_get_compat),
+	DEVMETHOD(ofw_bus_get_model,    ofw_bus_gen_get_model),
+	DEVMETHOD(ofw_bus_get_name,     ofw_bus_gen_get_name),
+	DEVMETHOD(ofw_bus_get_node,     ofw_bus_gen_get_node),
+	DEVMETHOD(ofw_bus_get_type,     ofw_bus_gen_get_type),
+
+	/* mmcbr_if */
+	DEVMETHOD(mmcbr_update_ios, fsl_sdhc_update_ios),
+	DEVMETHOD(mmcbr_request, fsl_sdhc_request),
+	DEVMETHOD(mmcbr_get_ro, fsl_sdhc_get_ro),
+	DEVMETHOD(mmcbr_acquire_host, fsl_sdhc_acquire_host),
+	DEVMETHOD(mmcbr_release_host, fsl_sdhc_release_host),
+
+	{0, 0},
+};
+
+/* kobj_class definition */
+static driver_t fsl_sdhc_driver = {
+	"sdhci",
+	fsl_sdhc_methods,
+	sizeof(struct fsl_sdhc_softc)
+};
+
+static devclass_t fsl_sdhc_devclass;
+
+DRIVER_MODULE(sdhci, simplebus, fsl_sdhc_driver, fsl_sdhc_devclass, 0, 0);
+
+
+/*****************************************************************************
+ * Private methods
+ *****************************************************************************/
+static inline int
+read4(struct fsl_sdhc_softc *sc, unsigned int offset)
+{
+
+	return bus_space_read_4(sc->bst, sc->bsh, offset);
+}
+
+static inline void
+write4(struct fsl_sdhc_softc *sc, unsigned int offset, int value)
+{
+
+	bus_space_write_4(sc->bst, sc->bsh, offset, value);
+}
+
+static inline void
+set_bit(struct fsl_sdhc_softc *sc, uint32_t offset, uint32_t mask)
+{
+	uint32_t x = read4(sc, offset);
+
+	write4(sc, offset, x | mask);
+}
+
+static inline void
+clear_bit(struct fsl_sdhc_softc *sc, uint32_t offset, uint32_t mask)
+{
+	uint32_t x = read4(sc, offset);
+
+	write4(sc, offset, x & ~mask);
+}
+
+static int
+wait_for_bit_clear(struct fsl_sdhc_softc *sc, enum sdhc_reg_off reg,
+    uint32_t bit)
+{
+	uint32_t timeout = 10;
+	uint32_t stat;
+
+	stat = read4(sc, reg);
+	while (stat & bit) {
+		if (timeout == 0) {
+			return (-1);
+		}
+		--timeout;
+		DELAY(1000);
+		stat = read4(sc, reg);
+	}
+
+	return (0);
+}
+
+static int
+wait_for_free_line(struct fsl_sdhc_softc *sc, enum sdhc_line line)
+{
+	uint32_t timeout = 100;
+	uint32_t stat;
+
+	stat = read4(sc, SDHC_PRSSTAT);
+	while (stat & line) {
+		if (timeout == 0) {
+			return (-1);
+		}
+		--timeout;
+		DELAY(1000);
+		stat = read4(sc, SDHC_PRSSTAT);
+	}
+
+	return (0);
+}
+
+static uint32_t
+get_platform_clock(struct fsl_sdhc_softc *sc)
+{
+	device_t self, parent;
+	phandle_t node;
+	uint32_t clock;
+
+	self = sc->self;
+	node = ofw_bus_get_node(self);
+
+	/* Get sdhci node properties */
+	if((OF_getprop(node, "clock-frequency", (void *)&clock,
+	    sizeof(clock)) <= 0) || (clock == 0)) {
+
+		/*
+		 * Trying to get clock from parent device (soc) if correct
+		 * clock cannot be acquired from sdhci node.
+		 */
+		parent = device_get_parent(self);
+		node = ofw_bus_get_node(parent);
+
+		/* Get soc properties */
+		if ((OF_getprop(node, "bus-frequency", (void *)&clock,
+		    sizeof(clock)) <= 0) || (clock == 0)) {
+			device_printf(self,"Cannot acquire correct sdhci "
+			    "frequency from DTS.\n");
+
+			return (0);
+		}
+	}
+
+	DPRINTF("Acquired clock: %d from DTS\n", clock);
+
+	return (clock);
+}
+
+/**
+ * Set clock driving card.
+ * @param sc
+ * @param clock Desired clock frequency in Hz
+ */
+static void
+set_clock(struct fsl_sdhc_softc *sc, uint32_t clock)
+{
+	uint32_t base_clock;
+	uint32_t divisor, prescaler = 1;
+	uint32_t round = 0;
+
+	if (clock == sc->slot.clock)
+		return;
+
+	if (clock == 0) {
+		clear_bit(sc, SDHC_SYSCTL, MASK_CLOCK_CONTROL | SYSCTL_PEREN |
+		    SYSCTL_HCKEN | SYSCTL_IPGEN);
+		return;
+	}
+
+	base_clock = sc->platform_clock;
+	round = base_clock & 0x2;
+	base_clock >>= 2;
+	base_clock += round;
+	round = 0;
+
+	/* SD specification 1.1 doesn't allow frequences above 50 MHz */
+	if (clock > FSL_SDHC_MAX_CLOCK)
+		clock = FSL_SDHC_MAX_CLOCK;
+
+	/*
+	 * divisor = ceil(base_clock / clock)
+	 * TODO: Reconsider symmetric rounding here instead of ceiling.
+	 */
+	divisor = (base_clock + clock - 1) / clock;
+
+	while (divisor > 16) {
+		round = divisor & 0x1;
+		divisor >>= 1;
+
+		prescaler <<= 1;
+	}
+	divisor += round - 1;
+
+	/* Turn off the clock. */
+	clear_bit(sc, SDHC_SYSCTL, MASK_CLOCK_CONTROL);
+
+	/* Write clock settings. */
+	set_bit(sc, SDHC_SYSCTL, (prescaler << SHIFT_SDCLKFS) |
+	    (divisor << SHIFT_DVS));
+
+	/*
+	 * Turn on clocks.
+	 * TODO: This actually disables clock automatic gating off feature of
+	 * the controller which eventually should be enabled but as for now
+	 * it prevents controller from generating card insertion/removal
+	 * interrupts correctly.
+	 */
+	set_bit(sc, SDHC_SYSCTL, SYSCTL_PEREN | SYSCTL_HCKEN | SYSCTL_IPGEN);
+
+	sc->slot.clock = clock;
+
+	DPRINTF("given clock = %d, computed clock = %d\n", clock,
+	    (base_clock / prescaler) / (divisor + 1));
+}
+
+static inline void
+send_80_clock_ticks(struct fsl_sdhc_softc *sc)
+{
+	int err;
+
+	err = wait_for_free_line(sc, SDHC_CMD_LINE | SDHC_DAT_LINE);
+	if (err != 0) {
+		device_printf(sc->self, "Can't acquire data/cmd lines\n");
+		return;
+	}
+
+	set_bit(sc, SDHC_SYSCTL, SYSCTL_INITA);
+	err = wait_for_bit_clear(sc, SDHC_SYSCTL, SYSCTL_INITA);
+	if (err != 0) {
+		device_printf(sc->self, "Can't send 80 clocks to the card.\n");
+	}
+}
+
+static void
+set_bus_width(struct fsl_sdhc_softc *sc, enum mmc_bus_width width)
+{
+
+	DPRINTF("setting bus width to %d\n", width);
+	switch (width) {
+	case bus_width_1:
+		set_bit(sc, SDHC_PROCTL, DTW_1);
+		break;
+	case bus_width_4:
+		set_bit(sc, SDHC_PROCTL, DTW_4);
+		break;
+	case bus_width_8:
+		set_bit(sc, SDHC_PROCTL, DTW_8);
+		break;
+	default:
+		device_printf(sc->self, "Unsupported bus width\n");
+	}
+}
+
+static void
+reset_controller_all(struct fsl_sdhc_softc *sc)
+{
+	uint32_t count = 5;
+
+	set_bit(sc, SDHC_SYSCTL, SYSCTL_RSTA);
+	while (read4(sc, SDHC_SYSCTL) & SYSCTL_RSTA) {
+		DELAY(FSL_SDHC_RESET_DELAY);
+		--count;
+		if (count == 0) {
+			device_printf(sc->self,
+			    "Can't reset the controller\n");
+			return;
+		}
+	}
+}
+
+static void
+reset_controller_dat_cmd(struct fsl_sdhc_softc *sc)
+{
+	int err;
+
+	set_bit(sc, SDHC_SYSCTL, SYSCTL_RSTD | SYSCTL_RSTC);
+	err = wait_for_bit_clear(sc, SDHC_SYSCTL, SYSCTL_RSTD | SYSCTL_RSTC);
+	if (err != 0) {
+		device_printf(sc->self, "Can't reset data & command part!\n");
+		return;
+	}
+}
+
+static void
+init_controller(struct fsl_sdhc_softc *sc)
+{
+
+	/* Enable interrupts. */
+#ifdef FSL_SDHC_NO_DMA
+	write4(sc, SDHC_IRQSTATEN, MASK_IRQ_ALL & ~IRQ_DINT & ~IRQ_DMAE);
+	write4(sc, SDHC_IRQSIGEN, MASK_IRQ_ALL & ~IRQ_DINT & ~IRQ_DMAE);
+#else
+	write4(sc, SDHC_IRQSTATEN, MASK_IRQ_ALL & ~IRQ_BRR & ~IRQ_BWR);
+	write4(sc, SDHC_IRQSIGEN, MASK_IRQ_ALL & ~IRQ_BRR & ~IRQ_BWR);
+
+	/* Write DMA address */
+	write4(sc, SDHC_DSADDR, sc->dma_phys);
+
+	/* Enable snooping and fix for AHB2MAG bypass. */
+	write4(sc, SDHC_DCR, DCR_SNOOP | DCR_AHB2MAG_BYPASS);
+#endif
+	/* Set data timeout. */
+	set_bit(sc, SDHC_SYSCTL, 0xe << SHIFT_DTOCV);
+
+	/* Set water-mark levels (FIFO buffer size). */
+	write4(sc, SDHC_WML, (FSL_SDHC_FIFO_BUF_WORDS << 16) |
+	    FSL_SDHC_FIFO_BUF_WORDS);
+}
+
+static void
+init_mmc_host_struct(struct fsl_sdhc_softc *sc)
+{
+	struct mmc_host *host = &sc->mmc_host;
+
+	/* Clear host structure. */
+	bzero(host, sizeof(struct mmc_host));
+
+	/* Calculate minimum and maximum operating frequencies. */
+	host->f_min = sc->platform_clock / FSL_SDHC_MAX_DIV;
+	host->f_max = FSL_SDHC_MAX_CLOCK;
+
+	/* Set operation conditions (voltage). */
+	host->host_ocr = MMC_OCR_320_330 | MMC_OCR_330_340;
+
+	/* Set additional host controller capabilities. */
+	host->caps = MMC_CAP_4_BIT_DATA;
+
+	/* Set mode. */
+	host->mode = mode_sd;
+}
+
+static void
+card_detect_task(void *arg, int pending)
+{
+	struct fsl_sdhc_softc *sc = (struct fsl_sdhc_softc *)arg;
+	int err;
+	int insert;
+
+	insert = read4(sc, SDHC_PRSSTAT) & PRSSTAT_CINS;
+
+	mtx_lock(&sc->mtx);
+
+	if (insert) {
+		if (sc->child != NULL) {
+			mtx_unlock(&sc->mtx);
+			return;
+		}
+
+		sc->child = device_add_child(sc->self, "mmc", -1);
+		if (sc->child == NULL) {
+			device_printf(sc->self, "Couldn't add MMC bus!\n");
+			mtx_unlock(&sc->mtx);
+			return;
+		}
+
+		/* Initialize MMC bus host structure. */
+		init_mmc_host_struct(sc);
+		device_set_ivars(sc->child, &sc->mmc_host);
+
+	} else {
+		if (sc->child == NULL) {
+			mtx_unlock(&sc->mtx);
+			return;
+		}
+	}
+
+	mtx_unlock(&sc->mtx);
+
+	if (insert) {
+		if ((err = device_probe_and_attach(sc->child)) != 0) {
+			device_printf(sc->self, "MMC bus failed on probe "
+			    "and attach! error %d\n", err);
+			device_delete_child(sc->self, sc->child);
+			sc->child = NULL;
+		}
+	} else {
+		if (device_delete_child(sc->self, sc->child) != 0)
+			device_printf(sc->self, "Could not delete MMC bus!\n");
+		sc->child = NULL;
+	}
+}
+
+static void
+card_detect_delay(void *arg)
+{
+	struct fsl_sdhc_softc *sc = arg;
+
+	taskqueue_enqueue(taskqueue_swi_giant, &sc->card_detect_task);
+}
+
+static void
+finalize_request(struct fsl_sdhc_softc *sc)
+{
+
+	DPRINTF("finishing request %x\n", sc->request);
+
+	sc->request->done(sc->request);
+	sc->request = NULL;
+}
+
+/**
+ * Read response from card.
+ * @todo Implement Auto-CMD responses being held in R3 for multi-block xfers.
+ * @param sc
+ */
+static void
+get_response(struct fsl_sdhc_softc *sc)
+{
+	struct mmc_command *cmd = sc->request->cmd;
+	int i;
+	uint32_t val;
+	uint8_t ext = 0;
+
+	if (cmd->flags & MMC_RSP_136) {
+		/* CRC is stripped, need to shift one byte left. */
+		for (i = 0; i < 4; i++) {
+			val = read4(sc, SDHC_CMDRSP0 + i * 4);
+			cmd->resp[3 - i] = (val << 8) + ext;
+			ext = val >> 24;
+		}
+	} else {
+		cmd->resp[0] = read4(sc, SDHC_CMDRSP0);
+	}
+}
+
+#ifdef FSL_SDHC_NO_DMA
+/**
+ * Read all content of a fifo buffer.
+ * @warning Assumes data buffer is 32-bit aligned.
+ * @param sc
+ */
+static void
+read_block_pio(struct fsl_sdhc_softc *sc)
+{
+	struct mmc_data *data = sc->request->cmd->data;
+	size_t left = min(FSL_SDHC_FIFO_BUF_SIZE, data->len);
+	uint8_t *buf = data->data;
+	uint32_t word;
+
+	buf += sc->data_offset;
+	bus_space_read_multi_4(sc->bst, sc->bsh, SDHC_DATPORT, (uint32_t *)buf,
+	    left >> 2);
+
+	sc->data_offset += left;
+
+	/* Handle 32-bit unaligned size case. */
+	left &= 0x3;
+	if (left > 0) {
+		buf = (uint8_t *)data->data + (sc->data_offset & ~0x3);
+		word = read4(sc, SDHC_DATPORT);
+		while (left > 0) {
+			*(buf++) = word;
+			word >>= 8;
+			--left;
+		}
+	}
+}
+
+/**
+ * Write a fifo buffer.
+ * @warning Assumes data buffer size is 32-bit aligned.
+ * @param sc
+ */
+static void
+write_block_pio(struct fsl_sdhc_softc *sc)
+{
+	struct mmc_data *data = sc->request->cmd->data;
+	size_t left = min(FSL_SDHC_FIFO_BUF_SIZE, data->len);
+	uint8_t *buf = data->data;
+	uint32_t word = 0;
+
+	DPRINTF("sc->data_offset %d\n", sc->data_offset);
+
+	buf += sc->data_offset;
+	bus_space_write_multi_4(sc->bst, sc->bsh, SDHC_DATPORT, (uint32_t *)buf,
+	    left >> 2);
+
+	sc->data_offset += left;
+
+	/* Handle 32-bit unaligned size case. */
+	left &= 0x3;
+	if (left > 0) {
+		buf = (uint8_t *)data->data + (sc->data_offset & ~0x3);
+		while (left > 0) {
+			word += *(buf++);
+			word <<= 8;
+			--left;
+		}
+		write4(sc, SDHC_DATPORT, word);
+	}
+}
+
+static void
+pio_read_transfer(struct fsl_sdhc_softc *sc)
+{
+
+	while (read4(sc, SDHC_PRSSTAT) & PRSSTAT_BREN) {
+		read_block_pio(sc);
+
+		/*
+		 * TODO: should we check here whether data_offset >= data->len?
+		 */
+	}
+}
+
+static void
+pio_write_transfer(struct fsl_sdhc_softc *sc)
+{
+
+	while (read4(sc, SDHC_PRSSTAT) & PRSSTAT_BWEN) {
+		write_block_pio(sc);
+
+		/*
+		 * TODO: should we check here whether data_offset >= data->len?
+		 */
+	}
+}
+#endif /* FSL_SDHC_USE_DMA */
+
+static inline void
+handle_command_intr(struct fsl_sdhc_softc *sc, uint32_t irq_stat)
+{
+	struct mmc_command *cmd = sc->request->cmd;
+
+	/* Handle errors. */
+	if (irq_stat & IRQ_CTOE) {
+		cmd->error = MMC_ERR_TIMEOUT;
+	} else if (irq_stat & IRQ_CCE) {
+		cmd->error = MMC_ERR_BADCRC;
+	} else if (irq_stat & (IRQ_CEBE | IRQ_CIE)) {
+		cmd->error = MMC_ERR_FIFO;
+	}
+
+	if (cmd->error) {
+		device_printf(sc->self, "Error interrupt occured\n");
+		reset_controller_dat_cmd(sc);
+		return;
+	}
+
+	if (sc->command_done)
+		return;
+
+	if (irq_stat & IRQ_CC) {
+		sc->command_done = 1;
+
+		if (cmd->flags & MMC_RSP_PRESENT)
+			get_response(sc);
+	}
+}
+
+static inline void
+handle_data_intr(struct fsl_sdhc_softc *sc, uint32_t irq_stat)
+{
+	struct mmc_command *cmd = sc->request->cmd;
+
+	/* Handle errors. */
+	if (irq_stat & IRQ_DTOE) {
+		cmd->error = MMC_ERR_TIMEOUT;
+	} else if (irq_stat & (IRQ_DCE | IRQ_DEBE)) {
+		cmd->error = MMC_ERR_BADCRC;
+	} else if (irq_stat & IRQ_ERROR_DATA_MASK) {
+		cmd->error = MMC_ERR_FAILED;
+	}
+
+	if (cmd->error) {
+		device_printf(sc->self, "Error interrupt occured\n");
+		sc->data_done = 1;
+		reset_controller_dat_cmd(sc);
+		return;
+	}
+
+	if (sc->data_done)
+		return;
+
+#ifdef FSL_SDHC_NO_DMA
+	if (irq_stat & IRQ_BRR) {
+		pio_read_transfer(sc);
+	}
+
+	if (irq_stat & IRQ_BWR) {
+		pio_write_transfer(sc);
+	}
+#else
+	if (irq_stat & IRQ_DINT) {
+		struct mmc_data *data = sc->request->cmd->data;
+
+		/* Synchronize DMA. */
+		if (data->flags & MMC_DATA_READ) {
+			bus_dmamap_sync(sc->dma_tag, sc->dma_map,
+			    BUS_DMASYNC_POSTREAD);
+			memcpy(data->data, sc->dma_mem, data->len);
+		} else {
+			bus_dmamap_sync(sc->dma_tag, sc->dma_map,
+			    BUS_DMASYNC_POSTWRITE);
+		}
+
+		/*
+		 * TODO: For multiple block transfers, address of dma memory
+		 * in DSADDR register should be set to the beginning of the
+		 * segment here. Also offset to data pointer should be handled.
+		 */
+	}
+#endif
+
+	if (irq_stat & IRQ_TC)
+		sc->data_done = 1;
+}
+
+static void
+interrupt_handler(void *arg)
+{
+	struct fsl_sdhc_softc *sc = (struct fsl_sdhc_softc *)arg;
+	uint32_t irq_stat;
+
+	mtx_lock(&sc->mtx);
+
+	irq_stat = read4(sc, SDHC_IRQSTAT);
+
+	/* Card interrupt. */
+	if (irq_stat & IRQ_CINT) {
+		DPRINTF("Card interrupt recievied\n");
+
+	}
+
+	/* Card insertion interrupt. */
+	if (irq_stat & IRQ_CINS) {
+		clear_bit(sc, SDHC_IRQSIGEN, IRQ_CINS);
+		clear_bit(sc, SDHC_IRQSTATEN, IRQ_CINS);
+		set_bit(sc, SDHC_IRQSIGEN, IRQ_CRM);
+		set_bit(sc, SDHC_IRQSTATEN, IRQ_CRM);
+
+		callout_reset(&sc->card_detect_callout, hz / 2,
+		    card_detect_delay, sc);
+	}
+
+	/* Card removal interrupt. */
+	if (irq_stat & IRQ_CRM) {
+		clear_bit(sc, SDHC_IRQSIGEN, IRQ_CRM);
+		clear_bit(sc, SDHC_IRQSTATEN, IRQ_CRM);
+		set_bit(sc, SDHC_IRQSIGEN, IRQ_CINS);
+		set_bit(sc, SDHC_IRQSTATEN, IRQ_CINS);
+
+		callout_stop(&sc->card_detect_callout);
+		taskqueue_enqueue(taskqueue_swi_giant, &sc->card_detect_task);
+	}
+
+	/* Handle request interrupts. */
+	if (sc->request) {
+		handle_command_intr(sc, irq_stat);
+		handle_data_intr(sc, irq_stat);
+
+		/*
+		 * Finalize request when transfer is done successfully
+		 * or was interrupted due to error.
+		 */  
+		if ((sc->data_done && sc->command_done) ||
+		    (sc->request->cmd->error))
+			finalize_request(sc);
+	}
+
+	/* Clear status register. */
+	write4(sc, SDHC_IRQSTAT, irq_stat);
+
+	mtx_unlock(&sc->mtx);
+}
+
+#ifndef FSL_SDHC_NO_DMA
+static void
+dma_get_phys_addr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+{
+
+	if (error != 0)
+		return;
+
+	/* Get first segment's physical address. */
+	*(bus_addr_t *)arg = segs->ds_addr;
+}
+
+static int
+init_dma(struct fsl_sdhc_softc *sc)
+{
+	device_t self = sc->self;
+	int err;
+
+	err = bus_dma_tag_create(bus_get_dma_tag(self),
+	    FSL_SDHC_DMA_BLOCK_SIZE, 0, BUS_SPACE_MAXADDR_32BIT,
+	    BUS_SPACE_MAXADDR, NULL, NULL, FSL_SDHC_DMA_BLOCK_SIZE, 1,
+	    FSL_SDHC_DMA_BLOCK_SIZE, BUS_DMA_ALLOCNOW, NULL, NULL,
+	    &sc->dma_tag);
+
+	if (err) {
+		device_printf(self, "Could not create DMA tag!\n");
+		return (-1);
+	}
+
+	err = bus_dmamem_alloc(sc->dma_tag, (void **)&(sc->dma_mem),
+	    BUS_DMA_NOWAIT | BUS_DMA_NOCACHE, &sc->dma_map);
+	if (err) {
+		device_printf(self, "Could not allocate DMA memory!\n");
+		goto fail1;
+	}
+
+	err = bus_dmamap_load(sc->dma_tag, sc->dma_map, (void *)sc->dma_mem,
+	    FSL_SDHC_DMA_BLOCK_SIZE, dma_get_phys_addr, &sc->dma_phys, 0);
+	if (err) {
+		device_printf(self, "Could not load DMA map!\n");
+		goto fail2;
+	}
+
+	return (0);
+
+fail2:
+	bus_dmamem_free(sc->dma_tag, sc->dma_mem, sc->dma_map);
+fail1:
+	bus_dma_tag_destroy(sc->dma_tag);
+
+	return (-1);
+}
+#endif /* FSL_SDHC_NO_DMA */
+
+static uint32_t
+set_xfertyp_register(const struct mmc_command *cmd)
+{
+	uint32_t xfertyp = 0;
+
+	/* Set command index. */
+	xfertyp |= cmd->opcode << CMDINX_SHIFT;
+
+	/* Set command type. */
+	if (cmd->opcode == MMC_STOP_TRANSMISSION)
+		xfertyp |= CMDTYP_ABORT;
+
+	/* Set data preset select. */
+	if (cmd->data) {
+		xfertyp |= XFERTYP_DPSEL;
+
+		/* Set transfer direction. */
+		if (cmd->data->flags & MMC_DATA_READ)
+			xfertyp |= XFERTYP_DTDSEL;
+	}
+
+	/* Set command index check. */
+	if (cmd->flags & MMC_RSP_OPCODE)
+		xfertyp |= XFERTYP_CICEN;
+
+	/* Set command CRC check. */
+	if (cmd->flags & MMC_RSP_CRC)
+		xfertyp |= XFERTYP_CCCEN;
+
+	/* Set response type */
+	if (!(cmd->flags & MMC_RSP_PRESENT))
+		xfertyp |= RSPTYP_NONE;
+	else if (cmd->flags & MMC_RSP_136)
+		xfertyp |= RSPTYP_136;
+	else if (cmd->flags & MMC_RSP_BUSY)
+		xfertyp |= RSPTYP_48_BUSY;
+	else
+		xfertyp |= RSPTYP_48;
+
+#ifndef FSL_SDHC_NO_DMA
+	/* Enable DMA */
+	xfertyp |= XFERTYP_DMAEN;
+#endif
+
+	return (xfertyp);
+}
+
+static uint32_t
+set_blkattr_register(const struct mmc_data *data)
+{
+
+	if (data->len <= FSL_SDHC_MAX_BLOCK_SIZE) {
+		/* One block transfer. */
+		return (BLKATTR_BLOCK_COUNT(1) | ((data->len) &
+		    BLKATTR_BLKSZE));
+	}
+
+	/* TODO: Write code here for multi-block transfers. */
+	return (0);
+}
+
+/**
+ * Initiate data transfer. Interrupt handler will finalize it.
+ * @todo Implement multi-block transfers.
+ * @param sc
+ * @param cmd
+ */
+static int
+start_data(struct fsl_sdhc_softc *sc, struct mmc_data *data)
+{
+	uint32_t reg;
+
+	if ((uint32_t)data->data & 0x3) {
+		device_printf(sc->self, "32-bit unaligned data pointer in "
+		    "request\n");
+		return (-1);
+	}
+
+	sc->data_done = 0;
+
+#ifdef FSL_SDHC_NO_DMA
+	sc->data_ptr = data->data;
+	sc->data_offset = 0;
+#else
+	/* Write DMA address register. */
+	write4(sc, SDHC_DSADDR, sc->dma_phys);
+
+	/* Synchronize DMA. */
+	if (data->flags & MMC_DATA_READ) {
+		bus_dmamap_sync(sc->dma_tag, sc->dma_map,
+		    BUS_DMASYNC_PREREAD);
+	} else {
+		memcpy(sc->dma_mem, data->data, data->len);
+		bus_dmamap_sync(sc->dma_tag, sc->dma_map,
+		    BUS_DMASYNC_PREWRITE);
+	}
+#endif
+	/* Set block size and count. */
+	reg = set_blkattr_register(data);
+	if (reg == 0) {
+		device_printf(sc->self, "Requested unsupported multi-block "
+		    "transfer.\n");
+		return (-1);
+	}
+	write4(sc, SDHC_BLKATTR, reg);
+
+	return (0);
+}
+
+static int
+start_command(struct fsl_sdhc_softc *sc, struct mmc_command *cmd)
+{
+	struct mmc_request *req = sc->request;
+	uint32_t mask;
+	uint32_t xfertyp;
+	int err;
+
+	DPRINTF("opcode %d, flags 0x%08x\n", cmd->opcode, cmd->flags);
+	DPRINTF("PRSSTAT = 0x%08x\n", read4(sc, SDHC_PRSSTAT));
+
+	sc->command_done = 0;
+
+	cmd->error = MMC_ERR_NONE;
+
+	/* TODO: should we check here for card presence and clock settings? */
+
+	/* Always wait for free CMD line. */
+	mask = SDHC_CMD_LINE;
+	/* Wait for free DAT if we have data or busy signal. */
+	if (cmd->data || (cmd->flags & MMC_RSP_BUSY))
+		mask |= SDHC_DAT_LINE;
+	/* We shouldn't wait for DAT for stop commands. */
+	if (cmd == req->stop)
+		mask &= ~SDHC_DAT_LINE;
+	err = wait_for_free_line(sc, mask);
+	if (err != 0) {
+		device_printf(sc->self, "Controller never released inhibit "
+		    "bit(s).\n");
+		reset_controller_dat_cmd(sc);
+		cmd->error = MMC_ERR_FAILED;
+		sc->request = NULL;
+		req->done(req);
+		return (-1);
+	}
+
+	xfertyp = set_xfertyp_register(cmd);
+
+	if (cmd->data != NULL) {
+		err = start_data(sc, cmd->data);
+		if (err != 0) {
+			device_printf(sc->self,
+			    "Data transfer request failed\n");
+			reset_controller_dat_cmd(sc);
+			cmd->error = MMC_ERR_FAILED;
+			sc->request = NULL;
+			req->done(req);
+			return (-1);
+		}
+	}
+
+	write4(sc, SDHC_CMDARG, cmd->arg);
+	write4(sc, SDHC_XFERTYP, xfertyp);
+
+	DPRINTF("XFERTYP = 0x%08x\n", xfertyp);
+	DPRINTF("CMDARG = 0x%08x\n", cmd->arg);
+
+	return (0);
+}
+
+#ifdef DEBUG
+static void
+dump_registers(struct fsl_sdhc_softc *sc)
+{
+	printf("PRSSTAT = 0x%08x\n", read4(sc, SDHC_PRSSTAT));
+	printf("PROCTL = 0x%08x\n", read4(sc, SDHC_PROCTL));
+	printf("PMUXCR = 0x%08x\n", ccsr_read4(OCP85XX_PMUXCR));
+	printf("HOSTCAPBLT = 0x%08x\n", read4(sc, SDHC_HOSTCAPBLT));
+	printf("IRQSTAT = 0x%08x\n", read4(sc, SDHC_IRQSTAT));
+	printf("IRQSTATEN = 0x%08x\n", read4(sc, SDHC_IRQSTATEN));
+	printf("IRQSIGEN = 0x%08x\n", read4(sc, SDHC_IRQSIGEN));
+	printf("WML = 0x%08x\n", read4(sc, SDHC_WML));
+	printf("DSADDR = 0x%08x\n", read4(sc, SDHC_DSADDR));
+	printf("XFERTYP = 0x%08x\n", read4(sc, SDHC_XFERTYP));
+	printf("ECMCR = 0x%08x\n", ccsr_read4(OCP85XX_ECMCR));
+	printf("DCR = 0x%08x\n", read4(sc, SDHC_DCR));
+}
+#endif
+
+/*****************************************************************************
+ * Public methods
+ *****************************************************************************/
+/*
+ * Device interface methods.
+ */
+static int
+fsl_sdhc_probe(device_t self)
+{
+	static const char *desc =
+	    "Freescale Enhanced Secure Digital Host Controller";
+
+	if (!ofw_bus_is_compatible(self, "fsl,p2020-esdhc") &&
+	    !ofw_bus_is_compatible(self, "fsl,esdhc"))
+		return (ENXIO);
+
+	device_set_desc(self, desc);
+
+	return (BUS_PROBE_VENDOR);
+}
+
+static int
+fsl_sdhc_attach(device_t self)
+{
+	struct fsl_sdhc_softc *sc;
+
+	sc = device_get_softc(self);
+
+	sc->self = self;
+
+	mtx_init(&sc->mtx, device_get_nameunit(self), NULL, MTX_DEF);
+
+	/* Setup memory resource */
+	sc->mem_rid = 0;
+	sc->mem_resource = bus_alloc_resource_any(self, SYS_RES_MEMORY,
+	    &sc->mem_rid, RF_ACTIVE);
+	if (sc->mem_resource == NULL) {
+		device_printf(self, "Could not allocate memory.\n");
+		goto fail;
+	}
+	sc->bst = rman_get_bustag(sc->mem_resource);
+	sc->bsh = rman_get_bushandle(sc->mem_resource);
+
+	/* Setup interrupt resource. */
+	sc->irq_rid = 0;
+	sc->irq_resource = bus_alloc_resource_any(self, SYS_RES_IRQ,
+	    &sc->irq_rid, RF_ACTIVE);
+	if (sc->irq_resource == NULL) {
+		device_printf(self, "Could not allocate interrupt.\n");
+		goto fail;
+	}
+	if (bus_setup_intr(self, sc->irq_resource, INTR_TYPE_MISC |
+	    INTR_MPSAFE, NULL, interrupt_handler, sc, &sc->ihl) != 0) {
+		device_printf(self, "Could not setup interrupt.\n");
+		goto fail;
+	}
+
+	/* Setup DMA. */
+#ifndef FSL_SDHC_NO_DMA
+	if (init_dma(sc) != 0) {
+		device_printf(self, "Could not setup DMA\n");
+	}
+#endif
+	sc->bus_busy = 0;
+	sc->platform_clock = get_platform_clock(sc);
+	if (sc->platform_clock == 0) {
+		device_printf(self, "Could not get platform clock.\n");
+		goto fail;
+	}
+	sc->command_done = 1;
+	sc->data_done = 1;
+
+	/* Init card detection task. */
+	TASK_INIT(&sc->card_detect_task, 0, card_detect_task, sc);
+	callout_init(&sc->card_detect_callout, 1);
+
+	reset_controller_all(sc);
+	init_controller(sc);
+	set_clock(sc, 400000);
+	send_80_clock_ticks(sc);
+
+#ifdef DEBUG
+	dump_registers(sc);
+#endif
+
+	return (0);
+
+fail:
+	fsl_sdhc_detach(self);
+	return (ENXIO);
+}
+
+static int
+fsl_sdhc_detach(device_t self)
+{
+	struct fsl_sdhc_softc *sc = device_get_softc(self);
+	int err;
+
+	if (sc->child)
+		device_delete_child(self, sc->child);
+
+	taskqueue_drain(taskqueue_swi_giant, &sc->card_detect_task);
+
+#ifndef FSL_SDHC_NO_DMA
+	bus_dmamap_unload(sc->dma_tag, sc->dma_map);
+	bus_dmamem_free(sc->dma_tag, sc->dma_mem, sc->dma_map);
+	bus_dma_tag_destroy(sc->dma_tag);
+#endif
+
+	if (sc->ihl != NULL) {
+		err = bus_teardown_intr(self, sc->irq_resource, sc->ihl);
+		if (err)
+			return (err);
+	}
+	if (sc->irq_resource != NULL) {
+		err = bus_release_resource(self, SYS_RES_IRQ, sc->irq_rid,
+		    sc->irq_resource);
+		if (err)
+			return (err);
+
+	}
+	if (sc->mem_resource != NULL) {
+		err = bus_release_resource(self, SYS_RES_MEMORY, sc->mem_rid,
+		    sc->mem_resource);
+		if (err)
+			return (err);
+	}
+
+	mtx_destroy(&sc->mtx);
+
+	return (0);
+}
+
+
+/*
+ * Bus interface methods.
+ */
+static int
+fsl_sdhc_read_ivar(device_t self, device_t child, int index,
+    uintptr_t *result)
+{
+	struct mmc_host *host = device_get_ivars(child);
+
+	switch (index) {
+	case MMCBR_IVAR_BUS_MODE:
+		*(int *)result = host->ios.bus_mode;
+		break;
+	case MMCBR_IVAR_BUS_WIDTH:
+		*(int *)result = host->ios.bus_width;
+		break;
+	case MMCBR_IVAR_CHIP_SELECT:
+		*(int *)result = host->ios.chip_select;
+		break;
+	case MMCBR_IVAR_CLOCK:
+		*(int *)result = host->ios.clock;
+		break;
+	case MMCBR_IVAR_F_MIN:
+		*(int *)result = host->f_min;
+		break;
+	case MMCBR_IVAR_F_MAX:
+		*(int *)result = host->f_max;
+		break;
+	case MMCBR_IVAR_HOST_OCR:
+		*(int *)result = host->host_ocr;
+		break;
+	case MMCBR_IVAR_MODE:
+		*(int *)result = host->mode;
+		break;
+	case MMCBR_IVAR_OCR:
+		*(int *)result = host->ocr;
+		break;
+	case MMCBR_IVAR_POWER_MODE:
+		*(int *)result = host->ios.power_mode;
+		break;
+	case MMCBR_IVAR_VDD:
+		*(int *)result = host->ios.vdd;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+static int
+fsl_sdhc_write_ivar(device_t self, device_t child, int index,
+    uintptr_t value)
+{
+	struct mmc_host *host = device_get_ivars(child);
+
+	switch (index) {
+	case MMCBR_IVAR_BUS_MODE:
+		host->ios.bus_mode = value;
+		break;
+	case MMCBR_IVAR_BUS_WIDTH:
+		host->ios.bus_width = value;
+		break;
+	case MMCBR_IVAR_CHIP_SELECT:
+		host->ios.chip_select = value;
+		break;
+	case MMCBR_IVAR_CLOCK:
+		host->ios.clock = value;
+		break;
+	case MMCBR_IVAR_MODE:
+		host->mode = value;
+		break;
+	case MMCBR_IVAR_OCR:
+		host->ocr = value;
+		break;
+	case MMCBR_IVAR_POWER_MODE:
+		host->ios.power_mode = value;
+		break;
+	case MMCBR_IVAR_VDD:
+		host->ios.vdd = value;
+		break;
+	case MMCBR_IVAR_HOST_OCR:
+	case MMCBR_IVAR_F_MIN:
+	case MMCBR_IVAR_F_MAX:
+	default:
+		/* Instance variable not writable. */
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+
+/*
+ * MMC bridge methods.
+ */
+static int
+fsl_sdhc_update_ios(device_t self, device_t reqdev)
+{
+	struct fsl_sdhc_softc *sc = device_get_softc(self);
+	struct mmc_host *host = device_get_ivars(reqdev);
+	struct mmc_ios *ios = &host->ios;
+
+	mtx_lock(&sc->mtx);
+
+	/* Full reset on bus power down to clear from any state. */
+	if (ios->power_mode == power_off) {
+		reset_controller_all(sc);
+		init_controller(sc);
+	}
+
+	set_clock(sc, ios->clock);
+	set_bus_width(sc, ios->bus_width);
+
+	mtx_unlock(&sc->mtx);
+
+	return (0);
+}
+
+static int
+fsl_sdhc_request(device_t self, device_t reqdev, struct mmc_request *req)
+{
+	struct fsl_sdhc_softc *sc = device_get_softc(self);
+	int err;
+
+	mtx_lock(&sc->mtx);
+
+	sc->request = req;
+	err = start_command(sc, req->cmd);
+
+	mtx_unlock(&sc->mtx);
+
+	return (err);
+}
+
+static int
+fsl_sdhc_get_ro(device_t self, device_t reqdev)
+{
+	struct fsl_sdhc_softc *sc = device_get_softc(self);
+
+	/* Wouldn't it be faster using branching (if {}) ?? */
+	return (((read4(sc, SDHC_PRSSTAT) & PRSSTAT_WPSPL) >> 19) ^ 0x1);
+}
+
+static int
+fsl_sdhc_acquire_host(device_t self, device_t reqdev)
+{
+	struct fsl_sdhc_softc *sc = device_get_softc(self);
+	int retval = 0;
+
+	mtx_lock(&sc->mtx);
+
+	while (sc->bus_busy)
+		retval = mtx_sleep(sc, &sc->mtx, PZERO, "sdhcah", 0);
+	++(sc->bus_busy);
+
+	mtx_unlock(&sc->mtx);
+
+	return (retval);
+}
+
+static int
+fsl_sdhc_release_host(device_t self, device_t reqdev)
+{
+	struct fsl_sdhc_softc *sc = device_get_softc(self);
+
+	mtx_lock(&sc->mtx);
+	--(sc->bus_busy);
+	mtx_unlock(&sc->mtx);
+	wakeup(sc);
+
+	return (0);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/mpc85xx/fsl_sdhc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/powerpc/mpc85xx/fsl_sdhc.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,297 @@
+/*-
+ * Copyright (c) 2011-2012 Semihalf
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/powerpc/mpc85xx/fsl_sdhc.h 236121 2012-05-26 21:07:15Z raj $
+ */
+
+#ifndef FSL_SDHC_H_
+#define FSL_SDHC_H_
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/rman.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+
+#include <machine/bus.h>
+
+#include <dev/mmc/bridge.h>
+#include <dev/mmc/mmcreg.h>
+#include <dev/mmc/mmcvar.h>
+#include <dev/mmc/mmcbrvar.h>
+
+#include "mmcbr_if.h"
+
+
+/*****************************************************************************
+ * Private defines
+ *****************************************************************************/
+struct slot {
+	uint32_t	clock;
+};
+
+struct fsl_sdhc_softc {
+	device_t		self;
+	device_t		child;
+
+	bus_space_handle_t	bsh;
+	bus_space_tag_t		bst;
+
+	struct resource		*mem_resource;
+	int			mem_rid;
+	struct resource		*irq_resource;
+	int			irq_rid;
+	void			*ihl;
+
+	bus_dma_tag_t		dma_tag;
+	bus_dmamap_t		dma_map;
+	uint32_t*		dma_mem;
+	bus_addr_t		dma_phys;
+
+	struct mtx		mtx;
+
+	struct task		card_detect_task;
+	struct callout		card_detect_callout;
+
+	struct mmc_host		mmc_host;
+
+	struct slot		slot;
+	uint32_t		bus_busy;
+	uint32_t		platform_clock;
+
+	struct mmc_request	*request;
+	int			data_done;
+	int			command_done;
+	int			use_dma;
+	uint32_t*		data_ptr;
+	uint32_t		data_offset;
+};
+
+#define FSL_SDHC_RESET_DELAY 50
+
+#define	FSL_SDHC_BASE_CLOCK_DIV		(2)
+#define	FSL_SDHC_MAX_DIV		(FSL_SDHC_BASE_CLOCK_DIV * 256 * 16)
+#define	FSL_SDHC_MIN_DIV		(FSL_SDHC_BASE_CLOCK_DIV * 2)
+#define	FSL_SDHC_MAX_CLOCK		(50000000)
+
+#define	FSL_SDHC_MAX_BLOCK_COUNT	(65535)
+#define	FSL_SDHC_MAX_BLOCK_SIZE		(4096)
+
+#define FSL_SDHC_FIFO_BUF_SIZE		(64)	/* Water-mark level. */
+#define FSL_SDHC_FIFO_BUF_WORDS		(FSL_SDHC_FIFO_BUF_SIZE / 4)
+
+#define FSL_SDHC_DMA_SEGMENT_SIZE	(1024)
+#define	FSL_SDHC_DMA_ALIGNMENT		(4)
+#define	FSL_SDHC_DMA_BLOCK_SIZE		FSL_SDHC_MAX_BLOCK_SIZE
+
+
+/*
+ * Offsets of SD HC registers
+ */
+enum sdhc_reg_off {
+	SDHC_DSADDR	= 0x000,
+	SDHC_BLKATTR	= 0x004,
+	SDHC_CMDARG	= 0x008,
+	SDHC_XFERTYP	= 0x00c,
+	SDHC_CMDRSP0	= 0x010,
+	SDHC_CMDRSP1	= 0x014,
+	SDHC_CMDRSP2	= 0x018,
+	SDHC_CMDRSP3	= 0x01c,
+	SDHC_DATPORT	= 0x020,
+	SDHC_PRSSTAT	= 0x024,
+	SDHC_PROCTL	= 0x028,
+	SDHC_SYSCTL	= 0x02c,
+	SDHC_IRQSTAT	= 0x030,
+	SDHC_IRQSTATEN	= 0x034,
+	SDHC_IRQSIGEN	= 0x038,
+	SDHC_AUTOC12ERR	= 0x03c,
+	SDHC_HOSTCAPBLT	= 0x040,
+	SDHC_WML	= 0x044,
+	SDHC_FEVT	= 0x050,
+	SDHC_HOSTVER	= 0x0fc,
+	SDHC_DCR	= 0x40c
+};
+
+enum sysctl_bit {
+	SYSCTL_INITA	= 0x08000000,
+	SYSCTL_RSTD	= 0x04000000,
+	SYSCTL_RSTC	= 0x02000000,
+	SYSCTL_RSTA	= 0x01000000,
+	SYSCTL_DTOCV	= 0x000f0000,
+	SYSCTL_SDCLKFS	= 0x0000ff00,
+	SYSCTL_DVS	= 0x000000f0,
+	SYSCTL_PEREN	= 0x00000004,
+	SYSCTL_HCKEN	= 0x00000002,
+	SYSCTL_IPGEN	= 0x00000001
+};
+
+#define HEX_LEFT_SHIFT(x)	(4 * x)
+enum sysctl_shift {
+	SHIFT_DTOCV	= HEX_LEFT_SHIFT(4),
+	SHIFT_SDCLKFS	= HEX_LEFT_SHIFT(2),
+	SHIFT_DVS	= HEX_LEFT_SHIFT(1)
+};
+
+enum proctl_bit {
+	PROCTL_WECRM	= 0x04000000,
+	PROCTL_WECINS	= 0x02000000,
+	PROCTL_WECINT	= 0x01000000,
+	PROCTL_RWCTL	= 0x00040000,
+	PROCTL_CREQ	= 0x00020000,
+	PROCTL_SABGREQ	= 0x00010000,
+	PROCTL_CDSS	= 0x00000080,
+	PROCTL_CDTL	= 0x00000040,
+	PROCTL_EMODE	= 0x00000030,
+	PROCTL_D3CD	= 0x00000008,
+	PROCTL_DTW	= 0x00000006
+};
+
+enum dtw {
+	DTW_1	= 0x00000000,
+	DTW_4	= 0x00000002,
+	DTW_8	= 0x00000004
+};
+
+enum prsstat_bit {
+	PRSSTAT_DLSL	= 0xff000000,
+	PRSSTAT_CLSL	= 0x00800000,
+	PRSSTAT_WPSPL	= 0x00080000,
+	PRSSTAT_CDPL	= 0x00040000,
+	PRSSTAT_CINS	= 0x00010000,
+	PRSSTAT_BREN	= 0x00000800,
+	PRSSTAT_BWEN	= 0x00000400,
+	PRSSTAT_RTA	= 0x00000200,
+	PRSSTAT_WTA	= 0x00000100,
+	PRSSTAT_SDOFF	= 0x00000080,
+	PRSSTAT_PEROFF	= 0x00000040,
+	PRSSTAT_HCKOFF	= 0x00000020,
+	PRSSTAT_IPGOFF	= 0x00000010,
+	PRSSTAT_DLA	= 0x00000004,
+	PRSSTAT_CDIHB	= 0x00000002,
+	PRSSTAT_CIHB	= 0x00000001
+
+};
+
+enum irq_bits {
+	IRQ_DMAE	= 0x10000000,
+	IRQ_AC12E	= 0x01000000,
+	IRQ_DEBE	= 0x00400000,
+	IRQ_DCE		= 0x00200000,
+	IRQ_DTOE	= 0x00100000,
+	IRQ_CIE		= 0x00080000,
+	IRQ_CEBE	= 0x00040000,
+	IRQ_CCE		= 0x00020000,
+	IRQ_CTOE	= 0x00010000,
+	IRQ_CINT	= 0x00000100,
+	IRQ_CRM		= 0x00000080,
+	IRQ_CINS	= 0x00000040,
+	IRQ_BRR		= 0x00000020,
+	IRQ_BWR		= 0x00000010,
+	IRQ_DINT	= 0x00000008,
+	IRQ_BGE		= 0x00000004,
+	IRQ_TC		= 0x00000002,
+	IRQ_CC		= 0x00000001
+};
+
+enum irq_masks {
+	IRQ_ERROR_DATA_MASK	= IRQ_DMAE | IRQ_DEBE | IRQ_DCE | IRQ_DTOE,
+	IRQ_ERROR_CMD_MASK	= IRQ_AC12E | IRQ_CIE | IRQ_CTOE | IRQ_CCE |
+				  IRQ_CEBE
+};
+
+enum dcr_bits {
+	DCR_PRI			= 0x0000c000,
+	DCR_SNOOP		= 0x00000040,
+	DCR_AHB2MAG_BYPASS	= 0x00000020,
+	DCR_RD_SAFE		= 0x00000004,
+	DCR_RD_PFE		= 0x00000002,
+	DCR_RD_PF_SIZE		= 0x00000001
+};
+
+#define	DCR_PRI_SHIFT	(14)
+
+enum xfertyp_bits {
+	XFERTYP_CMDINX	= 0x3f000000,
+	XFERTYP_CMDTYP	= 0x00c00000,
+	XFERTYP_DPSEL	= 0x00200000,
+	XFERTYP_CICEN	= 0x00100000,
+	XFERTYP_CCCEN	= 0x00080000,
+	XFERTYP_RSPTYP	= 0x00030000,
+	XFERTYP_MSBSEL	= 0x00000020,
+	XFERTYP_DTDSEL	= 0x00000010,
+	XFERTYP_AC12EN	= 0x00000004,
+	XFERTYP_BCEN	= 0x00000002,
+	XFERTYP_DMAEN	= 0x00000001
+};
+
+#define	CMDINX_SHIFT	(24)
+
+enum xfertyp_cmdtyp {
+	CMDTYP_NORMAL	= 0x00000000,
+	CMDYTP_SUSPEND	= 0x00400000,
+	CMDTYP_RESUME	= 0x00800000,
+	CMDTYP_ABORT	= 0x00c00000
+};
+
+enum xfertyp_rsptyp {
+	RSPTYP_NONE	= 0x00000000,
+	RSPTYP_136	= 0x00010000,
+	RSPTYP_48	= 0x00020000,
+	RSPTYP_48_BUSY	= 0x00030000
+};
+
+enum blkattr_bits {
+	BLKATTR_BLKSZE	= 0x00001fff,
+	BLKATTR_BLKCNT	= 0xffff0000
+};
+#define	BLKATTR_BLOCK_COUNT(x)	(x << 16)
+
+enum wml_bits {
+	WR_WML	= 0x00ff0000,
+	RD_WML	= 0x000000ff,
+};
+
+enum sdhc_bit_mask {
+	MASK_CLOCK_CONTROL	= 0x0000ffff,
+	MASK_IRQ_ALL		= IRQ_DMAE | IRQ_AC12E | IRQ_DEBE | IRQ_DCE |
+				  IRQ_DTOE | IRQ_CIE | IRQ_CEBE | IRQ_CCE |
+				  IRQ_CTOE | IRQ_CINT | IRQ_CRM | IRQ_CINS |
+				  IRQ_BRR | IRQ_BWR | IRQ_DINT | IRQ_BGE |
+				  IRQ_TC | IRQ_CC,
+};
+
+enum sdhc_line {
+	SDHC_DAT_LINE	= 0x2,
+	SDHC_CMD_LINE	= 0x1
+};
+
+#endif /* FSL_SDHC_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/mpc85xx/i2c.c
--- a/head/sys/powerpc/mpc85xx/i2c.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/mpc85xx/i2c.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/i2c.c 235935 2012-05-24 21:09:38Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -72,9 +72,6 @@
 #define I2C_BAUD_RATE_DEF	0x3F
 #define I2C_DFSSR_DIV		0x10
 
-#define DEBUG
-#undef DEBUG
-
 #ifdef  DEBUG
 #define debugf(fmt, args...) do { printf("%s(): ", __func__); printf(fmt,##args); } while (0)
 #else
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/mpc85xx/lbc.c
--- a/head/sys/powerpc/mpc85xx/lbc.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/mpc85xx/lbc.c	Wed Jul 25 16:40:53 2012 +0300
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/lbc.c 238045 2012-07-03 00:06:14Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -56,9 +56,6 @@
 #include "ofw_bus_if.h"
 #include "lbc.h"
 
-#define DEBUG
-#undef DEBUG
-
 #ifdef DEBUG
 #define debugf(fmt, args...) do { printf("%s(): ", __func__);	\
     printf(fmt,##args); } while (0)
@@ -66,20 +63,6 @@
 #define debugf(fmt, args...)
 #endif
 
-static __inline void
-lbc_write_reg(struct lbc_softc *sc, bus_size_t off, uint32_t val)
-{
-
-	bus_space_write_4(sc->sc_bst, sc->sc_bsh, off, val);
-}
-
-static __inline uint32_t
-lbc_read_reg(struct lbc_softc *sc, bus_size_t off)
-{
-
-	return (bus_space_read_4(sc->sc_bst, sc->sc_bsh, off));
-}
-
 static MALLOC_DEFINE(M_LBC, "localbus", "localbus devices information");
 
 static int lbc_probe(device_t);
@@ -161,46 +144,123 @@
 static void
 lbc_banks_unmap(struct lbc_softc *sc)
 {
-	int i;
+	int r;
 
-	for (i = 0; i < LBC_DEV_MAX; i++) {
-		if (sc->sc_banks[i].size == 0)
-			continue;
+	r = 0;
+	while (r < LBC_DEV_MAX) {
+		if (sc->sc_range[r].size == 0)
+			return;
 
-		law_disable(OCP85XX_TGTIF_LBC, sc->sc_banks[i].pa,
-		    sc->sc_banks[i].size);
-		pmap_unmapdev(sc->sc_banks[i].va, sc->sc_banks[i].size);
+		pmap_unmapdev(sc->sc_range[r].kva, sc->sc_range[r].size);
+		law_disable(OCP85XX_TGTIF_LBC, sc->sc_range[r].addr,
+		    sc->sc_range[r].size);
+		r++;
 	}
 }
 
 static int
 lbc_banks_map(struct lbc_softc *sc)
 {
-	u_long start, size;
-	int error, i;
+	vm_paddr_t end, start;
+	vm_size_t size;
+	u_int i, r, ranges, s;
+	int error;
 
+	bzero(sc->sc_range, sizeof(sc->sc_range));
+
+	/*
+	 * Determine number of discontiguous address ranges to program.
+	 */
+	ranges = 0;
 	for (i = 0; i < LBC_DEV_MAX; i++) {
-		if (sc->sc_banks[i].size == 0)
+		size = sc->sc_banks[i].size;
+		if (size == 0)
 			continue;
 
-		/* Physical address start/size. */
-		start = sc->sc_banks[i].pa;
-		size = sc->sc_banks[i].size;
+		start = sc->sc_banks[i].addr;
+		for (r = 0; r < ranges; r++) {
+			/* Avoid wrap-around bugs. */
+			end = sc->sc_range[r].addr - 1 + sc->sc_range[r].size;
+			if (start > 0 && end == start - 1) {
+				sc->sc_range[r].size += size;
+				break;
+			}
+			/* Avoid wrap-around bugs. */
+			end = start - 1 + size;
+			if (sc->sc_range[r].addr > 0 &&
+			    end == sc->sc_range[r].addr - 1) {
+				sc->sc_range[r].addr = start;
+				sc->sc_range[r].size += size;
+				break;
+			}
+		}
+		if (r == ranges) {
+			/* New range; add using insertion sort */
+			r = 0;
+			while (r < ranges && sc->sc_range[r].addr < start)
+				r++;
+			for (s = ranges; s > r; s--)
+				sc->sc_range[s] = sc->sc_range[s-1];
+			sc->sc_range[r].addr = start;
+			sc->sc_range[r].size = size;
+			ranges++;
+		}
+	}
 
-		/*
-		 * Configure LAW for this LBC bank (CS) and map its physical
-		 * memory region into KVA.
-		 */
+	/*
+	 * Ranges are sorted so quickly go over the list to merge ranges
+	 * that grew toward each other while building the ranges.
+	 */
+	r = 0;
+	while (r < ranges - 1) {
+		end = sc->sc_range[r].addr + sc->sc_range[r].size;
+		if (end != sc->sc_range[r+1].addr) {
+			r++;
+			continue;
+		}
+		sc->sc_range[r].size += sc->sc_range[r+1].size;
+		for (s = r + 1; s < ranges - 1; s++)
+			sc->sc_range[s] = sc->sc_range[s+1];
+		bzero(&sc->sc_range[s], sizeof(sc->sc_range[s]));
+		ranges--;
+	}
+
+	/*
+	 * Configure LAW for the LBC ranges and map the physical memory
+	 * range into KVA.
+	 */
+	for (r = 0; r < ranges; r++) {
+		start = sc->sc_range[r].addr;
+		size = sc->sc_range[r].size;
 		error = law_enable(OCP85XX_TGTIF_LBC, start, size);
 		if (error)
 			return (error);
+		sc->sc_range[r].kva = (vm_offset_t)pmap_mapdev(start, size);
+	}
 
-		sc->sc_banks[i].va = (vm_offset_t)pmap_mapdev(start, size);
-		if (sc->sc_banks[i].va == 0) {
-			lbc_banks_unmap(sc);
-			return (ENOSPC);
+	/* XXX: need something better here? */
+	if (ranges == 0)
+		return (EINVAL);
+
+	/* Assign KVA to banks based on the enclosing range. */
+	for (i = 0; i < LBC_DEV_MAX; i++) {
+		size = sc->sc_banks[i].size;
+		if (size == 0)
+			continue;
+
+		start = sc->sc_banks[i].addr;
+		for (r = 0; r < ranges; r++) {
+			end = sc->sc_range[r].addr - 1 + sc->sc_range[r].size;
+			if (start >= sc->sc_range[r].addr &&
+			    start - 1 + size <= end)
+				break;
+		}
+		if (r < ranges) {
+			sc->sc_banks[i].kva = sc->sc_range[r].kva +
+			    (start - sc->sc_range[r].addr);
 		}
 	}
+
 	return (0);
 }
 
@@ -215,12 +275,11 @@
 		size = sc->sc_banks[i].size;
 		if (size == 0)
 			continue;
+
 		/*
 		 * Compute and program BR value.
 		 */
-		regval = 0;
-		regval |= sc->sc_banks[i].pa;
-
+		regval = sc->sc_banks[i].addr;
 		switch (sc->sc_banks[i].width) {
 		case 8:
 			regval |= (1 << 11);
@@ -240,24 +299,22 @@
 		regval |= (sc->sc_banks[i].msel << 5);
 		regval |= (sc->sc_banks[i].atom << 2);
 		regval |= 1;
-
-		lbc_write_reg(sc, LBC85XX_BR(i), regval);
+		bus_space_write_4(sc->sc_bst, sc->sc_bsh,
+		    LBC85XX_BR(i), regval);
 
 		/*
 		 * Compute and program OR value.
 		 */
-		regval = 0;
-		regval |= lbc_address_mask(size);
-
+		regval = lbc_address_mask(size);
 		switch (sc->sc_banks[i].msel) {
 		case LBCRES_MSEL_GPCM:
 			/* TODO Add flag support for option registers */
-			regval |= 0x00000ff7;
+			regval |= 0x0ff7;
 			break;
 		case LBCRES_MSEL_FCM:
-			printf("FCM mode not supported yet!");
-			error = ENOSYS;
-			goto fail;
+			/* TODO Add flag support for options register */
+			regval |= 0x0796;
+			break;
 		case LBCRES_MSEL_UPMA:
 		case LBCRES_MSEL_UPMB:
 		case LBCRES_MSEL_UPMC:
@@ -265,27 +322,10 @@
 			error = ENOSYS;
 			goto fail;
 		}
-		lbc_write_reg(sc, LBC85XX_OR(i), regval);
+		bus_space_write_4(sc->sc_bst, sc->sc_bsh,
+		    LBC85XX_OR(i), regval);
 	}
 
-	/*
-	 * Initialize configuration register:
-	 * - enable Local Bus
-	 * - set data buffer control signal function
-	 * - disable parity byte select
-	 * - set ECC parity type
-	 * - set bus monitor timing and timer prescale
-	 */
-	lbc_write_reg(sc, LBC85XX_LBCR, 0);
-
-	/*
-	 * Initialize clock ratio register:
-	 * - disable PLL bypass mode
-	 * - configure LCLK delay cycles for the assertion of LALE
-	 * - set system clock divider
-	 */
-	lbc_write_reg(sc, LBC85XX_LCRR, 0x00030008);
-
 	return (0);
 
 fail:
@@ -348,7 +388,7 @@
 		reg += addr_cells - 1 + size_cells;
 
 		/* Calculate address range relative to VA base. */
-		start = sc->sc_banks[bank].va + start;
+		start = sc->sc_banks[bank].kva + start;
 		end = start + count - 1;
 
 		debugf("reg addr bank = %d, start = %lx, end = %lx, "
@@ -364,6 +404,18 @@
 	return (rv);
 }
 
+static void
+lbc_intr(void *arg)
+{
+	struct lbc_softc *sc = arg;
+	uint32_t ltesr;
+
+	ltesr = bus_space_read_4(sc->sc_bst, sc->sc_bsh, LBC85XX_LTESR);
+	sc->sc_ltesr = ltesr;
+	bus_space_write_4(sc->sc_bst, sc->sc_bsh, LBC85XX_LTESR, ltesr);
+	wakeup(sc->sc_dev);
+}
+
 static int
 lbc_probe(device_t dev)
 {
@@ -393,14 +445,59 @@
 	sc = device_get_softc(dev);
 	sc->sc_dev = dev;
 
-	sc->sc_rid = 0;
-	sc->sc_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->sc_rid,
+	sc->sc_mrid = 0;
+	sc->sc_mres = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->sc_mrid,
 	    RF_ACTIVE);
-	if (sc->sc_res == NULL)
+	if (sc->sc_mres == NULL)
 		return (ENXIO);
 
-	sc->sc_bst = rman_get_bustag(sc->sc_res);
-	sc->sc_bsh = rman_get_bushandle(sc->sc_res);
+	sc->sc_bst = rman_get_bustag(sc->sc_mres);
+	sc->sc_bsh = rman_get_bushandle(sc->sc_mres);
+
+	for (bank = 0; bank < LBC_DEV_MAX; bank++) {
+		bus_space_write_4(sc->sc_bst, sc->sc_bsh, LBC85XX_BR(bank), 0);
+		bus_space_write_4(sc->sc_bst, sc->sc_bsh, LBC85XX_OR(bank), 0);
+	}
+
+	/*
+	 * Initialize configuration register:
+	 * - enable Local Bus
+	 * - set data buffer control signal function
+	 * - disable parity byte select
+	 * - set ECC parity type
+	 * - set bus monitor timing and timer prescale
+	 */
+	bus_space_write_4(sc->sc_bst, sc->sc_bsh, LBC85XX_LBCR, 0);
+
+	/*
+	 * Initialize clock ratio register:
+	 * - disable PLL bypass mode
+	 * - configure LCLK delay cycles for the assertion of LALE
+	 * - set system clock divider
+	 */
+	bus_space_write_4(sc->sc_bst, sc->sc_bsh, LBC85XX_LCRR, 0x00030008);
+
+	bus_space_write_4(sc->sc_bst, sc->sc_bsh, LBC85XX_LTEDR, 0);
+	bus_space_write_4(sc->sc_bst, sc->sc_bsh, LBC85XX_LTESR, ~0);
+	bus_space_write_4(sc->sc_bst, sc->sc_bsh, LBC85XX_LTEIR, 0x64080001);
+
+	sc->sc_irid = 0;
+	sc->sc_ires = bus_alloc_resource_any(dev, SYS_RES_IRQ, &sc->sc_irid,
+	    RF_ACTIVE | RF_SHAREABLE);
+	if (sc->sc_ires != NULL) {
+		error = bus_setup_intr(dev, sc->sc_ires,
+		    INTR_TYPE_MISC | INTR_MPSAFE, NULL, lbc_intr, sc,
+		    &sc->sc_icookie);
+		if (error) {
+			device_printf(dev, "could not activate interrupt\n");
+			bus_release_resource(dev, SYS_RES_IRQ, sc->sc_irid,
+			    sc->sc_ires);
+			sc->sc_ires = NULL;
+		}
+	}
+
+	sc->sc_ltesr = ~0;
+
 	rangesptr = NULL;
 
 	rm = &sc->sc_rman;
@@ -479,7 +576,7 @@
 		debugf("bank = %d, start = %lx, size = %lx\n", bank,
 		    start, size);
 
-		sc->sc_banks[bank].pa = start + offset;
+		sc->sc_banks[bank].addr = start + offset;
 		sc->sc_banks[bank].size = size;
 
 		/*
@@ -552,7 +649,7 @@
 
 fail:
 	free(rangesptr, M_OFWPROP);
-	bus_release_resource(dev, SYS_RES_MEMORY, sc->sc_rid, sc->sc_res);
+	bus_release_resource(dev, SYS_RES_MEMORY, sc->sc_mrid, sc->sc_mres);
 	return (error);
 }
 
@@ -676,3 +773,53 @@
 	di = device_get_ivars(child);
 	return (&di->di_ofw);
 }
+
+void
+lbc_write_reg(device_t child, u_int off, uint32_t val)
+{
+	device_t dev;
+	struct lbc_softc *sc;
+
+	dev = device_get_parent(child);
+
+	if (off >= 0x1000) {
+		device_printf(dev, "%s(%s): invalid offset %#x\n",
+		    __func__, device_get_nameunit(child), off);
+		return;
+	}
+
+	sc = device_get_softc(dev);
+
+	if (off == LBC85XX_LTESR && sc->sc_ltesr != ~0u) {
+		sc->sc_ltesr ^= (val & sc->sc_ltesr);
+		return;
+	}
+
+	if (off == LBC85XX_LTEATR && (val & 1) == 0)
+		sc->sc_ltesr = ~0u;
+	bus_space_write_4(sc->sc_bst, sc->sc_bsh, off, val);
+}
+
+uint32_t
+lbc_read_reg(device_t child, u_int off)
+{
+	device_t dev;
+	struct lbc_softc *sc;
+	uint32_t val;
+
+	dev = device_get_parent(child);
+
+	if (off >= 0x1000) {
+		device_printf(dev, "%s(%s): invalid offset %#x\n",
+		    __func__, device_get_nameunit(child), off);
+		return (~0U);
+	}
+
+	sc = device_get_softc(dev);
+
+	if (off == LBC85XX_LTESR && sc->sc_ltesr != ~0U)
+		val = sc->sc_ltesr;
+	else
+		val = bus_space_read_4(sc->sc_bst, sc->sc_bsh, off);
+	return (val);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/mpc85xx/lbc.h
--- a/head/sys/powerpc/mpc85xx/lbc.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/mpc85xx/lbc.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/mpc85xx/lbc.h 238045 2012-07-03 00:06:14Z marcel $
  */
 
 #ifndef _MACHINE_LBC_H_
@@ -33,10 +33,35 @@
 #define	LBC_DEV_MAX	8
 
 /* Local access registers */
-#define	LBC85XX_BR(n)	(8 * n)
-#define	LBC85XX_OR(n)	(4 + (8 * n))
-#define	LBC85XX_LBCR	(0xd0)
-#define	LBC85XX_LCRR	(0xd4)
+#define	LBC85XX_BR(n)	(0x0 + (8 * n))	/* Base register 0-7 */
+#define	LBC85XX_OR(n)	(0x4 + (8 * n))	/* Options register 0-7 */
+#define	LBC85XX_MAR	0x068		/* UPM address register */
+#define	LBC85XX_MAMR	0x070		/* UPMA mode register */
+#define	LBC85XX_MBMR	0x074		/* UPMB mode register */
+#define	LBC85XX_MCMR	0x078		/* UPMC mode register */
+#define	LBC85XX_MRTPR	0x084		/* Memory refresh timer prescaler */
+#define	LBC85XX_MDR	0x088		/* UPM data register */
+#define	LBC85XX_LSOR	0x090		/* Special operation initiation */
+#define	LBC85XX_LURT	0x0a0		/* UPM refresh timer */
+#define	LBC85XX_LSRT	0x0a4		/* SDRAM refresh timer */
+#define	LBC85XX_LTESR	0x0b0		/* Transfer error status register */
+#define	LBC85XX_LTEDR	0x0b4		/* Transfer error disable register */
+#define	LBC85XX_LTEIR	0x0b8		/* Transfer error interrupt register */
+#define	LBC85XX_LTEATR	0x0bc		/* Transfer error attributes register */
+#define	LBC85XX_LTEAR	0x0c0		/* Transfer error address register */
+#define	LBC85XX_LTECCR	0x0c4		/* Transfer error ECC register */
+#define	LBC85XX_LBCR	0x0d0		/* Configuration register */
+#define	LBC85XX_LCRR	0x0d4		/* Clock ratio register */
+#define	LBC85XX_FMR	0x0e0		/* Flash mode register */
+#define	LBC85XX_FIR	0x0e4		/* Flash instruction register */
+#define	LBC85XX_FCR	0x0e8		/* Flash command register */
+#define	LBC85XX_FBAR	0x0ec		/* Flash block address register */
+#define	LBC85XX_FPAR	0x0f0		/* Flash page address register */
+#define	LBC85XX_FBCR	0x0f4		/* Flash byte count register */
+#define	LBC85XX_FECC0	0x100		/* Flash ECC block 0 register */
+#define	LBC85XX_FECC1	0x104		/* Flash ECC block 0 register */
+#define	LBC85XX_FECC2	0x108		/* Flash ECC block 0 register */
+#define	LBC85XX_FECC3	0x10c		/* Flash ECC block 0 register */
 
 /* LBC machine select */
 #define	LBCRES_MSEL_GPCM	0
@@ -55,10 +80,16 @@
 #define	LBCRES_ATOM_RAWA	1
 #define	LBCRES_ATOM_WARA	2
 
+struct lbc_memrange {
+	vm_paddr_t	addr;
+	vm_size_t	size;
+	vm_offset_t	kva;
+};
+
 struct lbc_bank {
-	u_long		pa;		/* physical addr of the bank */
-	u_long		size;		/* bank size */
-	vm_offset_t	va;		/* VA of the bank */
+	vm_paddr_t	addr;		/* physical addr of the bank */
+	vm_size_t	size;		/* bank size */
+	vm_offset_t	kva;		/* VA of the bank */
 
 	/*
 	 * XXX the following bank attributes do not have properties specified
@@ -74,17 +105,25 @@
 
 struct lbc_softc {
 	device_t		sc_dev;
-	struct resource		*sc_res;
+
+	struct resource		*sc_mres;
 	bus_space_handle_t	sc_bsh;
 	bus_space_tag_t		sc_bst;
-	int			sc_rid;
+	int			sc_mrid;
+
+	int			sc_irid;
+	struct resource		*sc_ires;
+	void			*sc_icookie;
 
 	struct rman		sc_rman;
 
 	int			sc_addr_cells;
 	int			sc_size_cells;
 
+	struct lbc_memrange	sc_range[LBC_DEV_MAX];
 	struct lbc_bank		sc_banks[LBC_DEV_MAX];
+
+	uint32_t		sc_ltesr;
 };
 
 struct lbc_devinfo {
@@ -93,4 +132,7 @@
 	int			di_bank;
 };
 
+uint32_t	lbc_read_reg(device_t child, u_int off);
+void		lbc_write_reg(device_t child, u_int off, uint32_t val);
+
 #endif /* _MACHINE_LBC_H_ */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/mpc85xx/mpc85xx.c
--- a/head/sys/powerpc/mpc85xx/mpc85xx.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/mpc85xx/mpc85xx.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/mpc85xx.c 222428 2011-05-28 19:14:16Z marcel $");
+__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/mpc85xx.c 235934 2012-05-24 21:07:10Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -38,6 +38,7 @@
 
 #include <machine/cpu.h>
 #include <machine/cpufunc.h>
+#include <machine/pio.h>
 #include <machine/spr.h>
 
 #include <powerpc/mpc85xx/mpc85xx.h>
@@ -60,7 +61,7 @@
 	volatile uint32_t *ptr = (void *)addr;
 
 	*ptr = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 int
@@ -87,6 +88,9 @@
 	uint32_t bar, sr;
 	int i, law_max;
 
+	if (size == 0)
+		return (0);
+
 	law_max = law_getmax();
 	bar = _LAW_BAR(addr);
 	sr = _LAW_SR(trgt, size);
@@ -167,7 +171,10 @@
 	default:
 		rv = ENXIO;
 	}
-	*trgt_mem = *trgt_io = trgt;
+	if (rv == 0) {
+		*trgt_mem = trgt;
+		*trgt_io = trgt;
+	}
 	return (rv);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/mpc85xx/nexus.c
--- a/head/sys/powerpc/mpc85xx/nexus.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/mpc85xx/nexus.c	Wed Jul 25 16:40:53 2012 +0300
@@ -54,7 +54,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/nexus.c 227843 2011-11-22 21:28:20Z marius $");
+__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/nexus.c 238042 2012-07-02 23:41:56Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -65,6 +65,8 @@
 #include <sys/malloc.h>
 #include <sys/rman.h>
 
+#include <machine/intr_machdep.h>
+
 /*
  * Device interface
  */
@@ -75,6 +77,13 @@
 static int	nexus_deactivate_resource(device_t, device_t, int, int,
     struct resource *);
 
+static int	nexus_config_intr(device_t, int, enum intr_trigger,
+    enum intr_polarity);
+static int	nexus_setup_intr(device_t, device_t, struct resource *, int,
+    driver_filter_t *, driver_intr_t *, void *, void **);
+static int	nexus_teardown_intr(device_t, device_t, struct resource *,
+    void *);
+
 static device_method_t nexus_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		nexus_probe),
@@ -89,8 +98,9 @@
 	DEVMETHOD(bus_probe_nomatch,	NULL),
 	DEVMETHOD(bus_read_ivar,	NULL),
 	DEVMETHOD(bus_write_ivar,	NULL),
-	DEVMETHOD(bus_setup_intr,	NULL),
-	DEVMETHOD(bus_teardown_intr,	NULL),
+	DEVMETHOD(bus_config_intr,	nexus_config_intr),
+	DEVMETHOD(bus_setup_intr,	nexus_setup_intr),
+	DEVMETHOD(bus_teardown_intr,	nexus_teardown_intr),
 	DEVMETHOD(bus_alloc_resource,	NULL),
 	DEVMETHOD(bus_activate_resource,	nexus_activate_resource),
 	DEVMETHOD(bus_deactivate_resource,	nexus_deactivate_resource),
@@ -143,3 +153,49 @@
 	/* Not much to be done yet... */
 	return (rman_deactivate_resource(res));
 }
+
+static int
+nexus_config_intr(device_t bus, int irq, enum intr_trigger trig,
+    enum intr_polarity pol)
+{
+
+	return (powerpc_config_intr(irq, trig, pol));
+}
+
+static int
+nexus_setup_intr(device_t bus, device_t child, struct resource *res, int flags,
+    driver_filter_t *ifilt, driver_intr_t *ihand, void *arg, void **cookiep)
+{
+	int error;
+
+	*cookiep = NULL;
+
+	/* somebody tried to setup an irq that failed to allocate! */
+	if (res == NULL)
+		return (EINVAL);
+
+	if ((rman_get_flags(res) & RF_SHAREABLE) == 0)
+		flags |= INTR_EXCL;
+
+	/* We depend on rman_activate_resource() being idempotent. */
+	error = rman_activate_resource(res);
+	if (error)
+		return (error);
+
+	error = powerpc_setup_intr(device_get_nameunit(child),
+	    rman_get_start(res), ifilt, ihand, arg, flags, cookiep);
+	return (error);
+}
+
+static int
+nexus_teardown_intr(device_t bus, device_t child, struct resource *res,
+    void *cookie)
+{
+	int error;
+
+	if (res == NULL)
+		return (EINVAL);
+
+	error = powerpc_teardown_intr(cookie);
+	return (error);
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/mpc85xx/openpic_fdt.c
--- a/head/sys/powerpc/mpc85xx/openpic_fdt.c	Wed Jul 25 16:32:50 2012 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,93 +0,0 @@
-/*-
- * Copyright (c) 2009-2010 The FreeBSD Foundation
- * All rights reserved.
- *
- * This software was developed by Semihalf under sponsorship from
- * the FreeBSD Foundation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/openpic_fdt.c 222813 2011-06-07 08:46:13Z attilio $");
-
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/module.h>
-#include <sys/bus.h>
-
-#include <machine/bus.h>
-#include <machine/intr_machdep.h>
-
-#include <dev/ofw/ofw_bus.h>
-#include <dev/ofw/ofw_bus_subr.h>
-
-#include <machine/openpicvar.h>
-
-#include "pic_if.h"
-
-static int openpic_fdt_probe(device_t);
-static int openpic_fdt_attach(device_t);
-
-static device_method_t openpic_fdt_methods[] = {
-	/* Device interface */
-	DEVMETHOD(device_probe,		openpic_fdt_probe),
-	DEVMETHOD(device_attach,	openpic_fdt_attach),
-
-	/* PIC interface */
-	DEVMETHOD(pic_bind,		openpic_bind),
-	DEVMETHOD(pic_config,		openpic_config),
-	DEVMETHOD(pic_dispatch,		openpic_dispatch),
-	DEVMETHOD(pic_enable,		openpic_enable),
-	DEVMETHOD(pic_eoi,		openpic_eoi),
-	DEVMETHOD(pic_ipi,		openpic_ipi),
-	DEVMETHOD(pic_mask,		openpic_mask),
-	DEVMETHOD(pic_unmask,		openpic_unmask),
-
-	{ 0, 0 },
-};
-
-static driver_t openpic_fdt_driver = {
-	"openpic",
-	openpic_fdt_methods,
-	sizeof(struct openpic_softc)
-};
-
-DRIVER_MODULE(openpic, simplebus, openpic_fdt_driver, openpic_devclass, 0, 0);
-
-static int
-openpic_fdt_probe(device_t dev)
-{
-
-	if (!ofw_bus_is_compatible(dev, "chrp,open-pic"))
-		return (ENXIO);
-		
-	device_set_desc(dev, OPENPIC_DEVSTR);
-	return (BUS_PROBE_DEFAULT);
-}
-
-static int
-openpic_fdt_attach(device_t dev)
-{
-
-	return (openpic_common_attach(dev, ofw_bus_get_node(dev)));
-}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/mpc85xx/pci_fdt.c
--- a/head/sys/powerpc/mpc85xx/pci_fdt.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/mpc85xx/pci_fdt.c	Wed Jul 25 16:40:53 2012 +0300
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/pci_fdt.c 227843 2011-11-22 21:28:20Z marius $");
+__FBSDID("$FreeBSD: head/sys/powerpc/mpc85xx/pci_fdt.c 235933 2012-05-24 21:01:35Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -816,8 +816,13 @@
 	}
 
 	*allocp = pci_start + alloc;
-	*vap = (uintptr_t)pmap_mapdev(start, size);
-	fsl_pcib_outbound(sc, wnd, type, start, size, pci_start);
+	if (size > 0) {
+		*vap = (uintptr_t)pmap_mapdev(start, size);
+		fsl_pcib_outbound(sc, wnd, type, start, size, pci_start);
+	} else {
+		*vap = 0;
+		fsl_pcib_outbound(sc, wnd, -1, 0, 0, 0);
+	}
 	return (0);
 }
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powermac/hrowpic.c
--- a/head/sys/powerpc/powermac/hrowpic.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/powermac/hrowpic.c	Wed Jul 25 16:40:53 2012 +0300
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/powerpc/powermac/hrowpic.c 237936 2012-07-01 19:07:45Z rpaulo $
  */
 
 /*
@@ -245,7 +245,7 @@
 }
 
 static void
-hrowpic_eoi(device_t dev __unused, u_int irq __unused)
+hrowpic_eoi(device_t dev, u_int irq)
 {
 	struct hrowpic_softc *sc;
 	int bank;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/atomic.S
--- a/head/sys/powerpc/powerpc/atomic.S	Wed Jul 25 16:32:50 2012 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,137 +0,0 @@
-/*-
- * Copyright (c) 2000, 2001 Benno Rice
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: head/sys/powerpc/powerpc/atomic.S 230400 2012-01-20 22:34:19Z andreast $
- */
-
-#include <machine/asm.h>
-	
-	.text
-	
-ASENTRY_NOPROF(atomic_set_8)
-0:	lwarx	0, 0, 3		/* load old value */
-	slwi	4, 4, 24	/* shift the byte so it's in the right place */
-	or	0, 0, 4		/* generate new value */
-	stwcx.	0, 0, 3		/* attempt to store */
-	bne-	0		/* loop if failed */
-	eieio			/* synchronise */
-	sync
-	blr			/* return */
-
-ASENTRY_NOPROF(atomic_clear_8)
-0:	lwarx 	0, 0, 3		/* load old value */
-	slwi	4, 4, 24	/* shift the byte so it's in the right place */
-	andc	0, 0, 4		/* generate new value */
-	stwcx.	0, 0, 3		/* attempt to store */
-	bne-	0		/* loop if failed */
-	eieio			/* synchronise */
-	sync
-	blr			/* return */
-
-ASENTRY_NOPROF(atomic_add_8)
-0:	lwarx	9, 0, 3		/* load old value */
-	srwi	0, 9, 24	/* byte alignment */
-	add	0, 4, 0		/* calculate new value */
-	slwi	0, 9, 24	/* byte alignment */
-	clrlwi	9, 9, 8		/* clear the byte in the original word */
-	or	9, 9, 0		/* copy back in to the original word */
-	stwcx.	9, 0, 3		/* attempt to store */
-	bne-	0		/* loop if failed */
-	eieio			/* synchronise */
-	sync
-	blr			/* return */
-
-ASENTRY_NOPROF(atomic_subtract_8)
-0:	lwarx	9, 0, 3		/* load old value */
-	srwi	0, 9, 24	/* byte alignment */
-	subf	0, 4, 0		/* calculate new value */
-	slwi	0, 9, 24	/* byte alignment */
-	clrlwi	9, 9, 8		/* clear the byte in the original word */
-	or	9, 9, 0		/* copy back in to the original word */
-	stwcx.	9, 0, 3		/* attempt to store */
-	bne-	0		/* loop if failed */
-	eieio			/* synchronise */
-	sync
-	blr			/* return */
-
-ASENTRY_NOPROF(atomic_set_16)
-	li	11, 3		/* mask to test for alignment */
-	andc.	11, 3, 11	/* force address to be word-aligned */
-0:	lwarx	12, 0, 11	/* load old value */
-	bne	1f		/* no realignment needed if it's aligned */
-	slwi	4, 4, 16	/* realign operand */
-1:	or	12, 12, 4	/* calculate new value */
-	stwcx.	12, 0, 11	/* attempt to store */
-	bne-	0b		/* loop if failed */
-	eieio			/* synchronise */
-	sync
-	blr			/* return */
-
-ASENTRY_NOPROF(atomic_clear_16)
-	li	11, 3		/* mask to test for alignment */
-	andc.	11, 3, 11	/* force address to be word-aligned */
-0:	lwarx	12, 0, 11	/* load old value */
-	bne	1f		/* no realignment needed if it's aligned */
-	slwi	4, 4, 16	/* realign operand */
-1:	andc	12, 12, 4	/* calculate new value */
-	stwcx.	12, 0, 11	/* attempt to store */
-	bne-	0b		/* loop if failed */
-	eieio			/* synchronise */
-	sync
-	blr			/* return */
-
-ASENTRY_NOPROF(atomic_add_16)
-	li	11, 3		/* mask to test for alignment */
-	andc.	11, 3, 11	/* force address to be word-aligned */
-0:	lwarx	12, 0, 11	/* load old value */
-	bne	1f		/* no realignment needed if it's aligned */
-	srwi	12, 9, 16	/* realign */
-1:	add	12, 4, 12	/* calculate new value */
-	bne	2f		/* no realignment needed if it's aligned */
-	slwi	12, 12, 16	/* realign */
-2:	clrlwi	9, 9, 16	/* clear old value */
-	or	9, 9, 12	/* copy in new value */
-	stwcx.	12, 0, 11	/* attempt to store */
-	bne-	0b		/* loop if failed */
-	eieio			/* synchronise */
-	sync
-	blr			/* return */
-
-ASENTRY_NOPROF(atomic_subtract_16)
-	li	11, 3		/* mask to test for alignment */
-	andc.	11, 3, 11	/* force address to be word-aligned */
-0:	lwarx	12, 0, 11	/* load old value */
-	bne	1f		/* no realignment needed if it's aligned */
-	srwi	12, 9, 16	/* realign */
-1:	subf	12, 4, 12	/* calculate new value */
-	bne	2f		/* no realignment needed if it's aligned */
-	slwi	12, 12, 16	/* realign */
-2:	clrlwi	9, 9, 16	/* clear old value */
-	or	9, 9, 12	/* copy in new value */
-	stwcx.	12, 0, 11	/* attempt to store */
-	bne-	0		/* loop if failed */
-	eieio			/* synchronise */
-	sync
-	blr			/* return */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/bus_machdep.c
--- a/head/sys/powerpc/powerpc/bus_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/powerpc/bus_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/powerpc/bus_machdep.c 226410 2011-10-15 23:15:55Z nwhitehorn $");
+__FBSDID("$FreeBSD: head/sys/powerpc/powerpc/bus_machdep.c 234579 2012-04-22 18:54:51Z nwhitehorn $");
 
 #define	KTR_BE_IO	0
 #define	KTR_LE_IO	0
@@ -169,7 +169,8 @@
 bs_gen_barrier(bus_space_handle_t bsh __unused, bus_size_t ofs __unused,
     bus_size_t size __unused, int flags __unused)
 {
-	__asm __volatile("eieio; sync" : : : "memory");
+
+	powerpc_iomb();
 }
 
 /*
@@ -183,6 +184,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	res = *addr;
+	powerpc_iomb();
 	CTR4(KTR_BE_IO, "%s(bsh=%#x, ofs=%#x) = %#x", __func__, bsh, ofs, res);
 	return (res);
 }
@@ -195,6 +197,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	res = *addr;
+	powerpc_iomb();
 	CTR4(KTR_BE_IO, "%s(bsh=%#x, ofs=%#x) = %#x", __func__, bsh, ofs, res);
 	return (res);
 }
@@ -207,6 +210,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	res = *addr;
+	powerpc_iomb();
 	CTR4(KTR_BE_IO, "%s(bsh=%#x, ofs=%#x) = %#x", __func__, bsh, ofs, res);
 	return (res);
 }
@@ -219,6 +223,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	res = *addr;
+	powerpc_iomb();
 	return (res);
 }
 
@@ -253,7 +258,7 @@
 
 	while (cnt--)
 		*addr++ = *s++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -263,7 +268,7 @@
 
 	while (cnt--)
 		*addr++ = *s++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -273,7 +278,7 @@
 
 	while (cnt--)
 		*addr++ = *s++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -283,7 +288,7 @@
 
 	while (cnt--)
 		*addr++ = *s++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -293,7 +298,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	*addr = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 	CTR4(KTR_BE_IO, "%s(bsh=%#x, ofs=%#x, val=%#x)", __func__, bsh, ofs, val);
 }
 
@@ -304,7 +309,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	*addr = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 	CTR4(KTR_BE_IO, "%s(bsh=%#x, ofs=%#x, val=%#x)", __func__, bsh, ofs, val);
 }
 
@@ -315,7 +320,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	*addr = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 	CTR4(KTR_BE_IO, "%s(bsh=%#x, ofs=%#x, val=%#x)", __func__, bsh, ofs, val);
 }
 
@@ -326,7 +331,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	*addr = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -365,7 +370,7 @@
 
 	while (cnt--)
 		*d++ = *addr++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -376,7 +381,7 @@
 
 	while (cnt--)
 		*d++ = *addr++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -387,7 +392,7 @@
 
 	while (cnt--)
 		*d++ = *addr++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -398,7 +403,7 @@
 
 	while (cnt--)
 		*d++ = *addr++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -408,7 +413,7 @@
 
 	while (cnt--)
 		*d = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -418,7 +423,7 @@
 
 	while (cnt--)
 		*d = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -428,7 +433,7 @@
 
 	while (cnt--)
 		*d = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -438,7 +443,7 @@
 
 	while (cnt--)
 		*d = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -448,7 +453,7 @@
 
 	while (cnt--)
 		*d++ = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -458,7 +463,7 @@
 
 	while (cnt--)
 		*d++ = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -468,7 +473,7 @@
 
 	while (cnt--)
 		*d++ = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -478,7 +483,7 @@
 
 	while (cnt--)
 		*d++ = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 /*
@@ -492,7 +497,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	res = *addr;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 	CTR4(KTR_LE_IO, "%s(bsh=%#x, ofs=%#x) = %#x", __func__, bsh, ofs, res);
 	return (res);
 }
@@ -505,7 +510,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	__asm __volatile("lhbrx %0, 0, %1" : "=r"(res) : "r"(addr));
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 	CTR4(KTR_LE_IO, "%s(bsh=%#x, ofs=%#x) = %#x", __func__, bsh, ofs, res);
 	return (res);
 }
@@ -518,7 +523,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	__asm __volatile("lwbrx %0, 0, %1" : "=r"(res) : "r"(addr));
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 	CTR4(KTR_LE_IO, "%s(bsh=%#x, ofs=%#x) = %#x", __func__, bsh, ofs, res);
 	return (res);
 }
@@ -560,7 +565,7 @@
 
 	while (cnt--)
 		*addr++ = *s++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -570,7 +575,7 @@
 
 	while (cnt--)
 		*addr++ = in16rb(s++);
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -580,7 +585,7 @@
 
 	while (cnt--)
 		*addr++ = in32rb(s++);
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -596,6 +601,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	*addr = val;
+	powerpc_iomb();
 	CTR4(KTR_LE_IO, "%s(bsh=%#x, ofs=%#x, val=%#x)", __func__, bsh, ofs, val);
 }
 
@@ -606,6 +612,7 @@
  
 	addr = __ppc_ba(bsh, ofs);
 	__asm __volatile("sthbrx %0, 0, %1" :: "r"(val), "r"(addr));
+	powerpc_iomb();
 	CTR4(KTR_LE_IO, "%s(bsh=%#x, ofs=%#x, val=%#x)", __func__, bsh, ofs, val);
 }
 
@@ -616,6 +623,7 @@
 
 	addr = __ppc_ba(bsh, ofs);
 	__asm __volatile("stwbrx %0, 0, %1" :: "r"(val), "r"(addr));
+	powerpc_iomb();
 	CTR4(KTR_LE_IO, "%s(bsh=%#x, ofs=%#x, val=%#x)", __func__, bsh, ofs, val);
 }
 
@@ -661,7 +669,7 @@
 
 	while (cnt--)
 		*d++ = *addr++;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -672,7 +680,7 @@
 
 	while (cnt--)
 		out16rb(d++, *addr++);
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -683,7 +691,7 @@
 
 	while (cnt--)
 		out32rb(d++, *addr++);
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -700,7 +708,7 @@
 
 	while (cnt--)
 		*d = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -710,7 +718,7 @@
 
 	while (cnt--)
 		out16rb(d, val);
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -720,7 +728,7 @@
 
 	while (cnt--)
 		out32rb(d, val);
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -736,7 +744,7 @@
 
 	while (cnt--)
 		*d++ = val;
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -746,7 +754,7 @@
 
 	while (cnt--)
 		out16rb(d++, val);
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
@@ -756,7 +764,7 @@
 
 	while (cnt--)
 		out32rb(d++, val);
-	__asm __volatile("eieio; sync");
+	powerpc_iomb();
 }
 
 static void
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/cpu.c
--- a/head/sys/powerpc/powerpc/cpu.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/powerpc/cpu.c	Wed Jul 25 16:40:53 2012 +0300
@@ -55,7 +55,7 @@
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * from $NetBSD: cpu_subr.c,v 1.1 2003/02/03 17:10:09 matt Exp $
- * $FreeBSD: head/sys/powerpc/powerpc/cpu.c 225953 2011-10-03 21:19:15Z mav $
+ * $FreeBSD: head/sys/powerpc/powerpc/cpu.c 236141 2012-05-27 10:25:20Z raj $
  */
 
 #include <sys/param.h>
@@ -75,13 +75,13 @@
 #include <machine/spr.h>
 
 static void	cpu_6xx_setup(int cpuid, uint16_t vers);
-static void	cpu_e500_setup(int cpuid, uint16_t vers);
 static void	cpu_970_setup(int cpuid, uint16_t vers);
+static void	cpu_booke_setup(int cpuid, uint16_t vers);
 
 int powerpc_pow_enabled;
 void (*cpu_idle_hook)(void) = NULL;
 static void	cpu_idle_60x(void);
-static void	cpu_idle_e500(void);
+static void	cpu_idle_booke(void);
 
 struct cputab {
 	const char	*name;
@@ -146,9 +146,13 @@
         { "Motorola PowerPC 8245",	MPC8245,	REVFMT_MAJMIN,
 	   PPC_FEATURE_HAS_FPU, cpu_6xx_setup },
         { "Freescale e500v1 core",	FSL_E500v1,	REVFMT_MAJMIN,
-	   0, cpu_e500_setup },
+	   0, cpu_booke_setup },
         { "Freescale e500v2 core",	FSL_E500v2,	REVFMT_MAJMIN,
-	   0, cpu_e500_setup },
+	   0, cpu_booke_setup },
+	{ "Freescale e500mc core",	FSL_E500mc,	REVFMT_MAJMIN,
+	   0, cpu_booke_setup },
+	{ "Freescale e5500 core",	FSL_E5500,	REVFMT_MAJMIN,
+	   0, cpu_booke_setup },
         { "IBM Cell Broadband Engine",	IBMCELLBE,	REVFMT_MAJMIN,
 	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU,
 	   NULL},
@@ -191,6 +195,8 @@
 			break;
 		case FSL_E500v1:
 		case FSL_E500v2:
+		case FSL_E500mc:
+		case FSL_E5500:
 			maj = (pvr >>  4) & 0xf;
 			min = (pvr >>  0) & 0xf;
 			break;
@@ -438,8 +444,9 @@
 }
 
 static void
-cpu_e500_setup(int cpuid, uint16_t vers)
+cpu_booke_setup(int cpuid, uint16_t vers)
 {
+#ifdef BOOKE_E500
 	register_t hid0;
 
 	hid0 = mfspr(SPR_HID0);
@@ -451,9 +458,10 @@
 	mtspr(SPR_HID0, hid0);
 
 	printf("cpu%d: HID0 %b\n", cpuid, (int)hid0, HID0_E500_BITMASK);
+#endif
 
 	if (cpu_idle_hook == NULL)
-		cpu_idle_hook = cpu_idle_e500;
+		cpu_idle_hook = cpu_idle_booke;
 }
 
 static void
@@ -519,6 +527,7 @@
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
+
 	if (cpu_idle_hook != NULL) {
 		if (!busy) {
 			critical_enter();
@@ -530,6 +539,7 @@
 			critical_exit();
 		}
 	}
+
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
@@ -576,7 +586,7 @@
 }
 
 static void
-cpu_idle_e500(void)
+cpu_idle_booke(void)
 {
 	register_t msr;
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/db_trace.c
--- a/head/sys/powerpc/powerpc/db_trace.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/powerpc/db_trace.c	Wed Jul 25 16:40:53 2012 +0300
@@ -1,4 +1,4 @@
-/*	$FreeBSD: head/sys/powerpc/powerpc/db_trace.c 223470 2011-06-23 09:43:53Z andreast $ */
+/*	$FreeBSD: head/sys/powerpc/powerpc/db_trace.c 236141 2012-05-27 10:25:20Z raj $ */
 /*	$NetBSD: db_trace.c,v 1.20 2002/05/13 20:30:09 matt Exp $	*/
 /*	$OpenBSD: db_trace.c,v 1.3 1997/03/21 02:10:48 niklas Exp $	*/
 
@@ -102,7 +102,7 @@
 	{ "dar", DB_OFFSET(cpu.aim.dar),	db_frame },
 	{ "dsisr", DB_OFFSET(cpu.aim.dsisr),	db_frame },
 #endif
-#ifdef E500
+#if defined(BOOKE)
 	{ "dear", DB_OFFSET(cpu.booke.dear),	db_frame },
 	{ "esr", DB_OFFSET(cpu.booke.esr),	db_frame },
 #endif
@@ -243,7 +243,7 @@
 			case EXC_SC: trapstr = "SC"; break;
 			case EXC_EXI: trapstr = "EXI"; break;
 			case EXC_MCHK: trapstr = "MCHK"; break;
-#ifndef E500
+#if !defined(BOOKE)
 			case EXC_VEC: trapstr = "VEC"; break;
 			case EXC_FPA: trapstr = "FPA"; break;
 			case EXC_BPT: trapstr = "BPT"; break;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/gdb_machdep.c
--- a/head/sys/powerpc/powerpc/gdb_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/powerpc/gdb_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/powerpc/powerpc/gdb_machdep.c 236141 2012-05-27 10:25:20Z raj $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -84,7 +84,7 @@
 int
 gdb_cpu_signal(int vector, int dummy __unused)
 {
-#ifdef E500
+#if defined(BOOKE)
 	if (vector == EXC_DEBUG || vector == EXC_PGM)
 		return (SIGTRAP);
 #else
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/genassym.c
--- a/head/sys/powerpc/powerpc/genassym.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/powerpc/genassym.c	Wed Jul 25 16:40:53 2012 +0300
@@ -30,7 +30,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)genassym.c	5.11 (Berkeley) 5/10/91
- * $FreeBSD: head/sys/powerpc/powerpc/genassym.c 230123 2012-01-15 00:08:14Z nwhitehorn $
+ * $FreeBSD: head/sys/powerpc/powerpc/genassym.c 236141 2012-05-27 10:25:20Z raj $
  */
 
 #include <sys/param.h>
@@ -63,7 +63,7 @@
 ASSYM(PC_DISISAVE, offsetof(struct pcpu, pc_disisave));
 ASSYM(PC_DBSAVE, offsetof(struct pcpu, pc_dbsave));
 
-#ifdef E500
+#if defined(BOOKE)
 ASSYM(PC_BOOKE_CRITSAVE, offsetof(struct pcpu, pc_booke_critsave));
 ASSYM(PC_BOOKE_MCHKSAVE, offsetof(struct pcpu, pc_booke_mchksave));
 ASSYM(PC_BOOKE_TLBSAVE, offsetof(struct pcpu, pc_booke_tlbsave));
@@ -116,15 +116,14 @@
 ASSYM(PM_SR, offsetof(struct pmap, pm_sr));
 ASSYM(USER_SR, USER_SR);
 #endif
-#elif defined(E500)
+#elif defined(BOOKE)
 ASSYM(PM_PDIR, offsetof(struct pmap, pm_pdir));
-#endif
-
-#if defined(E500)
 ASSYM(PTE_RPN, offsetof(struct pte, rpn));
 ASSYM(PTE_FLAGS, offsetof(struct pte, flags));
+#if defined(BOOKE_E500)
 ASSYM(TLB0_ENTRY_SIZE, sizeof(struct tlb_entry));
 #endif
+#endif
 
 #ifdef __powerpc64__
 ASSYM(FSP, 48);
@@ -215,18 +214,20 @@
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(MAXCOMLEN, MAXCOMLEN);
 
-#ifdef E500
+#if defined(BOOKE)
+ASSYM(PSL_DE, PSL_DE);
+ASSYM(PSL_DS, PSL_DS);
+ASSYM(PSL_IS, PSL_IS);
+ASSYM(PSL_CE, PSL_CE);
+#endif
+#if defined(BOOKE_E500)
 ASSYM(PSL_UCLE, PSL_UCLE);
 ASSYM(PSL_SPE, PSL_SPE);
 ASSYM(PSL_WE, PSL_WE);
-ASSYM(PSL_CE, PSL_CE);
 ASSYM(PSL_UBLE, PSL_UBLE);
-ASSYM(PSL_DS, PSL_DS);
-ASSYM(PSL_IS, PSL_IS);
-ASSYM(PSL_DE, PSL_DE);
 
 ASSYM(PSL_KERNSET_INIT, PSL_KERNSET_INIT);
-#else /* AIM */
+#elif defined(AIM)
 #ifdef __powerpc64__
 ASSYM(PSL_SF, PSL_SF);
 ASSYM(PSL_HV, PSL_HV);
@@ -256,8 +257,9 @@
 ASSYM(PSL_FP, PSL_FP);
 ASSYM(PSL_ME, PSL_ME);
 ASSYM(PSL_PR, PSL_PR);
+#if defined(BOOKE_E500)
 ASSYM(PSL_PMM, PSL_PMM);
-
+#endif
 ASSYM(PSL_KERNSET, PSL_KERNSET);
 ASSYM(PSL_USERSET, PSL_USERSET);
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/mmu_if.m
--- a/head/sys/powerpc/powerpc/mmu_if.m	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/powerpc/mmu_if.m	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
-# $FreeBSD: head/sys/powerpc/powerpc/mmu_if.m 225418 2011-09-06 10:30:11Z kib $
+# $FreeBSD: head/sys/powerpc/powerpc/mmu_if.m 238357 2012-07-10 22:10:21Z alc $
 #
 
 #include <sys/param.h>
@@ -387,7 +387,7 @@
  *
  * @retval int		count of referenced bits
  */
-METHOD boolean_t ts_referenced {
+METHOD int ts_referenced {
 	mmu_t		_mmu;
 	vm_page_t	_pg;
 };
@@ -761,7 +761,7 @@
  */
 METHOD void * mapdev {
 	mmu_t		_mmu;
-	vm_offset_t	_pa;
+	vm_paddr_t	_pa;
 	vm_size_t	_size;
 };
 
@@ -818,7 +818,7 @@
  *
  * @retval pa		physical address corresponding to mapping
  */
-METHOD vm_offset_t kextract {
+METHOD vm_paddr_t kextract {
 	mmu_t		_mmu;
 	vm_offset_t	_va;
 };
@@ -833,7 +833,7 @@
 METHOD void kenter {
 	mmu_t		_mmu;
 	vm_offset_t	_va;
-	vm_offset_t	_pa;
+	vm_paddr_t	_pa;
 };
 
 /**
@@ -860,7 +860,7 @@
  */
 METHOD boolean_t dev_direct_mapped {
 	mmu_t		_mmu;
-	vm_offset_t	_pa;
+	vm_paddr_t	_pa;
 	vm_size_t	_size;
 };
 
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/openpic_fdt.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/powerpc/powerpc/openpic_fdt.c	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,93 @@
+/*-
+ * Copyright (c) 2009-2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Semihalf under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/powerpc/powerpc/openpic_fdt.c 236119 2012-05-26 21:02:49Z raj $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/bus.h>
+
+#include <machine/bus.h>
+#include <machine/intr_machdep.h>
+
+#include <dev/ofw/ofw_bus.h>
+#include <dev/ofw/ofw_bus_subr.h>
+
+#include <machine/openpicvar.h>
+
+#include "pic_if.h"
+
+static int openpic_fdt_probe(device_t);
+static int openpic_fdt_attach(device_t);
+
+static device_method_t openpic_fdt_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		openpic_fdt_probe),
+	DEVMETHOD(device_attach,	openpic_fdt_attach),
+
+	/* PIC interface */
+	DEVMETHOD(pic_bind,		openpic_bind),
+	DEVMETHOD(pic_config,		openpic_config),
+	DEVMETHOD(pic_dispatch,		openpic_dispatch),
+	DEVMETHOD(pic_enable,		openpic_enable),
+	DEVMETHOD(pic_eoi,		openpic_eoi),
+	DEVMETHOD(pic_ipi,		openpic_ipi),
+	DEVMETHOD(pic_mask,		openpic_mask),
+	DEVMETHOD(pic_unmask,		openpic_unmask),
+
+	{ 0, 0 },
+};
+
+static driver_t openpic_fdt_driver = {
+	"openpic",
+	openpic_fdt_methods,
+	sizeof(struct openpic_softc)
+};
+
+DRIVER_MODULE(openpic, simplebus, openpic_fdt_driver, openpic_devclass, 0, 0);
+
+static int
+openpic_fdt_probe(device_t dev)
+{
+
+	if (!ofw_bus_is_compatible(dev, "chrp,open-pic"))
+		return (ENXIO);
+		
+	device_set_desc(dev, OPENPIC_DEVSTR);
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+openpic_fdt_attach(device_t dev)
+{
+
+	return (openpic_common_attach(dev, ofw_bus_get_node(dev)));
+}
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/platform.c
--- a/head/sys/powerpc/powerpc/platform.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/powerpc/platform.c	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/powerpc/platform.c 227537 2011-11-15 20:11:03Z marius $");
+__FBSDID("$FreeBSD: head/sys/powerpc/powerpc/platform.c 235936 2012-05-24 21:13:24Z raj $");
 
 /*
  * Dispatch platform calls to the appropriate platform implementation
@@ -92,7 +92,7 @@
 		    &aregions, &naregions);
 
 	for (i = 0; i < npregions; i++)
-		if ((addr >= pregions[i].mr_start) 
+		if ((addr >= pregions[i].mr_start)
 		   && (addr + len <= pregions[i].mr_start + pregions[i].mr_size))
 			return (0);
 
@@ -116,7 +116,7 @@
 {
 	return (PLATFORM_TIMEBASE_FREQ(plat_obj, cpu));
 }
-	
+
 int
 platform_smp_first_cpu(struct cpuref *cpu)
 {
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/powerpc/powerpc/pmap_dispatch.c
--- a/head/sys/powerpc/powerpc/pmap_dispatch.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/powerpc/powerpc/pmap_dispatch.c	Wed Jul 25 16:40:53 2012 +0300
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/powerpc/powerpc/pmap_dispatch.c 227537 2011-11-15 20:11:03Z marius $");
+__FBSDID("$FreeBSD: head/sys/powerpc/powerpc/pmap_dispatch.c 235936 2012-05-24 21:13:24Z raj $");
 
 /*
  * Dispatch MI pmap calls to the appropriate MMU implementation
@@ -76,6 +76,20 @@
 
 int pmap_bootstrapped;
 
+#ifdef AIM
+int
+pvo_vaddr_compare(struct pvo_entry *a, struct pvo_entry *b)
+{
+	if (PVO_VADDR(a) < PVO_VADDR(b))
+		return (-1);
+	else if (PVO_VADDR(a) > PVO_VADDR(b))
+		return (1);
+	return (0);
+}
+RB_GENERATE(pvo_tree, pvo_entry, pvo_plink, pvo_vaddr_compare);
+#endif
+	
+
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
@@ -426,7 +440,7 @@
 }
 
 void *
-pmap_mapdev(vm_offset_t pa, vm_size_t size)
+pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
@@ -457,7 +471,7 @@
 	MMU_UNMAPDEV(mmu_obj, va, size);
 }
 
-vm_offset_t
+vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 
@@ -466,7 +480,7 @@
 }
 
 void
-pmap_kenter(vm_offset_t va, vm_offset_t pa)
+pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 
 	CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, pa);
@@ -482,7 +496,7 @@
 }
 
 boolean_t
-pmap_dev_direct_mapped(vm_offset_t pa, vm_size_t size)
+pmap_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
 {
 
 	CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/conf/GENERIC
--- a/head/sys/sparc64/conf/GENERIC	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/conf/GENERIC	Wed Jul 25 16:40:53 2012 +0300
@@ -16,7 +16,7 @@
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
-# $FreeBSD: head/sys/sparc64/conf/GENERIC 234348 2012-04-16 18:29:07Z marius $
+# $FreeBSD: head/sys/sparc64/conf/GENERIC 237842 2012-06-30 14:55:36Z marius $
 
 cpu		SUN4U
 ident		GENERIC
@@ -26,7 +26,7 @@
 # Platforms supported
 #	At this time all platforms are supported, as-is.
 
-options 	SCHED_ULE		# ULE scheduler
+options 	SCHED_4BSD		# 4BSD scheduler
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
@@ -109,6 +109,7 @@
 device		isp		# Qlogic family
 device		ispfw		# Firmware module for Qlogic host adapters
 device		mpt		# LSI-Logic MPT-Fusion
+device		mps		# LSI-Logic MPT-Fusion 2
 device		sym		# NCR/Symbios/LSI Logic 53C8XX/53C1010/53C1510D
 
 # ATA/SCSI peripherals
@@ -118,7 +119,7 @@
 device		sa		# Sequential Access (tape etc)
 device		cd		# CD
 device		pass		# Passthrough device (direct ATA/SCSI access)
-device		ses		# SCSI Environmental Services (and SAF-TE)
+device		ses		# Enclosure Services (SES and SAF-TE)
 device		ctl		# CAM Target Layer
 
 # RAID controllers
@@ -234,35 +235,8 @@
 device		ohci		# OHCI PCI->USB interface
 device		ehci		# EHCI PCI->USB interface (USB 2.0)
 device		usb		# USB Bus (required)
-#device		udbp		# USB Double Bulk Pipe devices (needs netgraph)
-device		uhid		# "Human Interface Devices"
 device		ukbd		# Keyboard
-device		ulpt		# Printer
 device		umass		# Disks/Mass storage - Requires scbus and da
-device		ums		# Mouse
-device		urio		# Diamond Rio 500 MP3 player
-# USB Serial devices
-device		uark		# Technologies ARK3116 based serial adapters
-device		ubsa		# Belkin F5U103 and compatible serial adapters
-device		uftdi		# For FTDI usb serial adapters
-device		uipaq		# Some WinCE based devices
-device		uplcom		# Prolific PL-2303 serial adapters
-device		uslcom		# SI Labs CP2101/CP2102 serial adapters
-device		uvisor		# Visor and Palm devices
-device		uvscom		# USB serial support for DDI pocket's PHS
-# USB Ethernet, requires miibus
-device		aue		# ADMtek USB Ethernet
-device		axe		# ASIX Electronics USB Ethernet
-device		cdce		# Generic USB over Ethernet
-device		cue		# CATC USB Ethernet
-device		kue		# Kawasaki LSI USB Ethernet
-device		rue		# RealTek RTL8150 USB Ethernet
-device		udav		# Davicom DM9601E USB
-# USB Wireless
-device		rum		# Ralink Technology RT2501USB wireless NICs
-device		uath		# Atheros AR5523 wireless NICs
-device		ural		# Ralink Technology RT2500USB wireless NICs
-device		zyd		# ZyDAS zd1211/zd1211b wireless NICs
 
 # FireWire support
 device		firewire	# FireWire bus code
@@ -278,4 +252,3 @@
 device		snd_audiocs	# Crystal Semiconductor CS4231
 device		snd_es137x	# Ensoniq AudioPCI ES137x
 device		snd_t4dwave	# Acer Labs M5451
-device		snd_uaudio	# USB Audio
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/include/_stdint.h
--- a/head/sys/sparc64/include/_stdint.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/include/_stdint.h	Wed Jul 25 16:40:53 2012 +0300
@@ -27,7 +27,7 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/sparc64/include/_stdint.h 237517 2012-06-24 04:15:58Z andrew $
  */
 
 #ifndef	_MACHINE__STDINT_H_
@@ -149,12 +149,6 @@
 /* Limit of size_t. */
 #define	SIZE_MAX	UINT64_MAX
 
-#ifndef WCHAR_MIN /* Also possibly defined in <wchar.h> */
-/* Limits of wchar_t. */
-#define	WCHAR_MIN	INT32_MIN
-#define	WCHAR_MAX	INT32_MAX
-#endif
-
 /* Limits of wint_t. */
 #define	WINT_MIN	INT32_MIN
 #define	WINT_MAX	INT32_MAX
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/include/_types.h
--- a/head/sys/sparc64/include/_types.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/include/_types.h	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  *
  *	From: @(#)ansi.h	8.2 (Berkeley) 1/4/94
  *	From: @(#)types.h	8.3 (Berkeley) 1/5/94
- * $FreeBSD: head/sys/sparc64/include/_types.h 228469 2011-12-13 13:38:03Z ed $
+ * $FreeBSD: head/sys/sparc64/include/_types.h 237517 2012-06-24 04:15:58Z andrew $
  */
 
 #ifndef _MACHINE__TYPES_H_
@@ -92,6 +92,10 @@
 typedef	__uint64_t	__vm_paddr_t;
 typedef	__uint64_t	__vm_pindex_t;
 typedef	__uint64_t	__vm_size_t;
+typedef	int		__wchar_t;
+
+#define	__WCHAR_MIN	__INT_MIN	/* min value for a wchar_t */
+#define	__WCHAR_MAX	__INT_MAX	/* max value for a wchar_t */
 
 /*
  * Unusual type definitions.
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/include/elf.h
--- a/head/sys/sparc64/include/elf.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/include/elf.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/sparc64/include/elf.h 237430 2012-06-22 06:38:31Z kib $
  */
 
 #ifndef	_MACHINE_ELF_H_
@@ -90,6 +90,7 @@
 #define	AT_NCPUS	19	/* Number of CPUs. */
 #define	AT_PAGESIZES	20	/* Pagesizes. */
 #define	AT_PAGESIZESLEN	21	/* Number of pagesizes. */
+#define	AT_TIMEKEEP	22	/* Pointer to timehands. */
 #define	AT_STACKPROT	23	/* Initial stack protection. */
 
 #define	AT_COUNT	24	/* Count of defined aux entry types. */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/include/in_cksum.h
--- a/head/sys/sparc64/include/in_cksum.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/include/in_cksum.h	Wed Jul 25 16:40:53 2012 +0300
@@ -55,7 +55,7 @@
  *	from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
  *	from: FreeBSD: src/sys/alpha/include/in_cksum.h,v 1.5 2000/05/06
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/sparc64/include/in_cksum.h 235941 2012-05-24 22:00:48Z bz $
  */
 
 #ifndef _MACHINE_IN_CKSUM_H_
@@ -65,6 +65,7 @@
 
 #define	in_cksum(m, len)	in_cksum_skip(m, len, 0)
 
+#if defined(IPVERSION) && (IPVERSION == 4)
 static __inline void
 in_cksum_update(struct ip *ip)
 {
@@ -73,6 +74,7 @@
 	__tmp = (int)ip->ip_sum + 1;
 	ip->ip_sum = __tmp + (__tmp >> 16);
 }
+#endif
 
 static __inline u_short
 in_addword(u_short sum, u_short b)
@@ -106,6 +108,7 @@
 	return (sum);
 }
 
+#if defined(IPVERSION) && (IPVERSION == 4)
 static __inline u_int
 in_cksum_hdr(struct ip *ip)
 {
@@ -163,6 +166,7 @@
 #undef __LD_ADD
 	return (__ret);
 }
+#endif
 
 #ifdef _KERNEL
 u_short	in_cksum_skip(struct mbuf *m, int len, int skip);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/include/intr_machdep.h
--- a/head/sys/sparc64/include/intr_machdep.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/include/intr_machdep.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/sparc64/include/intr_machdep.h 235232 2012-05-10 15:23:20Z marius $
  */
 
 #ifndef	_MACHINE_INTR_MACHDEP_H_
@@ -91,10 +91,10 @@
 extern ih_func_t *intr_handlers[];
 extern struct intr_vector intr_vectors[];
 
+void	intr_add_cpu(u_int cpu);
 #ifdef SMP
-void	intr_add_cpu(u_int cpu);
+int	intr_bind(int vec, u_char cpu);
 #endif
-int	intr_bind(int vec, u_char cpu);
 int	intr_describe(int vec, void *ih, const char *descr);
 void	intr_setup(int level, ih_func_t *ihf, int pri, iv_func_t *ivf,
 	    void *iva);
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/include/pcb.h
--- a/head/sys/sparc64/include/pcb.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/include/pcb.h	Wed Jul 25 16:40:53 2012 +0300
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/sparc64/include/pcb.h 234785 2012-04-29 11:04:31Z dim $
  */
 
 #ifndef	_MACHINE_PCB_H_
@@ -55,7 +55,7 @@
 
 #ifdef _KERNEL
 void	makectx(struct trapframe *tf, struct pcb *pcb);
-int	savectx(struct pcb *pcb);
+int	savectx(struct pcb *pcb) __returns_twice;
 #endif
 
 #endif /* !LOCORE */
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/include/pmap.h
--- a/head/sys/sparc64/include/pmap.h	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/include/pmap.h	Wed Jul 25 16:40:53 2012 +0300
@@ -33,7 +33,7 @@
  *	from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90
  *	from: @(#)pmap.h        7.4 (Berkeley) 5/12/91
  *	from: FreeBSD: src/sys/i386/include/pmap.h,v 1.70 2000/11/30
- * $FreeBSD: head/sys/sparc64/include/pmap.h 223800 2011-07-05 18:50:40Z marius $
+ * $FreeBSD: head/sys/sparc64/include/pmap.h 237168 2012-06-16 18:56:19Z alc $
  */
 
 #ifndef	_MACHINE_PMAP_H_
@@ -43,6 +43,7 @@
 #include <sys/_cpuset.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
+#include <sys/_rwlock.h>
 #include <machine/cache.h>
 #include <machine/tte.h>
 
@@ -79,6 +80,7 @@
 #define	PMAP_UNLOCK(pmap)	mtx_unlock(&(pmap)->pm_mtx)
 
 #define	pmap_page_get_memattr(m)	VM_MEMATTR_DEFAULT
+#define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
 #define	pmap_page_set_memattr(m, ma)	(void)0
 
 void	pmap_bootstrap(u_int cpu_impl);
@@ -101,6 +103,7 @@
 
 extern	struct pmap kernel_pmap_store;
 #define	kernel_pmap	(&kernel_pmap_store)
+extern	struct rwlock tte_list_global_lock;
 extern	vm_paddr_t phys_avail[];
 extern	vm_offset_t virtual_avail;
 extern	vm_offset_t virtual_end;
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/include/vdso.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/head/sys/sparc64/include/vdso.h	Wed Jul 25 16:40:53 2012 +0300
@@ -0,0 +1,34 @@
+/*-
+ * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/sparc64/include/vdso.h 237433 2012-06-22 07:06:40Z kib $
+ */
+
+#ifndef _SPARC64_VDSO_H
+#define	_SPARC64_VDSO_H
+
+#define	VDSO_TIMEHANDS_MD			\
+	uint32_t	th_res[8];
+
+#endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/sparc64/intr_machdep.c
--- a/head/sys/sparc64/sparc64/intr_machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/sparc64/intr_machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -59,7 +59,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/sparc64/sparc64/intr_machdep.c 234247 2012-04-13 22:58:23Z marius $");
+__FBSDID("$FreeBSD: head/sys/sparc64/sparc64/intr_machdep.c 235231 2012-05-10 15:17:21Z marius $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -554,4 +554,11 @@
 }
 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
     NULL);
+#else /* !SMP */
+/* Use an empty stub for compatibility. */
+void
+intr_add_cpu(u_int cpu __unused)
+{
+
+}
 #endif
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/sparc64/machdep.c
--- a/head/sys/sparc64/sparc64/machdep.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/sparc64/machdep.c	Wed Jul 25 16:40:53 2012 +0300
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/sparc64/sparc64/machdep.c 234247 2012-04-13 22:58:23Z marius $");
+__FBSDID("$FreeBSD: head/sys/sparc64/sparc64/machdep.c 234723 2012-04-26 20:24:25Z attilio $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -197,12 +197,10 @@
 
 	cpu_identify(rdpr(ver), PCPU_GET(clock), curcpu);
 
-#ifdef SMP
 	/*
 	 * Add BSP as an interrupt target.
 	 */
 	intr_add_cpu(0);
-#endif
 }
 
 void
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/sparc64/pmap.c
--- a/head/sys/sparc64/sparc64/pmap.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/sparc64/pmap.c	Wed Jul 25 16:40:53 2012 +0300
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: head/sys/sparc64/sparc64/pmap.c 230634 2012-01-27 23:25:24Z marius $");
+__FBSDID("$FreeBSD: head/sys/sparc64/sparc64/pmap.c 237623 2012-06-27 03:45:25Z alc $");
 
 /*
  * Manages physical address maps.
@@ -71,6 +71,7 @@
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
@@ -134,6 +135,11 @@
 struct pmap kernel_pmap_store;
 
 /*
+ * Global tte list lock
+ */
+struct rwlock tte_list_global_lock;
+
+/*
  * Allocate physical memory for use in pmap_bootstrap.
  */
 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size, uint32_t colors);
@@ -666,6 +672,12 @@
 		pm->pm_context[i] = TLB_CTX_KERNEL;
 	CPU_FILL(&pm->pm_active);
 
+ 	/*
+	 * Initialize the global tte list lock, which is more commonly
+	 * known as the pmap pv global lock.
+	 */
+	rw_init(&tte_list_global_lock, "pmap pv global");
+
 	/*
 	 * Flush all non-locked TLB entries possibly left over by the
 	 * firmware.
@@ -876,7 +888,7 @@
 	struct tte *tp;
 	int color;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_cache_enter: fake page"));
 	PMAP_STATS_INC(pmap_ncache_enter);
@@ -951,7 +963,7 @@
 	struct tte *tp;
 	int color;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	CTR3(KTR_PMAP, "pmap_cache_remove: m=%p va=%#lx c=%d", m, va,
 	    m->md.colors[DCACHE_COLOR(va)]);
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
@@ -1026,7 +1038,7 @@
 	vm_page_t om;
 	u_long data;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	PMAP_STATS_INC(pmap_nkenter);
 	tp = tsb_kvtotte(va);
 	CTR4(KTR_PMAP, "pmap_kenter: va=%#lx pa=%#lx tp=%p data=%#lx",
@@ -1088,7 +1100,7 @@
 	struct tte *tp;
 	vm_page_t m;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	PMAP_STATS_INC(pmap_nkremove);
 	tp = tsb_kvtotte(va);
 	CTR3(KTR_PMAP, "pmap_kremove: va=%#lx tp=%p data=%#lx", va, tp,
@@ -1139,19 +1151,16 @@
 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
-	int locked;
 
 	PMAP_STATS_INC(pmap_nqenter);
 	va = sva;
-	if (!(locked = mtx_owned(&vm_page_queue_mtx)))
-		vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	while (count-- > 0) {
 		pmap_kenter(va, *m);
 		va += PAGE_SIZE;
 		m++;
 	}
-	if (!locked)
-		vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	tlb_range_demap(kernel_pmap, sva, va);
 }
 
@@ -1163,18 +1172,15 @@
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
-	int locked;
 
 	PMAP_STATS_INC(pmap_nqremove);
 	va = sva;
-	if (!(locked = mtx_owned(&vm_page_queue_mtx)))
-		vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
-	if (!locked)
-		vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	tlb_range_demap(kernel_pmap, sva, va);
 }
 
@@ -1322,7 +1328,7 @@
 	vm_page_t m;
 	u_long data;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	data = atomic_readandclear_long(&tp->tte_data);
 	if ((data & TD_FAKE) == 0) {
 		m = PHYS_TO_VM_PAGE(TD_PA(data));
@@ -1359,7 +1365,7 @@
 	    pm->pm_context[curcpu], start, end);
 	if (PMAP_REMOVE_DONE(pm))
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	PMAP_LOCK(pm);
 	if (end - start > PMAP_TSB_THRESH) {
 		tsb_foreach(pm, NULL, start, end, pmap_remove_tte);
@@ -1372,7 +1378,7 @@
 		tlb_range_demap(pm, start, end - 1);
 	}
 	PMAP_UNLOCK(pm);
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 }
 
 void
@@ -1385,7 +1391,7 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	for (tp = TAILQ_FIRST(&m->md.tte_list); tp != NULL; tp = tpn) {
 		tpn = TAILQ_NEXT(tp, tte_link);
 		if ((tp->tte_data & TD_PV) == 0)
@@ -1408,7 +1414,7 @@
 		PMAP_UNLOCK(pm);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 }
 
 static int
@@ -1470,10 +1476,10 @@
     vm_prot_t prot, boolean_t wired)
 {
 
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	PMAP_LOCK(pm);
 	pmap_enter_locked(pm, va, m, prot, wired);
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	PMAP_UNLOCK(pm);
 }
 
@@ -1493,7 +1499,7 @@
 	vm_page_t real;
 	u_long data;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 	KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 ||
 	    VM_OBJECT_LOCKED(m->object),
@@ -1636,14 +1642,14 @@
 
 	psize = atop(end - start);
 	m = m_start;
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	PMAP_LOCK(pm);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_locked(pm, start + ptoa(diff), m, prot &
 		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
 		m = TAILQ_NEXT(m, listq);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	PMAP_UNLOCK(pm);
 }
 
@@ -1651,11 +1657,11 @@
 pmap_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	PMAP_LOCK(pm);
 	pmap_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 	    FALSE);
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	PMAP_UNLOCK(pm);
 }
 
@@ -1721,7 +1727,7 @@
 
 	if (dst_addr != src_addr)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
@@ -1739,7 +1745,7 @@
 				pmap_copy_tte(src_pmap, dst_pmap, tp, va);
 		tlb_range_demap(dst_pmap, src_addr, src_addr + len - 1);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
@@ -1938,7 +1944,7 @@
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	loops = 0;
 	rv = FALSE;
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
@@ -1949,7 +1955,7 @@
 		if (++loops >= 16)
 			break;
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	return (rv);
 }
 
@@ -1966,11 +1972,11 @@
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link)
 		if ((tp->tte_data & (TD_PV | TD_WIRED)) == (TD_PV | TD_WIRED))
 			count++;
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	return (count);
 }
 
@@ -1997,13 +2003,13 @@
 	rv = FALSE;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (rv);
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link)
 		if ((tp->tte_data & TD_PV) != 0) {
 			rv = TRUE;
 			break;
 		}
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	return (rv);
 }
 
@@ -2029,7 +2035,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	count = 0;
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	if ((tp = TAILQ_FIRST(&m->md.tte_list)) != NULL) {
 		tpf = tp;
 		do {
@@ -2043,7 +2049,7 @@
 				break;
 		} while ((tp = tpn) != NULL && tp != tpf);
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	return (count);
 }
 
@@ -2066,7 +2072,7 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return (rv);
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
@@ -2075,7 +2081,7 @@
 			break;
 		}
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	return (rv);
 }
 
@@ -2109,7 +2115,7 @@
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	rv = FALSE;
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
@@ -2118,7 +2124,7 @@
 			break;
 		}
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 	return (rv);
 }
 
@@ -2141,7 +2147,7 @@
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
@@ -2149,7 +2155,7 @@
 		if ((data & TD_W) != 0)
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 }
 
 void
@@ -2160,7 +2166,7 @@
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_reference: page %p is not managed", m));
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
@@ -2168,7 +2174,7 @@
 		if ((data & TD_REF) != 0)
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 }
 
 void
@@ -2189,7 +2195,7 @@
 	if ((m->oflags & VPO_BUSY) == 0 &&
 	    (m->aflags & PGA_WRITEABLE) == 0)
 		return;
-	vm_page_lock_queues();
+	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
@@ -2200,7 +2206,7 @@
 		}
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
-	vm_page_unlock_queues();
+	rw_wunlock(&tte_list_global_lock);
 }
 
 int
diff -r 58bcef12a717 -r fc630f3c8529 head/sys/sparc64/sparc64/tsb.c
--- a/head/sys/sparc64/sparc64/tsb.c	Wed Jul 25 16:32:50 2012 +0300
+++ b/head/sys/sparc64/sparc64/tsb.c	Wed Jul 25 16:40:53 2012 +0300
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: head/sys/sparc64/sparc64/tsb.c 236214 2012-05-29 01:52:38Z alc $");
 
 #include "opt_ddb.h"
 #include "opt_pmap.h"
@@ -40,6 +40,7 @@
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
@@ -131,7 +132,7 @@
 			PMAP_STATS_INC(tsb_nenter_u_oc);
 	}
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 	if (pm == kernel_pmap) {
 		PMAP_STATS_INC(tsb_nenter_k);