Rewrite the final loop in vm_phys_enqueue_contig as a new function, vm_phys_enq_beg.
The number of amd64 bytes in the loop shrinks from 91 to 70.
Old:
1e20: 48 89 d1 movq %rdx, %rcx 1e23: 48 29 f1 subq %rsi, %rcx 1e26: 48 c1 f9 03 sarq $0x3, %rcx 1e2a: 48 0f af cf imulq %rdi, %rcx 1e2e: 48 0f bd c9 bsrq %rcx, %rcx 1e32: 88 4e 5c movb %cl, 0x5c(%rsi) 1e35: 4c 8d 46 10 leaq 0x10(%rsi), %r8 1e39: 48 c7 46 10 00 00 00 00 movq $0x0, 0x10(%rsi) 1e41: 41 89 c9 movl %ecx, %r9d 1e44: 4f 8d 0c 49 leaq (%r9,%r9,2), %r9 1e48: 4e 8b 54 c8 08 movq 0x8(%rax,%r9,8), %r10 1e4d: 4c 89 56 18 movq %r10, 0x18(%rsi) 1e51: 4e 8b 54 c8 08 movq 0x8(%rax,%r9,8), %r10 1e56: 49 89 32 movq %rsi, (%r10) 1e59: 4e 89 44 c8 08 movq %r8, 0x8(%rax,%r9,8) 1e5e: 42 ff 44 c8 10 incl 0x10(%rax,%r9,8) 1e63: 41 b8 01 00 00 00 movl $0x1, %r8d 1e69: 41 d3 e0 shll %cl, %r8d 1e6c: 49 63 c8 movslq %r8d, %rcx 1e6f: 48 6b c9 68 imulq $0x68, %rcx, %rcx 1e73: 48 01 ce addq %rcx, %rsi 1e76: 48 39 d6 cmpq %rdx, %rsi 1e79: 72 a5 jb 0x1e20 <vm_phys_enqueue_contig+0x130>
New:
1e20: 0f bd ca bsrl %edx, %ecx 1e23: 88 4e 5c movb %cl, 0x5c(%rsi) 1e26: 48 8d 7e 10 leaq 0x10(%rsi), %rdi 1e2a: 48 c7 46 10 00 00 00 00 movq $0x0, 0x10(%rsi) 1e32: 4c 8d 04 49 leaq (%rcx,%rcx,2), %r8 1e36: 4e 8b 4c c0 08 movq 0x8(%rax,%r8,8), %r9 1e3b: 4c 89 4e 18 movq %r9, 0x18(%rsi) 1e3f: 4e 8b 4c c0 08 movq 0x8(%rax,%r8,8), %r9 1e44: 49 89 31 movq %rsi, (%r9) 1e47: 4a 89 7c c0 08 movq %rdi, 0x8(%rax,%r8,8) 1e4c: 42 ff 44 c0 10 incl 0x10(%rax,%r8,8) 1e51: bf 01 00 00 00 movl $0x1, %edi 1e56: d3 e7 shll %cl, %edi 1e58: 48 63 cf movslq %edi, %rcx 1e5b: 48 6b f9 68 imulq $0x68, %rcx, %rdi 1e5f: 48 01 fe addq %rdi, %rsi 1e62: 29 ca subl %ecx, %edx 1e64: 75 ba jne 0x1e20 <vm_phys_enqueue_contig+0x130>