Page MenuHomeFreeBSD

D45943.id140964.diff
No OneTemporary

D45943.id140964.diff

diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -16,10 +16,12 @@
strcmp \
strcpy \
strlen \
- strncmp \
strnlen \
strrchr
+
+MDSRCS+= \
+ strncmp.S
#
# Add the above functions. Generate an asm file that includes the needed
# Arm Optimized Routines file defining the function name to the libc name.
diff --git a/lib/libc/aarch64/string/strncmp.S b/lib/libc/aarch64/string/strncmp.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/strncmp.S
@@ -0,0 +1,567 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+ .text
+
+ENTRY(strncmp)
+
+ bic x8, x0, #0xf // x8 is x0 but aligned to the boundary
+ and x9, x0, #0xf // x9 is the offset
+ bic x10, x1, #0xf // x10 is x1 but aligned to the boundary
+ and x11, x1, #0xf // x11 is the offset
+
+ subs x2, x2, #1
+ b.mi .Lempty
+
+ mov x13, #-1 // save constants for later
+ mov x16, #0xf
+
+ /*
+ * Check if either string is located at end of page to avoid crossing
+ * into unmapped page. If so, we load 16 bytes from the nearest
+ * alignment boundary and shift based on the offset.
+ */
+
+ add x3, x0, #16 // end of head
+ add x4, x1, #16
+ eor x3, x3, x0
+ eor x4, x4, x1 // bits that changed
+ orr x3, x3, x4 // in either str1 or str2
+ cmp x2,#16
+ b.lt .Llt16
+ tbz w3, #PAGE_SHIFT, .Lbegin
+
+ ldr q0, [x8] // load aligned head
+ ldr q1, [x10]
+
+ lsl x14, x9, #2
+ lsl x15, x11, #2
+ lsl x3, x13, x14 // string head
+ lsl x4, x13, x15
+
+ cmeq v5.16b, v0.16b, #0
+ cmeq v6.16b, v1.16b, #0
+
+ shrn v5.8b, v5.8h, #4
+ shrn v6.8b, v6.8h, #4
+ fmov x5, d5
+ fmov x6, d6
+
+ adrp x14, shift_data
+ add x14, x14, :lo12:shift_data
+
+ /* heads may cross page boundary, avoid unmapped loads */
+ tst x5, x3
+ b.eq 0f
+
+ ldr q4, [x14, x9] // load permutation table
+ tbl v0.16b, {v0.16b}, v4.16b
+
+ b 1f
+ .p2align 4
+0:
+ ldr q0, [x0] // load true head
+1:
+ tst x6, x4
+ b.eq 0f
+
+ ldr q4, [x14, x11]
+ tbl v4.16b, {v1.16b}, v4.16b
+
+ b 1f
+
+ .p2align 4
+.Lbegin:
+ ldr q0, [x0] // load true heads
+0:
+ ldr q4, [x1]
+1:
+ cmeq v2.16b, v0.16b, #0 // NUL byte present?
+ cmeq v4.16b, v0.16b, v4.16b // which bytes match?
+
+ orn v2.16b, v2.16b, v4.16b // mismatch or NUL byte?
+
+ shrn v2.8b, v2.8h, #4
+ fmov x5, d2
+
+ cbnz x5, .Lhead_mismatch
+ /* load head and second chunk */
+ ldr q2, [x8, #16] // load second chunk
+ ldr q3, [x10, #16]
+
+ add x2, x2, x11
+ sub x2, x2, #16 // account for length of RSI chunk?
+
+ subs x9, x9, x11 // is a&0xf >= b&0xf
+ b.lt .Lswapped // if not swap operands
+ b .Lnormal
+
+ .p2align 4
+.Llt16:
+ /*
+ * Check if either string is located at end of page to avoid crossing
+ * into unmapped page. If so, we load 16 bytes from the nearest
+ * alignment boundary and shift based on the offset.
+ */
+ tbz w3, #PAGE_SHIFT, 2f
+
+ ldr q0, [x8] // load aligned head
+ ldr q1, [x10]
+
+ lsl x14, x9, #2
+ lsl x15, x11, #2
+ lsl x3, x13, x14 // string head
+ lsl x4, x13, x15
+
+ /* Introduce a null byte match if the limit is within the aligned chunk */
+ add x14, x2, x9
+ add x15, x2, x11
+ lsl x14, x14, #2
+ lsl x15, x15, #2
+ lsl x14, x16, x14
+ lsl x15, x16, x15
+
+ cmeq v5.16b, v0.16b, #0
+ cmeq v6.16b, v1.16b, #0
+
+ shrn v5.8b, v5.8h, #4
+ shrn v6.8b, v6.8h, #4
+ fmov x5, d5
+ fmov x6, d6
+
+ orr x5, x5, x14 // insert null match if limit reached
+ orr x6, x6, x15
+
+ adrp x14, shift_data
+ add x14, x14, :lo12:shift_data
+
+ /* heads may cross page boundary, avoid unmapped loads */
+ tst x5, x3
+ b.eq 0f
+
+ ldr q4, [x14, x9] // load permutation table
+ tbl v0.16b, {v0.16b}, v4.16b
+
+ b 1f
+ .p2align 4
+0:
+ ldr q0, [x0] // load true head
+1:
+ tst x6, x4
+ b.eq 0f
+
+ ldr q4, [x14, x11]
+ tbl v4.16b, {v1.16b}, v4.16b
+
+ b 1f
+
+ .p2align 4
+2:
+ ldr q0, [x0] // load true heads
+0:
+ ldr q4, [x1]
+1:
+
+ cmeq v2.16b, v0.16b, #0 // NUL byte present?
+ cmeq v4.16b, v0.16b, v4.16b // which bytes match?
+
+ bic v2.16b, v4.16b, v2.16b // match and not NUL byte
+
+ shrn v2.8b, v2.8h, #4
+ fmov x5, d2
+ lsl x4, x2, #2
+ lsl x4, x13, x4
+ orn x5, x4, x5 // mismatch or NUL byte?
+
+.Lhead_mismatch:
+ rbit x3, x5
+ clz x3, x3 // index of mismatch
+ lsr x3, x3, #2
+ ldrb w4, [x0, x3]
+ ldrb w5, [x1, x3]
+ sub w0, w4, w5
+ ret
+
+ .p2align 4
+.Lnormal:
+ sub x12, x10, x9
+ ldr q0, [x12, #16]!
+ sub x10, x10, x8
+ sub x11, x10, x9
+
+ cmeq v1.16b, v3.16b, #0 // NUL present?
+ cmeq v0.16b, v0.16b, v2.16b // Mismatch between chunks?
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+
+ add x8, x8, #32 // advance to next iteration
+
+ lsl x4, x2, #2
+ lsl x4, x13, x4
+ orr x3, x6, x4 // introduce a null byte match
+ cmp x2, #16 // does the buffer end within x2
+ csel x6, x3, x6, lt
+ cbnz x6, .Lnulfound2 // NUL or end of buffer found?
+ mvn x5, x5
+ cbnz x5, .Lmismatch2
+ sub x2, x2, #16
+ cmp x2, #32 // end of buffer within first main loop iteration?
+ b.lt .Ltail
+ /*
+ * During the main loop, the layout of the two strings is something like:
+ *
+ * v ------1------ v ------2------ v
+ * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
+ * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
+ *
+ * where v indicates the alignment boundaries and corresponding chunks
+ * of the strings have the same letters. Chunk A has been checked in
+ * the previous iteration. This iteration, we first check that string
+ * X1 doesn't end within region 2, then we compare chunk B between the
+ * two strings. As X1 is known not to hold a NUL byte in regions 1
+ * and 2 at this point, this also ensures that x0 has not ended yet.
+ */
+ .p2align 4
+0:
+ ldr q0, [x8, x11]
+ ldr q1, [x8, x10]
+ ldr q2, [x8]
+
+ cmeq v1.16b, v1.16b, #0 // end of string?
+ cmeq v0.16b, v0.16b, v2.16b // do the chunks match?
+
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+ cbnz x6, .Lnulfound
+ mvn x5, x5 // any mismatches?
+ cbnz x5, .Lmismatch
+
+ add x8, x8, #16
+
+ /* main loop unrolled twice */
+ ldr q0, [x8, x11]
+ ldr q1, [x8, x10]
+ ldr q2, [x8]
+
+ add x8, x8, #16
+ cmeq v1.16b, v1.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+ cbnz x6, .Lnulfound2
+ mvn x5, x5
+ cbnz x5, .Lmismatch2
+ sub x2, x2, #32
+ cmp x2, #32 // end of buffer within next iteration
+ b.ge 0b // if yes, process tail
+
+ /* end of buffer will occur in next 32 bytes */
+.Ltail:
+ ldr q0, [x8, x11]
+ ldr q1, [x8, x10]
+ ldr q2, [x8]
+
+ cmeq v1.16b, v1.16b, #0 // end of string?
+ cmeq v0.16b, v0.16b, v2.16b // do the chunks match?
+
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+
+ /*
+ * If x2 <= 16 then we introduce a NUL byte in the
+ * result from CMEQ to avoid comparing further!
+ */
+
+ lsl x4, x2, #2
+ lsl x4, x13, x4
+ orr x3, x6, x4 // introduce a null byte match
+ cmp x2, #16 // does the buffer end within x2
+ csel x6, x3, x6, lt
+
+ cbnz x6, .Lnulfound // NUL or end of string found
+ mvn x5, x5
+ cbnz x5, .Lmismatch
+
+ add x8, x8, #16
+
+ /* main loop unrolled twice */
+ ldr q0, [x8, x11]
+ ldr q1, [x8, x10]
+ ldr q2, [x8]
+
+ add x8, x8, #16
+ cmeq v1.16b, v1.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+
+ ubfiz x4, x2, #2, #4 // (x2 - 16) << 2
+ lsl x4, x13, x4 // take first half into account
+ orr x6, x6, x4 // introduce a null byte match
+
+.Lnulfound2:
+ sub x8, x8, #16
+
+.Lnulfound:
+ mov x4, x6
+
+ ubfiz x7, x9, #2, #4
+ lsl x6, x6, x7 // adjust NUL mask to indices
+
+ orn x5, x6, x5
+ cbnz x5, .Lmismatch
+
+ /*
+ * (x0) == (x1) and NUL is past the string.
+ * Compare (x1) with the corresponding part
+ * of the other string until the NUL byte.
+ */
+ ldr q0, [x8, x9]
+ ldr q1, [x8, x10]
+
+ cmeq v1.16b, v0.16b, v1.16b
+ shrn v1.8b, v1.8h, #4
+ fmov x5, d1
+
+ orn x5, x4, x5
+
+ rbit x3, x5
+ clz x3, x3
+ lsr x5, x3, #2
+
+ add x10, x10, x8 // restore x10 pointer
+ add x8, x8, x9 // point to corresponding chunk in x0
+
+ ldrb w4, [x8, x5]
+ ldrb w5, [x10, x5]
+ sub w0, w4, w5
+ ret
+
+ .p2align 4
+.Lmismatch2:
+ sub x8, x8, #16 // roll back second increment
+.Lmismatch:
+ rbit x3, x5
+ clz x3, x3 // index of mismatch
+ lsr x3, x3, #2
+ add x11, x8, x11
+
+ ldrb w4, [x8, x3]
+ ldrb w5, [x11, x3]
+ sub w0, w4, w5 // difference of the mismatching chars
+ ret
+
+ /*
+ * If (a&0xf) < (b&0xf), we do the same thing but with swapped
+ * operands. I found that this performs slightly better than
+ * using conditional moves to do the swap branchless.
+ */
+ .p2align 4
+.Lswapped:
+ add x12, x8, x9
+ ldr q0, [x12, #16]!
+ sub x8, x8, x10
+ add x11, x8, x9
+ add x2,x2,x9
+ neg x9, x9
+
+ cmeq v1.16b, v2.16b, #0
+ cmeq v0.16b, v0.16b, v3.16b
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+
+ add x10, x10, #32
+
+ lsl x4, x2, #2
+ lsl x4, x13, x4
+ orr x3,x6,x4 // introduce a null byte match
+ cmp x2,#16
+ csel x6, x3, x6, lt
+ cbnz x6, .Lnulfound2s
+ mvn x5, x5
+ cbnz x5, .Lmismatch2s
+
+ sub x2, x2, #16
+ cmp x2, #32
+ b.lt .Ltails
+
+ /*
+ * During the main loop, the layout of the two strings is something like:
+ *
+ * v ------1------ v ------2------ v
+ * X1: AAAAAAAAAAAAABBBBBBBBBBBBBBBB...
+ * X0: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC...
+ *
+ * where v indicates the alignment boundaries and corresponding chunks
+ * of the strings have the same letters. Chunk A has been checked in
+ * the previous iteration. This iteration, we first check that string
+ * X0 doesn't end within region 2, then we compare chunk B between the
+ * two strings. As X0 is known not to hold a NUL byte in regions 1
+ * and 2 at this point, this also ensures that X1 has not ended yet.
+ */
+ .p2align 4
+0:
+ ldr q0, [x10, x11]
+ ldr q1, [x10, x8]
+ ldr q2, [x10]
+
+ cmeq v1.16b, v1.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+ cbnz x6, .Lnulfounds
+ mvn x5, x5
+ cbnz x5, .Lmismatchs
+
+ add x10, x10, #16
+
+ /* main loop unrolled twice */
+ ldr q0, [x10, x11]
+ ldr q1, [x10, x8]
+ ldr q2, [x10]
+
+ add x10, x10, #16
+ cmeq v1.16b, v1.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+ cbnz x6, .Lnulfound2s
+ mvn x5, x5
+ cbnz x5, .Lmismatch2s
+ sub x2, x2, #32
+ cmp x2, #32
+ b.ge 0b
+
+.Ltails:
+ ldr q0, [x10, x11]
+ ldr q1, [x10, x8]
+ ldr q2, [x10]
+
+ cmeq v1.16b, v1.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+
+ /*
+ * If x2 <= 16 then we introduce a NUL byte in the
+ * result from CMEQ to avoid comparing further!
+ */
+
+ lsl x4, x2, #2
+ lsl x4, x13, x4
+ orr x3, x6, x4 // introduce a null byte match
+ cmp x2, #16
+ csel x6, x3, x6, lt
+
+ cbnz x6, .Lnulfounds
+ mvn x5, x5
+ cbnz x5, .Lmismatchs
+
+ add x10, x10, #16
+
+ ldr q0, [x10, x11]
+ ldr q1, [x10, x8]
+ ldr q2, [x10]
+
+ add x10, x10, #16
+ cmeq v1.16b, v1.16b, #0
+ cmeq v0.16b, v0.16b, v2.16b
+
+ shrn v1.8b, v1.8h, #4
+ shrn v0.8b, v0.8h, #4
+ fmov x6, d1
+ fmov x5, d0
+
+ ubfiz x4, x2, #2, #4
+ lsl x4, x13, x4
+ orr x6, x6, x4 // introduce a null byte match
+
+.Lnulfound2s:
+ sub x10, x10, #16
+.Lnulfounds:
+ mov x4, x6
+
+ ubfiz x7, x9, #2, #4
+ lsl x6, x6, x7
+
+ orn x5, x6, x5
+
+ cbnz x5, .Lmismatchs
+
+ ldr q0, [x10, x9]
+ ldr q1, [x10, x8]
+
+ cmeq v1.16b, v0.16b, v1.16b
+ shrn v1.8b, v1.8h, #4
+ fmov x5, d1
+
+ orn x5, x4, x5
+
+ rbit x3, x5
+ clz x3, x3
+ lsr x5, x3, #2
+
+ add x11, x10, x8
+ add x10, x10, x9
+
+ ldrb w4, [x10, x5]
+ ldrb w5, [x11, x5]
+ sub w0, w5, w4
+ ret
+
+ .p2align 4
+.Lmismatch2s:
+ sub x10, x10, #16
+.Lmismatchs:
+ rbit x3, x5
+ clz x3, x3
+ lsr x3, x3, #2
+ add x11, x10, x11
+
+ ldrb w4, [x10, x3]
+ ldrb w5, [x11, x3]
+ sub w0, w5, w4
+ ret
+
+ .p2align 4
+.Lempty:
+ eor x0, x0, x0
+ ret
+
+END(strncmp)
+
+ .section .rodata
+ .p2align 4
+shift_data:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .fill 16, 1, -1
+ .size shift_data, .-shift_data

File Metadata

Mime Type
text/plain
Expires
Mon, Jan 27, 5:56 PM (3 h, 30 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16202394
Default Alt Text
D45943.id140964.diff (12 KB)

Event Timeline