Page MenuHomeFreeBSD

D45621.diff
No OneTemporary

D45621.diff

diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -5,7 +5,6 @@
AARCH64_STRING_FUNCS= \
memchr \
- memcmp \
memcpy \
memmove \
memrchr \
@@ -20,6 +19,11 @@
strnlen \
strrchr
+
+MDSRCS+= \
+ memcmp.S \
+ bcmp.S
+
#
# Add the above functions. Generate an asm file that includes the needed
# Arm Optimized Routines file defining the function name to the libc name.
diff --git a/lib/libc/aarch64/string/bcmp.S b/lib/libc/aarch64/string/bcmp.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/bcmp.S
@@ -0,0 +1,8 @@
+/*-
+ * Written by Mateusz Guzik <mjg@freebsd.org>
+ * Public domain.
+ */
+
+#define BCMP
+#include "memcmp.S"
+
diff --git a/lib/libc/aarch64/string/memcmp.S b/lib/libc/aarch64/string/memcmp.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/memcmp.S
@@ -0,0 +1,213 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+#ifdef BCMP
+#define memcmp bcmp
+#define __memcmp __bcmp
+#endif
+
+ .weak memcmp
+ .set memcmp, __memcmp
+ .text
+
+ENTRY(__memcmp)
+
+ mov x8, x0 // store base addresses
+ mov x9, x1
+ cbz x2, .Lnone // 0 length
+
+ /*
+ * Check if buffer is located at end of page to avoid crossing
+ * into unmapped page. If so, we load 32 bytes from the end of the
+ * limit and check the other buffer.
+ */
+
+ cmp x2, #32
+ b.hi .Lbegin
+ add x3, x8, #32
+ add x4, x9, #32
+ eor x3, x3, x8
+ eor x4, x4, x9
+
+ tst w3, #PAGE_SIZE
+ b.eq 0f
+
+ mov x3, #32
+ sub x3, x3, x2
+ sub x8, x8, x3
+
+ /*
+ * We perform a variable shift in the vector register using TBL,
+ * a suitable permutation is generated by loading a table of bytes
+ * with a desired offset.
+ */
+
+ adrp x0, shift_table
+ add x0, x0, :lo12:shift_table
+ add x0, x0, x3
+ ldp q0, q1, [x8]
+ ldp q4, q5, [x0] // load permutation table
+ tbl v0.16b, {v0.16b, v1.16b}, v4.16b
+ tbl v1.16b, {v0.16b, v1.16b}, v5.16b
+ add x8, x8, x3 // reset pointer to beginning of src
+ b 1f
+
+0:
+ ldp q0, q1, [x8]
+
+1:
+ tst w4, #PAGE_SIZE
+ b.eq 0f
+
+ mov x3, #32
+ sub x3, x3, x2
+ sub x9, x9, x3
+
+ ldp q2, q3, [x9]
+ adrp x0, shift_table
+ add x0, x0, :lo12:shift_table
+ add x0, x0, x3
+ ldp q4, q5, [x0]
+ tbl v2.16b, {v2.16b, v3.16b}, v4.16b
+ tbl v3.16b, {v2.16b, v3.16b}, v5.16b
+ add x9, x9, x3
+ b 1f
+
+ /*
+ * Compare strings of 1--32 bytes. We do this by loading into two
+ * vector registers and then doing a quick compare with XOR, UMAXP
+ * do determine if the first 32 bytes all match.
+ */
+ .p2align 4
+.Lbegin:
+ ldp q0, q1, [x8]
+0:
+ ldp q2, q3, [x9]
+1:
+
+ /* quick check if no matches in first 32 bytes */
+ eor v4.16b, v0.16b, v2.16b // v4 = b1(0-15) XOR b2(0-15)
+ eor v5.16b, v1.16b, v3.16b
+ umaxp v4.16b, v4.16b, v5.16b
+ umaxp v4.16b, v4.16b, v4.16b // fill v4 with max value
+ fmov x6, d4
+ cbz x6, .Lloop // if d4 is 0 then all matched
+
+ cmeq v0.16b, v0.16b, v2.16b // do compare between 0-15 b1 vs b2
+ cmeq v1.16b, v1.16b, v3.16b // do compare between 16-31 b1 vs b2
+ shrn v0.8b, v0.8h, #4 // shift right to fit in x1
+ shrn v1.8b, v1.8h, #4
+
+ fmov x1, d0
+ fmov x3, d1
+
+ mvn x0, x1 // invert for clz
+ mvn x3, x3
+ rbit x1, x0
+ rbit x3, x3
+ clz x1, x1
+ clz x3, x3
+ add x3, x3, #64
+ cmn x0, #0 // any match in LSB?
+ csel x0, x3, x1, eq // take x3 if none, else x1
+
+ lsr x0, x0, #2
+ cmp x0, x2
+ b.hs .Lnone
+ ldrb w4, [x8, x0]
+ ldrb w5, [x9, x0]
+ sub w0, w4, w5 // get the byte difference
+ ret
+
+ /*
+ * Compare strings of 32+ bytes. We introduce special handling if
+ * theres less than 32 bytes left of the limit.
+ */
+ .p2align 4
+.Lloop:
+ cmp x2, #64 // next iteration => x2 < 32 ?
+ b.ls .Llast32
+ sub x2, x2, #32
+ ldp q0, q1, [x8,#32]!
+ ldp q2, q3, [x9,#32]!
+
+ eor v4.16b, v0.16b, v2.16b
+ eor v5.16b, v1.16b, v3.16b
+ umaxp v4.16b, v4.16b, v5.16b
+ umaxp v4.16b, v4.16b, v4.16b
+ fmov x6, d4
+ cbnz x6, .Lmatch
+
+ /* main loop unrolled */
+ cmp x2, #64 // next iteration => x2 < 32 ?
+ b.ls .Llast32
+ sub x2, x2, #32
+ ldp q0, q1, [x8,#32]!
+ ldp q2, q3, [x9,#32]!
+
+ eor v4.16b, v0.16b, v2.16b
+ eor v5.16b, v1.16b, v3.16b
+ umaxp v4.16b, v4.16b, v5.16b
+ umaxp v4.16b, v4.16b, v4.16b
+ fmov x6, d4
+ cbz x6, .Lloop
+ b .Lmatch
+
+ /* If 32 bytes left to compare only load 32 bytes from x8,x9 - limit to
+ * avoid overread */
+ .p2align 4
+.Llast32:
+ sub x2, x2, #32
+ cmp x2, #0
+ b.le .Lnone
+ add x8, x8, x2
+ add x9, x9, x2
+ mov x2, #32
+ ldp q0, q1, [x8]
+ ldp q2, q3, [x9]
+.Lmatch:
+ cmeq v0.16b, v0.16b, v2.16b
+ cmeq v1.16b, v1.16b, v3.16b
+
+ shrn v0.8b, v0.8h, #4
+ shrn v1.8b, v1.8h, #4
+ fmov x1, d0
+ fmov x3, d1
+
+ mvn x0, x1
+ mvn x3, x3
+ rbit x1, x0
+ rbit x3, x3
+ clz x1, x1
+ clz x3, x3
+ add x3, x3, #64
+ cmn x0, #0
+ csel x0, x3, x1, eq
+
+ lsr x0, x0, #2
+ cmp x0, x2
+ b.hs .Lnone
+ ldrb w4, [x8, x0]
+ ldrb w5, [x9, x0]
+ sub w0, w4, w5
+ ret
+
+.Lnone:
+ mov x0, #0
+ ret
+
+END(__memcmp)
+
+ .section .rodata
+ .p2align 4
+shift_table:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ .fill 16, 1, -1
+ .size shift_table, .-shift_table

File Metadata

Mime Type
text/plain
Expires
Fri, Nov 8, 3:02 PM (17 h, 28 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14537164
Default Alt Text
D45621.diff (5 KB)

Event Timeline