Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F102100427
D45621.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
5 KB
Referenced Files
None
Subscribers
None
D45621.diff
View Options
diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -5,7 +5,6 @@
AARCH64_STRING_FUNCS= \
memchr \
- memcmp \
memcpy \
memmove \
memrchr \
@@ -20,6 +19,11 @@
strnlen \
strrchr
+
+MDSRCS+= \
+ memcmp.S \
+ bcmp.S
+
#
# Add the above functions. Generate an asm file that includes the needed
# Arm Optimized Routines file defining the function name to the libc name.
diff --git a/lib/libc/aarch64/string/bcmp.S b/lib/libc/aarch64/string/bcmp.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/bcmp.S
@@ -0,0 +1,8 @@
+/*-
+ * Written by Mateusz Guzik <mjg@freebsd.org>
+ * Public domain.
+ */
+
+#define BCMP
+#include "memcmp.S"
+
diff --git a/lib/libc/aarch64/string/memcmp.S b/lib/libc/aarch64/string/memcmp.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/memcmp.S
@@ -0,0 +1,213 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+#ifdef BCMP
+#define memcmp bcmp
+#define __memcmp __bcmp
+#endif
+
+ .weak memcmp
+ .set memcmp, __memcmp
+ .text
+
+ENTRY(__memcmp)
+
+ mov x8, x0 // store base addresses
+ mov x9, x1
+ cbz x2, .Lnone // 0 length
+
+ /*
+ * Check if buffer is located at end of page to avoid crossing
+ * into unmapped page. If so, we load 32 bytes from the end of the
+ * limit and check the other buffer.
+ */
+
+ cmp x2, #32
+ b.hi .Lbegin
+ add x3, x8, #32
+ add x4, x9, #32
+ eor x3, x3, x8
+ eor x4, x4, x9
+
+ tst w3, #PAGE_SIZE
+ b.eq 0f
+
+ mov x3, #32
+ sub x3, x3, x2
+ sub x8, x8, x3
+
+ /*
+ * We perform a variable shift in the vector register using TBL,
+ * a suitable permutation is generated by loading a table of bytes
+ * with a desired offset.
+ */
+
+ adrp x0, shift_table
+ add x0, x0, :lo12:shift_table
+ add x0, x0, x3
+ ldp q0, q1, [x8]
+ ldp q4, q5, [x0] // load permutation table
+ tbl v0.16b, {v0.16b, v1.16b}, v4.16b
+ tbl v1.16b, {v0.16b, v1.16b}, v5.16b
+ add x8, x8, x3 // reset pointer to beginning of src
+ b 1f
+
+0:
+ ldp q0, q1, [x8]
+
+1:
+ tst w4, #PAGE_SIZE
+ b.eq 0f
+
+ mov x3, #32
+ sub x3, x3, x2
+ sub x9, x9, x3
+
+ ldp q2, q3, [x9]
+ adrp x0, shift_table
+ add x0, x0, :lo12:shift_table
+ add x0, x0, x3
+ ldp q4, q5, [x0]
+ tbl v2.16b, {v2.16b, v3.16b}, v4.16b
+ tbl v3.16b, {v2.16b, v3.16b}, v5.16b
+ add x9, x9, x3
+ b 1f
+
+ /*
+ * Compare strings of 1--32 bytes. We do this by loading into two
+ * vector registers and then doing a quick compare with XOR, UMAXP
+ * do determine if the first 32 bytes all match.
+ */
+ .p2align 4
+.Lbegin:
+ ldp q0, q1, [x8]
+0:
+ ldp q2, q3, [x9]
+1:
+
+ /* quick check if no matches in first 32 bytes */
+ eor v4.16b, v0.16b, v2.16b // v4 = b1(0-15) XOR b2(0-15)
+ eor v5.16b, v1.16b, v3.16b
+ umaxp v4.16b, v4.16b, v5.16b
+ umaxp v4.16b, v4.16b, v4.16b // fill v4 with max value
+ fmov x6, d4
+ cbz x6, .Lloop // if d4 is 0 then all matched
+
+ cmeq v0.16b, v0.16b, v2.16b // do compare between 0-15 b1 vs b2
+ cmeq v1.16b, v1.16b, v3.16b // do compare between 16-31 b1 vs b2
+ shrn v0.8b, v0.8h, #4 // shift right to fit in x1
+ shrn v1.8b, v1.8h, #4
+
+ fmov x1, d0
+ fmov x3, d1
+
+ mvn x0, x1 // invert for clz
+ mvn x3, x3
+ rbit x1, x0
+ rbit x3, x3
+ clz x1, x1
+ clz x3, x3
+ add x3, x3, #64
+ cmn x0, #0 // any match in LSB?
+ csel x0, x3, x1, eq // take x3 if none, else x1
+
+ lsr x0, x0, #2
+ cmp x0, x2
+ b.hs .Lnone
+ ldrb w4, [x8, x0]
+ ldrb w5, [x9, x0]
+ sub w0, w4, w5 // get the byte difference
+ ret
+
+ /*
+ * Compare strings of 32+ bytes. We introduce special handling if
+ * theres less than 32 bytes left of the limit.
+ */
+ .p2align 4
+.Lloop:
+ cmp x2, #64 // next iteration => x2 < 32 ?
+ b.ls .Llast32
+ sub x2, x2, #32
+ ldp q0, q1, [x8,#32]!
+ ldp q2, q3, [x9,#32]!
+
+ eor v4.16b, v0.16b, v2.16b
+ eor v5.16b, v1.16b, v3.16b
+ umaxp v4.16b, v4.16b, v5.16b
+ umaxp v4.16b, v4.16b, v4.16b
+ fmov x6, d4
+ cbnz x6, .Lmatch
+
+ /* main loop unrolled */
+ cmp x2, #64 // next iteration => x2 < 32 ?
+ b.ls .Llast32
+ sub x2, x2, #32
+ ldp q0, q1, [x8,#32]!
+ ldp q2, q3, [x9,#32]!
+
+ eor v4.16b, v0.16b, v2.16b
+ eor v5.16b, v1.16b, v3.16b
+ umaxp v4.16b, v4.16b, v5.16b
+ umaxp v4.16b, v4.16b, v4.16b
+ fmov x6, d4
+ cbz x6, .Lloop
+ b .Lmatch
+
+ /* If 32 bytes left to compare only load 32 bytes from x8,x9 - limit to
+ * avoid overread */
+ .p2align 4
+.Llast32:
+ sub x2, x2, #32
+ cmp x2, #0
+ b.le .Lnone
+ add x8, x8, x2
+ add x9, x9, x2
+ mov x2, #32
+ ldp q0, q1, [x8]
+ ldp q2, q3, [x9]
+.Lmatch:
+ cmeq v0.16b, v0.16b, v2.16b
+ cmeq v1.16b, v1.16b, v3.16b
+
+ shrn v0.8b, v0.8h, #4
+ shrn v1.8b, v1.8h, #4
+ fmov x1, d0
+ fmov x3, d1
+
+ mvn x0, x1
+ mvn x3, x3
+ rbit x1, x0
+ rbit x3, x3
+ clz x1, x1
+ clz x3, x3
+ add x3, x3, #64
+ cmn x0, #0
+ csel x0, x3, x1, eq
+
+ lsr x0, x0, #2
+ cmp x0, x2
+ b.hs .Lnone
+ ldrb w4, [x8, x0]
+ ldrb w5, [x9, x0]
+ sub w0, w4, w5
+ ret
+
+.Lnone:
+ mov x0, #0
+ ret
+
+END(__memcmp)
+
+ .section .rodata
+ .p2align 4
+shift_table:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ .byte 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ .fill 16, 1, -1
+ .size shift_table, .-shift_table
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Nov 8, 3:02 PM (17 h, 28 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14537164
Default Alt Text
D45621.diff (5 KB)
Attached To
Mode
D45621: lib/libc/aarch64/string: add memcmp SIMD implementation
Attached
Detach File
Event Timeline
Log In to Comment