Page MenuHomeFreeBSD

D46243.diff
No OneTemporary

D46243.diff

diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc
--- a/lib/libc/aarch64/string/Makefile.inc
+++ b/lib/libc/aarch64/string/Makefile.inc
@@ -20,6 +20,9 @@
strnlen \
strrchr
+MDSRCS+= \
+ strlcpy.S
+
#
# Add the above functions. Generate an asm file that includes the needed
# Arm Optimized Routines file defining the function name to the libc name.
diff --git a/lib/libc/aarch64/string/strlcpy.S b/lib/libc/aarch64/string/strlcpy.S
new file mode 100644
--- /dev/null
+++ b/lib/libc/aarch64/string/strlcpy.S
@@ -0,0 +1,316 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
+*/
+
+#include <machine/asm.h>
+
+ .weak strlcpy
+ .set strlcpy, __strlcpy
+ .text
+
+ENTRY(__strlcpy)
+ subs x2, x2, #1
+ b.lo .L0
+
+ mov x9, x0 // stash copy of dst pointer
+ bic x10, x1, #0xf // src aligned
+ and x11, x1, #0xf // src offset
+
+ ldr q1, [x10]
+ cmeq v1.16b, v1.16b, #0 // NUL found in head?
+
+ mov x8, #-1 // fill register with 0xfff..fff
+ lsl x12, x11, #2
+ lsl x8, x8, x12 // mask of bytes in the string
+
+ shrn v1.8b, v1.8h, #4
+ fmov x5, d1
+
+ ands x5, x5, x8
+ b.ne .Lhead_nul
+
+ ldr q3, [x10, #16] // load second string chunk
+ ldr q2, [x1] // load true head
+ mov x8, #32
+ sub x8, x8, x11
+
+ cmeq v1.16b, v3.16b, #0 // NUL found in second chunk?
+
+ subs x2, x2, x8
+ b.ls .Lhead_buf_end
+
+ /* process second chunk */
+ shrn v1.8b, v1.8h, #4
+ fmov x5, d1
+ cbnz x5, .Lsecond_nul
+
+ /* string didn't end in second chunk and neither did buffer */
+ ldr q1, [x10, #32] // load next string chunk
+ str q2, [x0] // deposit head into buffer
+ sub x0, x0, x11 // adjust x0
+ str q3, [x0, #16] // deposit second chunk
+ add x10, x10, #32 // advance src
+ add x0, x0, #32 // advance dst
+ subs x2, x2, #16 // enough left for another round?
+ b.ls 1f
+
+ /* main loop unrolled twice */
+ .p2align 4
+0:
+ cmeq v2.16b, v1.16b, #0 // NUL found in second chunk?
+ shrn v2.8b, v2.8h, #4
+ fmov x5, d2
+
+ cbnz x5, 3f
+
+ str q1, [x0]
+ ldr q1, [x10, #16] // load next chunk
+
+ cmp x2, #16 // more than a full chunk left?
+ b.ls 2f
+
+ add x10, x10, #32 // advance pointers
+ add x0, x0, #32
+
+ cmeq v2.16b, v1.16b, #0 // NUL found in second chunk?
+ shrn v2.8b, v2.8h, #4
+ fmov x5, d2
+ cbnz x5, 4f // process chunk if match
+
+ str q1, [x0, #-16]
+ ldr q1, [x10] // load next chunk
+
+ subs x2, x2, #32
+ b.hi 0b
+
+1:
+ sub x10, x10, #16 // undo second advancement
+ add x2, x2, #16
+ sub x0, x0, #16
+
+ /* 1--16 bytes left in the buffer but string has not ended yet */
+2:
+ cmeq v2.16b, v1.16b, #0 // NUL found in second chunk?
+ shrn v2.8b, v2.8h, #4
+ fmov x4, d2
+
+ mov x6, #0xf
+ mov x7, x4
+
+ lsl x5, x2, #2 // shift 0xf to the limits position
+ lsl x5, x6, x5
+ cmp x2, #16 // dont induce match if limit >=16
+ csel x5, x5, xzr, lo
+ orr x8, x4, x5 // treat limit as if terminator present
+
+ rbit x8, x8 // simulate x86 tzcnt
+ clz x8, x8 // index of mismatch
+ lsr x8, x8, #2
+
+ add x0, x0, x8
+
+ ldr q1, [x10, x8] // load tail
+ str q1, [x0] // store tail
+ strb wzr, [x0, #16]
+
+ /* continue to find the end of the string */
+ cbnz x7, 1f
+
+ /* we opt for a simpler strlen than the one in libc as the
+ * cmeq, shrn approach is faster for shorter strings.
+ */
+ .p2align 4
+0:
+ ldr q1, [x10, #32]
+ cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL
+ shrn v1.8b, v1.8h, #4
+ fmov x7, d1
+ cbnz x7, 2f
+
+ ldr q1, [x10, #48]
+ cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL
+ shrn v1.8b, v1.8h, #4
+ fmov x7, d1
+ add x10, x10, #32
+ cbz x7, 0b
+
+1: sub x10, x10, #16
+2: rbit x8, x7
+ clz x8, x8 // index of mismatch
+ lsr x8, x8, #2
+
+ sub x10, x10, x1
+ add x0, x10, #32
+ add x0, x0, x8
+
+ ret
+
+4:
+ sub x10, x10, #16 // undo second advancement
+ sub x0, x0, #16 // undo second advancement
+
+ /* string has ended but buffer has not */
+3:
+ rbit x8, x5
+ clz x8, x8 // index of mismatch
+ lsr x8, x8, #2
+
+ add x0, x0, x8 // restore dst pointer
+ add x10, x10, x8
+
+ ldr q1, [x10, #-15]
+ str q1, [x0, #-15]
+ add x0, x0, #1
+ sub x0, x10, x1
+
+ ret
+
+.Lhead_buf_end:
+ shrn v1.8b, v1.8h, #4
+ fmov x8, d1
+
+ add x2, x2, #32 // restore limit
+
+ mov x7, x8
+ mov x6, #0xf
+
+ cmp x2, #16 // should we induce a match or not
+ b.lo 0f
+
+ rbit x8, x8
+ clz x8, x8 // index of mismatch
+ lsr x8, x8, #2
+ add x8, x8, #16
+
+ cmp x8, x2
+ csel x8, x8, x2, lo // copy min(buflen, srclen) bytes
+ b 1f
+0:
+
+ rbit x8, x8
+ clz x8, x8 // index of mismatch
+ lsr x8, x8, #2
+
+ mov x8, x2
+1:
+
+ sub x8, x8, x11
+ strb wzr, [x9, x8]
+
+ /* continue to find the end of the string */
+ cbnz x7, 1f
+
+ /* we opt for a simpler strlen than the one in libc as the
+ * cmeq, shrn approach is faster for shorter strings.
+ */
+ .p2align 4
+0:
+ ldr q1, [x10, #32]
+ cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL
+ shrn v1.8b, v1.8h, #4
+ fmov x7, d1
+ cbnz x7, 2f
+
+ ldr q1, [x10, #48]
+ cmeq v1.16b, v1.16b, #0 // bytewise compare against NUL
+ shrn v1.8b, v1.8h, #4
+ fmov x7, d1
+ add x10, x10, #32
+ cbz x7, 0b
+
+1: sub x10, x10, #16
+2: rbit x6, x7
+ clz x6, x6 // index of mismatch
+ lsr x6, x6, #2
+
+ sub x10, x10, x1
+ add x0, x10, #32
+ add x0, x0, x6
+
+ add x4, x9, x8 // dst + cnt
+ add x5, x1, x8 // src + cnt
+
+ b .L1732
+
+.Lsecond_nul:
+ add x2, x2, x8
+
+ rbit x8, x5
+ clz x8, x8 // index of mismatch
+ lsr x5, x8, #2
+
+ sub x8, x11, #16
+ sub x0, x5, x8 // string length
+
+ cmp x0, x2 // did we match or hit limit first?
+ csel x8, x2, x0, hi
+
+ add x4, x9, x8 // dst + cnt
+ add x5, x1, x8 // src + cnt
+
+ strb wzr, [x4]
+
+ /* copy 17-32 bytes */
+.L1732:
+ cmp x8, #16
+ b.lo .L0816
+ ldp x16, x17, [x1]
+ ldp x12, x1, [x5, #-16]
+ stp x16, x17, [x9]
+ stp x12, x1, [x4, #-16]
+ ret
+
+.Lhead_nul:
+ rbit x8, x5
+ clz x8, x8 // index of mismatch
+ lsr x8, x8, #2
+
+ sub x0, x8, x11
+ cmp x0, x2
+ csel x8, x2, x0, hi
+
+ add x4, x9, x8 // dst + cnt
+ add x5, x1, x8 // src + cnt
+ strb wzr, [x4]
+
+ /* Copy 8-16 bytes */
+.L0816:
+ tbz x8, #3, .L0407
+ ldr x16, [x1]
+ ldr x17, [x5, #-8]
+ str x16, [x9]
+ str x17, [x4, #-8]
+ ret
+
+ /* Copy 4-7 bytes */
+ .p2align 4
+.L0407:
+ cmp x8, #3
+ b.ls .L0203
+ ldr w16, [x1]
+ ldr w18, [x5, #-4]
+ str w16, [x9]
+ str w18, [x4, #-4]
+ ret
+
+.L0203:
+ tbz x8, 1, .L0001
+ ldrh w16, [x1]
+ ldrh w17, [x5, #-2]
+ strh w16, [x9]
+ strh w17, [x4, #-2]
+ ret
+
+.L0001:
+ ldrb w16, [x1]
+ strb w16, [x9]
+ strb wzr, [x4]
+ ret
+
+.L0:
+ mov x0, x1
+ b strlen
+ ret
+END(__strlcpy)

File Metadata

Mime Type
text/plain
Expires
Sun, Nov 17, 5:16 PM (20 h, 40 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14659215
Default Alt Text
D46243.diff (6 KB)

Event Timeline