strlen.S revision 1.1 1 1.1 christos /*
2 1.1 christos * Written by J.T. Conklin <jtc (at) acorntoolworks.com>
3 1.1 christos * Public domain.
4 1.1 christos */
5 1.1 christos
6 1.1 christos #include <machine/asm.h>
7 1.1 christos
8 1.1 christos #if defined(LIBC_SCCS)
9 1.1 christos RCSID("$NetBSD: strlen.S,v 1.1 2005/12/20 19:28:51 christos Exp $")
10 1.1 christos #endif
11 1.1 christos
12 1.1 christos ENTRY(strlen)
13 1.1 christos movq %rdi,%rax
14 1.1 christos negq %rdi
15 1.1 christos
16 1.1 christos .Lalign:
17 1.1 christos /* Consider unrolling loop? */
18 1.1 christos testb $7,%al
19 1.1 christos je .Lword_aligned
20 1.1 christos cmpb $0,(%rax)
21 1.1 christos jne 1f
22 1.1 christos leaq (%rdi,%rax),%rax
23 1.1 christos ret
24 1.1 christos 1: incq %rax
25 1.1 christos jmp .Lalign
26 1.1 christos
27 1.1 christos /*
28 1.1 christos * There are many well known branch-free sequences which are used
29 1.1 christos * for determining whether a zero-byte is contained within a word.
30 1.1 christos * These sequences are generally much more efficent than loading
31 1.1 christos * and comparing each byte individually.
32 1.1 christos *
33 1.1 christos * The expression [1,2]:
34 1.1 christos *
35 1.1 christos * (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
36 1.1 christos *
37 1.1 christos * evaluates to a non-zero value if any of the bytes in the
38 1.1 christos * original word is zero.
39 1.1 christos *
40 1.1 christos * It also has the useful property that bytes in the result word
41 1.1 christos * that correspond to non-zero bytes in the original word have
42 1.1 christos * the value 0x00, while bytes corresponding to zero bytes have
43 1.1 christos * the value 0x80. This allows calculation of the first (and
44 1.1 christos * last) occurrence of a zero byte within the word (useful for C's
45 1.1 christos * str* primitives) by counting the number of leading (or
46 1.1 christos * trailing) zeros and dividing the result by 8. On machines
47 1.1 christos * without (or with slow) clz() / ctz() instructions, testing
48 1.1 christos * each byte in the result word for zero is necessary.
49 1.1 christos *
50 1.1 christos * This typically takes 4 instructions (5 on machines without
51 1.1 christos * "not-or") not including those needed to load the constant.
52 1.1 christos *
53 1.1 christos *
54 1.1 christos * The expression:
55 1.1 christos *
56 1.1 christos * (2) ((x - 0x01....01) & ~x & 0x80....80)
57 1.1 christos *
58 1.1 christos * evaluates to a non-zero value if any of the bytes in the
59 1.1 christos * original word is zero.
60 1.1 christos *
61 1.1 christos * On little endian machines, the first byte in the result word
62 1.1 christos * that corresponds to a zero byte in the original byte is 0x80,
63 1.1 christos * so clz() can be used as above. On big endian machines, and
64 1.1 christos * little endian machines without (or with a slow) clz() insn,
65 1.1 christos * testing each byte in the original for zero is necessary.
66 1.1 christos *
67 1.1 christos * This typically takes 3 instructions (4 on machines without
68 1.1 christos * "and with complement") not including those needed to load
69 1.1 christos * constants.
70 1.1 christos *
71 1.1 christos *
72 1.1 christos * The expression:
73 1.1 christos *
74 1.1 christos * (3) ((x - 0x01....01) & 0x80....80)
75 1.1 christos *
76 1.1 christos * always evaluates to a non-zero value if any of the bytes in
77 1.1 christos * the original word is zero. However, in rare cases, it also
78 1.1 christos * evaluates to a non-zero value when none of the bytes in the
79 1.1 christos * original word is zero.
80 1.1 christos *
81 1.1 christos * To account for possible false positives, each byte of the
82 1.1 christos * original word must be checked when the expression evaluates to
83 1.1 christos * a non-zero value. However, because it is simpler than those
84 1.1 christos * presented above, code that uses it will be faster as long as
85 1.1 christos * the rate of false positives is low.
86 1.1 christos *
87 1.1 christos * This is likely, because the the false positive can only occur
88 1.1 christos * if the most siginificant bit of a byte within the word is set.
89 1.1 christos * The expression will never fail for typical 7-bit ASCII strings.
90 1.1 christos *
91 1.1 christos * This typically takes 2 instructions not including those needed
92 1.1 christos * to load constants.
93 1.1 christos *
94 1.1 christos *
95 1.1 christos * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
96 1.1 christos *
97 1.1 christos * [2] International Business Machines, "The PowerPC Compiler Writer's
98 1.1 christos * Guide", Warthman Associates, 1996
99 1.1 christos */
100 1.1 christos
101 1.1 christos _ALIGN_TEXT
102 1.1 christos .Lword_aligned:
103 1.1 christos movabsq $0x0101010101010101,%r8
104 1.1 christos movabsq $0x8080808080808080,%r9
105 1.1 christos .Lloop:
106 1.1 christos movq (%rax),%rdx
107 1.1 christos addq $8,%rax
108 1.1 christos subq %r8,%rdx
109 1.1 christos testq %r9,%rdx
110 1.1 christos je .Lloop
111 1.1 christos
112 1.1 christos /*
113 1.1 christos * In rare cases, the above loop may exit prematurely. We must
114 1.1 christos * return to the loop if none of the bytes in the word equal 0.
115 1.1 christos */
116 1.1 christos cmpb $0,-8(%rax) /* 1st byte == 0? */
117 1.1 christos je .Lsub8
118 1.1 christos cmpb $0,-7(%rax) /* 2nd byte == 0? */
119 1.1 christos je .Lsub7
120 1.1 christos cmpb $0,-6(%rax) /* 3rd byte == 0? */
121 1.1 christos je .Lsub6
122 1.1 christos cmpb $0,-5(%rax) /* 4th byte == 0? */
123 1.1 christos je .Lsub5
124 1.1 christos cmpb $0,-4(%rax) /* 5th byte == 0? */
125 1.1 christos je .Lsub4
126 1.1 christos cmpb $0,-3(%rax) /* 6th byte == 0? */
127 1.1 christos je .Lsub3
128 1.1 christos cmpb $0,-2(%rax) /* 7th byte == 0? */
129 1.1 christos je .Lsub2
130 1.1 christos cmpb $0,-1(%rax) /* 8th byte == 0? */
131 1.1 christos jne .Lloop
132 1.1 christos
133 1.1 christos .Lsub1:
134 1.1 christos leaq -1(%rdi,%rax),%rax
135 1.1 christos ret
136 1.1 christos .Lsub2:
137 1.1 christos leaq -2(%rdi,%rax),%rax
138 1.1 christos ret
139 1.1 christos .Lsub3:
140 1.1 christos leaq -3(%rdi,%rax),%rax
141 1.1 christos ret
142 1.1 christos .Lsub4:
143 1.1 christos leaq -4(%rdi,%rax),%rax
144 1.1 christos ret
145 1.1 christos .Lsub5:
146 1.1 christos leaq -5(%rdi,%rax),%rax
147 1.1 christos ret
148 1.1 christos .Lsub6:
149 1.1 christos leaq -6(%rdi,%rax),%rax
150 1.1 christos ret
151 1.1 christos .Lsub7:
152 1.1 christos leaq -7(%rdi,%rax),%rax
153 1.1 christos ret
154 1.1 christos .Lsub8:
155 1.1 christos leaq -8(%rdi,%rax),%rax
156 1.1 christos ret
157