Home | History | Annotate | Line # | Download | only in string
strlen.S revision 1.1
      1  1.1  christos /*
      2  1.1  christos  * Written by J.T. Conklin <jtc (at) acorntoolworks.com>
      3  1.1  christos  * Public domain.
      4  1.1  christos  */
      5  1.1  christos 
      6  1.1  christos #include <machine/asm.h>
      7  1.1  christos 
      8  1.1  christos #if defined(LIBC_SCCS)
      9  1.1  christos 	RCSID("$NetBSD: strlen.S,v 1.1 2005/12/20 19:28:51 christos Exp $")
     10  1.1  christos #endif
     11  1.1  christos 
     12  1.1  christos ENTRY(strlen)
     13  1.1  christos 	movq	%rdi,%rax
     14  1.1  christos 	negq	%rdi
     15  1.1  christos 
     16  1.1  christos .Lalign:
     17  1.1  christos 	/* Consider unrolling loop? */
     18  1.1  christos 	testb	$7,%al
     19  1.1  christos 	je	.Lword_aligned
     20  1.1  christos 	cmpb	$0,(%rax)
     21  1.1  christos 	jne	1f
     22  1.1  christos 	leaq	(%rdi,%rax),%rax
     23  1.1  christos 	ret
     24  1.1  christos 1:	incq	%rax
     25  1.1  christos 	jmp	.Lalign
     26  1.1  christos 
     27  1.1  christos 	/*
     28  1.1  christos 	 * There are many well known branch-free sequences which are used
     29  1.1  christos 	 * for determining whether a zero-byte is contained within a word.
     30  1.1  christos 	 * These sequences are generally much more efficent than loading
     31  1.1  christos 	 * and comparing each byte individually.
     32  1.1  christos 	 *
     33  1.1  christos 	 * The expression [1,2]:
     34  1.1  christos 	 *
     35  1.1  christos 	 * (1)  ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
     36  1.1  christos 	 *
     37  1.1  christos 	 * evaluates to a non-zero value if any of the bytes in the
     38  1.1  christos 	 * original word is zero.
     39  1.1  christos 	 *
     40  1.1  christos 	 * It also has the useful property that bytes in the result word
     41  1.1  christos 	 * that correspond to non-zero bytes in the original word have
     42  1.1  christos 	 * the value 0x00, while bytes corresponding to zero bytes have
     43  1.1  christos 	 * the value 0x80. This allows calculation of the first (and
     44  1.1  christos 	 * last) occurrence of a zero byte within the word (useful for C's
     45  1.1  christos 	 * str* primitives) by counting the number of leading (or
     46  1.1  christos 	 * trailing) zeros and dividing the result by 8.  On machines
     47  1.1  christos 	 * without (or with slow) clz() / ctz() instructions, testing
     48  1.1  christos 	 * each byte in the result word for zero is necessary.
     49  1.1  christos 	 *
     50  1.1  christos 	 * This typically takes 4 instructions (5 on machines without
     51  1.1  christos 	 * "not-or") not including those needed to load the constant.
     52  1.1  christos 	 *
     53  1.1  christos 	 *
     54  1.1  christos 	 * The expression:
     55  1.1  christos 	 *
     56  1.1  christos 	 * (2)  ((x - 0x01....01) & ~x & 0x80....80)
     57  1.1  christos 	 *
     58  1.1  christos 	 * evaluates to a non-zero value if any of the bytes in the
     59  1.1  christos 	 * original word is zero.
     60  1.1  christos 	 *
     61  1.1  christos 	 * On little endian machines, the first byte in the result word
     62  1.1  christos 	 * that corresponds to a zero byte in the original byte is 0x80,
     63  1.1  christos 	 * so clz() can be used as above.  On big endian machines, and
     64  1.1  christos 	 * little endian machines without (or with a slow) clz() insn,
     65  1.1  christos 	 * testing each byte in the original for zero is necessary.
     66  1.1  christos 	 *
     67  1.1  christos 	 * This typically takes 3 instructions (4 on machines without
     68  1.1  christos 	 * "and with complement") not including those needed to load
     69  1.1  christos 	 * constants.
     70  1.1  christos 	 *
     71  1.1  christos 	 *
     72  1.1  christos 	 * The expression:
     73  1.1  christos 	 *
     74  1.1  christos 	 * (3)  ((x - 0x01....01) & 0x80....80)
     75  1.1  christos 	 *
     76  1.1  christos 	 * always evaluates to a non-zero value if any of the bytes in
     77  1.1  christos 	 * the original word is zero.  However, in rare cases, it also
     78  1.1  christos 	 * evaluates to a non-zero value when none of the bytes in the
     79  1.1  christos 	 * original word is zero.
     80  1.1  christos 	 *
     81  1.1  christos 	 * To account for possible false positives, each byte of the
     82  1.1  christos 	 * original word must be checked when the expression evaluates to
     83  1.1  christos 	 * a non-zero value.  However, because it is simpler than those
     84  1.1  christos 	 * presented above, code that uses it will be faster as long as
     85  1.1  christos 	 * the rate of false positives is low.
     86  1.1  christos 	 *
     87  1.1  christos 	 * This is likely, because the the false positive can only occur
     88  1.1  christos 	 * if the most siginificant bit of a byte within the word is set.
     89  1.1  christos 	 * The expression will never fail for typical 7-bit ASCII strings.
     90  1.1  christos 	 *
     91  1.1  christos 	 * This typically takes 2 instructions not including those needed
     92  1.1  christos 	 * to load constants.
     93  1.1  christos 	 *
     94  1.1  christos 	 *
     95  1.1  christos 	 * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
     96  1.1  christos 	 *
     97  1.1  christos 	 * [2] International Business Machines, "The PowerPC Compiler Writer's
     98  1.1  christos 	 *     Guide", Warthman Associates, 1996
     99  1.1  christos 	 */
    100  1.1  christos 
    101  1.1  christos 	_ALIGN_TEXT
    102  1.1  christos .Lword_aligned:
    103  1.1  christos 	movabsq	$0x0101010101010101,%r8
    104  1.1  christos 	movabsq	$0x8080808080808080,%r9
    105  1.1  christos .Lloop:
    106  1.1  christos 	movq	(%rax),%rdx
    107  1.1  christos 	addq	$8,%rax
    108  1.1  christos 	subq	%r8,%rdx
    109  1.1  christos 	testq	%r9,%rdx
    110  1.1  christos 	je	.Lloop
    111  1.1  christos 
    112  1.1  christos 	/*
    113  1.1  christos 	 * In rare cases, the above loop may exit prematurely. We must
    114  1.1  christos 	 * return to the loop if none of the bytes in the word equal 0.
    115  1.1  christos 	 */
    116  1.1  christos 	cmpb	$0,-8(%rax)		/* 1st byte == 0? */
    117  1.1  christos 	je	.Lsub8
    118  1.1  christos 	cmpb	$0,-7(%rax)		/* 2nd byte == 0? */
    119  1.1  christos 	je	.Lsub7
    120  1.1  christos 	cmpb	$0,-6(%rax)		/* 3rd byte == 0? */
    121  1.1  christos 	je	.Lsub6
    122  1.1  christos 	cmpb	$0,-5(%rax)		/* 4th byte == 0? */
    123  1.1  christos 	je	.Lsub5
    124  1.1  christos 	cmpb	$0,-4(%rax)		/* 5th byte == 0? */
    125  1.1  christos 	je	.Lsub4
    126  1.1  christos 	cmpb	$0,-3(%rax)		/* 6th byte == 0? */
    127  1.1  christos 	je	.Lsub3
    128  1.1  christos 	cmpb	$0,-2(%rax)		/* 7th byte == 0? */
    129  1.1  christos 	je	.Lsub2
    130  1.1  christos 	cmpb	$0,-1(%rax)		/* 8th byte == 0? */
    131  1.1  christos 	jne	.Lloop
    132  1.1  christos 
    133  1.1  christos .Lsub1:
    134  1.1  christos 	leaq	-1(%rdi,%rax),%rax
    135  1.1  christos 	ret
    136  1.1  christos .Lsub2:
    137  1.1  christos 	leaq	-2(%rdi,%rax),%rax
    138  1.1  christos 	ret
    139  1.1  christos .Lsub3:
    140  1.1  christos 	leaq	-3(%rdi,%rax),%rax
    141  1.1  christos 	ret
    142  1.1  christos .Lsub4:
    143  1.1  christos 	leaq	-4(%rdi,%rax),%rax
    144  1.1  christos 	ret
    145  1.1  christos .Lsub5:
    146  1.1  christos 	leaq	-5(%rdi,%rax),%rax
    147  1.1  christos 	ret
    148  1.1  christos .Lsub6:
    149  1.1  christos 	leaq	-6(%rdi,%rax),%rax
    150  1.1  christos 	ret
    151  1.1  christos .Lsub7:
    152  1.1  christos 	leaq	-7(%rdi,%rax),%rax
    153  1.1  christos 	ret
    154  1.1  christos .Lsub8:
    155  1.1  christos 	leaq	-8(%rdi,%rax),%rax
    156  1.1  christos 	ret
    157