17117f1b4Smrg
27117f1b4Smrg#include "sparc_matrix.h"
37117f1b4Smrg
47117f1b4Smrg	.register %g2, #scratch
57117f1b4Smrg	.register %g3, #scratch
67117f1b4Smrg
77117f1b4Smrg	.text
87117f1b4Smrg
97117f1b4Smrg#ifdef __arch64__
107117f1b4Smrg#define STACK_VAR_OFF	(2047 + (8 * 16))
117117f1b4Smrg#else
127117f1b4Smrg#define STACK_VAR_OFF	(4 * 16)
137117f1b4Smrg#endif
147117f1b4Smrg
157117f1b4Smrg	/* Newton-Raphson approximation turns out to be slower
167117f1b4Smrg	 * (and less accurate) than direct fsqrts/fdivs.
177117f1b4Smrg	 */
187117f1b4Smrg#define ONE_DOT_ZERO	0x3f800000
197117f1b4Smrg
207117f1b4Smrg	.globl	_mesa_sparc_transform_normalize_normals
217117f1b4Smrg_mesa_sparc_transform_normalize_normals:
227117f1b4Smrg	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
237117f1b4Smrg
247117f1b4Smrg	sethi	%hi(ONE_DOT_ZERO), %g2
257117f1b4Smrg	sub	%sp, 16, %sp
267117f1b4Smrg	st	%g2, [%sp + STACK_VAR_OFF+0x0]
277117f1b4Smrg	st	%o1, [%sp + STACK_VAR_OFF+0x4]
287117f1b4Smrg	ld	[%sp + STACK_VAR_OFF+0x0], %f12	! f12 = 1.0f
297117f1b4Smrg	ld	[%sp + STACK_VAR_OFF+0x4], %f15	! f15 = scale
307117f1b4Smrg	add	%sp, 16, %sp
317117f1b4Smrg
327ec681f3Smrg	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
337117f1b4Smrg	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
347117f1b4Smrg	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
357117f1b4Smrg	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
367117f1b4Smrg	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
377117f1b4Smrg
387117f1b4Smrg	LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)
397117f1b4Smrg
407117f1b4Smrg	/* dest->count = in->count */
417117f1b4Smrg	st	%g1, [%o4 + V4F_COUNT]
427117f1b4Smrg
437117f1b4Smrg	cmp	%g1, 1
447117f1b4Smrg	bl	7f
457117f1b4Smrg	 cmp	%o3, 0
467117f1b4Smrg	bne	4f
477117f1b4Smrg	 clr	%o4				! 'i' for STRIDE_LOOP
487117f1b4Smrg
497117f1b4Smrg1:	/* LENGTHS == NULL */
507117f1b4Smrg	ld	[%o5 + 0x00], %f0		! ux = from[0]
517117f1b4Smrg	ld	[%o5 + 0x04], %f1		! uy = from[1]
527117f1b4Smrg	ld	[%o5 + 0x08], %f2		! uz = from[2]
537117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
547117f1b4Smrg	add	%o4, 1, %o4			! i++
557117f1b4Smrg
567117f1b4Smrg	/* tx (f3) = (ux * m0) + (uy * m1) + (uz * m2)
577117f1b4Smrg	 * ty (f5) = (ux * m4) + (uy * m5) + (uz * m6)
587117f1b4Smrg	 * tz (f7) = (ux * m8) + (uy * m9) + (uz * m10)
597117f1b4Smrg	 */
607117f1b4Smrg	fmuls	%f0, M0, %f3			! FGM	Group
617117f1b4Smrg	fmuls	%f1, M1, %f4			! FGM	Group
627117f1b4Smrg	fmuls	%f0, M4, %f5			! FGM	Group
637117f1b4Smrg	fmuls	%f1, M5, %f6			! FGM	Group
647117f1b4Smrg	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
657117f1b4Smrg	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
667117f1b4Smrg	fadds	%f3, %f4, %f3			! FGA
677117f1b4Smrg	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
687117f1b4Smrg	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
697117f1b4Smrg	fadds	%f5, %f6, %f5			! FGA
707117f1b4Smrg	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
717117f1b4Smrg	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
727117f1b4Smrg	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
737117f1b4Smrg	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
747117f1b4Smrg	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
757117f1b4Smrg
767117f1b4Smrg	/* f3=tx, f5=ty, f7=tz */
777117f1b4Smrg
787117f1b4Smrg	/* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
797117f1b4Smrg	fmuls	%f3, %f3, %f6			! FGM	Group	f3 available
807117f1b4Smrg	fmuls	%f5, %f5, %f8			! FGM	Group	f5 available
817117f1b4Smrg	fmuls	%f7, %f7, %f10			! FGM	Group	f7 available
827117f1b4Smrg	fadds	%f6, %f8, %f6			! FGA	Group	2cyc stall f6,f8 available
837117f1b4Smrg	fadds	%f6, %f10, %f6			! FGA	Group	4cyc stall f6,f10 available
847117f1b4Smrg
857117f1b4Smrg	/* scale (f6) = 1.0 / sqrt(len) */
867117f1b4Smrg	fsqrts	%f6, %f6			! FDIV  20 cycles
877117f1b4Smrg	fdivs	%f12, %f6, %f6			! FDIV	14 cycles
887117f1b4Smrg
897117f1b4Smrg	fmuls	%f3, %f6, %f3
907117f1b4Smrg	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
917117f1b4Smrg	fmuls	%f5, %f6, %f5
927117f1b4Smrg	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
937117f1b4Smrg	fmuls	%f7, %f6, %f7
947117f1b4Smrg	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale
957117f1b4Smrg
967117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
977117f1b4Smrg	bl	1b
984a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
997117f1b4Smrg
1007117f1b4Smrg	ba	7f
1017117f1b4Smrg	 nop
1027117f1b4Smrg
1037117f1b4Smrg4:	/* LENGTHS != NULL */
1047117f1b4Smrg	fmuls	M0, %f15, M0
1057117f1b4Smrg	fmuls	M1, %f15, M1
1067117f1b4Smrg	fmuls	M2, %f15, M2
1077117f1b4Smrg	fmuls	M4, %f15, M4
1087117f1b4Smrg	fmuls	M5, %f15, M5
1097117f1b4Smrg	fmuls	M6, %f15, M6
1107117f1b4Smrg	fmuls	M8, %f15, M8
1117117f1b4Smrg	fmuls	M9, %f15, M9
1127117f1b4Smrg	fmuls	M10, %f15, M10
1137117f1b4Smrg
1147117f1b4Smrg5:
1157117f1b4Smrg	ld	[%o5 + 0x00], %f0		! ux = from[0]
1167117f1b4Smrg	ld	[%o5 + 0x04], %f1		! uy = from[1]
1177117f1b4Smrg	ld	[%o5 + 0x08], %f2		! uz = from[2]
1187117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
1197117f1b4Smrg	add	%o4, 1, %o4			! i++
1207117f1b4Smrg
1217117f1b4Smrg	/* tx (f3) = (ux * m0) + (uy * m1) + (uz * m2)
1227117f1b4Smrg	 * ty (f5) = (ux * m4) + (uy * m5) + (uz * m6)
1237117f1b4Smrg	 * tz (f7) = (ux * m8) + (uy * m9) + (uz * m10)
1247117f1b4Smrg	 */
1257117f1b4Smrg	fmuls	%f0, M0, %f3			! FGM	Group
1267117f1b4Smrg	fmuls	%f1, M1, %f4			! FGM	Group
1277117f1b4Smrg	fmuls	%f0, M4, %f5			! FGM	Group
1287117f1b4Smrg	fmuls	%f1, M5, %f6			! FGM	Group
1297117f1b4Smrg	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
1307117f1b4Smrg	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
1317117f1b4Smrg	fadds	%f3, %f4, %f3			! FGA
1327117f1b4Smrg	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
1337117f1b4Smrg	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
1347117f1b4Smrg	fadds	%f5, %f6, %f5			! FGA
1357117f1b4Smrg	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
1367117f1b4Smrg	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
1377117f1b4Smrg	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
1387117f1b4Smrg	ld	[%o3], %f13			! LSU
1397117f1b4Smrg	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
1407117f1b4Smrg	add	%o3, 4, %o3			! IEU0
1417117f1b4Smrg	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
1427117f1b4Smrg
1437117f1b4Smrg	/* f3=tx, f5=ty, f7=tz, f13=lengths[i] */
1447117f1b4Smrg
1457117f1b4Smrg	fmuls	%f3, %f13, %f3
1467117f1b4Smrg	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * len
1477117f1b4Smrg	fmuls	%f5, %f13, %f5
1487117f1b4Smrg	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * len
1497117f1b4Smrg	fmuls	%f7, %f13, %f7
1507117f1b4Smrg	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * len
1517117f1b4Smrg
1527117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
1537117f1b4Smrg	bl	5b
1544a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
1557117f1b4Smrg
1567117f1b4Smrg7:	retl
1577117f1b4Smrg	 nop
1587117f1b4Smrg
1597117f1b4Smrg	.globl	_mesa_sparc_transform_normalize_normals_no_rot
1607117f1b4Smrg_mesa_sparc_transform_normalize_normals_no_rot:
1617117f1b4Smrg	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
1627117f1b4Smrg
1637117f1b4Smrg	sethi	%hi(ONE_DOT_ZERO), %g2
1647117f1b4Smrg	sub	%sp, 16, %sp
1657117f1b4Smrg	st	%g2, [%sp + STACK_VAR_OFF+0x0]
1667117f1b4Smrg	st	%o1, [%sp + STACK_VAR_OFF+0x4]
1677117f1b4Smrg	ld	[%sp + STACK_VAR_OFF+0x0], %f12	! f12 = 1.0f
1687117f1b4Smrg	ld	[%sp + STACK_VAR_OFF+0x4], %f15	! f15 = scale
1697117f1b4Smrg	add	%sp, 16, %sp
1707117f1b4Smrg
1717ec681f3Smrg	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
1727117f1b4Smrg	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
1737117f1b4Smrg	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
1747117f1b4Smrg	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
1757117f1b4Smrg	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
1767117f1b4Smrg
1777117f1b4Smrg	LDMATRIX_0_5_10(%o0)
1787117f1b4Smrg
1797117f1b4Smrg	/* dest->count = in->count */
1807117f1b4Smrg	st	%g1, [%o4 + V4F_COUNT]
1817117f1b4Smrg
1827117f1b4Smrg	cmp	%g1, 1
1837117f1b4Smrg	bl	7f
1847117f1b4Smrg	 cmp	%o3, 0
1857117f1b4Smrg	bne	4f
1867117f1b4Smrg	 clr	%o4				! 'i' for STRIDE_LOOP
1877117f1b4Smrg
1887117f1b4Smrg1:	/* LENGTHS == NULL */
1897117f1b4Smrg	ld	[%o5 + 0x00], %f0		! ux = from[0]
1907117f1b4Smrg	ld	[%o5 + 0x04], %f1		! uy = from[1]
1917117f1b4Smrg	ld	[%o5 + 0x08], %f2		! uz = from[2]
1927117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
1937117f1b4Smrg	add	%o4, 1, %o4			! i++
1947117f1b4Smrg
1957117f1b4Smrg	/* tx (f3) = (ux * m0)
1967117f1b4Smrg	 * ty (f5) = (uy * m5)
1977117f1b4Smrg	 * tz (f7) = (uz * m10)
1987117f1b4Smrg	 */
1997117f1b4Smrg	fmuls	%f0, M0, %f3			! FGM	Group
2007117f1b4Smrg	fmuls	%f1, M5, %f5			! FGM	Group
2017117f1b4Smrg	fmuls	%f2, M10, %f7			! FGM	Group
2027117f1b4Smrg
2037117f1b4Smrg	/* f3=tx, f5=ty, f7=tz */
2047117f1b4Smrg
2057117f1b4Smrg	/* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
2067117f1b4Smrg	fmuls	%f3, %f3, %f6			! FGM	Group	stall, f3 available
2077117f1b4Smrg	fmuls	%f5, %f5, %f8			! FGM	Group	f5 available
2087117f1b4Smrg	fmuls	%f7, %f7, %f10			! FGM	Group	f7 available
2097117f1b4Smrg	fadds	%f6, %f8, %f6			! FGA	Group	2cyc stall f6,f8 available
2107117f1b4Smrg	fadds	%f6, %f10, %f6			! FGA	Group	4cyc stall f6,f10 available
2117117f1b4Smrg
2127117f1b4Smrg	/* scale (f6) = 1.0 / sqrt(len) */
2137117f1b4Smrg	fsqrts	%f6, %f6			! FDIV  20 cycles
2147117f1b4Smrg	fdivs	%f12, %f6, %f6			! FDIV	14 cycles
2157117f1b4Smrg
2167117f1b4Smrg	fmuls	%f3, %f6, %f3
2177117f1b4Smrg	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
2187117f1b4Smrg	fmuls	%f5, %f6, %f5
2197117f1b4Smrg	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
2207117f1b4Smrg	fmuls	%f7, %f6, %f7
2217117f1b4Smrg	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale
2227117f1b4Smrg
2237117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
2247117f1b4Smrg	bl	1b
2254a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
2267117f1b4Smrg
2277117f1b4Smrg	ba	7f
2287117f1b4Smrg	 nop
2297117f1b4Smrg
2307117f1b4Smrg4:	/* LENGTHS != NULL */
2317117f1b4Smrg	fmuls	M0, %f15, M0
2327117f1b4Smrg	fmuls	M5, %f15, M5
2337117f1b4Smrg	fmuls	M10, %f15, M10
2347117f1b4Smrg
2357117f1b4Smrg5:
2367117f1b4Smrg	ld	[%o5 + 0x00], %f0		! ux = from[0]
2377117f1b4Smrg	ld	[%o5 + 0x04], %f1		! uy = from[1]
2387117f1b4Smrg	ld	[%o5 + 0x08], %f2		! uz = from[2]
2397117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
2407117f1b4Smrg	add	%o4, 1, %o4			! i++
2417117f1b4Smrg
2427117f1b4Smrg	/* tx (f3) = (ux * m0)
2437117f1b4Smrg	 * ty (f5) = (uy * m5)
2447117f1b4Smrg	 * tz (f7) = (uz * m10)
2457117f1b4Smrg	 */
2467117f1b4Smrg	fmuls	%f0, M0, %f3			! FGM	Group
2477117f1b4Smrg	ld	[%o3], %f13			! LSU
2487117f1b4Smrg	fmuls	%f1, M5, %f5			! FGM	Group
2497117f1b4Smrg	add	%o3, 4, %o3			! IEU0
2507117f1b4Smrg	fmuls	%f2, M10, %f7			! FGM	Group
2517117f1b4Smrg
2527117f1b4Smrg	/* f3=tx, f5=ty, f7=tz, f13=lengths[i] */
2537117f1b4Smrg
2547117f1b4Smrg	fmuls	%f3, %f13, %f3
2557117f1b4Smrg	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * len
2567117f1b4Smrg	fmuls	%f5, %f13, %f5
2577117f1b4Smrg	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * len
2587117f1b4Smrg	fmuls	%f7, %f13, %f7
2597117f1b4Smrg	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * len
2607117f1b4Smrg
2617117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
2627117f1b4Smrg	bl	5b
2634a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
2647117f1b4Smrg
2657117f1b4Smrg7:	retl
2667117f1b4Smrg	 nop
2677117f1b4Smrg
2687117f1b4Smrg	.globl	_mesa_sparc_transform_rescale_normals_no_rot
2697117f1b4Smrg_mesa_sparc_transform_rescale_normals_no_rot:
2707117f1b4Smrg	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
2717117f1b4Smrg	sub	%sp, 16, %sp
2727117f1b4Smrg	st	%o1, [%sp + STACK_VAR_OFF+0x0]
2737117f1b4Smrg	ld	[%sp + STACK_VAR_OFF+0x0], %f15	! f15 = scale
2747117f1b4Smrg	add	%sp, 16, %sp
2757117f1b4Smrg
2767ec681f3Smrg	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
2777117f1b4Smrg	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
2787117f1b4Smrg	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
2797117f1b4Smrg	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
2807117f1b4Smrg	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
2817117f1b4Smrg
2827117f1b4Smrg	LDMATRIX_0_5_10(%o0)
2837117f1b4Smrg
2847117f1b4Smrg	/* dest->count = in->count */
2857117f1b4Smrg	st	%g1, [%o4 + V4F_COUNT]
2867117f1b4Smrg
2877117f1b4Smrg	cmp	%g1, 1
2887117f1b4Smrg	bl	7f
2897117f1b4Smrg	 clr	%o4				! 'i' for STRIDE_LOOP
2907117f1b4Smrg
2917117f1b4Smrg	fmuls	M0, %f15, M0
2927117f1b4Smrg	fmuls	M5, %f15, M5
2937117f1b4Smrg	fmuls	M10, %f15, M10
2947117f1b4Smrg
2957117f1b4Smrg1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
2967117f1b4Smrg	ld	[%o5 + 0x04], %f1		! uy = from[1]
2977117f1b4Smrg	ld	[%o5 + 0x08], %f2		! uz = from[2]
2987117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
2997117f1b4Smrg	add	%o4, 1, %o4			! i++
3007117f1b4Smrg
3017117f1b4Smrg	/* tx (f3) = (ux * m0)
3027117f1b4Smrg	 * ty (f5) = (uy * m5)
3037117f1b4Smrg	 * tz (f7) = (uz * m10)
3047117f1b4Smrg	 */
3057117f1b4Smrg	fmuls	%f0, M0, %f3			! FGM	Group
3067117f1b4Smrg	st	%f3, [%g3 + 0x00]		! LSU
3077117f1b4Smrg	fmuls	%f1, M5, %f5			! FGM	Group
3087117f1b4Smrg	st	%f5, [%g3 + 0x04]		! LSU
3097117f1b4Smrg	fmuls	%f2, M10, %f7			! FGM	Group
3107117f1b4Smrg	st	%f7, [%g3 + 0x08]		! LSU
3117117f1b4Smrg
3127117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
3137117f1b4Smrg	bl	1b
3144a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
3157117f1b4Smrg
3167117f1b4Smrg7:	retl
3177117f1b4Smrg	 nop
3187117f1b4Smrg
3197117f1b4Smrg	.globl	_mesa_sparc_transform_rescale_normals
3207117f1b4Smrg_mesa_sparc_transform_rescale_normals:
3217117f1b4Smrg	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
3227117f1b4Smrg	sub	%sp, 16, %sp
3237117f1b4Smrg	st	%o1, [%sp + STACK_VAR_OFF+0x0]
3247117f1b4Smrg	ld	[%sp + STACK_VAR_OFF+0x0], %f15	! f15 = scale
3257117f1b4Smrg	add	%sp, 16, %sp
3267117f1b4Smrg
3277ec681f3Smrg	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
3287117f1b4Smrg	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
3297117f1b4Smrg	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
3307117f1b4Smrg	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
3317117f1b4Smrg	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
3327117f1b4Smrg
3337117f1b4Smrg	LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)
3347117f1b4Smrg
3357117f1b4Smrg	/* dest->count = in->count */
3367117f1b4Smrg	st	%g1, [%o4 + V4F_COUNT]
3377117f1b4Smrg
3387117f1b4Smrg	cmp	%g1, 1
3397117f1b4Smrg	bl	7f
3407117f1b4Smrg	 clr	%o4				! 'i' for STRIDE_LOOP
3417117f1b4Smrg
3427117f1b4Smrg	fmuls	M0, %f15, M0
3437117f1b4Smrg	fmuls	M1, %f15, M1
3447117f1b4Smrg	fmuls	M2, %f15, M2
3457117f1b4Smrg	fmuls	M4, %f15, M4
3467117f1b4Smrg	fmuls	M5, %f15, M5
3477117f1b4Smrg	fmuls	M6, %f15, M6
3487117f1b4Smrg	fmuls	M8, %f15, M8
3497117f1b4Smrg	fmuls	M9, %f15, M9
3507117f1b4Smrg	fmuls	M10, %f15, M10
3517117f1b4Smrg
3527117f1b4Smrg1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
3537117f1b4Smrg	ld	[%o5 + 0x04], %f1		! uy = from[1]
3547117f1b4Smrg	ld	[%o5 + 0x08], %f2		! uz = from[2]
3557117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
3567117f1b4Smrg	add	%o4, 1, %o4			! i++
3577117f1b4Smrg
3587117f1b4Smrg	fmuls	%f0, M0, %f3			! FGM	Group
3597117f1b4Smrg	fmuls	%f1, M1, %f4			! FGM	Group
3607117f1b4Smrg	fmuls	%f0, M4, %f5			! FGM	Group
3617117f1b4Smrg	fmuls	%f1, M5, %f6			! FGM	Group
3627117f1b4Smrg	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
3637117f1b4Smrg	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
3647117f1b4Smrg	fadds	%f3, %f4, %f3			! FGA
3657117f1b4Smrg	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
3667117f1b4Smrg	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
3677117f1b4Smrg	fadds	%f5, %f6, %f5			! FGA
3687117f1b4Smrg	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
3697117f1b4Smrg	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
3707117f1b4Smrg	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
3717117f1b4Smrg	st	%f3, [%g3 + 0x00]		! LSU
3727117f1b4Smrg	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
3737117f1b4Smrg	st	%f5, [%g3 + 0x04]		! LSU
3747117f1b4Smrg	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
3757117f1b4Smrg	st	%f7, [%g3 + 0x08]		! LSU
3767117f1b4Smrg
3777117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
3787117f1b4Smrg	bl	1b
3794a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
3807117f1b4Smrg
3817117f1b4Smrg7:	retl
3827117f1b4Smrg	 nop
3837117f1b4Smrg
3847117f1b4Smrg	.globl	_mesa_sparc_transform_normals_no_rot
3857117f1b4Smrg_mesa_sparc_transform_normals_no_rot:
3867117f1b4Smrg	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
3877ec681f3Smrg	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
3887117f1b4Smrg	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
3897117f1b4Smrg	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
3907117f1b4Smrg	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
3917117f1b4Smrg	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
3927117f1b4Smrg
3937117f1b4Smrg	LDMATRIX_0_5_10(%o0)
3947117f1b4Smrg
3957117f1b4Smrg	/* dest->count = in->count */
3967117f1b4Smrg	st	%g1, [%o4 + V4F_COUNT]
3977117f1b4Smrg
3987117f1b4Smrg	cmp	%g1, 1
3997117f1b4Smrg	bl	7f
4007117f1b4Smrg	 clr	%o4				! 'i' for STRIDE_LOOP
4017117f1b4Smrg
4027117f1b4Smrg1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
4037117f1b4Smrg	ld	[%o5 + 0x04], %f1		! uy = from[1]
4047117f1b4Smrg	ld	[%o5 + 0x08], %f2		! uz = from[2]
4057117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
4067117f1b4Smrg	add	%o4, 1, %o4			! i++
4077117f1b4Smrg
4087117f1b4Smrg	/* tx (f3) = (ux * m0)
4097117f1b4Smrg	 * ty (f5) = (uy * m5)
4107117f1b4Smrg	 * tz (f7) = (uz * m10)
4117117f1b4Smrg	 */
4127117f1b4Smrg	fmuls	%f0, M0, %f3			! FGM	Group
4137117f1b4Smrg	st	%f3, [%g3 + 0x00]		! LSU
4147117f1b4Smrg	fmuls	%f1, M5, %f5			! FGM	Group
4157117f1b4Smrg	st	%f5, [%g3 + 0x04]		! LSU
4167117f1b4Smrg	fmuls	%f2, M10, %f7			! FGM	Group
4177117f1b4Smrg	st	%f7, [%g3 + 0x08]		! LSU
4187117f1b4Smrg
4197117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
4207117f1b4Smrg	bl	1b
4214a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
4227117f1b4Smrg
4237117f1b4Smrg7:	retl
4247117f1b4Smrg	 nop
4257117f1b4Smrg
4267117f1b4Smrg	.globl	_mesa_sparc_transform_normals
4277117f1b4Smrg_mesa_sparc_transform_normals:
4287117f1b4Smrg	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
4297ec681f3Smrg	add     %o0, MATRIX_INV, %o0		! o0 = mat->inv
4307117f1b4Smrg	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
4317117f1b4Smrg	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
4327117f1b4Smrg	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
4337117f1b4Smrg	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
4347117f1b4Smrg
4357117f1b4Smrg	LDMATRIX_0_1_2_4_5_6_8_9_10(%o0)
4367117f1b4Smrg
4377117f1b4Smrg	/* dest->count = in->count */
4387117f1b4Smrg	st	%g1, [%o4 + V4F_COUNT]
4397117f1b4Smrg
4407117f1b4Smrg	cmp	%g1, 1
4417117f1b4Smrg	bl	7f
4427117f1b4Smrg	 clr	%o4				! 'i' for STRIDE_LOOP
4437117f1b4Smrg
4447117f1b4Smrg1:	ld	[%o5 + 0x00], %f0		! ux = from[0]
4457117f1b4Smrg	ld	[%o5 + 0x04], %f1		! uy = from[1]
4467117f1b4Smrg	ld	[%o5 + 0x08], %f2		! uz = from[2]
4477117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
4487117f1b4Smrg	add	%o4, 1, %o4			! i++
4497117f1b4Smrg
4507117f1b4Smrg	fmuls	%f0, M0, %f3			! FGM	Group
4517117f1b4Smrg	fmuls	%f1, M1, %f4			! FGM	Group
4527117f1b4Smrg	fmuls	%f0, M4, %f5			! FGM	Group
4537117f1b4Smrg	fmuls	%f1, M5, %f6			! FGM	Group
4547117f1b4Smrg	fmuls	%f0, M8, %f7			! FGM	Group	f3 available
4557117f1b4Smrg	fmuls	%f1, M9, %f8			! FGM	Group	f4 available
4567117f1b4Smrg	fadds	%f3, %f4, %f3			! FGA
4577117f1b4Smrg	fmuls	%f2, M2, %f10			! FGM	Group	f5 available
4587117f1b4Smrg	fmuls	%f2, M6, %f0			! FGM	Group	f6 available
4597117f1b4Smrg	fadds	%f5, %f6, %f5			! FGA
4607117f1b4Smrg	fmuls	%f2, M10, %f4			! FGM	Group	f7 available
4617117f1b4Smrg	fadds	%f7, %f8, %f7			! FGA	Group	f8,f3 available
4627117f1b4Smrg	fadds	%f3, %f10, %f3			! FGA	Group	f10 available
4637117f1b4Smrg	st	%f3, [%g3 + 0x00]		! LSU
4647117f1b4Smrg	fadds	%f5, %f0, %f5			! FGA	Group	stall f0,f5 available
4657117f1b4Smrg	st	%f5, [%g3 + 0x04]		! LSU
4667117f1b4Smrg	fadds	%f7, %f4, %f7			! FGA	Group	stall f4,f7 available
4677117f1b4Smrg	st	%f7, [%g3 + 0x08]		! LSU
4687117f1b4Smrg
4697117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
4707117f1b4Smrg	bl	1b
4714a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
4727117f1b4Smrg
4737117f1b4Smrg7:	retl
4747117f1b4Smrg	 nop
4757117f1b4Smrg
4767117f1b4Smrg	.globl	_mesa_sparc_normalize_normals
4777117f1b4Smrg_mesa_sparc_normalize_normals:
4787117f1b4Smrg	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
4797117f1b4Smrg
4807117f1b4Smrg	sethi	%hi(ONE_DOT_ZERO), %g2
4817117f1b4Smrg	sub	%sp, 16, %sp
4827117f1b4Smrg	st	%g2, [%sp + STACK_VAR_OFF+0x0]
4837117f1b4Smrg	ld	[%sp + STACK_VAR_OFF+0x0], %f12	! f12 = 1.0f
4847117f1b4Smrg	add	%sp, 16, %sp
4857117f1b4Smrg
4867117f1b4Smrg	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
4877117f1b4Smrg	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
4887117f1b4Smrg	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
4897117f1b4Smrg	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
4907117f1b4Smrg
4917117f1b4Smrg	/* dest->count = in->count */
4927117f1b4Smrg	st	%g1, [%o4 + V4F_COUNT]
4937117f1b4Smrg
4947117f1b4Smrg	cmp	%g1, 1
4957117f1b4Smrg	bl	7f
4967117f1b4Smrg	 cmp	%o3, 0
4977117f1b4Smrg	bne	4f
4987117f1b4Smrg	 clr	%o4				! 'i' for STRIDE_LOOP
4997117f1b4Smrg
5007117f1b4Smrg1:	/* LENGTHS == NULL */
5017117f1b4Smrg	ld	[%o5 + 0x00], %f3		! ux = from[0]
5027117f1b4Smrg	ld	[%o5 + 0x04], %f5		! uy = from[1]
5037117f1b4Smrg	ld	[%o5 + 0x08], %f7		! uz = from[2]
5047117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
5057117f1b4Smrg	add	%o4, 1, %o4			! i++
5067117f1b4Smrg
5077117f1b4Smrg	/* f3=tx, f5=ty, f7=tz */
5087117f1b4Smrg
5097117f1b4Smrg	/* len (f6) = (tx * tx) + (ty * ty) + (tz * tz) */
5107117f1b4Smrg	fmuls	%f3, %f3, %f6			! FGM	Group	f3 available
5117117f1b4Smrg	fmuls	%f5, %f5, %f8			! FGM	Group	f5 available
5127117f1b4Smrg	fmuls	%f7, %f7, %f10			! FGM	Group	f7 available
5137117f1b4Smrg	fadds	%f6, %f8, %f6			! FGA	Group	2cyc stall f6,f8 available
5147117f1b4Smrg	fadds	%f6, %f10, %f6			! FGA	Group	4cyc stall f6,f10 available
5157117f1b4Smrg
5167117f1b4Smrg	/* scale (f6) = 1.0 / sqrt(len) */
5177117f1b4Smrg	fsqrts	%f6, %f6			! FDIV  20 cycles
5187117f1b4Smrg	fdivs	%f12, %f6, %f6			! FDIV	14 cycles
5197117f1b4Smrg
5207117f1b4Smrg	fmuls	%f3, %f6, %f3
5217117f1b4Smrg	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
5227117f1b4Smrg	fmuls	%f5, %f6, %f5
5237117f1b4Smrg	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
5247117f1b4Smrg	fmuls	%f7, %f6, %f7
5257117f1b4Smrg	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale
5267117f1b4Smrg
5277117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
5287117f1b4Smrg	bl	1b
5294a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
5307117f1b4Smrg
5317117f1b4Smrg	ba	7f
5327117f1b4Smrg	 nop
5337117f1b4Smrg
5347117f1b4Smrg4:	/* LENGTHS != NULL */
5357117f1b4Smrg
5367117f1b4Smrg5:
5377117f1b4Smrg	ld	[%o5 + 0x00], %f3		! ux = from[0]
5387117f1b4Smrg	ld	[%o5 + 0x04], %f5		! uy = from[1]
5397117f1b4Smrg	ld	[%o5 + 0x08], %f7		! uz = from[2]
5407117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
5417117f1b4Smrg	add	%o4, 1, %o4			! i++
5427117f1b4Smrg
5437117f1b4Smrg	ld	[%o3], %f13			! LSU
5447117f1b4Smrg	add	%o3, 4, %o3			! IEU0
5457117f1b4Smrg
5467117f1b4Smrg	/* f3=tx, f5=ty, f7=tz, f13=lengths[i] */
5477117f1b4Smrg
5487117f1b4Smrg	fmuls	%f3, %f13, %f3
5497117f1b4Smrg	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * len
5507117f1b4Smrg	fmuls	%f5, %f13, %f5
5517117f1b4Smrg	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * len
5527117f1b4Smrg	fmuls	%f7, %f13, %f7
5537117f1b4Smrg	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * len
5547117f1b4Smrg
5557117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
5567117f1b4Smrg	bl	5b
5574a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
5587117f1b4Smrg
5597117f1b4Smrg7:	retl
5607117f1b4Smrg	 nop
5617117f1b4Smrg
5627117f1b4Smrg	.globl	_mesa_sparc_rescale_normals
5637117f1b4Smrg_mesa_sparc_rescale_normals:
5647117f1b4Smrg	/* o0=mat o1=scale o2=in o3=lengths o4=dest */
5657117f1b4Smrg
5667117f1b4Smrg	sethi	%hi(ONE_DOT_ZERO), %g2
5677117f1b4Smrg	sub	%sp, 16, %sp
5687117f1b4Smrg	st	%o1, [%sp + STACK_VAR_OFF+0x0]
5697117f1b4Smrg	ld	[%sp + STACK_VAR_OFF+0x0], %f15	! f15 = scale
5707117f1b4Smrg	add	%sp, 16, %sp
5717117f1b4Smrg
5727117f1b4Smrg	LDPTR	[%o2 + V4F_START], %o5		! o5 = 'from' in->start
5737117f1b4Smrg	ld	[%o2 + V4F_COUNT], %g1		! g1 = in->count
5747117f1b4Smrg	ld	[%o2 + V4F_STRIDE], %g2		! g2 = in->stride
5757117f1b4Smrg	LDPTR	[%o4 + V4F_START], %g3		! g3 = 'out' dest->start
5767117f1b4Smrg
5777117f1b4Smrg	/* dest->count = in->count */
5787117f1b4Smrg	st	%g1, [%o4 + V4F_COUNT]
5797117f1b4Smrg
5807117f1b4Smrg	cmp	%g1, 1
5817117f1b4Smrg	bl	7f
5827117f1b4Smrg	 clr	%o4				! 'i' for STRIDE_LOOP
5837117f1b4Smrg
5847117f1b4Smrg1:
5857117f1b4Smrg	ld	[%o5 + 0x00], %f3		! ux = from[0]
5867117f1b4Smrg	ld	[%o5 + 0x04], %f5		! uy = from[1]
5877117f1b4Smrg	ld	[%o5 + 0x08], %f7		! uz = from[2]
5887117f1b4Smrg	add	%o5, %g2, %o5			! STRIDE_F(from, stride)
5897117f1b4Smrg	add	%o4, 1, %o4			! i++
5907117f1b4Smrg
5917117f1b4Smrg	/* f3=tx, f5=ty, f7=tz */
5927117f1b4Smrg
5937117f1b4Smrg	fmuls	%f3, %f15, %f3
5947117f1b4Smrg	st	%f3, [%g3 + 0x00]		! out[i][0] = tx * scale
5957117f1b4Smrg	fmuls	%f5, %f15, %f5
5967117f1b4Smrg	st	%f5, [%g3 + 0x04]		! out[i][1] = ty * scale
5977117f1b4Smrg	fmuls	%f7, %f15, %f7
5987117f1b4Smrg	st	%f7, [%g3 + 0x08]		! out[i][2] = tz * scale
5997117f1b4Smrg
6007117f1b4Smrg	cmp	%o4, %g1			! continue if (i < count)
6017117f1b4Smrg	bl	1b
6024a49301eSmrg	 add	%g3, 0x10, %g3			! advance out vector pointer
6037117f1b4Smrg
6047117f1b4Smrg7:	retl
6057117f1b4Smrg	 nop
606