stan.sa revision 1.1.1.1 1 * MOTOROLA MICROPROCESSOR & MEMORY TECHNOLOGY GROUP
2 * M68000 Hi-Performance Microprocessor Division
3 * M68040 Software Package
4 *
5 * M68040 Software Package Copyright (c) 1993, 1994 Motorola Inc.
6 * All rights reserved.
7 *
8 * THE SOFTWARE is provided on an "AS IS" basis and without warranty.
9 * To the maximum extent permitted by applicable law,
10 * MOTOROLA DISCLAIMS ALL WARRANTIES WHETHER EXPRESS OR IMPLIED,
11 * INCLUDING IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A
12 * PARTICULAR PURPOSE and any warranty against infringement with
13 * regard to the SOFTWARE (INCLUDING ANY MODIFIED VERSIONS THEREOF)
14 * and any accompanying written materials.
15 *
16 * To the maximum extent permitted by applicable law,
17 * IN NO EVENT SHALL MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
18 * (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS
19 * PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR
20 * OTHER PECUNIARY LOSS) ARISING OF THE USE OR INABILITY TO USE THE
21 * SOFTWARE. Motorola assumes no responsibility for the maintenance
22 * and support of the SOFTWARE.
23 *
24 * You are hereby granted a copyright license to use, modify, and
25 * distribute the SOFTWARE so long as this entire notice is retained
26 * without alteration in any modified and/or redistributed versions,
27 * and that such modified versions are clearly identified as such.
28 * No licenses are granted by implication, estoppel or otherwise
29 * under any patents or trademarks of Motorola, Inc.
30
31 *
32 * stan.sa 3.3 7/29/91
33 *
34 * The entry point stan computes the tangent of
35 * an input argument;
36 * stand does the same except for denormalized input.
37 *
38 * Input: Double-extended number X in location pointed to
39 * by address register a0.
40 *
41 * Output: The value tan(X) returned in floating-point register Fp0.
42 *
43 * Accuracy and Monotonicity: The returned result is within 3 ulp in
44 * 64 significant bit, i.e. within 0.5001 ulp to 53 bits if the
45 * result is subsequently rounded to double precision. The
46 * result is provably monotonic in double precision.
47 *
48 * Speed: The program sTAN takes approximately 170 cycles for
49 * input argument X such that |X| < 15Pi, which is the the usual
50 * situation.
51 *
52 * Algorithm:
53 *
54 * 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.
55 *
56 * 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let
57 * k = N mod 2, so in particular, k = 0 or 1.
58 *
59 * 3. If k is odd, go to 5.
60 *
61 * 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a
62 * rational function U/V where
63 * U = r + r*s*(P1 + s*(P2 + s*P3)), and
64 * V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r.
65 * Exit.
66 *
67 * 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a
68 * rational function U/V where
69 * U = r + r*s*(P1 + s*(P2 + s*P3)), and
70 * V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,
71 * -Cot(r) = -V/U. Exit.
72 *
73 * 6. If |X| > 1, go to 8.
74 *
75 * 7. (|X|<2**(-40)) Tan(X) = X. Exit.
76 *
77 * 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2.
78 *
79
80 STAN IDNT 2,1 Motorola 040 Floating Point Software Package
81
82 section 8
83
84 include fpsp.h
85
86 BOUNDS1 DC.L $3FD78000,$4004BC7E
87 TWOBYPI DC.L $3FE45F30,$6DC9C883
88
89 TANQ4 DC.L $3EA0B759,$F50F8688
90 TANP3 DC.L $BEF2BAA5,$A8924F04
91
92 TANQ3 DC.L $BF346F59,$B39BA65F,$00000000,$00000000
93
94 TANP2 DC.L $3FF60000,$E073D3FC,$199C4A00,$00000000
95
96 TANQ2 DC.L $3FF90000,$D23CD684,$15D95FA1,$00000000
97
98 TANP1 DC.L $BFFC0000,$8895A6C5,$FB423BCA,$00000000
99
100 TANQ1 DC.L $BFFD0000,$EEF57E0D,$A84BC8CE,$00000000
101
102 INVTWOPI DC.L $3FFC0000,$A2F9836E,$4E44152A,$00000000
103
104 TWOPI1 DC.L $40010000,$C90FDAA2,$00000000,$00000000
105 TWOPI2 DC.L $3FDF0000,$85A308D4,$00000000,$00000000
106
107 *--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING
108 *--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT
109 *--MOST 69 BITS LONG.
110 xdef PITBL
111 PITBL:
112 DC.L $C0040000,$C90FDAA2,$2168C235,$21800000
113 DC.L $C0040000,$C2C75BCD,$105D7C23,$A0D00000
114 DC.L $C0040000,$BC7EDCF7,$FF523611,$A1E80000
115 DC.L $C0040000,$B6365E22,$EE46F000,$21480000
116 DC.L $C0040000,$AFEDDF4D,$DD3BA9EE,$A1200000
117 DC.L $C0040000,$A9A56078,$CC3063DD,$21FC0000
118 DC.L $C0040000,$A35CE1A3,$BB251DCB,$21100000
119 DC.L $C0040000,$9D1462CE,$AA19D7B9,$A1580000
120 DC.L $C0040000,$96CBE3F9,$990E91A8,$21E00000
121 DC.L $C0040000,$90836524,$88034B96,$20B00000
122 DC.L $C0040000,$8A3AE64F,$76F80584,$A1880000
123 DC.L $C0040000,$83F2677A,$65ECBF73,$21C40000
124 DC.L $C0030000,$FB53D14A,$A9C2F2C2,$20000000
125 DC.L $C0030000,$EEC2D3A0,$87AC669F,$21380000
126 DC.L $C0030000,$E231D5F6,$6595DA7B,$A1300000
127 DC.L $C0030000,$D5A0D84C,$437F4E58,$9FC00000
128 DC.L $C0030000,$C90FDAA2,$2168C235,$21000000
129 DC.L $C0030000,$BC7EDCF7,$FF523611,$A1680000
130 DC.L $C0030000,$AFEDDF4D,$DD3BA9EE,$A0A00000
131 DC.L $C0030000,$A35CE1A3,$BB251DCB,$20900000
132 DC.L $C0030000,$96CBE3F9,$990E91A8,$21600000
133 DC.L $C0030000,$8A3AE64F,$76F80584,$A1080000
134 DC.L $C0020000,$FB53D14A,$A9C2F2C2,$1F800000
135 DC.L $C0020000,$E231D5F6,$6595DA7B,$A0B00000
136 DC.L $C0020000,$C90FDAA2,$2168C235,$20800000
137 DC.L $C0020000,$AFEDDF4D,$DD3BA9EE,$A0200000
138 DC.L $C0020000,$96CBE3F9,$990E91A8,$20E00000
139 DC.L $C0010000,$FB53D14A,$A9C2F2C2,$1F000000
140 DC.L $C0010000,$C90FDAA2,$2168C235,$20000000
141 DC.L $C0010000,$96CBE3F9,$990E91A8,$20600000
142 DC.L $C0000000,$C90FDAA2,$2168C235,$1F800000
143 DC.L $BFFF0000,$C90FDAA2,$2168C235,$1F000000
144 DC.L $00000000,$00000000,$00000000,$00000000
145 DC.L $3FFF0000,$C90FDAA2,$2168C235,$9F000000
146 DC.L $40000000,$C90FDAA2,$2168C235,$9F800000
147 DC.L $40010000,$96CBE3F9,$990E91A8,$A0600000
148 DC.L $40010000,$C90FDAA2,$2168C235,$A0000000
149 DC.L $40010000,$FB53D14A,$A9C2F2C2,$9F000000
150 DC.L $40020000,$96CBE3F9,$990E91A8,$A0E00000
151 DC.L $40020000,$AFEDDF4D,$DD3BA9EE,$20200000
152 DC.L $40020000,$C90FDAA2,$2168C235,$A0800000
153 DC.L $40020000,$E231D5F6,$6595DA7B,$20B00000
154 DC.L $40020000,$FB53D14A,$A9C2F2C2,$9F800000
155 DC.L $40030000,$8A3AE64F,$76F80584,$21080000
156 DC.L $40030000,$96CBE3F9,$990E91A8,$A1600000
157 DC.L $40030000,$A35CE1A3,$BB251DCB,$A0900000
158 DC.L $40030000,$AFEDDF4D,$DD3BA9EE,$20A00000
159 DC.L $40030000,$BC7EDCF7,$FF523611,$21680000
160 DC.L $40030000,$C90FDAA2,$2168C235,$A1000000
161 DC.L $40030000,$D5A0D84C,$437F4E58,$1FC00000
162 DC.L $40030000,$E231D5F6,$6595DA7B,$21300000
163 DC.L $40030000,$EEC2D3A0,$87AC669F,$A1380000
164 DC.L $40030000,$FB53D14A,$A9C2F2C2,$A0000000
165 DC.L $40040000,$83F2677A,$65ECBF73,$A1C40000
166 DC.L $40040000,$8A3AE64F,$76F80584,$21880000
167 DC.L $40040000,$90836524,$88034B96,$A0B00000
168 DC.L $40040000,$96CBE3F9,$990E91A8,$A1E00000
169 DC.L $40040000,$9D1462CE,$AA19D7B9,$21580000
170 DC.L $40040000,$A35CE1A3,$BB251DCB,$A1100000
171 DC.L $40040000,$A9A56078,$CC3063DD,$A1FC0000
172 DC.L $40040000,$AFEDDF4D,$DD3BA9EE,$21200000
173 DC.L $40040000,$B6365E22,$EE46F000,$A1480000
174 DC.L $40040000,$BC7EDCF7,$FF523611,$21E80000
175 DC.L $40040000,$C2C75BCD,$105D7C23,$20D00000
176 DC.L $40040000,$C90FDAA2,$2168C235,$A1800000
177
178 INARG equ FP_SCR4
179
180 TWOTO63 equ L_SCR1
181 ENDFLAG equ L_SCR2
182 N equ L_SCR3
183
184 xref t_frcinx
185 xref t_extdnrm
186
187 xdef stand
188 stand:
189 *--TAN(X) = X FOR DENORMALIZED X
190
191 bra t_extdnrm
192
193 xdef stan
194 stan:
195 FMOVE.X (a0),FP0 ...LOAD INPUT
196
197 MOVE.L (A0),D0
198 MOVE.W 4(A0),D0
199 ANDI.L #$7FFFFFFF,D0
200
201 CMPI.L #$3FD78000,D0 ...|X| >= 2**(-40)?
202 BGE.B TANOK1
203 BRA.W TANSM
204 TANOK1:
205 CMPI.L #$4004BC7E,D0 ...|X| < 15 PI?
206 BLT.B TANMAIN
207 BRA.W REDUCEX
208
209
210 TANMAIN:
211 *--THIS IS THE USUAL CASE, |X| <= 15 PI.
212 *--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
213 FMOVE.X FP0,FP1
214 FMUL.D TWOBYPI,FP1 ...X*2/PI
215
216 *--HIDE THE NEXT TWO INSTRUCTIONS
217 lea.l PITBL+$200,a1 ...TABLE OF N*PI/2, N = -32,...,32
218
219 *--FP1 IS NOW READY
220 FMOVE.L FP1,D0 ...CONVERT TO INTEGER
221
222 ASL.L #4,D0
223 ADDA.L D0,a1 ...ADDRESS N*PIBY2 IN Y1, Y2
224
225 FSUB.X (a1)+,FP0 ...X-Y1
226 *--HIDE THE NEXT ONE
227
228 FSUB.S (a1),FP0 ...FP0 IS R = (X-Y1)-Y2
229
230 ROR.L #5,D0
231 ANDI.L #$80000000,D0 ...D0 WAS ODD IFF D0 < 0
232
233 TANCONT:
234
235 CMPI.L #0,D0
236 BLT.W NODD
237
238 FMOVE.X FP0,FP1
239 FMUL.X FP1,FP1 ...S = R*R
240
241 FMOVE.D TANQ4,FP3
242 FMOVE.D TANP3,FP2
243
244 FMUL.X FP1,FP3 ...SQ4
245 FMUL.X FP1,FP2 ...SP3
246
247 FADD.D TANQ3,FP3 ...Q3+SQ4
248 FADD.X TANP2,FP2 ...P2+SP3
249
250 FMUL.X FP1,FP3 ...S(Q3+SQ4)
251 FMUL.X FP1,FP2 ...S(P2+SP3)
252
253 FADD.X TANQ2,FP3 ...Q2+S(Q3+SQ4)
254 FADD.X TANP1,FP2 ...P1+S(P2+SP3)
255
256 FMUL.X FP1,FP3 ...S(Q2+S(Q3+SQ4))
257 FMUL.X FP1,FP2 ...S(P1+S(P2+SP3))
258
259 FADD.X TANQ1,FP3 ...Q1+S(Q2+S(Q3+SQ4))
260 FMUL.X FP0,FP2 ...RS(P1+S(P2+SP3))
261
262 FMUL.X FP3,FP1 ...S(Q1+S(Q2+S(Q3+SQ4)))
263
264
265 FADD.X FP2,FP0 ...R+RS(P1+S(P2+SP3))
266
267
268 FADD.S #:3F800000,FP1 ...1+S(Q1+...)
269
270 FMOVE.L d1,fpcr ;restore users exceptions
271 FDIV.X FP1,FP0 ;last inst - possible exception set
272
273 bra t_frcinx
274
275 NODD:
276 FMOVE.X FP0,FP1
277 FMUL.X FP0,FP0 ...S = R*R
278
279 FMOVE.D TANQ4,FP3
280 FMOVE.D TANP3,FP2
281
282 FMUL.X FP0,FP3 ...SQ4
283 FMUL.X FP0,FP2 ...SP3
284
285 FADD.D TANQ3,FP3 ...Q3+SQ4
286 FADD.X TANP2,FP2 ...P2+SP3
287
288 FMUL.X FP0,FP3 ...S(Q3+SQ4)
289 FMUL.X FP0,FP2 ...S(P2+SP3)
290
291 FADD.X TANQ2,FP3 ...Q2+S(Q3+SQ4)
292 FADD.X TANP1,FP2 ...P1+S(P2+SP3)
293
294 FMUL.X FP0,FP3 ...S(Q2+S(Q3+SQ4))
295 FMUL.X FP0,FP2 ...S(P1+S(P2+SP3))
296
297 FADD.X TANQ1,FP3 ...Q1+S(Q2+S(Q3+SQ4))
298 FMUL.X FP1,FP2 ...RS(P1+S(P2+SP3))
299
300 FMUL.X FP3,FP0 ...S(Q1+S(Q2+S(Q3+SQ4)))
301
302
303 FADD.X FP2,FP1 ...R+RS(P1+S(P2+SP3))
304 FADD.S #:3F800000,FP0 ...1+S(Q1+...)
305
306
307 FMOVE.X FP1,-(sp)
308 EORI.L #$80000000,(sp)
309
310 FMOVE.L d1,fpcr ;restore users exceptions
311 FDIV.X (sp)+,FP0 ;last inst - possible exception set
312
313 bra t_frcinx
314
315 TANBORS:
316 *--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION.
317 *--IF |X| < 2**(-40), RETURN X OR 1.
318 CMPI.L #$3FFF8000,D0
319 BGT.B REDUCEX
320
321 TANSM:
322
323 FMOVE.X FP0,-(sp)
324 FMOVE.L d1,fpcr ;restore users exceptions
325 FMOVE.X (sp)+,FP0 ;last inst - posibble exception set
326
327 bra t_frcinx
328
329
330 REDUCEX:
331 *--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
332 *--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
333 *--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
334
335 FMOVEM.X FP2-FP5,-(A7) ...save FP2 through FP5
336 MOVE.L D2,-(A7)
337 FMOVE.S #:00000000,FP1
338
339 *--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
340 *--there is a danger of unwanted overflow in first LOOP iteration. In this
341 *--case, reduce argument by one remainder step to make subsequent reduction
342 *--safe.
343 cmpi.l #$7ffeffff,d0 ;is argument dangerously large?
344 bne.b LOOP
345 move.l #$7ffe0000,FP_SCR2(a6) ;yes
346 * ;create 2**16383*PI/2
347 move.l #$c90fdaa2,FP_SCR2+4(a6)
348 clr.l FP_SCR2+8(a6)
349 ftst.x fp0 ;test sign of argument
350 move.l #$7fdc0000,FP_SCR3(a6) ;create low half of 2**16383*
351 * ;PI/2 at FP_SCR3
352 move.l #$85a308d3,FP_SCR3+4(a6)
353 clr.l FP_SCR3+8(a6)
354 fblt.w red_neg
355 or.w #$8000,FP_SCR2(a6) ;positive arg
356 or.w #$8000,FP_SCR3(a6)
357 red_neg:
358 fadd.x FP_SCR2(a6),fp0 ;high part of reduction is exact
359 fmove.x fp0,fp1 ;save high result in fp1
360 fadd.x FP_SCR3(a6),fp0 ;low part of reduction
361 fsub.x fp0,fp1 ;determine low component of result
362 fadd.x FP_SCR3(a6),fp1 ;fp0/fp1 are reduced argument.
363
364 *--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
365 *--integer quotient will be stored in N
366 *--Intermeditate remainder is 66-bit long; (R,r) in (FP0,FP1)
367
368 LOOP:
369 FMOVE.X FP0,INARG(a6) ...+-2**K * F, 1 <= F < 2
370 MOVE.W INARG(a6),D0
371 MOVE.L D0,A1 ...save a copy of D0
372 ANDI.L #$00007FFF,D0
373 SUBI.L #$00003FFF,D0 ...D0 IS K
374 CMPI.L #28,D0
375 BLE.B LASTLOOP
376 CONTLOOP:
377 SUBI.L #27,D0 ...D0 IS L := K-27
378 MOVE.L #0,ENDFLAG(a6)
379 BRA.B WORK
380 LASTLOOP:
381 CLR.L D0 ...D0 IS L := 0
382 MOVE.L #1,ENDFLAG(a6)
383
384 WORK:
385 *--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN
386 *--THAT INT( X * (2/PI) / 2**(L) ) < 2**29.
387
388 *--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
389 *--2**L * (PIby2_1), 2**L * (PIby2_2)
390
391 MOVE.L #$00003FFE,D2 ...BIASED EXPO OF 2/PI
392 SUB.L D0,D2 ...BIASED EXPO OF 2**(-L)*(2/PI)
393
394 MOVE.L #$A2F9836E,FP_SCR1+4(a6)
395 MOVE.L #$4E44152A,FP_SCR1+8(a6)
396 MOVE.W D2,FP_SCR1(a6) ...FP_SCR1 is 2**(-L)*(2/PI)
397
398 FMOVE.X FP0,FP2
399 FMUL.X FP_SCR1(a6),FP2
400 *--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
401 *--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.L FP <--> N
402 *--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
403 *--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE
404 *--US THE DESIRED VALUE IN FLOATING POINT.
405
406 *--HIDE SIX CYCLES OF INSTRUCTION
407 MOVE.L A1,D2
408 SWAP D2
409 ANDI.L #$80000000,D2
410 ORI.L #$5F000000,D2 ...D2 IS SIGN(INARG)*2**63 IN SGL
411 MOVE.L D2,TWOTO63(a6)
412
413 MOVE.L D0,D2
414 ADDI.L #$00003FFF,D2 ...BIASED EXPO OF 2**L * (PI/2)
415
416 *--FP2 IS READY
417 FADD.S TWOTO63(a6),FP2 ...THE FRACTIONAL PART OF FP1 IS ROUNDED
418
419 *--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1 and 2**(L)*Piby2_2
420 MOVE.W D2,FP_SCR2(a6)
421 CLR.W FP_SCR2+2(a6)
422 MOVE.L #$C90FDAA2,FP_SCR2+4(a6)
423 CLR.L FP_SCR2+8(a6) ...FP_SCR2 is 2**(L) * Piby2_1
424
425 *--FP2 IS READY
426 FSUB.S TWOTO63(a6),FP2 ...FP2 is N
427
428 ADDI.L #$00003FDD,D0
429 MOVE.W D0,FP_SCR3(a6)
430 CLR.W FP_SCR3+2(a6)
431 MOVE.L #$85A308D3,FP_SCR3+4(a6)
432 CLR.L FP_SCR3+8(a6) ...FP_SCR3 is 2**(L) * Piby2_2
433
434 MOVE.L ENDFLAG(a6),D0
435
436 *--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
437 *--P2 = 2**(L) * Piby2_2
438 FMOVE.X FP2,FP4
439 FMul.X FP_SCR2(a6),FP4 ...W = N*P1
440 FMove.X FP2,FP5
441 FMul.X FP_SCR3(a6),FP5 ...w = N*P2
442 FMove.X FP4,FP3
443 *--we want P+p = W+w but |p| <= half ulp of P
444 *--Then, we need to compute A := R-P and a := r-p
445 FAdd.X FP5,FP3 ...FP3 is P
446 FSub.X FP3,FP4 ...W-P
447
448 FSub.X FP3,FP0 ...FP0 is A := R - P
449 FAdd.X FP5,FP4 ...FP4 is p = (W-P)+w
450
451 FMove.X FP0,FP3 ...FP3 A
452 FSub.X FP4,FP1 ...FP1 is a := r - p
453
454 *--Now we need to normalize (A,a) to "new (R,r)" where R+r = A+a but
455 *--|r| <= half ulp of R.
456 FAdd.X FP1,FP0 ...FP0 is R := A+a
457 *--No need to calculate r if this is the last loop
458 CMPI.L #0,D0
459 BGT.W RESTORE
460
461 *--Need to calculate r
462 FSub.X FP0,FP3 ...A-R
463 FAdd.X FP3,FP1 ...FP1 is r := (A-R)+a
464 BRA.W LOOP
465
466 RESTORE:
467 FMOVE.L FP2,N(a6)
468 MOVE.L (A7)+,D2
469 FMOVEM.X (A7)+,FP2-FP5
470
471
472 MOVE.L N(a6),D0
473 ROR.L #1,D0
474
475
476 BRA.W TANCONT
477
478 end
479