skipjack.c revision 1.1 1 1.1 tls /* $NetBSD: skipjack.c,v 1.1 2003/11/16 12:07:50 tls Exp $ */
2 1.1 tls /* $OpenBSD: skipjack.c,v 1.3 2001/05/05 00:31:34 angelos Exp $ */
3 1.1 tls
4 1.1 tls /*
5 1.1 tls * Further optimized test implementation of SKIPJACK algorithm
6 1.1 tls * Mark Tillotson <markt (at) chaos.org.uk>, 25 June 98
7 1.1 tls * Optimizations suit RISC (lots of registers) machine best.
8 1.1 tls *
9 1.1 tls * based on unoptimized implementation of
10 1.1 tls * Panu Rissanen <bande (at) lut.fi> 960624
11 1.1 tls *
12 1.1 tls * SKIPJACK and KEA Algorithm Specifications
13 1.1 tls * Version 2.0
14 1.1 tls * 29 May 1998
15 1.1 tls */
16 1.1 tls
17 1.1 tls #include <sys/cdefs.h>
18 1.1 tls __KERNEL_RCSID(0, "$NetBSD: skipjack.c,v 1.1 2003/11/16 12:07:50 tls Exp $");
19 1.1 tls
20 1.1 tls #include <sys/param.h>
21 1.1 tls #include <crypto/skipjack/skipjack.h>
22 1.1 tls #include <sys/malloc.h>
23 1.1 tls #include <opencrypto/cryptodev.h>
24 1.1 tls
25 1.1 tls static const u_int8_t ftable[0x100] =
26 1.1 tls {
27 1.1 tls 0xa3, 0xd7, 0x09, 0x83, 0xf8, 0x48, 0xf6, 0xf4,
28 1.1 tls 0xb3, 0x21, 0x15, 0x78, 0x99, 0xb1, 0xaf, 0xf9,
29 1.1 tls 0xe7, 0x2d, 0x4d, 0x8a, 0xce, 0x4c, 0xca, 0x2e,
30 1.1 tls 0x52, 0x95, 0xd9, 0x1e, 0x4e, 0x38, 0x44, 0x28,
31 1.1 tls 0x0a, 0xdf, 0x02, 0xa0, 0x17, 0xf1, 0x60, 0x68,
32 1.1 tls 0x12, 0xb7, 0x7a, 0xc3, 0xe9, 0xfa, 0x3d, 0x53,
33 1.1 tls 0x96, 0x84, 0x6b, 0xba, 0xf2, 0x63, 0x9a, 0x19,
34 1.1 tls 0x7c, 0xae, 0xe5, 0xf5, 0xf7, 0x16, 0x6a, 0xa2,
35 1.1 tls 0x39, 0xb6, 0x7b, 0x0f, 0xc1, 0x93, 0x81, 0x1b,
36 1.1 tls 0xee, 0xb4, 0x1a, 0xea, 0xd0, 0x91, 0x2f, 0xb8,
37 1.1 tls 0x55, 0xb9, 0xda, 0x85, 0x3f, 0x41, 0xbf, 0xe0,
38 1.1 tls 0x5a, 0x58, 0x80, 0x5f, 0x66, 0x0b, 0xd8, 0x90,
39 1.1 tls 0x35, 0xd5, 0xc0, 0xa7, 0x33, 0x06, 0x65, 0x69,
40 1.1 tls 0x45, 0x00, 0x94, 0x56, 0x6d, 0x98, 0x9b, 0x76,
41 1.1 tls 0x97, 0xfc, 0xb2, 0xc2, 0xb0, 0xfe, 0xdb, 0x20,
42 1.1 tls 0xe1, 0xeb, 0xd6, 0xe4, 0xdd, 0x47, 0x4a, 0x1d,
43 1.1 tls 0x42, 0xed, 0x9e, 0x6e, 0x49, 0x3c, 0xcd, 0x43,
44 1.1 tls 0x27, 0xd2, 0x07, 0xd4, 0xde, 0xc7, 0x67, 0x18,
45 1.1 tls 0x89, 0xcb, 0x30, 0x1f, 0x8d, 0xc6, 0x8f, 0xaa,
46 1.1 tls 0xc8, 0x74, 0xdc, 0xc9, 0x5d, 0x5c, 0x31, 0xa4,
47 1.1 tls 0x70, 0x88, 0x61, 0x2c, 0x9f, 0x0d, 0x2b, 0x87,
48 1.1 tls 0x50, 0x82, 0x54, 0x64, 0x26, 0x7d, 0x03, 0x40,
49 1.1 tls 0x34, 0x4b, 0x1c, 0x73, 0xd1, 0xc4, 0xfd, 0x3b,
50 1.1 tls 0xcc, 0xfb, 0x7f, 0xab, 0xe6, 0x3e, 0x5b, 0xa5,
51 1.1 tls 0xad, 0x04, 0x23, 0x9c, 0x14, 0x51, 0x22, 0xf0,
52 1.1 tls 0x29, 0x79, 0x71, 0x7e, 0xff, 0x8c, 0x0e, 0xe2,
53 1.1 tls 0x0c, 0xef, 0xbc, 0x72, 0x75, 0x6f, 0x37, 0xa1,
54 1.1 tls 0xec, 0xd3, 0x8e, 0x62, 0x8b, 0x86, 0x10, 0xe8,
55 1.1 tls 0x08, 0x77, 0x11, 0xbe, 0x92, 0x4f, 0x24, 0xc5,
56 1.1 tls 0x32, 0x36, 0x9d, 0xcf, 0xf3, 0xa6, 0xbb, 0xac,
57 1.1 tls 0x5e, 0x6c, 0xa9, 0x13, 0x57, 0x25, 0xb5, 0xe3,
58 1.1 tls 0xbd, 0xa8, 0x3a, 0x01, 0x05, 0x59, 0x2a, 0x46
59 1.1 tls };
60 1.1 tls
61 1.1 tls /*
62 1.1 tls * For each key byte generate a table to represent the function
63 1.1 tls * ftable [in ^ keybyte]
64 1.1 tls *
65 1.1 tls * These tables used to save an XOR in each stage of the G-function
66 1.1 tls * the tables are hopefully pointed to by register allocated variables
67 1.1 tls * k0, k1..k9
68 1.1 tls */
69 1.1 tls void
70 1.1 tls subkey_table_gen (const u_int8_t *key, u_int8_t **key_tables)
71 1.1 tls {
72 1.1 tls int i, k;
73 1.1 tls
74 1.1 tls for (k = 0; k < 10; k++) {
75 1.1 tls u_int8_t key_byte = key [k];
76 1.1 tls u_int8_t * table = key_tables[k];
77 1.1 tls for (i = 0; i < 0x100; i++)
78 1.1 tls table [i] = ftable [i ^ key_byte];
79 1.1 tls }
80 1.1 tls }
81 1.1 tls
82 1.1 tls
83 1.1 tls #define g(k0, k1, k2, k3, ih, il, oh, ol) \
84 1.1 tls { \
85 1.1 tls oh = k##k0 [il] ^ ih; \
86 1.1 tls ol = k##k1 [oh] ^ il; \
87 1.1 tls oh = k##k2 [ol] ^ oh; \
88 1.1 tls ol = k##k3 [oh] ^ ol; \
89 1.1 tls }
90 1.1 tls
91 1.1 tls #define g0(ih, il, oh, ol) g(0, 1, 2, 3, ih, il, oh, ol)
92 1.1 tls #define g4(ih, il, oh, ol) g(4, 5, 6, 7, ih, il, oh, ol)
93 1.1 tls #define g8(ih, il, oh, ol) g(8, 9, 0, 1, ih, il, oh, ol)
94 1.1 tls #define g2(ih, il, oh, ol) g(2, 3, 4, 5, ih, il, oh, ol)
95 1.1 tls #define g6(ih, il, oh, ol) g(6, 7, 8, 9, ih, il, oh, ol)
96 1.1 tls
97 1.1 tls
98 1.1 tls #define g_inv(k0, k1, k2, k3, ih, il, oh, ol) \
99 1.1 tls { \
100 1.1 tls ol = k##k3 [ih] ^ il; \
101 1.1 tls oh = k##k2 [ol] ^ ih; \
102 1.1 tls ol = k##k1 [oh] ^ ol; \
103 1.1 tls oh = k##k0 [ol] ^ oh; \
104 1.1 tls }
105 1.1 tls
106 1.1 tls
107 1.1 tls #define g0_inv(ih, il, oh, ol) g_inv(0, 1, 2, 3, ih, il, oh, ol)
108 1.1 tls #define g4_inv(ih, il, oh, ol) g_inv(4, 5, 6, 7, ih, il, oh, ol)
109 1.1 tls #define g8_inv(ih, il, oh, ol) g_inv(8, 9, 0, 1, ih, il, oh, ol)
110 1.1 tls #define g2_inv(ih, il, oh, ol) g_inv(2, 3, 4, 5, ih, il, oh, ol)
111 1.1 tls #define g6_inv(ih, il, oh, ol) g_inv(6, 7, 8, 9, ih, il, oh, ol)
112 1.1 tls
113 1.1 tls /* optimized version of Skipjack algorithm
114 1.1 tls *
115 1.1 tls * the appropriate g-function is inlined for each round
116 1.1 tls *
117 1.1 tls * the data movement is minimized by rotating the names of the
118 1.1 tls * variables w1..w4, not their contents (saves 3 moves per round)
119 1.1 tls *
120 1.1 tls * the loops are completely unrolled (needed to staticize choice of g)
121 1.1 tls *
122 1.1 tls * compiles to about 470 instructions on a Sparc (gcc -O)
123 1.1 tls * which is about 58 instructions per byte, 14 per round.
124 1.1 tls * gcc seems to leave in some unnecessary and with 0xFF operations
125 1.1 tls * but only in the latter part of the functions. Perhaps it
126 1.1 tls * runs out of resources to properly optimize long inlined function?
127 1.1 tls * in theory should get about 11 instructions per round, not 14
128 1.1 tls */
129 1.1 tls
130 1.1 tls void
131 1.1 tls skipjack_forwards(u_int8_t *plain, u_int8_t *cipher, u_int8_t **key_tables)
132 1.1 tls {
133 1.1 tls u_int8_t wh1 = plain[0]; u_int8_t wl1 = plain[1];
134 1.1 tls u_int8_t wh2 = plain[2]; u_int8_t wl2 = plain[3];
135 1.1 tls u_int8_t wh3 = plain[4]; u_int8_t wl3 = plain[5];
136 1.1 tls u_int8_t wh4 = plain[6]; u_int8_t wl4 = plain[7];
137 1.1 tls
138 1.1 tls u_int8_t * k0 = key_tables [0];
139 1.1 tls u_int8_t * k1 = key_tables [1];
140 1.1 tls u_int8_t * k2 = key_tables [2];
141 1.1 tls u_int8_t * k3 = key_tables [3];
142 1.1 tls u_int8_t * k4 = key_tables [4];
143 1.1 tls u_int8_t * k5 = key_tables [5];
144 1.1 tls u_int8_t * k6 = key_tables [6];
145 1.1 tls u_int8_t * k7 = key_tables [7];
146 1.1 tls u_int8_t * k8 = key_tables [8];
147 1.1 tls u_int8_t * k9 = key_tables [9];
148 1.1 tls
149 1.1 tls /* first 8 rounds */
150 1.1 tls g0 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 1; wh4 ^= wh1;
151 1.1 tls g4 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 2; wh3 ^= wh4;
152 1.1 tls g8 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 3; wh2 ^= wh3;
153 1.1 tls g2 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 4; wh1 ^= wh2;
154 1.1 tls g6 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 5; wh4 ^= wh1;
155 1.1 tls g0 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 6; wh3 ^= wh4;
156 1.1 tls g4 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 7; wh2 ^= wh3;
157 1.1 tls g8 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 8; wh1 ^= wh2;
158 1.1 tls
159 1.1 tls /* second 8 rounds */
160 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 9 ; g2 (wh1,wl1, wh1,wl1);
161 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 10; g6 (wh4,wl4, wh4,wl4);
162 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 11; g0 (wh3,wl3, wh3,wl3);
163 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 12; g4 (wh2,wl2, wh2,wl2);
164 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 13; g8 (wh1,wl1, wh1,wl1);
165 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 14; g2 (wh4,wl4, wh4,wl4);
166 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 15; g6 (wh3,wl3, wh3,wl3);
167 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 16; g0 (wh2,wl2, wh2,wl2);
168 1.1 tls
169 1.1 tls /* third 8 rounds */
170 1.1 tls g4 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 17; wh4 ^= wh1;
171 1.1 tls g8 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 18; wh3 ^= wh4;
172 1.1 tls g2 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 19; wh2 ^= wh3;
173 1.1 tls g6 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 20; wh1 ^= wh2;
174 1.1 tls g0 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 21; wh4 ^= wh1;
175 1.1 tls g4 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 22; wh3 ^= wh4;
176 1.1 tls g8 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 23; wh2 ^= wh3;
177 1.1 tls g2 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 24; wh1 ^= wh2;
178 1.1 tls
179 1.1 tls /* last 8 rounds */
180 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 25; g6 (wh1,wl1, wh1,wl1);
181 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 26; g0 (wh4,wl4, wh4,wl4);
182 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 27; g4 (wh3,wl3, wh3,wl3);
183 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 28; g8 (wh2,wl2, wh2,wl2);
184 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 29; g2 (wh1,wl1, wh1,wl1);
185 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 30; g6 (wh4,wl4, wh4,wl4);
186 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 31; g0 (wh3,wl3, wh3,wl3);
187 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 32; g4 (wh2,wl2, wh2,wl2);
188 1.1 tls
189 1.1 tls /* pack into byte vector */
190 1.1 tls cipher [0] = wh1; cipher [1] = wl1;
191 1.1 tls cipher [2] = wh2; cipher [3] = wl2;
192 1.1 tls cipher [4] = wh3; cipher [5] = wl3;
193 1.1 tls cipher [6] = wh4; cipher [7] = wl4;
194 1.1 tls }
195 1.1 tls
196 1.1 tls
197 1.1 tls void
198 1.1 tls skipjack_backwards (u_int8_t *cipher, u_int8_t *plain, u_int8_t **key_tables)
199 1.1 tls {
200 1.1 tls /* setup 4 16-bit portions */
201 1.1 tls u_int8_t wh1 = cipher[0]; u_int8_t wl1 = cipher[1];
202 1.1 tls u_int8_t wh2 = cipher[2]; u_int8_t wl2 = cipher[3];
203 1.1 tls u_int8_t wh3 = cipher[4]; u_int8_t wl3 = cipher[5];
204 1.1 tls u_int8_t wh4 = cipher[6]; u_int8_t wl4 = cipher[7];
205 1.1 tls
206 1.1 tls u_int8_t * k0 = key_tables [0];
207 1.1 tls u_int8_t * k1 = key_tables [1];
208 1.1 tls u_int8_t * k2 = key_tables [2];
209 1.1 tls u_int8_t * k3 = key_tables [3];
210 1.1 tls u_int8_t * k4 = key_tables [4];
211 1.1 tls u_int8_t * k5 = key_tables [5];
212 1.1 tls u_int8_t * k6 = key_tables [6];
213 1.1 tls u_int8_t * k7 = key_tables [7];
214 1.1 tls u_int8_t * k8 = key_tables [8];
215 1.1 tls u_int8_t * k9 = key_tables [9];
216 1.1 tls
217 1.1 tls /* first 8 rounds */
218 1.1 tls g4_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 32; wh3 ^= wh2;
219 1.1 tls g0_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 31; wh4 ^= wh3;
220 1.1 tls g6_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 30; wh1 ^= wh4;
221 1.1 tls g2_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 29; wh2 ^= wh1;
222 1.1 tls g8_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 28; wh3 ^= wh2;
223 1.1 tls g4_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 27; wh4 ^= wh3;
224 1.1 tls g0_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 26; wh1 ^= wh4;
225 1.1 tls g6_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 25; wh2 ^= wh1;
226 1.1 tls
227 1.1 tls /* second 8 rounds */
228 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 24; g2_inv (wh2,wl2, wh2,wl2);
229 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 23; g8_inv (wh3,wl3, wh3,wl3);
230 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 22; g4_inv (wh4,wl4, wh4,wl4);
231 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 21; g0_inv (wh1,wl1, wh1,wl1);
232 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 20; g6_inv (wh2,wl2, wh2,wl2);
233 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 19; g2_inv (wh3,wl3, wh3,wl3);
234 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 18; g8_inv (wh4,wl4, wh4,wl4);
235 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 17; g4_inv (wh1,wl1, wh1,wl1);
236 1.1 tls
237 1.1 tls /* third 8 rounds */
238 1.1 tls g0_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 16; wh3 ^= wh2;
239 1.1 tls g6_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 15; wh4 ^= wh3;
240 1.1 tls g2_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 14; wh1 ^= wh4;
241 1.1 tls g8_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 13; wh2 ^= wh1;
242 1.1 tls g4_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 12; wh3 ^= wh2;
243 1.1 tls g0_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 11; wh4 ^= wh3;
244 1.1 tls g6_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 10; wh1 ^= wh4;
245 1.1 tls g2_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 9; wh2 ^= wh1;
246 1.1 tls
247 1.1 tls /* last 8 rounds */
248 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 8; g8_inv (wh2,wl2, wh2,wl2);
249 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 7; g4_inv (wh3,wl3, wh3,wl3);
250 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 6; g0_inv (wh4,wl4, wh4,wl4);
251 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 5; g6_inv (wh1,wl1, wh1,wl1);
252 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 4; g2_inv (wh2,wl2, wh2,wl2);
253 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 3; g8_inv (wh3,wl3, wh3,wl3);
254 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 2; g4_inv (wh4,wl4, wh4,wl4);
255 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 1; g0_inv (wh1,wl1, wh1,wl1);
256 1.1 tls
257 1.1 tls /* pack into byte vector */
258 1.1 tls plain [0] = wh1; plain [1] = wl1;
259 1.1 tls plain [2] = wh2; plain [3] = wl2;
260 1.1 tls plain [4] = wh3; plain [5] = wl3;
261 1.1 tls plain [6] = wh4; plain [7] = wl4;
262 1.1 tls }
263