skipjack.c revision 1.1.10.1 1 1.1.10.1 kent /* $NetBSD: skipjack.c,v 1.1.10.1 2005/04/29 11:28:44 kent Exp $ */
2 1.1 tls /* $OpenBSD: skipjack.c,v 1.3 2001/05/05 00:31:34 angelos Exp $ */
3 1.1 tls
4 1.1.10.1 kent /*
5 1.1.10.1 kent * Further optimized test implementation of SKIPJACK algorithm
6 1.1 tls * Mark Tillotson <markt (at) chaos.org.uk>, 25 June 98
7 1.1 tls * Optimizations suit RISC (lots of registers) machine best.
8 1.1 tls *
9 1.1 tls * based on unoptimized implementation of
10 1.1 tls * Panu Rissanen <bande (at) lut.fi> 960624
11 1.1 tls *
12 1.1.10.1 kent * SKIPJACK and KEA Algorithm Specifications
13 1.1.10.1 kent * Version 2.0
14 1.1 tls * 29 May 1998
15 1.1 tls */
16 1.1 tls
17 1.1 tls #include <sys/cdefs.h>
18 1.1.10.1 kent __KERNEL_RCSID(0, "$NetBSD: skipjack.c,v 1.1.10.1 2005/04/29 11:28:44 kent Exp $");
19 1.1 tls
20 1.1 tls #include <sys/param.h>
21 1.1 tls #include <crypto/skipjack/skipjack.h>
22 1.1 tls #include <sys/malloc.h>
23 1.1 tls #include <opencrypto/cryptodev.h>
24 1.1 tls
25 1.1 tls static const u_int8_t ftable[0x100] =
26 1.1.10.1 kent {
27 1.1.10.1 kent 0xa3, 0xd7, 0x09, 0x83, 0xf8, 0x48, 0xf6, 0xf4,
28 1.1.10.1 kent 0xb3, 0x21, 0x15, 0x78, 0x99, 0xb1, 0xaf, 0xf9,
29 1.1.10.1 kent 0xe7, 0x2d, 0x4d, 0x8a, 0xce, 0x4c, 0xca, 0x2e,
30 1.1.10.1 kent 0x52, 0x95, 0xd9, 0x1e, 0x4e, 0x38, 0x44, 0x28,
31 1.1.10.1 kent 0x0a, 0xdf, 0x02, 0xa0, 0x17, 0xf1, 0x60, 0x68,
32 1.1.10.1 kent 0x12, 0xb7, 0x7a, 0xc3, 0xe9, 0xfa, 0x3d, 0x53,
33 1.1.10.1 kent 0x96, 0x84, 0x6b, 0xba, 0xf2, 0x63, 0x9a, 0x19,
34 1.1.10.1 kent 0x7c, 0xae, 0xe5, 0xf5, 0xf7, 0x16, 0x6a, 0xa2,
35 1.1.10.1 kent 0x39, 0xb6, 0x7b, 0x0f, 0xc1, 0x93, 0x81, 0x1b,
36 1.1.10.1 kent 0xee, 0xb4, 0x1a, 0xea, 0xd0, 0x91, 0x2f, 0xb8,
37 1.1.10.1 kent 0x55, 0xb9, 0xda, 0x85, 0x3f, 0x41, 0xbf, 0xe0,
38 1.1.10.1 kent 0x5a, 0x58, 0x80, 0x5f, 0x66, 0x0b, 0xd8, 0x90,
39 1.1.10.1 kent 0x35, 0xd5, 0xc0, 0xa7, 0x33, 0x06, 0x65, 0x69,
40 1.1.10.1 kent 0x45, 0x00, 0x94, 0x56, 0x6d, 0x98, 0x9b, 0x76,
41 1.1.10.1 kent 0x97, 0xfc, 0xb2, 0xc2, 0xb0, 0xfe, 0xdb, 0x20,
42 1.1.10.1 kent 0xe1, 0xeb, 0xd6, 0xe4, 0xdd, 0x47, 0x4a, 0x1d,
43 1.1.10.1 kent 0x42, 0xed, 0x9e, 0x6e, 0x49, 0x3c, 0xcd, 0x43,
44 1.1.10.1 kent 0x27, 0xd2, 0x07, 0xd4, 0xde, 0xc7, 0x67, 0x18,
45 1.1.10.1 kent 0x89, 0xcb, 0x30, 0x1f, 0x8d, 0xc6, 0x8f, 0xaa,
46 1.1.10.1 kent 0xc8, 0x74, 0xdc, 0xc9, 0x5d, 0x5c, 0x31, 0xa4,
47 1.1.10.1 kent 0x70, 0x88, 0x61, 0x2c, 0x9f, 0x0d, 0x2b, 0x87,
48 1.1.10.1 kent 0x50, 0x82, 0x54, 0x64, 0x26, 0x7d, 0x03, 0x40,
49 1.1.10.1 kent 0x34, 0x4b, 0x1c, 0x73, 0xd1, 0xc4, 0xfd, 0x3b,
50 1.1.10.1 kent 0xcc, 0xfb, 0x7f, 0xab, 0xe6, 0x3e, 0x5b, 0xa5,
51 1.1.10.1 kent 0xad, 0x04, 0x23, 0x9c, 0x14, 0x51, 0x22, 0xf0,
52 1.1.10.1 kent 0x29, 0x79, 0x71, 0x7e, 0xff, 0x8c, 0x0e, 0xe2,
53 1.1.10.1 kent 0x0c, 0xef, 0xbc, 0x72, 0x75, 0x6f, 0x37, 0xa1,
54 1.1.10.1 kent 0xec, 0xd3, 0x8e, 0x62, 0x8b, 0x86, 0x10, 0xe8,
55 1.1.10.1 kent 0x08, 0x77, 0x11, 0xbe, 0x92, 0x4f, 0x24, 0xc5,
56 1.1.10.1 kent 0x32, 0x36, 0x9d, 0xcf, 0xf3, 0xa6, 0xbb, 0xac,
57 1.1.10.1 kent 0x5e, 0x6c, 0xa9, 0x13, 0x57, 0x25, 0xb5, 0xe3,
58 1.1 tls 0xbd, 0xa8, 0x3a, 0x01, 0x05, 0x59, 0x2a, 0x46
59 1.1 tls };
60 1.1 tls
61 1.1 tls /*
62 1.1.10.1 kent * For each key byte generate a table to represent the function
63 1.1 tls * ftable [in ^ keybyte]
64 1.1 tls *
65 1.1 tls * These tables used to save an XOR in each stage of the G-function
66 1.1 tls * the tables are hopefully pointed to by register allocated variables
67 1.1 tls * k0, k1..k9
68 1.1 tls */
69 1.1 tls void
70 1.1 tls subkey_table_gen (const u_int8_t *key, u_int8_t **key_tables)
71 1.1 tls {
72 1.1 tls int i, k;
73 1.1 tls
74 1.1 tls for (k = 0; k < 10; k++) {
75 1.1 tls u_int8_t key_byte = key [k];
76 1.1 tls u_int8_t * table = key_tables[k];
77 1.1 tls for (i = 0; i < 0x100; i++)
78 1.1 tls table [i] = ftable [i ^ key_byte];
79 1.1 tls }
80 1.1 tls }
81 1.1 tls
82 1.1 tls
83 1.1 tls #define g(k0, k1, k2, k3, ih, il, oh, ol) \
84 1.1 tls { \
85 1.1 tls oh = k##k0 [il] ^ ih; \
86 1.1 tls ol = k##k1 [oh] ^ il; \
87 1.1 tls oh = k##k2 [ol] ^ oh; \
88 1.1 tls ol = k##k3 [oh] ^ ol; \
89 1.1 tls }
90 1.1 tls
91 1.1 tls #define g0(ih, il, oh, ol) g(0, 1, 2, 3, ih, il, oh, ol)
92 1.1 tls #define g4(ih, il, oh, ol) g(4, 5, 6, 7, ih, il, oh, ol)
93 1.1 tls #define g8(ih, il, oh, ol) g(8, 9, 0, 1, ih, il, oh, ol)
94 1.1 tls #define g2(ih, il, oh, ol) g(2, 3, 4, 5, ih, il, oh, ol)
95 1.1 tls #define g6(ih, il, oh, ol) g(6, 7, 8, 9, ih, il, oh, ol)
96 1.1 tls
97 1.1.10.1 kent
98 1.1 tls #define g_inv(k0, k1, k2, k3, ih, il, oh, ol) \
99 1.1 tls { \
100 1.1 tls ol = k##k3 [ih] ^ il; \
101 1.1 tls oh = k##k2 [ol] ^ ih; \
102 1.1 tls ol = k##k1 [oh] ^ ol; \
103 1.1 tls oh = k##k0 [ol] ^ oh; \
104 1.1 tls }
105 1.1 tls
106 1.1 tls
107 1.1 tls #define g0_inv(ih, il, oh, ol) g_inv(0, 1, 2, 3, ih, il, oh, ol)
108 1.1 tls #define g4_inv(ih, il, oh, ol) g_inv(4, 5, 6, 7, ih, il, oh, ol)
109 1.1 tls #define g8_inv(ih, il, oh, ol) g_inv(8, 9, 0, 1, ih, il, oh, ol)
110 1.1 tls #define g2_inv(ih, il, oh, ol) g_inv(2, 3, 4, 5, ih, il, oh, ol)
111 1.1 tls #define g6_inv(ih, il, oh, ol) g_inv(6, 7, 8, 9, ih, il, oh, ol)
112 1.1 tls
113 1.1 tls /* optimized version of Skipjack algorithm
114 1.1 tls *
115 1.1 tls * the appropriate g-function is inlined for each round
116 1.1 tls *
117 1.1.10.1 kent * the data movement is minimized by rotating the names of the
118 1.1 tls * variables w1..w4, not their contents (saves 3 moves per round)
119 1.1 tls *
120 1.1 tls * the loops are completely unrolled (needed to staticize choice of g)
121 1.1 tls *
122 1.1 tls * compiles to about 470 instructions on a Sparc (gcc -O)
123 1.1 tls * which is about 58 instructions per byte, 14 per round.
124 1.1 tls * gcc seems to leave in some unnecessary and with 0xFF operations
125 1.1 tls * but only in the latter part of the functions. Perhaps it
126 1.1 tls * runs out of resources to properly optimize long inlined function?
127 1.1 tls * in theory should get about 11 instructions per round, not 14
128 1.1 tls */
129 1.1 tls
130 1.1 tls void
131 1.1 tls skipjack_forwards(u_int8_t *plain, u_int8_t *cipher, u_int8_t **key_tables)
132 1.1 tls {
133 1.1 tls u_int8_t wh1 = plain[0]; u_int8_t wl1 = plain[1];
134 1.1 tls u_int8_t wh2 = plain[2]; u_int8_t wl2 = plain[3];
135 1.1 tls u_int8_t wh3 = plain[4]; u_int8_t wl3 = plain[5];
136 1.1 tls u_int8_t wh4 = plain[6]; u_int8_t wl4 = plain[7];
137 1.1 tls
138 1.1 tls u_int8_t * k0 = key_tables [0];
139 1.1 tls u_int8_t * k1 = key_tables [1];
140 1.1 tls u_int8_t * k2 = key_tables [2];
141 1.1 tls u_int8_t * k3 = key_tables [3];
142 1.1 tls u_int8_t * k4 = key_tables [4];
143 1.1 tls u_int8_t * k5 = key_tables [5];
144 1.1 tls u_int8_t * k6 = key_tables [6];
145 1.1 tls u_int8_t * k7 = key_tables [7];
146 1.1 tls u_int8_t * k8 = key_tables [8];
147 1.1 tls u_int8_t * k9 = key_tables [9];
148 1.1 tls
149 1.1 tls /* first 8 rounds */
150 1.1 tls g0 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 1; wh4 ^= wh1;
151 1.1 tls g4 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 2; wh3 ^= wh4;
152 1.1 tls g8 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 3; wh2 ^= wh3;
153 1.1 tls g2 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 4; wh1 ^= wh2;
154 1.1 tls g6 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 5; wh4 ^= wh1;
155 1.1 tls g0 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 6; wh3 ^= wh4;
156 1.1 tls g4 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 7; wh2 ^= wh3;
157 1.1 tls g8 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 8; wh1 ^= wh2;
158 1.1 tls
159 1.1 tls /* second 8 rounds */
160 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 9 ; g2 (wh1,wl1, wh1,wl1);
161 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 10; g6 (wh4,wl4, wh4,wl4);
162 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 11; g0 (wh3,wl3, wh3,wl3);
163 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 12; g4 (wh2,wl2, wh2,wl2);
164 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 13; g8 (wh1,wl1, wh1,wl1);
165 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 14; g2 (wh4,wl4, wh4,wl4);
166 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 15; g6 (wh3,wl3, wh3,wl3);
167 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 16; g0 (wh2,wl2, wh2,wl2);
168 1.1 tls
169 1.1 tls /* third 8 rounds */
170 1.1 tls g4 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 17; wh4 ^= wh1;
171 1.1 tls g8 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 18; wh3 ^= wh4;
172 1.1 tls g2 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 19; wh2 ^= wh3;
173 1.1 tls g6 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 20; wh1 ^= wh2;
174 1.1 tls g0 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 21; wh4 ^= wh1;
175 1.1 tls g4 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 22; wh3 ^= wh4;
176 1.1 tls g8 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 23; wh2 ^= wh3;
177 1.1 tls g2 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 24; wh1 ^= wh2;
178 1.1 tls
179 1.1 tls /* last 8 rounds */
180 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 25; g6 (wh1,wl1, wh1,wl1);
181 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 26; g0 (wh4,wl4, wh4,wl4);
182 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 27; g4 (wh3,wl3, wh3,wl3);
183 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 28; g8 (wh2,wl2, wh2,wl2);
184 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 29; g2 (wh1,wl1, wh1,wl1);
185 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 30; g6 (wh4,wl4, wh4,wl4);
186 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 31; g0 (wh3,wl3, wh3,wl3);
187 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 32; g4 (wh2,wl2, wh2,wl2);
188 1.1 tls
189 1.1 tls /* pack into byte vector */
190 1.1 tls cipher [0] = wh1; cipher [1] = wl1;
191 1.1 tls cipher [2] = wh2; cipher [3] = wl2;
192 1.1 tls cipher [4] = wh3; cipher [5] = wl3;
193 1.1 tls cipher [6] = wh4; cipher [7] = wl4;
194 1.1 tls }
195 1.1 tls
196 1.1 tls
197 1.1 tls void
198 1.1 tls skipjack_backwards (u_int8_t *cipher, u_int8_t *plain, u_int8_t **key_tables)
199 1.1 tls {
200 1.1 tls /* setup 4 16-bit portions */
201 1.1 tls u_int8_t wh1 = cipher[0]; u_int8_t wl1 = cipher[1];
202 1.1 tls u_int8_t wh2 = cipher[2]; u_int8_t wl2 = cipher[3];
203 1.1 tls u_int8_t wh3 = cipher[4]; u_int8_t wl3 = cipher[5];
204 1.1 tls u_int8_t wh4 = cipher[6]; u_int8_t wl4 = cipher[7];
205 1.1 tls
206 1.1 tls u_int8_t * k0 = key_tables [0];
207 1.1 tls u_int8_t * k1 = key_tables [1];
208 1.1 tls u_int8_t * k2 = key_tables [2];
209 1.1 tls u_int8_t * k3 = key_tables [3];
210 1.1 tls u_int8_t * k4 = key_tables [4];
211 1.1 tls u_int8_t * k5 = key_tables [5];
212 1.1 tls u_int8_t * k6 = key_tables [6];
213 1.1 tls u_int8_t * k7 = key_tables [7];
214 1.1 tls u_int8_t * k8 = key_tables [8];
215 1.1 tls u_int8_t * k9 = key_tables [9];
216 1.1 tls
217 1.1 tls /* first 8 rounds */
218 1.1 tls g4_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 32; wh3 ^= wh2;
219 1.1 tls g0_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 31; wh4 ^= wh3;
220 1.1 tls g6_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 30; wh1 ^= wh4;
221 1.1 tls g2_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 29; wh2 ^= wh1;
222 1.1 tls g8_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 28; wh3 ^= wh2;
223 1.1 tls g4_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 27; wh4 ^= wh3;
224 1.1 tls g0_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 26; wh1 ^= wh4;
225 1.1 tls g6_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 25; wh2 ^= wh1;
226 1.1 tls
227 1.1 tls /* second 8 rounds */
228 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 24; g2_inv (wh2,wl2, wh2,wl2);
229 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 23; g8_inv (wh3,wl3, wh3,wl3);
230 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 22; g4_inv (wh4,wl4, wh4,wl4);
231 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 21; g0_inv (wh1,wl1, wh1,wl1);
232 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 20; g6_inv (wh2,wl2, wh2,wl2);
233 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 19; g2_inv (wh3,wl3, wh3,wl3);
234 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 18; g8_inv (wh4,wl4, wh4,wl4);
235 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 17; g4_inv (wh1,wl1, wh1,wl1);
236 1.1 tls
237 1.1 tls /* third 8 rounds */
238 1.1 tls g0_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 16; wh3 ^= wh2;
239 1.1 tls g6_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 15; wh4 ^= wh3;
240 1.1 tls g2_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 14; wh1 ^= wh4;
241 1.1 tls g8_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 13; wh2 ^= wh1;
242 1.1 tls g4_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 12; wh3 ^= wh2;
243 1.1 tls g0_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 11; wh4 ^= wh3;
244 1.1 tls g6_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 10; wh1 ^= wh4;
245 1.1 tls g2_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 9; wh2 ^= wh1;
246 1.1 tls
247 1.1 tls /* last 8 rounds */
248 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 8; g8_inv (wh2,wl2, wh2,wl2);
249 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 7; g4_inv (wh3,wl3, wh3,wl3);
250 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 6; g0_inv (wh4,wl4, wh4,wl4);
251 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 5; g6_inv (wh1,wl1, wh1,wl1);
252 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 4; g2_inv (wh2,wl2, wh2,wl2);
253 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 3; g8_inv (wh3,wl3, wh3,wl3);
254 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 2; g4_inv (wh4,wl4, wh4,wl4);
255 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 1; g0_inv (wh1,wl1, wh1,wl1);
256 1.1 tls
257 1.1 tls /* pack into byte vector */
258 1.1 tls plain [0] = wh1; plain [1] = wl1;
259 1.1 tls plain [2] = wh2; plain [3] = wl2;
260 1.1 tls plain [4] = wh3; plain [5] = wl3;
261 1.1 tls plain [6] = wh4; plain [7] = wl4;
262 1.1 tls }
263