skipjack.c revision 1.4 1 1.4 pgoyette /* $NetBSD: skipjack.c,v 1.4 2014/01/01 15:18:57 pgoyette Exp $ */
2 1.1 tls /* $OpenBSD: skipjack.c,v 1.3 2001/05/05 00:31:34 angelos Exp $ */
3 1.1 tls
4 1.2 perry /*
5 1.2 perry * Further optimized test implementation of SKIPJACK algorithm
6 1.1 tls * Mark Tillotson <markt (at) chaos.org.uk>, 25 June 98
7 1.1 tls * Optimizations suit RISC (lots of registers) machine best.
8 1.1 tls *
9 1.1 tls * based on unoptimized implementation of
10 1.1 tls * Panu Rissanen <bande (at) lut.fi> 960624
11 1.1 tls *
12 1.2 perry * SKIPJACK and KEA Algorithm Specifications
13 1.2 perry * Version 2.0
14 1.1 tls * 29 May 1998
15 1.1 tls */
16 1.1 tls
17 1.1 tls #include <sys/cdefs.h>
18 1.4 pgoyette __KERNEL_RCSID(0, "$NetBSD: skipjack.c,v 1.4 2014/01/01 15:18:57 pgoyette Exp $");
19 1.1 tls
20 1.4 pgoyette #include <sys/errno.h>
21 1.4 pgoyette #include <sys/malloc.h>
22 1.4 pgoyette #include <sys/module.h>
23 1.1 tls #include <sys/param.h>
24 1.4 pgoyette
25 1.1 tls #include <crypto/skipjack/skipjack.h>
26 1.1 tls #include <opencrypto/cryptodev.h>
27 1.1 tls
28 1.1 tls static const u_int8_t ftable[0x100] =
29 1.2 perry {
30 1.2 perry 0xa3, 0xd7, 0x09, 0x83, 0xf8, 0x48, 0xf6, 0xf4,
31 1.2 perry 0xb3, 0x21, 0x15, 0x78, 0x99, 0xb1, 0xaf, 0xf9,
32 1.2 perry 0xe7, 0x2d, 0x4d, 0x8a, 0xce, 0x4c, 0xca, 0x2e,
33 1.2 perry 0x52, 0x95, 0xd9, 0x1e, 0x4e, 0x38, 0x44, 0x28,
34 1.2 perry 0x0a, 0xdf, 0x02, 0xa0, 0x17, 0xf1, 0x60, 0x68,
35 1.2 perry 0x12, 0xb7, 0x7a, 0xc3, 0xe9, 0xfa, 0x3d, 0x53,
36 1.2 perry 0x96, 0x84, 0x6b, 0xba, 0xf2, 0x63, 0x9a, 0x19,
37 1.2 perry 0x7c, 0xae, 0xe5, 0xf5, 0xf7, 0x16, 0x6a, 0xa2,
38 1.2 perry 0x39, 0xb6, 0x7b, 0x0f, 0xc1, 0x93, 0x81, 0x1b,
39 1.2 perry 0xee, 0xb4, 0x1a, 0xea, 0xd0, 0x91, 0x2f, 0xb8,
40 1.2 perry 0x55, 0xb9, 0xda, 0x85, 0x3f, 0x41, 0xbf, 0xe0,
41 1.2 perry 0x5a, 0x58, 0x80, 0x5f, 0x66, 0x0b, 0xd8, 0x90,
42 1.2 perry 0x35, 0xd5, 0xc0, 0xa7, 0x33, 0x06, 0x65, 0x69,
43 1.2 perry 0x45, 0x00, 0x94, 0x56, 0x6d, 0x98, 0x9b, 0x76,
44 1.2 perry 0x97, 0xfc, 0xb2, 0xc2, 0xb0, 0xfe, 0xdb, 0x20,
45 1.2 perry 0xe1, 0xeb, 0xd6, 0xe4, 0xdd, 0x47, 0x4a, 0x1d,
46 1.2 perry 0x42, 0xed, 0x9e, 0x6e, 0x49, 0x3c, 0xcd, 0x43,
47 1.2 perry 0x27, 0xd2, 0x07, 0xd4, 0xde, 0xc7, 0x67, 0x18,
48 1.2 perry 0x89, 0xcb, 0x30, 0x1f, 0x8d, 0xc6, 0x8f, 0xaa,
49 1.2 perry 0xc8, 0x74, 0xdc, 0xc9, 0x5d, 0x5c, 0x31, 0xa4,
50 1.2 perry 0x70, 0x88, 0x61, 0x2c, 0x9f, 0x0d, 0x2b, 0x87,
51 1.2 perry 0x50, 0x82, 0x54, 0x64, 0x26, 0x7d, 0x03, 0x40,
52 1.2 perry 0x34, 0x4b, 0x1c, 0x73, 0xd1, 0xc4, 0xfd, 0x3b,
53 1.2 perry 0xcc, 0xfb, 0x7f, 0xab, 0xe6, 0x3e, 0x5b, 0xa5,
54 1.2 perry 0xad, 0x04, 0x23, 0x9c, 0x14, 0x51, 0x22, 0xf0,
55 1.2 perry 0x29, 0x79, 0x71, 0x7e, 0xff, 0x8c, 0x0e, 0xe2,
56 1.2 perry 0x0c, 0xef, 0xbc, 0x72, 0x75, 0x6f, 0x37, 0xa1,
57 1.2 perry 0xec, 0xd3, 0x8e, 0x62, 0x8b, 0x86, 0x10, 0xe8,
58 1.2 perry 0x08, 0x77, 0x11, 0xbe, 0x92, 0x4f, 0x24, 0xc5,
59 1.2 perry 0x32, 0x36, 0x9d, 0xcf, 0xf3, 0xa6, 0xbb, 0xac,
60 1.2 perry 0x5e, 0x6c, 0xa9, 0x13, 0x57, 0x25, 0xb5, 0xe3,
61 1.1 tls 0xbd, 0xa8, 0x3a, 0x01, 0x05, 0x59, 0x2a, 0x46
62 1.1 tls };
63 1.1 tls
64 1.1 tls /*
65 1.2 perry * For each key byte generate a table to represent the function
66 1.1 tls * ftable [in ^ keybyte]
67 1.1 tls *
68 1.1 tls * These tables used to save an XOR in each stage of the G-function
69 1.1 tls * the tables are hopefully pointed to by register allocated variables
70 1.1 tls * k0, k1..k9
71 1.1 tls */
72 1.1 tls void
73 1.1 tls subkey_table_gen (const u_int8_t *key, u_int8_t **key_tables)
74 1.1 tls {
75 1.1 tls int i, k;
76 1.1 tls
77 1.1 tls for (k = 0; k < 10; k++) {
78 1.1 tls u_int8_t key_byte = key [k];
79 1.1 tls u_int8_t * table = key_tables[k];
80 1.1 tls for (i = 0; i < 0x100; i++)
81 1.1 tls table [i] = ftable [i ^ key_byte];
82 1.1 tls }
83 1.1 tls }
84 1.1 tls
85 1.1 tls
86 1.1 tls #define g(k0, k1, k2, k3, ih, il, oh, ol) \
87 1.1 tls { \
88 1.1 tls oh = k##k0 [il] ^ ih; \
89 1.1 tls ol = k##k1 [oh] ^ il; \
90 1.1 tls oh = k##k2 [ol] ^ oh; \
91 1.1 tls ol = k##k3 [oh] ^ ol; \
92 1.1 tls }
93 1.1 tls
94 1.1 tls #define g0(ih, il, oh, ol) g(0, 1, 2, 3, ih, il, oh, ol)
95 1.1 tls #define g4(ih, il, oh, ol) g(4, 5, 6, 7, ih, il, oh, ol)
96 1.1 tls #define g8(ih, il, oh, ol) g(8, 9, 0, 1, ih, il, oh, ol)
97 1.1 tls #define g2(ih, il, oh, ol) g(2, 3, 4, 5, ih, il, oh, ol)
98 1.1 tls #define g6(ih, il, oh, ol) g(6, 7, 8, 9, ih, il, oh, ol)
99 1.1 tls
100 1.2 perry
101 1.1 tls #define g_inv(k0, k1, k2, k3, ih, il, oh, ol) \
102 1.1 tls { \
103 1.1 tls ol = k##k3 [ih] ^ il; \
104 1.1 tls oh = k##k2 [ol] ^ ih; \
105 1.1 tls ol = k##k1 [oh] ^ ol; \
106 1.1 tls oh = k##k0 [ol] ^ oh; \
107 1.1 tls }
108 1.1 tls
109 1.1 tls
110 1.1 tls #define g0_inv(ih, il, oh, ol) g_inv(0, 1, 2, 3, ih, il, oh, ol)
111 1.1 tls #define g4_inv(ih, il, oh, ol) g_inv(4, 5, 6, 7, ih, il, oh, ol)
112 1.1 tls #define g8_inv(ih, il, oh, ol) g_inv(8, 9, 0, 1, ih, il, oh, ol)
113 1.1 tls #define g2_inv(ih, il, oh, ol) g_inv(2, 3, 4, 5, ih, il, oh, ol)
114 1.1 tls #define g6_inv(ih, il, oh, ol) g_inv(6, 7, 8, 9, ih, il, oh, ol)
115 1.1 tls
116 1.1 tls /* optimized version of Skipjack algorithm
117 1.1 tls *
118 1.1 tls * the appropriate g-function is inlined for each round
119 1.1 tls *
120 1.2 perry * the data movement is minimized by rotating the names of the
121 1.1 tls * variables w1..w4, not their contents (saves 3 moves per round)
122 1.1 tls *
123 1.1 tls * the loops are completely unrolled (needed to staticize choice of g)
124 1.1 tls *
125 1.1 tls * compiles to about 470 instructions on a Sparc (gcc -O)
126 1.1 tls * which is about 58 instructions per byte, 14 per round.
127 1.1 tls * gcc seems to leave in some unnecessary and with 0xFF operations
128 1.1 tls * but only in the latter part of the functions. Perhaps it
129 1.1 tls * runs out of resources to properly optimize long inlined function?
130 1.1 tls * in theory should get about 11 instructions per round, not 14
131 1.1 tls */
132 1.1 tls
133 1.1 tls void
134 1.1 tls skipjack_forwards(u_int8_t *plain, u_int8_t *cipher, u_int8_t **key_tables)
135 1.1 tls {
136 1.1 tls u_int8_t wh1 = plain[0]; u_int8_t wl1 = plain[1];
137 1.1 tls u_int8_t wh2 = plain[2]; u_int8_t wl2 = plain[3];
138 1.1 tls u_int8_t wh3 = plain[4]; u_int8_t wl3 = plain[5];
139 1.1 tls u_int8_t wh4 = plain[6]; u_int8_t wl4 = plain[7];
140 1.1 tls
141 1.1 tls u_int8_t * k0 = key_tables [0];
142 1.1 tls u_int8_t * k1 = key_tables [1];
143 1.1 tls u_int8_t * k2 = key_tables [2];
144 1.1 tls u_int8_t * k3 = key_tables [3];
145 1.1 tls u_int8_t * k4 = key_tables [4];
146 1.1 tls u_int8_t * k5 = key_tables [5];
147 1.1 tls u_int8_t * k6 = key_tables [6];
148 1.1 tls u_int8_t * k7 = key_tables [7];
149 1.1 tls u_int8_t * k8 = key_tables [8];
150 1.1 tls u_int8_t * k9 = key_tables [9];
151 1.1 tls
152 1.1 tls /* first 8 rounds */
153 1.1 tls g0 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 1; wh4 ^= wh1;
154 1.1 tls g4 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 2; wh3 ^= wh4;
155 1.1 tls g8 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 3; wh2 ^= wh3;
156 1.1 tls g2 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 4; wh1 ^= wh2;
157 1.1 tls g6 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 5; wh4 ^= wh1;
158 1.1 tls g0 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 6; wh3 ^= wh4;
159 1.1 tls g4 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 7; wh2 ^= wh3;
160 1.1 tls g8 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 8; wh1 ^= wh2;
161 1.1 tls
162 1.1 tls /* second 8 rounds */
163 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 9 ; g2 (wh1,wl1, wh1,wl1);
164 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 10; g6 (wh4,wl4, wh4,wl4);
165 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 11; g0 (wh3,wl3, wh3,wl3);
166 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 12; g4 (wh2,wl2, wh2,wl2);
167 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 13; g8 (wh1,wl1, wh1,wl1);
168 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 14; g2 (wh4,wl4, wh4,wl4);
169 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 15; g6 (wh3,wl3, wh3,wl3);
170 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 16; g0 (wh2,wl2, wh2,wl2);
171 1.1 tls
172 1.1 tls /* third 8 rounds */
173 1.1 tls g4 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 17; wh4 ^= wh1;
174 1.1 tls g8 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 18; wh3 ^= wh4;
175 1.1 tls g2 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 19; wh2 ^= wh3;
176 1.1 tls g6 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 20; wh1 ^= wh2;
177 1.1 tls g0 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 21; wh4 ^= wh1;
178 1.1 tls g4 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 22; wh3 ^= wh4;
179 1.1 tls g8 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 23; wh2 ^= wh3;
180 1.1 tls g2 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 24; wh1 ^= wh2;
181 1.1 tls
182 1.1 tls /* last 8 rounds */
183 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 25; g6 (wh1,wl1, wh1,wl1);
184 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 26; g0 (wh4,wl4, wh4,wl4);
185 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 27; g4 (wh3,wl3, wh3,wl3);
186 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 28; g8 (wh2,wl2, wh2,wl2);
187 1.1 tls wh2 ^= wh1; wl2 ^= wl1 ^ 29; g2 (wh1,wl1, wh1,wl1);
188 1.1 tls wh1 ^= wh4; wl1 ^= wl4 ^ 30; g6 (wh4,wl4, wh4,wl4);
189 1.1 tls wh4 ^= wh3; wl4 ^= wl3 ^ 31; g0 (wh3,wl3, wh3,wl3);
190 1.1 tls wh3 ^= wh2; wl3 ^= wl2 ^ 32; g4 (wh2,wl2, wh2,wl2);
191 1.1 tls
192 1.1 tls /* pack into byte vector */
193 1.1 tls cipher [0] = wh1; cipher [1] = wl1;
194 1.1 tls cipher [2] = wh2; cipher [3] = wl2;
195 1.1 tls cipher [4] = wh3; cipher [5] = wl3;
196 1.1 tls cipher [6] = wh4; cipher [7] = wl4;
197 1.1 tls }
198 1.1 tls
199 1.1 tls
200 1.1 tls void
201 1.1 tls skipjack_backwards (u_int8_t *cipher, u_int8_t *plain, u_int8_t **key_tables)
202 1.1 tls {
203 1.1 tls /* setup 4 16-bit portions */
204 1.1 tls u_int8_t wh1 = cipher[0]; u_int8_t wl1 = cipher[1];
205 1.1 tls u_int8_t wh2 = cipher[2]; u_int8_t wl2 = cipher[3];
206 1.1 tls u_int8_t wh3 = cipher[4]; u_int8_t wl3 = cipher[5];
207 1.1 tls u_int8_t wh4 = cipher[6]; u_int8_t wl4 = cipher[7];
208 1.1 tls
209 1.1 tls u_int8_t * k0 = key_tables [0];
210 1.1 tls u_int8_t * k1 = key_tables [1];
211 1.1 tls u_int8_t * k2 = key_tables [2];
212 1.1 tls u_int8_t * k3 = key_tables [3];
213 1.1 tls u_int8_t * k4 = key_tables [4];
214 1.1 tls u_int8_t * k5 = key_tables [5];
215 1.1 tls u_int8_t * k6 = key_tables [6];
216 1.1 tls u_int8_t * k7 = key_tables [7];
217 1.1 tls u_int8_t * k8 = key_tables [8];
218 1.1 tls u_int8_t * k9 = key_tables [9];
219 1.1 tls
220 1.1 tls /* first 8 rounds */
221 1.1 tls g4_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 32; wh3 ^= wh2;
222 1.1 tls g0_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 31; wh4 ^= wh3;
223 1.1 tls g6_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 30; wh1 ^= wh4;
224 1.1 tls g2_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 29; wh2 ^= wh1;
225 1.1 tls g8_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 28; wh3 ^= wh2;
226 1.1 tls g4_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 27; wh4 ^= wh3;
227 1.1 tls g0_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 26; wh1 ^= wh4;
228 1.1 tls g6_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 25; wh2 ^= wh1;
229 1.1 tls
230 1.1 tls /* second 8 rounds */
231 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 24; g2_inv (wh2,wl2, wh2,wl2);
232 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 23; g8_inv (wh3,wl3, wh3,wl3);
233 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 22; g4_inv (wh4,wl4, wh4,wl4);
234 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 21; g0_inv (wh1,wl1, wh1,wl1);
235 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 20; g6_inv (wh2,wl2, wh2,wl2);
236 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 19; g2_inv (wh3,wl3, wh3,wl3);
237 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 18; g8_inv (wh4,wl4, wh4,wl4);
238 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 17; g4_inv (wh1,wl1, wh1,wl1);
239 1.1 tls
240 1.1 tls /* third 8 rounds */
241 1.1 tls g0_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 16; wh3 ^= wh2;
242 1.1 tls g6_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 15; wh4 ^= wh3;
243 1.1 tls g2_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 14; wh1 ^= wh4;
244 1.1 tls g8_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 13; wh2 ^= wh1;
245 1.1 tls g4_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 12; wh3 ^= wh2;
246 1.1 tls g0_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 11; wh4 ^= wh3;
247 1.1 tls g6_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 10; wh1 ^= wh4;
248 1.1 tls g2_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 9; wh2 ^= wh1;
249 1.1 tls
250 1.1 tls /* last 8 rounds */
251 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 8; g8_inv (wh2,wl2, wh2,wl2);
252 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 7; g4_inv (wh3,wl3, wh3,wl3);
253 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 6; g0_inv (wh4,wl4, wh4,wl4);
254 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 5; g6_inv (wh1,wl1, wh1,wl1);
255 1.1 tls wh1 ^= wh2; wl1 ^= wl2 ^ 4; g2_inv (wh2,wl2, wh2,wl2);
256 1.1 tls wh2 ^= wh3; wl2 ^= wl3 ^ 3; g8_inv (wh3,wl3, wh3,wl3);
257 1.1 tls wh3 ^= wh4; wl3 ^= wl4 ^ 2; g4_inv (wh4,wl4, wh4,wl4);
258 1.1 tls wh4 ^= wh1; wl4 ^= wl1 ^ 1; g0_inv (wh1,wl1, wh1,wl1);
259 1.1 tls
260 1.1 tls /* pack into byte vector */
261 1.1 tls plain [0] = wh1; plain [1] = wl1;
262 1.1 tls plain [2] = wh2; plain [3] = wl2;
263 1.1 tls plain [4] = wh3; plain [5] = wl3;
264 1.1 tls plain [6] = wh4; plain [7] = wl4;
265 1.1 tls }
266 1.4 pgoyette
267 1.4 pgoyette MODULE(MODULE_CLASS_MISC, skipjack, NULL);
268 1.4 pgoyette
269 1.4 pgoyette static int
270 1.4 pgoyette skipjack_modcmd(modcmd_t cmd, void *opaque)
271 1.4 pgoyette {
272 1.4 pgoyette
273 1.4 pgoyette switch (cmd) {
274 1.4 pgoyette case MODULE_CMD_INIT:
275 1.4 pgoyette return 0;
276 1.4 pgoyette case MODULE_CMD_FINI:
277 1.4 pgoyette return 0;
278 1.4 pgoyette default:
279 1.4 pgoyette return ENOTTY;
280 1.4 pgoyette }
281 1.4 pgoyette }
282