Home | History | Annotate | Line # | Download | only in skipjack
skipjack.c revision 1.1
      1  1.1  tls /*	$NetBSD: skipjack.c,v 1.1 2003/11/16 12:07:50 tls Exp $ */
      2  1.1  tls /*	$OpenBSD: skipjack.c,v 1.3 2001/05/05 00:31:34 angelos Exp $	*/
      3  1.1  tls 
      4  1.1  tls /*
      5  1.1  tls  * Further optimized test implementation of SKIPJACK algorithm
      6  1.1  tls  * Mark Tillotson <markt (at) chaos.org.uk>, 25 June 98
      7  1.1  tls  * Optimizations suit RISC (lots of registers) machine best.
      8  1.1  tls  *
      9  1.1  tls  * based on unoptimized implementation of
     10  1.1  tls  * Panu Rissanen <bande (at) lut.fi> 960624
     11  1.1  tls  *
     12  1.1  tls  * SKIPJACK and KEA Algorithm Specifications
     13  1.1  tls  * Version 2.0
     14  1.1  tls  * 29 May 1998
     15  1.1  tls */
     16  1.1  tls 
     17  1.1  tls #include <sys/cdefs.h>
     18  1.1  tls __KERNEL_RCSID(0, "$NetBSD: skipjack.c,v 1.1 2003/11/16 12:07:50 tls Exp $");
     19  1.1  tls 
     20  1.1  tls #include <sys/param.h>
     21  1.1  tls #include <crypto/skipjack/skipjack.h>
     22  1.1  tls #include <sys/malloc.h>
     23  1.1  tls #include <opencrypto/cryptodev.h>
     24  1.1  tls 
     25  1.1  tls static const u_int8_t ftable[0x100] =
     26  1.1  tls {
     27  1.1  tls 	0xa3, 0xd7, 0x09, 0x83, 0xf8, 0x48, 0xf6, 0xf4,
     28  1.1  tls 	0xb3, 0x21, 0x15, 0x78, 0x99, 0xb1, 0xaf, 0xf9,
     29  1.1  tls 	0xe7, 0x2d, 0x4d, 0x8a, 0xce, 0x4c, 0xca, 0x2e,
     30  1.1  tls 	0x52, 0x95, 0xd9, 0x1e, 0x4e, 0x38, 0x44, 0x28,
     31  1.1  tls 	0x0a, 0xdf, 0x02, 0xa0, 0x17, 0xf1, 0x60, 0x68,
     32  1.1  tls 	0x12, 0xb7, 0x7a, 0xc3, 0xe9, 0xfa, 0x3d, 0x53,
     33  1.1  tls 	0x96, 0x84, 0x6b, 0xba, 0xf2, 0x63, 0x9a, 0x19,
     34  1.1  tls 	0x7c, 0xae, 0xe5, 0xf5, 0xf7, 0x16, 0x6a, 0xa2,
     35  1.1  tls 	0x39, 0xb6, 0x7b, 0x0f, 0xc1, 0x93, 0x81, 0x1b,
     36  1.1  tls 	0xee, 0xb4, 0x1a, 0xea, 0xd0, 0x91, 0x2f, 0xb8,
     37  1.1  tls 	0x55, 0xb9, 0xda, 0x85, 0x3f, 0x41, 0xbf, 0xe0,
     38  1.1  tls 	0x5a, 0x58, 0x80, 0x5f, 0x66, 0x0b, 0xd8, 0x90,
     39  1.1  tls 	0x35, 0xd5, 0xc0, 0xa7, 0x33, 0x06, 0x65, 0x69,
     40  1.1  tls 	0x45, 0x00, 0x94, 0x56, 0x6d, 0x98, 0x9b, 0x76,
     41  1.1  tls 	0x97, 0xfc, 0xb2, 0xc2, 0xb0, 0xfe, 0xdb, 0x20,
     42  1.1  tls 	0xe1, 0xeb, 0xd6, 0xe4, 0xdd, 0x47, 0x4a, 0x1d,
     43  1.1  tls 	0x42, 0xed, 0x9e, 0x6e, 0x49, 0x3c, 0xcd, 0x43,
     44  1.1  tls 	0x27, 0xd2, 0x07, 0xd4, 0xde, 0xc7, 0x67, 0x18,
     45  1.1  tls 	0x89, 0xcb, 0x30, 0x1f, 0x8d, 0xc6, 0x8f, 0xaa,
     46  1.1  tls 	0xc8, 0x74, 0xdc, 0xc9, 0x5d, 0x5c, 0x31, 0xa4,
     47  1.1  tls 	0x70, 0x88, 0x61, 0x2c, 0x9f, 0x0d, 0x2b, 0x87,
     48  1.1  tls 	0x50, 0x82, 0x54, 0x64, 0x26, 0x7d, 0x03, 0x40,
     49  1.1  tls 	0x34, 0x4b, 0x1c, 0x73, 0xd1, 0xc4, 0xfd, 0x3b,
     50  1.1  tls 	0xcc, 0xfb, 0x7f, 0xab, 0xe6, 0x3e, 0x5b, 0xa5,
     51  1.1  tls 	0xad, 0x04, 0x23, 0x9c, 0x14, 0x51, 0x22, 0xf0,
     52  1.1  tls 	0x29, 0x79, 0x71, 0x7e, 0xff, 0x8c, 0x0e, 0xe2,
     53  1.1  tls 	0x0c, 0xef, 0xbc, 0x72, 0x75, 0x6f, 0x37, 0xa1,
     54  1.1  tls 	0xec, 0xd3, 0x8e, 0x62, 0x8b, 0x86, 0x10, 0xe8,
     55  1.1  tls 	0x08, 0x77, 0x11, 0xbe, 0x92, 0x4f, 0x24, 0xc5,
     56  1.1  tls 	0x32, 0x36, 0x9d, 0xcf, 0xf3, 0xa6, 0xbb, 0xac,
     57  1.1  tls 	0x5e, 0x6c, 0xa9, 0x13, 0x57, 0x25, 0xb5, 0xe3,
     58  1.1  tls 	0xbd, 0xa8, 0x3a, 0x01, 0x05, 0x59, 0x2a, 0x46
     59  1.1  tls };
     60  1.1  tls 
     61  1.1  tls /*
     62  1.1  tls  * For each key byte generate a table to represent the function
     63  1.1  tls  *    ftable [in ^ keybyte]
     64  1.1  tls  *
     65  1.1  tls  * These tables used to save an XOR in each stage of the G-function
     66  1.1  tls  * the tables are hopefully pointed to by register allocated variables
     67  1.1  tls  * k0, k1..k9
     68  1.1  tls  */
     69  1.1  tls void
     70  1.1  tls subkey_table_gen (const u_int8_t *key, u_int8_t **key_tables)
     71  1.1  tls {
     72  1.1  tls 	int i, k;
     73  1.1  tls 
     74  1.1  tls 	for (k = 0; k < 10; k++) {
     75  1.1  tls 		u_int8_t   key_byte = key [k];
     76  1.1  tls 		u_int8_t * table = key_tables[k];
     77  1.1  tls 		for (i = 0; i < 0x100; i++)
     78  1.1  tls 			table [i] = ftable [i ^ key_byte];
     79  1.1  tls 	}
     80  1.1  tls }
     81  1.1  tls 
     82  1.1  tls 
     83  1.1  tls #define g(k0, k1, k2, k3, ih, il, oh, ol) \
     84  1.1  tls { \
     85  1.1  tls 	oh = k##k0 [il] ^ ih; \
     86  1.1  tls 	ol = k##k1 [oh] ^ il; \
     87  1.1  tls 	oh = k##k2 [ol] ^ oh; \
     88  1.1  tls 	ol = k##k3 [oh] ^ ol; \
     89  1.1  tls }
     90  1.1  tls 
     91  1.1  tls #define g0(ih, il, oh, ol) g(0, 1, 2, 3, ih, il, oh, ol)
     92  1.1  tls #define g4(ih, il, oh, ol) g(4, 5, 6, 7, ih, il, oh, ol)
     93  1.1  tls #define g8(ih, il, oh, ol) g(8, 9, 0, 1, ih, il, oh, ol)
     94  1.1  tls #define g2(ih, il, oh, ol) g(2, 3, 4, 5, ih, il, oh, ol)
     95  1.1  tls #define g6(ih, il, oh, ol) g(6, 7, 8, 9, ih, il, oh, ol)
     96  1.1  tls 
     97  1.1  tls 
     98  1.1  tls #define g_inv(k0, k1, k2, k3, ih, il, oh, ol) \
     99  1.1  tls { \
    100  1.1  tls 	ol = k##k3 [ih] ^ il; \
    101  1.1  tls 	oh = k##k2 [ol] ^ ih; \
    102  1.1  tls 	ol = k##k1 [oh] ^ ol; \
    103  1.1  tls 	oh = k##k0 [ol] ^ oh; \
    104  1.1  tls }
    105  1.1  tls 
    106  1.1  tls 
    107  1.1  tls #define g0_inv(ih, il, oh, ol) g_inv(0, 1, 2, 3, ih, il, oh, ol)
    108  1.1  tls #define g4_inv(ih, il, oh, ol) g_inv(4, 5, 6, 7, ih, il, oh, ol)
    109  1.1  tls #define g8_inv(ih, il, oh, ol) g_inv(8, 9, 0, 1, ih, il, oh, ol)
    110  1.1  tls #define g2_inv(ih, il, oh, ol) g_inv(2, 3, 4, 5, ih, il, oh, ol)
    111  1.1  tls #define g6_inv(ih, il, oh, ol) g_inv(6, 7, 8, 9, ih, il, oh, ol)
    112  1.1  tls 
    113  1.1  tls /* optimized version of Skipjack algorithm
    114  1.1  tls  *
    115  1.1  tls  * the appropriate g-function is inlined for each round
    116  1.1  tls  *
    117  1.1  tls  * the data movement is minimized by rotating the names of the
    118  1.1  tls  * variables w1..w4, not their contents (saves 3 moves per round)
    119  1.1  tls  *
    120  1.1  tls  * the loops are completely unrolled (needed to staticize choice of g)
    121  1.1  tls  *
    122  1.1  tls  * compiles to about 470 instructions on a Sparc (gcc -O)
    123  1.1  tls  * which is about 58 instructions per byte, 14 per round.
    124  1.1  tls  * gcc seems to leave in some unnecessary and with 0xFF operations
    125  1.1  tls  * but only in the latter part of the functions.  Perhaps it
    126  1.1  tls  * runs out of resources to properly optimize long inlined function?
    127  1.1  tls  * in theory should get about 11 instructions per round, not 14
    128  1.1  tls  */
    129  1.1  tls 
    130  1.1  tls void
    131  1.1  tls skipjack_forwards(u_int8_t *plain, u_int8_t *cipher, u_int8_t **key_tables)
    132  1.1  tls {
    133  1.1  tls 	u_int8_t wh1 = plain[0];  u_int8_t wl1 = plain[1];
    134  1.1  tls 	u_int8_t wh2 = plain[2];  u_int8_t wl2 = plain[3];
    135  1.1  tls 	u_int8_t wh3 = plain[4];  u_int8_t wl3 = plain[5];
    136  1.1  tls 	u_int8_t wh4 = plain[6];  u_int8_t wl4 = plain[7];
    137  1.1  tls 
    138  1.1  tls 	u_int8_t * k0 = key_tables [0];
    139  1.1  tls 	u_int8_t * k1 = key_tables [1];
    140  1.1  tls 	u_int8_t * k2 = key_tables [2];
    141  1.1  tls 	u_int8_t * k3 = key_tables [3];
    142  1.1  tls 	u_int8_t * k4 = key_tables [4];
    143  1.1  tls 	u_int8_t * k5 = key_tables [5];
    144  1.1  tls 	u_int8_t * k6 = key_tables [6];
    145  1.1  tls 	u_int8_t * k7 = key_tables [7];
    146  1.1  tls 	u_int8_t * k8 = key_tables [8];
    147  1.1  tls 	u_int8_t * k9 = key_tables [9];
    148  1.1  tls 
    149  1.1  tls 	/* first 8 rounds */
    150  1.1  tls 	g0 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 1; wh4 ^= wh1;
    151  1.1  tls 	g4 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 2; wh3 ^= wh4;
    152  1.1  tls 	g8 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 3; wh2 ^= wh3;
    153  1.1  tls 	g2 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 4; wh1 ^= wh2;
    154  1.1  tls 	g6 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 5; wh4 ^= wh1;
    155  1.1  tls 	g0 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 6; wh3 ^= wh4;
    156  1.1  tls 	g4 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 7; wh2 ^= wh3;
    157  1.1  tls 	g8 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 8; wh1 ^= wh2;
    158  1.1  tls 
    159  1.1  tls 	/* second 8 rounds */
    160  1.1  tls 	wh2 ^= wh1; wl2 ^= wl1 ^ 9 ; g2 (wh1,wl1, wh1,wl1);
    161  1.1  tls 	wh1 ^= wh4; wl1 ^= wl4 ^ 10; g6 (wh4,wl4, wh4,wl4);
    162  1.1  tls 	wh4 ^= wh3; wl4 ^= wl3 ^ 11; g0 (wh3,wl3, wh3,wl3);
    163  1.1  tls 	wh3 ^= wh2; wl3 ^= wl2 ^ 12; g4 (wh2,wl2, wh2,wl2);
    164  1.1  tls 	wh2 ^= wh1; wl2 ^= wl1 ^ 13; g8 (wh1,wl1, wh1,wl1);
    165  1.1  tls 	wh1 ^= wh4; wl1 ^= wl4 ^ 14; g2 (wh4,wl4, wh4,wl4);
    166  1.1  tls 	wh4 ^= wh3; wl4 ^= wl3 ^ 15; g6 (wh3,wl3, wh3,wl3);
    167  1.1  tls 	wh3 ^= wh2; wl3 ^= wl2 ^ 16; g0 (wh2,wl2, wh2,wl2);
    168  1.1  tls 
    169  1.1  tls 	/* third 8 rounds */
    170  1.1  tls 	g4 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 17; wh4 ^= wh1;
    171  1.1  tls 	g8 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 18; wh3 ^= wh4;
    172  1.1  tls 	g2 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 19; wh2 ^= wh3;
    173  1.1  tls 	g6 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 20; wh1 ^= wh2;
    174  1.1  tls 	g0 (wh1,wl1, wh1,wl1); wl4 ^= wl1 ^ 21; wh4 ^= wh1;
    175  1.1  tls 	g4 (wh4,wl4, wh4,wl4); wl3 ^= wl4 ^ 22; wh3 ^= wh4;
    176  1.1  tls 	g8 (wh3,wl3, wh3,wl3); wl2 ^= wl3 ^ 23; wh2 ^= wh3;
    177  1.1  tls 	g2 (wh2,wl2, wh2,wl2); wl1 ^= wl2 ^ 24; wh1 ^= wh2;
    178  1.1  tls 
    179  1.1  tls 	/* last 8 rounds */
    180  1.1  tls 	wh2 ^= wh1; wl2 ^= wl1 ^ 25; g6 (wh1,wl1, wh1,wl1);
    181  1.1  tls 	wh1 ^= wh4; wl1 ^= wl4 ^ 26; g0 (wh4,wl4, wh4,wl4);
    182  1.1  tls 	wh4 ^= wh3; wl4 ^= wl3 ^ 27; g4 (wh3,wl3, wh3,wl3);
    183  1.1  tls 	wh3 ^= wh2; wl3 ^= wl2 ^ 28; g8 (wh2,wl2, wh2,wl2);
    184  1.1  tls 	wh2 ^= wh1; wl2 ^= wl1 ^ 29; g2 (wh1,wl1, wh1,wl1);
    185  1.1  tls 	wh1 ^= wh4; wl1 ^= wl4 ^ 30; g6 (wh4,wl4, wh4,wl4);
    186  1.1  tls 	wh4 ^= wh3; wl4 ^= wl3 ^ 31; g0 (wh3,wl3, wh3,wl3);
    187  1.1  tls 	wh3 ^= wh2; wl3 ^= wl2 ^ 32; g4 (wh2,wl2, wh2,wl2);
    188  1.1  tls 
    189  1.1  tls 	/* pack into byte vector */
    190  1.1  tls 	cipher [0] = wh1;  cipher [1] = wl1;
    191  1.1  tls 	cipher [2] = wh2;  cipher [3] = wl2;
    192  1.1  tls 	cipher [4] = wh3;  cipher [5] = wl3;
    193  1.1  tls 	cipher [6] = wh4;  cipher [7] = wl4;
    194  1.1  tls }
    195  1.1  tls 
    196  1.1  tls 
    197  1.1  tls void
    198  1.1  tls skipjack_backwards (u_int8_t *cipher, u_int8_t *plain, u_int8_t **key_tables)
    199  1.1  tls {
    200  1.1  tls 	/* setup 4 16-bit portions */
    201  1.1  tls 	u_int8_t wh1 = cipher[0];  u_int8_t wl1 = cipher[1];
    202  1.1  tls 	u_int8_t wh2 = cipher[2];  u_int8_t wl2 = cipher[3];
    203  1.1  tls 	u_int8_t wh3 = cipher[4];  u_int8_t wl3 = cipher[5];
    204  1.1  tls 	u_int8_t wh4 = cipher[6];  u_int8_t wl4 = cipher[7];
    205  1.1  tls 
    206  1.1  tls 	u_int8_t * k0 = key_tables [0];
    207  1.1  tls 	u_int8_t * k1 = key_tables [1];
    208  1.1  tls 	u_int8_t * k2 = key_tables [2];
    209  1.1  tls 	u_int8_t * k3 = key_tables [3];
    210  1.1  tls 	u_int8_t * k4 = key_tables [4];
    211  1.1  tls 	u_int8_t * k5 = key_tables [5];
    212  1.1  tls 	u_int8_t * k6 = key_tables [6];
    213  1.1  tls 	u_int8_t * k7 = key_tables [7];
    214  1.1  tls 	u_int8_t * k8 = key_tables [8];
    215  1.1  tls 	u_int8_t * k9 = key_tables [9];
    216  1.1  tls 
    217  1.1  tls 	/* first 8 rounds */
    218  1.1  tls 	g4_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 32; wh3 ^= wh2;
    219  1.1  tls 	g0_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 31; wh4 ^= wh3;
    220  1.1  tls 	g6_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 30; wh1 ^= wh4;
    221  1.1  tls 	g2_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 29; wh2 ^= wh1;
    222  1.1  tls 	g8_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 28; wh3 ^= wh2;
    223  1.1  tls 	g4_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 27; wh4 ^= wh3;
    224  1.1  tls 	g0_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 26; wh1 ^= wh4;
    225  1.1  tls 	g6_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 25; wh2 ^= wh1;
    226  1.1  tls 
    227  1.1  tls 	/* second 8 rounds */
    228  1.1  tls 	wh1 ^= wh2; wl1 ^= wl2 ^ 24; g2_inv (wh2,wl2, wh2,wl2);
    229  1.1  tls 	wh2 ^= wh3; wl2 ^= wl3 ^ 23; g8_inv (wh3,wl3, wh3,wl3);
    230  1.1  tls 	wh3 ^= wh4; wl3 ^= wl4 ^ 22; g4_inv (wh4,wl4, wh4,wl4);
    231  1.1  tls 	wh4 ^= wh1; wl4 ^= wl1 ^ 21; g0_inv (wh1,wl1, wh1,wl1);
    232  1.1  tls 	wh1 ^= wh2; wl1 ^= wl2 ^ 20; g6_inv (wh2,wl2, wh2,wl2);
    233  1.1  tls 	wh2 ^= wh3; wl2 ^= wl3 ^ 19; g2_inv (wh3,wl3, wh3,wl3);
    234  1.1  tls 	wh3 ^= wh4; wl3 ^= wl4 ^ 18; g8_inv (wh4,wl4, wh4,wl4);
    235  1.1  tls 	wh4 ^= wh1; wl4 ^= wl1 ^ 17; g4_inv (wh1,wl1, wh1,wl1);
    236  1.1  tls 
    237  1.1  tls 	/* third 8 rounds */
    238  1.1  tls 	g0_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 16; wh3 ^= wh2;
    239  1.1  tls 	g6_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 15; wh4 ^= wh3;
    240  1.1  tls 	g2_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 14; wh1 ^= wh4;
    241  1.1  tls 	g8_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 13; wh2 ^= wh1;
    242  1.1  tls 	g4_inv (wh2,wl2, wh2,wl2); wl3 ^= wl2 ^ 12; wh3 ^= wh2;
    243  1.1  tls 	g0_inv (wh3,wl3, wh3,wl3); wl4 ^= wl3 ^ 11; wh4 ^= wh3;
    244  1.1  tls 	g6_inv (wh4,wl4, wh4,wl4); wl1 ^= wl4 ^ 10; wh1 ^= wh4;
    245  1.1  tls 	g2_inv (wh1,wl1, wh1,wl1); wl2 ^= wl1 ^ 9;  wh2 ^= wh1;
    246  1.1  tls 
    247  1.1  tls 	/* last 8 rounds */
    248  1.1  tls 	wh1 ^= wh2; wl1 ^= wl2 ^ 8; g8_inv (wh2,wl2, wh2,wl2);
    249  1.1  tls 	wh2 ^= wh3; wl2 ^= wl3 ^ 7; g4_inv (wh3,wl3, wh3,wl3);
    250  1.1  tls 	wh3 ^= wh4; wl3 ^= wl4 ^ 6; g0_inv (wh4,wl4, wh4,wl4);
    251  1.1  tls 	wh4 ^= wh1; wl4 ^= wl1 ^ 5; g6_inv (wh1,wl1, wh1,wl1);
    252  1.1  tls 	wh1 ^= wh2; wl1 ^= wl2 ^ 4; g2_inv (wh2,wl2, wh2,wl2);
    253  1.1  tls 	wh2 ^= wh3; wl2 ^= wl3 ^ 3; g8_inv (wh3,wl3, wh3,wl3);
    254  1.1  tls 	wh3 ^= wh4; wl3 ^= wl4 ^ 2; g4_inv (wh4,wl4, wh4,wl4);
    255  1.1  tls 	wh4 ^= wh1; wl4 ^= wl1 ^ 1; g0_inv (wh1,wl1, wh1,wl1);
    256  1.1  tls 
    257  1.1  tls 	/* pack into byte vector */
    258  1.1  tls 	plain [0] = wh1;  plain [1] = wl1;
    259  1.1  tls 	plain [2] = wh2;  plain [3] = wl2;
    260  1.1  tls 	plain [4] = wh3;  plain [5] = wl3;
    261  1.1  tls 	plain [6] = wh4;  plain [7] = wl4;
    262  1.1  tls }
    263