Changeset 18 for trunk/src/sh_tiger1_64.c
 Timestamp:
 Jan 28, 2006, 9:07:52 PM (16 years ago)
 File:

 1 edited
Legend:
 Unmodified
 Added
 Removed

trunk/src/sh_tiger1_64.c
r1 r18 5 5 #include "config_xor.h" 6 6 7 #if defined(HAVE_LONG_64)  defined(HAVE_LONG_LONG_64) 8 9 /*@incondefs macroparens macroassign macroparams macrostmt @*/ 10 /*@fixedformalarray +charindex type paramuse predboolint exportlocal@*/ 7 #if defined(TIGER_64_BIT) 8 9 /* #if defined(HAVE_LONG_64)  defined(HAVE_LONG_LONG_64) */ 10 11 #undef USE_MEMSET 12 11 13 /* Big endian: */ 12 14 #ifdef WORDS_BIGENDIAN 13 15 #define BIG_ENDIAN 14 16 #endif 15 16 17 17 18 /* Tiger: A Fast New Hash Function … … 38 39 #if defined(HAVE_LONG_64) 39 40 typedef unsigned long int word64; 40 #el se41 #elif defined(HAVE_LONG_LONG_64) 41 42 typedef unsigned long long int word64; 43 #else 44 #error No 64 bit type found ! 42 45 #endif 43 46 … … 49 52 typedef unsigned short sh_word32; 50 53 #else 51 #error No 32 b ytetype found !54 #error No 32 bit type found ! 52 55 #endif 53 56 54 57 typedef unsigned char sh_byte; 55 58 56 /* Big endian: 57 #if !(defined(__alpha)defined(__i386__)defined(__vax__)) 58 #define BIG_ENDIAN 59 #endif 60 */ 61 62 /* The following macro denotes that an optimization */ 63 /* for Alpha is required. It is used only for */ 64 /* optimization of time. Otherwise it does nothing. */ 65 #ifdef __alpha 66 #define OPTIMIZE_FOR_ALPHA 67 #endif 68 69 /* NOTE that this code is NOT FULLY OPTIMIZED for any */ 70 /* machine. Assembly code might be much faster on some */ 71 /* machines, especially if the code is compiled with */ 72 /* gcc. */ 73 74 /* The number of passes of the hash function. */ 75 /* Three passes are recommended. */ 76 /* Use four passes when you need extra security. */ 77 /* Must be at least three. */ 59 #if defined(TIGER_OPT_ASM) 60 #define TIGER_ASM64_2 1 61 #else 62 #define TIGER_C 1 63 #endif 64 65 /* The number of passes of the hash function. */ 66 /* Three passes are recommended. */ 67 /* Use four passes when you need extra security. */ 68 /* Must be at least three. */ 78 69 #define PASSES 3 79 70 80 71 extern word64 tiger_table[4*256]; 72 73 /* Volatile can help if compiler is smart enough to use memory operand */ 74 static /*volatile*/ const word64 XOR_CONST1=0xA5A5A5A5A5A5A5A5LL; 75 static /*volatile*/ const word64 XOR_CONST2=0x0123456789ABCDEFLL; 81 76 82 77 #define t1 (tiger_table) … … 85 80 #define t4 (tiger_table+256*3) 86 81 82 #define pass_start 83 #define pass_end 84 85 86 87 87 #define save_abc \ 88 aa = a; \ 89 bb = b; \ 90 cc = c; 91 92 #ifdef OPTIMIZE_FOR_ALPHA 93 /* This is the official definition of round */ 94 #define round(a,b,c,x,mul) \ 95 c ^= x; \ 96 a = t1[((c)>>(0*8))&0xFF] ^ t2[((c)>>(2*8))&0xFF] ^ \ 97 t3[((c)>>(4*8))&0xFF] ^ t4[((c)>>(6*8))&0xFF] ; \ 98 b += t4[((c)>>(1*8))&0xFF] ^ t3[((c)>>(3*8))&0xFF] ^ \ 99 t2[((c)>>(5*8))&0xFF] ^ t1[((c)>>(7*8))&0xFF] ; \ 100 b *= mul; 101 #else 102 /* This code works faster when compiled on 32bit machines */ 103 /* (but works slower on Alpha) */ 104 #define round(a,b,c,x,mul) \ 105 c ^= x; \ 106 a = t1[(sh_byte)(c)] ^ \ 107 t2[(sh_byte)(((sh_word32)(c))>>(2*8))] ^ \ 108 t3[(sh_byte)((c)>>(4*8))] ^ \ 109 t4[(sh_byte)(((sh_word32)((c)>>(4*8)))>>(2*8))] ; \ 110 b += t4[(sh_byte)(((sh_word32)(c))>>(1*8))] ^ \ 111 t3[(sh_byte)(((sh_word32)(c))>>(3*8))] ^ \ 112 t2[(sh_byte)(((sh_word32)((c)>>(4*8)))>>(1*8))] ^ \ 113 t1[(sh_byte)(((sh_word32)((c)>>(4*8)))>>(3*8))]; \ 114 b *= mul; 115 #endif 116 117 #define pass(a,b,c,mul) \ 118 round(a,b,c,x0,mul) \ 119 round(b,c,a,x1,mul) \ 120 round(c,a,b,x2,mul) \ 121 round(a,b,c,x3,mul) \ 122 round(b,c,a,x4,mul) \ 123 round(c,a,b,x5,mul) \ 124 round(a,b,c,x6,mul) \ 125 round(b,c,a,x7,mul) 88 aa = a; \ 89 bb = b; \ 90 cc = c; 91 92 #ifdef TIGER_C 93 94 #define BN(x,n) (((x)>>((n)*8))&0xFF) 95 96 97 /* Depending on outer code one of these two can be better*/ 98 #define roundX(a,b,c,x) \ 99 c ^= x; \ 100 a = t1[BN(c,0)] ^ t2[BN(c,2)] ^ \ 101 t3[BN(c,4)] ^ t4[BN(c,6)] ; \ 102 b += t4[BN(c,1)] ^ t3[BN(c,3)] ^ \ 103 t2[BN(c,5)] ^ t1[BN(c,7)] ; 104 105 #define round5(a,b,c,x) roundX(a,b,c,x) b = b+b*4; 106 #define round7(a,b,c,x) roundX(a,b,c,x) b = b*8b; 107 #define round9(a,b,c,x) roundX(a,b,c,x) b = b+b*8; 108 109 #endif 110 111 112 #ifdef TIGER_OPT_ASM 113 114 #define MASK0 0xFFL 115 #define MASK8 0xFF00L 116 #define MASK16 0xFF0000L 117 #define MASK32 0xFF00000000LL 118 #define MASK40 0xFF0000000000LL 119 #define MASK48 0xFF000000000000LL 120 121 #define roundstart __asm__ ( 122 123 /* a will be moved into different reg each round 124 * using register substitution feature of GCC asm 125 * b will be moved in 2nd pass rounds only 126 */ 127 128 129 #define roundend(a,b,c,x) \ 130 : "+r" (a), "+r" (b), "+r" (c) \ 131 : "0" (a), "1" (b), "2" (c), "m" (x), "r" (&tiger_table),\ 132 "i" (MASK0), "i" (MASK8), "i" (MASK16), "r" (MASK32), "r" (MASK40), "r" (MASK48) \ 133 : "3", "%rax","%rbx","%rcx","%rdx","%rsi", "%edi", "%r8" ); 134 135 136 /* c ^= x; 137 a = t1[BN(c,0)] ^ t2[BN(c,2)] ^ 138 t3[BN(c,4)] ^ t4[BN(c,6)] ; 139 b += t4[BN(c,1)] ^ t3[BN(c,3)] ^ 140 t2[BN(c,5)] ^ t1[BN(c,7)] ; */ 141 142 #define roundX(a,b,c,x) \ 143 " movl %10, %%ebx \n"\ 144 " movq %11, %%rcx \n"\ 145 " movq %13, %%rdx \n"\ 146 " movq %6, %%r8 \n"\ 147 " xorq %%r8, %2 \n" \ 148 " andq %2, %%rbx \n"\ 149 " andq %2, %%rcx \n"\ 150 " andq %2, %%rdx \n"\ 151 " shrl $(163), %%ebx \n"\ 152 " shrq $(323), %%rcx \n"\ 153 " shrq $(483), %%rdx \n"\ 154 " movzbl %2b, %%eax \n"\ 155 " movzwl %2w, %%edi \n"\ 156 " movq (%7,%%rax,8), %%rsi \n"\ 157 " shrl $(8), %%edi \n" \ 158 " movq %2, %%rax \n" \ 159 " xorq (2048*1)(%7,%%rbx), %%rsi \n"\ 160 " movq %2, %%rbx \n"\ 161 " shrl $24, %%eax \n"\ 162 " andq %12, %%rbx \n"\ 163 " xorq (2048*2)(%7,%%rcx), %%rsi \n"\ 164 " shrq $(403), %%rbx \n"\ 165 " movq %2, %%rcx \n"\ 166 " xorq (2048*3)(%7,%%rdx), %%rsi \n"\ 167 " movq (2048*3)(%7,%%rdi,8), %%rdx \n"\ 168 " shrq $56, %%rcx \n"\ 169 " xorq (2048*2)(%7,%%rax,8), %%rdx \n"\ 170 " xorq (2048*1)(%7,%%rbx), %%rdx \n" \ 171 " subq %%rsi, %0 \n"\ 172 " xorq (%7,%%rcx,8), %%rdx \n"\ 173 " addq %%rdx, %1 \n" 174 175 #define round5(a,b,c,x) \ 176 roundstart \ 177 roundX(a,b,c,x) \ 178 /* b*=5; */ \ 179 "leaq (%1,%1,4), %1\n" \ 180 roundend(a,b,c,x) 181 182 183 #define round7(a,b,c,x) \ 184 roundstart \ 185 roundX(a,b,c,x) \ 186 roundend(a,b,c,x) \ 187 /* b*=7; */ \ 188 __asm__ ( \ 189 "leaq (%1,%1,8), %0\n" \ 190 "addq %1, %1 \n" \ 191 "subq %1, %0 " \ 192 :"=&r" (b): "r"(b): "1" ); 193 194 #define round9(a,b,c,x) \ 195 roundstart \ 196 roundX(a,b,c,x) \ 197 "leaq (%1,%1,8), %1\n" \ 198 roundend(a,b,c,x) 199 200 #endif 201 202 203 204 205 /* ============== Common macros ================== */ 126 206 127 207 #define key_schedule \ 128 x0 = x7 ^ 0xA5A5A5A5A5A5A5A5LL; \ 129 x1 ^= x0; \ 130 x2 += x1; \ 131 x3 = x2 ^ ((~x1)<<19); \ 132 x4 ^= x3; \ 133 x5 += x4; \ 134 x6 = x5 ^ ((~x4)>>23); \ 135 x7 ^= x6; \ 136 x0 += x7; \ 137 x1 = x0 ^ ((~x7)<<19); \ 138 x2 ^= x1; \ 139 x3 += x2; \ 140 x4 = x3 ^ ((~x2)>>23); \ 141 x5 ^= x4; \ 142 x6 += x5; \ 143 x7 = x6 ^ 0x0123456789ABCDEFLL; 208 x0 = x7 ^ XOR_CONST1; \ 209 x1 ^= x0; \ 210 x2 += x1;\ 211 x3 = x2 ^ ((~x1)<<19);\ 212 x4 ^= x3;\ 213 x5 += x4;\ 214 x6 = x5 ^ ((~x4)>>23); \ 215 x7 ^= x6; \ 216 x0 += x7; \ 217 x1 = x0 ^ ((~x7)<<19); \ 218 x2 ^= x1; \ 219 x3 += x2; \ 220 x4 = x3 ^ ((~x2)>>23); \ 221 x5 ^= x4; \ 222 x6 += x5; \ 223 x7 = x6 ^ XOR_CONST2; 224 225 #define pass5n(a,b,c) \ 226 round5(a,b,c,x0) \ 227 x0 = x7 ^ XOR_CONST1; \ 228 round5(b,c,a,x1) \ 229 x1 ^= x0; \ 230 round5(c,a,b,x2) \ 231 x2 += x1; \ 232 round5(a,b,c,x3) \ 233 x3 = x2 ^ ((~x1)<<19); \ 234 round5(b,c,a,x4) \ 235 x4 ^= x3; \ 236 round5(c,a,b,x5) \ 237 x5 += x4; \ 238 round5(a,b,c,x6) \ 239 x6 = x5 ^ ((~x4)>>23); \ 240 round5(b,c,a,x7) \ 241 x7 ^= x6; \ 242 x0 += x7; \ 243 x1 = x0 ^ ((~x7)<<19); \ 244 x2 ^= x1; \ 245 x3 += x2; \ 246 x4 = x3 ^ ((~x2)>>23); \ 247 x5 ^= x4; \ 248 x6 += x5; \ 249 x7 = x6 ^ XOR_CONST2; 250 251 #define pass7n(a,b,c) \ 252 round7(a,b,c,x0) \ 253 x0 = x7 ^ XOR_CONST1; \ 254 round7(b,c,a,x1) \ 255 x1 ^= x0; \ 256 round7(c,a,b,x2) \ 257 x2 += x1; \ 258 round7(a,b,c,x3) \ 259 x3 = x2 ^ ((~x1)<<19); \ 260 round7(b,c,a,x4) \ 261 x4 ^= x3; \ 262 round7(c,a,b,x5) \ 263 x5 += x4; \ 264 round7(a,b,c,x6) \ 265 x6 = x5 ^ ((~x4)>>23); \ 266 round7(b,c,a,x7) \ 267 x7 ^= x6; \ 268 x0 += x7; \ 269 x1 = x0 ^ ((~x7)<<19); \ 270 x2 ^= x1; \ 271 x3 += x2; \ 272 x4 = x3 ^ ((~x2)>>23); \ 273 x5 ^= x4; \ 274 x6 += x5; \ 275 x7 = x6 ^ XOR_CONST2; 276 277 #define pass5(a,b,c) \ 278 pass_start \ 279 round5(a,b,c,x0) \ 280 round5(b,c,a,x1) \ 281 round5(c,a,b,x2) \ 282 round5(a,b,c,x3) \ 283 round5(b,c,a,x4) \ 284 round5(c,a,b,x5) \ 285 round5(a,b,c,x6) \ 286 round5(b,c,a,x7) \ 287 pass_end 288 289 #define pass7(a,b,c) \ 290 pass_start \ 291 round7(a,b,c,x0) \ 292 round7(b,c,a,x1) \ 293 round7(c,a,b,x2) \ 294 round7(a,b,c,x3) \ 295 round7(b,c,a,x4) \ 296 round7(c,a,b,x5) \ 297 round7(a,b,c,x6) \ 298 round7(b,c,a,x7) \ 299 pass_end 300 301 302 #define pass9(a,b,c) \ 303 pass_start \ 304 round9(a,b,c,x0) \ 305 round9(b,c,a,x1) \ 306 round9(c,a,b,x2) \ 307 round9(a,b,c,x3) \ 308 round9(b,c,a,x4) \ 309 round9(c,a,b,x5) \ 310 round9(a,b,c,x6) \ 311 round9(b,c,a,x7) \ 312 pass_end 144 313 145 314 #define feedforward \ 146 a ^= aa; \ 147 b = bb; \ 148 c += cc; 149 150 #ifdef OPTIMIZE_FOR_ALPHA 151 /* The loop is unrolled: works better on Alpha */ 315 a ^= aa; \ 316 b = bb; \ 317 c += cc; 318 319 320 /* This version works ok with C variant and also with new asm version 321 * that just wastes a register r8 322 * reason? who knows, write forwarding is faster than keeping value 323 * in register? :) 324 */ 152 325 #define compress \ 153 save_abc \ 154 pass(a,b,c,5) \ 155 key_schedule \ 156 pass(c,a,b,7) \ 157 key_schedule \ 158 pass(b,c,a,9) \ 159 for(pass_no=3; pass_no<PASSES; pass_no++) { \ 160 key_schedule \ 161 pass(a,b,c,9) \ 162 tmpa=a; a=c; c=b; b=tmpa;} \ 163 feedforward 164 #else 165 /* loop: works better on PC and Sun (smaller cache?) */ 166 #define compress \ 167 save_abc \ 168 for(pass_no=0; pass_no<PASSES; pass_no++) { \ 169 if(pass_no != 0) {key_schedule} \ 170 pass(a,b,c,(pass_no==0?5:pass_no==1?7:9)); \ 171 tmpa=a; a=c; c=b; b=tmpa;} \ 172 feedforward 173 #endif 326 save_abc \ 327 pass5n(a,b,c) \ 328 pass7n(c,a,b) \ 329 pass9(b,c,a) \ 330 for(pass_no=3; pass_no<PASSES; pass_no++) { \ 331 key_schedule \ 332 pass9(a,b,c) \ 333 tmpa=a; a=c; c=b; b=tmpa; \ 334 } \ 335 feedforward 336 337 #define compress_old \ 338 save_abc \ 339 pass5(a,b,c) \ 340 key_schedule \ 341 pass7(c,a,b) \ 342 key_schedule \ 343 pass9(b,c,a) \ 344 for(pass_no=3; pass_no<PASSES; pass_no++) { \ 345 key_schedule \ 346 pass9(a,b,c) \ 347 tmpa=a; a=c; c=b; b=tmpa; \ 348 } \ 349 feedforward 174 350 175 351 #define tiger_compress_macro(str, state) \ 176 352 { \ 177 register word64 a, b, c, tmpa; \ 353 register word64 a, b, c; \ 354 register word64 tmpa; \ 178 355 word64 aa, bb, cc; \ 179 registerword64 x0, x1, x2, x3, x4, x5, x6, x7; \356 word64 x0, x1, x2, x3, x4, x5, x6, x7; \ 180 357 int pass_no; \ 181 358 \ … … 194 371 } 195 372 196 /* The compress function is a function. Requires smaller cache? */197 373 void tiger_compress(word64 *str, word64 state[3]) 198 374 { 199 #ifndef S_SPLINT_S200 375 tiger_compress_macro(((word64*)str), ((word64*)state)); 201 #endif202 376 } 203 204 #ifdef OPTIMIZE_FOR_ALPHA205 /* The compress function is inlined: works better on Alpha. */206 /* Still leaves the function above in the code, in case some other */207 /* module calls it directly. */208 #define tiger_compress(str, state) \209 tiger_compress_macro(((word64*)str), ((word64*)state))210 #endif211 377 212 378 void tiger_t(word64 *str, word64 length, word64 res[3]) … … 220 386 221 387 /* 222 res[0]=0x0123456789ABCDEFLL;223 res[1]=0xFEDCBA9876543210LL;224 res[2]=0xF096A5B4C3B2E187LL;225 */388 * res[0]=0x0123456789ABCDEFLL; 389 * res[1]=0xFEDCBA9876543210LL; 390 * res[2]=0xF096A5B4C3B2E187LL; 391 */ 226 392 227 393 for(i=length; i>=64; i=64) … … 229 395 #ifdef BIG_ENDIAN 230 396 for(j=0; j<64; j++) 231 397 temp[j^7] = ((sh_byte*)str)[j]; 232 398 tiger_compress(((word64*)temp), res); 233 399 #else … … 236 402 str += 8; 237 403 } 238 239 404 } 240 405 … … 246 411 247 412 /* 248 res[0]=0x0123456789ABCDEFLL;249 res[1]=0xFEDCBA9876543210LL;250 res[2]=0xF096A5B4C3B2E187LL;251 */413 * res[0]=0x0123456789ABCDEFLL; 414 * res[1]=0xFEDCBA9876543210LL; 415 * res[2]=0xF096A5B4C3B2E187LL; 416 */ 252 417 253 418 for(i=length; i>=64; i=64) … … 255 420 #ifdef BIG_ENDIAN 256 421 for(j=0; j<64; j++) 257 422 temp[j^7] = ((sh_byte*)str)[j]; 258 423 tiger_compress(((word64*)temp), res); 259 424 #else … … 272 437 temp[j^7] = 0; 273 438 #else 439 440 #ifndef USE_MEMSET 274 441 for(j=0; j<i; j++) 275 442 temp[j] = ((sh_byte*)str)[j]; 276 443 #else 444 memcpy( temp, str, j=i ); 445 #endif 277 446 temp[j++] = 0x01; 278 447 for(; j&7; j++) 279 temp[j] = 0; 280 #endif 448 temp[j] = 0; 449 450 #endif 451 281 452 if(j>56) 282 453 { 454 #ifndef USE_MEMSET 283 455 for(; j<64; j++) 284 456 temp[j] = 0; 457 #else 458 memset( temp+j, 0, 64j); 459 #endif 285 460 tiger_compress(((word64*)temp), res); 286 461 j=0; 287 462 } 288 463 464 #ifndef USE_MEMSET 289 465 for(; j<56; j++) 290 466 temp[j] = 0; 467 #else 468 memset( temp+j, 0, 56j); 469 #endif 470 291 471 ((word64*)(&(temp[56])))[0] = ((word64)length)<<3; 292 472 tiger_compress(((word64*)temp), res); 293 473 } 294 474 295 #else 296 297 void dummy_1_64 (int a) 298 { 299 (void) a; 300 return; 301 } 302 303 #endif 304 305 306 307 308 309 310 311 312 475 #endif
Note: See TracChangeset
for help on using the changeset viewer.