09-07-2017, 11:24 PM 
		
	
	
		Hello!
I try to implement sha3-256 with multiple iterations on the basis of m05000_a0.cl. The kernel code is quite difficult to understand. Perhaps someone might be interested in it and would like to help. I tried something like this. Any idea? Im am guessing around about the details. It is quite simple. First iteration: make sha3-256 of password, next make sha3-256 of the last hash. I am struggeling with the details and do not fully understand the implementation
	
	
	
	
	
I try to implement sha3-256 with multiple iterations on the basis of m05000_a0.cl. The kernel code is quite difficult to understand. Perhaps someone might be interested in it and would like to help. I tried something like this. Any idea? Im am guessing around about the details. It is quite simple. First iteration: make sha3-256 of password, next make sha3-256 of the last hash. I am struggeling with the details and do not fully understand the implementation

Code:
__
kernel void m05000_m04 (...)
{
 /**
  * modifier
  */
 const u32 lid = get_local_id (0);
 /**
  * base
  */
 const u32 gid = get_global_id (0);
 if (gid >= gid_max) return;
 u32 pw_buf0[4];
 u32 pw_buf1[4];
 pw_buf0[0] = pws[gid].i[0];
 pw_buf0[1] = pws[gid].i[1];
 pw_buf0[2] = pws[gid].i[2];
 pw_buf0[3] = pws[gid].i[3];
 pw_buf1[0] = pws[gid].i[4];
 pw_buf1[1] = pws[gid].i[5];
 pw_buf1[2] = pws[gid].i[6];
 pw_buf1[3] = pws[gid].i[7];
 const u32 pw_len = pws[gid].pw_len;
 /**
  * constants
  */
 const u8 keccakf_rotc[24] =
 {
    1,  3,  6, 10, 15, 21, 28, 36, 45, 55,  2, 14,
   27, 41, 56,  8, 25, 43, 62, 18, 39, 61, 20, 44
 };
 const u8 keccakf_piln[24] =
 {
   10,  7, 11, 17, 18,  3,  5, 16,  8, 21, 24,  4,
   15, 23, 19, 13, 12,  2, 20, 14, 22,  9,  6,  1
 };
 /**
  * 0x80 keccak, very special
  */
 const u32 mdlen = salt_bufs[salt_pos].keccak_mdlen;
/* ###################### NEW */
 /*
  * All the SHA-3 functions share code structure.  They
  * differ only in the size of the chunks they split the message into:
  * for digest size d, they are split into chunks of 200 - mdlen bytes.
  * So for several rounds mdlen = 32 (this is 256bit). For add80w we get 16
  */
 const u32 rsiz = 200 - (2 * mdlen);
 const u32 add80w = (rsiz - 1) / 8;
 const u32 add80w_256bit_input = 16; /* ###################### NEW */
 /**
  * loop
  */
 for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
 {
   u32x w0[4] = { 0 };
   u32x w1[4] = { 0 };
   u32x w2[4] = { 0 };
   u32x w3[4] = { 0 };
   
   u64x st_roundResult[4] = {0};
   
   const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);
   append_0x01_2x4_VV (w0, w1, out_len);
   /**
    * Keccak
    */
   u64x st[25];
   int round;
   int hashIter; /* ###################### NEW */
   for (hashIter = 0; hashIter < HASHITER_NUM; hashIter++)
   {
       if(hashIter == 0)
       {
           st[ 0] = hl32_to_64 (w0[1], w0[0]);
           st[ 1] = hl32_to_64 (w0[3], w0[2]);
           st[ 2] = hl32_to_64 (w1[1], w1[0]);
           st[ 3] = hl32_to_64 (w1[3], w1[2]);
           st[ 4] = 0;
           st[ 5] = 0;
           st[ 6] = 0;
           st[ 7] = 0;
           st[ 8] = 0;
           st[ 9] = 0;
           st[10] = 0;
           st[11] = 0;
           st[12] = 0;
           st[13] = 0;
           st[14] = 0;
           st[15] = 0;
           st[16] = 0;
           st[17] = 0;
           st[18] = 0;
           st[19] = 0;
           st[20] = 0;
           st[21] = 0;
           st[22] = 0;
           st[23] = 0;
           st[24] = 0;
           st[add80w] |= 0x8000000000000000;
       }
       /* ###################### NEW, take the result from last iteration*/
       else{
           st[ 0] = st_roundResult[0];
           st[ 1] = st_roundResult[1];
           st[ 2] = st_roundResult[2];
           st[ 3] = st_roundResult[3];
           st[ 4] = 0;
           st[ 5] = 0;
           st[ 6] = 0;
           st[ 7] = 0;
           st[ 8] = 0;
           st[ 9] = 0;
           st[10] = 0;
           st[11] = 0;
           st[12] = 0;
           st[13] = 0;
           st[14] = 0;
           st[15] = 0;
           st[16] = 0;
           st[17] = 0;
           st[18] = 0;
           st[19] = 0;
           st[20] = 0;
           st[21] = 0;
           st[22] = 0;
           st[23] = 0;
           st[24] = 0;
           st[add80w_256bit_input] |= 0x8000000000000000;                
       }
       for (round = 0; round < KECCAK_ROUNDS; round++)
       {
         // Theta
         u64x bc0 = Theta1 (0);
         u64x bc1 = Theta1 (1);
         u64x bc2 = Theta1 (2);
         u64x bc3 = Theta1 (3);
         u64x bc4 = Theta1 (4);
         u64x t;
         t = bc4 ^ rotl64 (bc1, 1); Theta2 (0);
         t = bc0 ^ rotl64 (bc2, 1); Theta2 (1);
         t = bc1 ^ rotl64 (bc3, 1); Theta2 (2);
         t = bc2 ^ rotl64 (bc4, 1); Theta2 (3);
         t = bc3 ^ rotl64 (bc0, 1); Theta2 (4);
         // Rho Pi
         t = st[1];
         Rho_Pi (0);
         Rho_Pi (1);
         Rho_Pi (2);
         Rho_Pi (3);
         Rho_Pi (4);
         Rho_Pi (5);
         Rho_Pi (6);
         Rho_Pi (7);
         Rho_Pi (8);
         Rho_Pi (9);
         Rho_Pi (10);
         Rho_Pi (11);
         Rho_Pi (12);
         Rho_Pi (13);
         Rho_Pi (14);
         Rho_Pi (15);
         Rho_Pi (16);
         Rho_Pi (17);
         Rho_Pi (18);
         Rho_Pi (19);
         Rho_Pi (20);
         Rho_Pi (21);
         Rho_Pi (22);
         Rho_Pi (23);
         //  Chi
         Chi (0);
         Chi (5);
         Chi (10);
         Chi (15);
         Chi (20);
         //  Iota
         st[0] ^= keccakf_rndc[round];
       }
       /* ###################### NEW, remember for next round */
       st_roundResult[0] = st[ 0];
       st_roundResult[1] = st[ 1];
       st_roundResult[2] = st[ 2];
       st_roundResult[3] = st[ 3];
   }
   const u32x r0 = l32_from_64 (st[1]);
   const u32x r1 = h32_from_64 (st[1]);
   const u32x r2 = l32_from_64 (st[2]);
   const u32x r3 = h32_from_64 (st[2]);
   COMPARE_M_SIMD (r0, r1, r2, r3);
 }
} 
 

 

