Want to implement SHA3-256 with Iterations
#1
Hello!

I try to implement sha3-256 with multiple iterations on the basis of m05000_a0.cl. The kernel code is quite difficult to understand. Perhaps someone might be interested in it and would like to help. I tried something like this. Any idea? Im am guessing around about the details. It is quite simple. First iteration: make sha3-256 of password, next make sha3-256 of the last hash. I am struggeling with the details and do not fully understand the implementation Undecided

Code:
__
kernel void m05000_m04 (...)
{
 /**
  * modifier
  */

 const u32 lid = get_local_id (0);

 /**
  * base
  */

 const u32 gid = get_global_id (0);

 if (gid >= gid_max) return;

 u32 pw_buf0[4];
 u32 pw_buf1[4];

 pw_buf0[0] = pws[gid].i[0];
 pw_buf0[1] = pws[gid].i[1];
 pw_buf0[2] = pws[gid].i[2];
 pw_buf0[3] = pws[gid].i[3];
 pw_buf1[0] = pws[gid].i[4];
 pw_buf1[1] = pws[gid].i[5];
 pw_buf1[2] = pws[gid].i[6];
 pw_buf1[3] = pws[gid].i[7];

 const u32 pw_len = pws[gid].pw_len;

 /**
  * constants
  */

 const u8 keccakf_rotc[24] =
 {
    1,  3,  6, 10, 15, 21, 28, 36, 45, 55,  2, 14,
   27, 41, 56,  8, 25, 43, 62, 18, 39, 61, 20, 44
 };

 const u8 keccakf_piln[24] =
 {
   10,  7, 11, 17, 18,  3,  5, 16,  8, 21, 24,  4,
   15, 23, 19, 13, 12,  2, 20, 14, 22,  9,  6,  1
 };

 /**
  * 0x80 keccak, very special
  */

 const u32 mdlen = salt_bufs[salt_pos].keccak_mdlen;
/* ###################### NEW */
 /*
  * All the SHA-3 functions share code structure.  They
  * differ only in the size of the chunks they split the message into:
  * for digest size d, they are split into chunks of 200 - mdlen bytes.
  * So for several rounds mdlen = 32 (this is 256bit). For add80w we get 16
  */
 const u32 rsiz = 200 - (2 * mdlen);

 const u32 add80w = (rsiz - 1) / 8;
 const u32 add80w_256bit_input = 16; /* ###################### NEW */

 /**
  * loop
  */

 for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
 {
   u32x w0[4] = { 0 };
   u32x w1[4] = { 0 };
   u32x w2[4] = { 0 };
   u32x w3[4] = { 0 };
   
   u64x st_roundResult[4] = {0};
   
   const u32x out_len = apply_rules_vect (pw_buf0, pw_buf1, pw_len, rules_buf, il_pos, w0, w1);

   append_0x01_2x4_VV (w0, w1, out_len);

   /**
    * Keccak
    */

   u64x st[25];

   int round;
   int hashIter; /* ###################### NEW */
   for (hashIter = 0; hashIter < HASHITER_NUM; hashIter++)
   {
       if(hashIter == 0)
       {
           st[ 0] = hl32_to_64 (w0[1], w0[0]);
           st[ 1] = hl32_to_64 (w0[3], w0[2]);
           st[ 2] = hl32_to_64 (w1[1], w1[0]);
           st[ 3] = hl32_to_64 (w1[3], w1[2]);
           st[ 4] = 0;
           st[ 5] = 0;
           st[ 6] = 0;
           st[ 7] = 0;
           st[ 8] = 0;
           st[ 9] = 0;
           st[10] = 0;
           st[11] = 0;
           st[12] = 0;
           st[13] = 0;
           st[14] = 0;
           st[15] = 0;
           st[16] = 0;
           st[17] = 0;
           st[18] = 0;
           st[19] = 0;
           st[20] = 0;
           st[21] = 0;
           st[22] = 0;
           st[23] = 0;
           st[24] = 0;

           st[add80w] |= 0x8000000000000000;
       }
       /* ###################### NEW, take the result from last iteration*/
       else{
           st[ 0] = st_roundResult[0];
           st[ 1] = st_roundResult[1];
           st[ 2] = st_roundResult[2];
           st[ 3] = st_roundResult[3];
           st[ 4] = 0;
           st[ 5] = 0;
           st[ 6] = 0;
           st[ 7] = 0;
           st[ 8] = 0;
           st[ 9] = 0;
           st[10] = 0;
           st[11] = 0;
           st[12] = 0;
           st[13] = 0;
           st[14] = 0;
           st[15] = 0;
           st[16] = 0;
           st[17] = 0;
           st[18] = 0;
           st[19] = 0;
           st[20] = 0;
           st[21] = 0;
           st[22] = 0;
           st[23] = 0;
           st[24] = 0;

           st[add80w_256bit_input] |= 0x8000000000000000;                
       }

       for (round = 0; round < KECCAK_ROUNDS; round++)
       {
         // Theta

         u64x bc0 = Theta1 (0);
         u64x bc1 = Theta1 (1);
         u64x bc2 = Theta1 (2);
         u64x bc3 = Theta1 (3);
         u64x bc4 = Theta1 (4);

         u64x t;

         t = bc4 ^ rotl64 (bc1, 1); Theta2 (0);
         t = bc0 ^ rotl64 (bc2, 1); Theta2 (1);
         t = bc1 ^ rotl64 (bc3, 1); Theta2 (2);
         t = bc2 ^ rotl64 (bc4, 1); Theta2 (3);
         t = bc3 ^ rotl64 (bc0, 1); Theta2 (4);

         // Rho Pi

         t = st[1];

         Rho_Pi (0);
         Rho_Pi (1);
         Rho_Pi (2);
         Rho_Pi (3);
         Rho_Pi (4);
         Rho_Pi (5);
         Rho_Pi (6);
         Rho_Pi (7);
         Rho_Pi (8);
         Rho_Pi (9);
         Rho_Pi (10);
         Rho_Pi (11);
         Rho_Pi (12);
         Rho_Pi (13);
         Rho_Pi (14);
         Rho_Pi (15);
         Rho_Pi (16);
         Rho_Pi (17);
         Rho_Pi (18);
         Rho_Pi (19);
         Rho_Pi (20);
         Rho_Pi (21);
         Rho_Pi (22);
         Rho_Pi (23);

         //  Chi

         Chi (0);
         Chi (5);
         Chi (10);
         Chi (15);
         Chi (20);

         //  Iota

         st[0] ^= keccakf_rndc[round];
       }
       /* ###################### NEW, remember for next round */
       st_roundResult[0] = st[ 0];
       st_roundResult[1] = st[ 1];
       st_roundResult[2] = st[ 2];
       st_roundResult[3] = st[ 3];
   }
   const u32x r0 = l32_from_64 (st[1]);
   const u32x r1 = h32_from_64 (st[1]);
   const u32x r2 = l32_from_64 (st[2]);
   const u32x r3 = h32_from_64 (st[2]);

   COMPARE_M_SIMD (r0, r1, r2, r3);
 }
}
#2
Where is this used in
#3
It is a custom application based on https://www.npmjs.com/package/sha3

I realized that in my quick example the padding has to be considered. Currently it is not in the additional rounds. If someone is interested, it would be great to get some support.
#4
Ok I see, not much trafic here. I'll try it myself. I will take a look from time to time. If someone want's to participate, you are welcome.