I was able to write a functioning mode 0 pure in OpenCL. The module is the exact same as SHA1 (00100). It still manages 300+MH/s on my 5600XT.
Code:
/**
* Author......: See docs/credits.txt
* License.....: MIT
*/
//#define NEW_SIMD_CODE
#ifdef KERNEL_STATIC
#include "inc_vendor.h"
#include "inc_types.h"
#include "inc_platform.cl"
#include "inc_common.cl"
#include "inc_rp.h"
#include "inc_rp.cl"
#include "inc_scalar.cl"
#include "inc_hash_sha1.cl"
#endif
#define PAD_LEN 256
KERNEL_FQ void m10240_mxx (KERN_ATTR_RULES ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u8 pad[PAD_LEN] = {
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
'Q','R','S','T','U','V','W','X','Y','Z','A','B','C','D','E','F',
'G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V',
'W','X','Y','Z','A','B','C','D','E','F','G','H','I','J','K','L',
'M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','A','B',
'C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R',
'S','T','U','V','W','X','Y','Z','A','B','C','D','E','F','G','H',
'I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X',
'Y','Z','A','B','C','D','E','F','G','H','I','J','K','L','M','N',
'O','P','Q','R','S','T','U','V','W','X','Y','Z','A','B','C','D',
'E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T',
'U','V','W','X','Y','Z','A','B','C','D','E','F','G','H','I','J',
'K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
'Q','R','S','T','U','V','W','X','Y','Z','A','B','C','D','E','F',
'G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U',0
};
/**
* base
*/
COPY_PW (pws[gid]);
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos++)
{
pw_t tmp = PASTE_PW;
tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
sha1_ctx_t ctx;
sha1_init (&ctx);
u8 *bytes = &tmp.i;
for (u32 i = 0 ; i < tmp.pw_len ; ++i) {
pad[i] = bytes[i];
pad[i + 39] = bytes[i];
}
for (u32 i = 0 ; i < 4 ; ++i) {
u32 buf[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
buf[0] = (pad[(i*64)] << 24) + (pad[1+(i*64)] << 16) + (pad[2+(i*64)] << 8) + (pad[3+(i*64)]);
buf[1] = (pad[4+(i*64)] << 24) + (pad[5+(i*64)] << 16) + (pad[6+(i*64)] << 8) + (pad[7+(i*64)]);
buf[2] = (pad[8+(i*64)] << 24) + (pad[9+(i*64)] << 16) + (pad[10+(i*64)] << 8) + (pad[11+(i*64)]);
buf[3] = (pad[12+(i*64)] << 24) + (pad[13+(i*64)] << 16) + (pad[14+(i*64)] << 8) + (pad[15+(i*64)]);
buf[4] = (pad[16+(i*64)] << 24) + (pad[17+(i*64)] << 16) + (pad[18+(i*64)] << 8) + (pad[19+(i*64)]);
buf[5] = (pad[20+(i*64)] << 24) + (pad[21+(i*64)] << 16) + (pad[22+(i*64)] << 8) + (pad[23+(i*64)]);
buf[6] = (pad[24+(i*64)] << 24) + (pad[25+(i*64)] << 16) + (pad[26+(i*64)] << 8) + (pad[27+(i*64)]);
buf[7] = (pad[28+(i*64)] << 24) + (pad[29+(i*64)] << 16) + (pad[30+(i*64)] << 8) + (pad[31+(i*64)]);
buf[8] = (pad[32+(i*64)] << 24) + (pad[33+(i*64)] << 16) + (pad[34+(i*64)] << 8) + (pad[35+(i*64)]);
buf[9] = (pad[36+(i*64)] << 24) + (pad[37+(i*64)] << 16) + (pad[38+(i*64)] << 8) + (pad[39+(i*64)]);
buf[10] = (pad[40+(i*64)] << 24) + (pad[41+(i*64)] << 16) + (pad[42+(i*64)] << 8) + (pad[43+(i*64)]);
buf[11] = (pad[44+(i*64)] << 24) + (pad[45+(i*64)] << 16) + (pad[46+(i*64)] << 8) + (pad[47+(i*64)]);
buf[12] = (pad[48+(i*64)] << 24) + (pad[49+(i*64)] << 16) + (pad[50+(i*64)] << 8) + (pad[51+(i*64)]);
buf[13] = (pad[52+(i*64)] << 24) + (pad[53+(i*64)] << 16) + (pad[54+(i*64)] << 8) + (pad[55+(i*64)]);
buf[14] = (pad[56+(i*64)] << 24) + (pad[57+(i*64)] << 16) + (pad[58+(i*64)] << 8) + (pad[59+(i*64)]);
buf[15] = (pad[60+(i*64)] << 24) + (pad[61+(i*64)] << 16) + (pad[62+(i*64)] << 8) + (pad[63+(i*64)]);
sha1_update(&ctx, buf, 64);
}
sha1_final (&ctx);
const u32 r0 = ctx.h[DGST_R0];
const u32 r1 = ctx.h[DGST_R1];
const u32 r2 = ctx.h[DGST_R2];
const u32 r3 = ctx.h[DGST_R3];
COMPARE_M_SCALAR (r0, r1, r2, r3);
}
}
KERNEL_FQ void m10240_sxx (KERN_ATTR_RULES ())
{
/**
* modifier
*/
const u64 lid = get_local_id (0);
const u64 gid = get_global_id (0);
if (gid >= gid_max) return;
u8 pad[PAD_LEN] = {
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
'Q','R','S','T','U','V','W','X','Y','Z','A','B','C','D','E','F',
'G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V',
'W','X','Y','Z','A','B','C','D','E','F','G','H','I','J','K','L',
'M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','A','B',
'C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R',
'S','T','U','V','W','X','Y','Z','A','B','C','D','E','F','G','H',
'I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X',
'Y','Z','A','B','C','D','E','F','G','H','I','J','K','L','M','N',
'O','P','Q','R','S','T','U','V','W','X','Y','Z','A','B','C','D',
'E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T',
'U','V','W','X','Y','Z','A','B','C','D','E','F','G','H','I','J',
'K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
'Q','R','S','T','U','V','W','X','Y','Z','A','B','C','D','E','F',
'G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U',0
};
/**
* digest
*/
const u32 search[4] =
{
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R0],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R1],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R2],
digests_buf[DIGESTS_OFFSET].digest_buf[DGST_R3]
};
/**
* base
*/
COPY_PW (pws[gid]);
/**
* loop
*/
for (u32 il_pos = 0; il_pos < il_cnt; il_pos++)
{
pw_t tmp = PASTE_PW;
tmp.pw_len = apply_rules (rules_buf[il_pos].cmds, tmp.i, tmp.pw_len);
sha1_ctx_t ctx;
sha1_init (&ctx);
u8 *bytes = &tmp.i;
for (u32 i = 0 ; i < tmp.pw_len ; ++i) {
pad[i] = bytes[i];
pad[i + 39] = bytes[i];
}
for (u32 i = 0 ; i < 4 ; ++i) {
u32 buf[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
buf[0] = (pad[(i*64)] << 24) + (pad[1+(i*64)] << 16) + (pad[2+(i*64)] << 8) + (pad[3+(i*64)]);
buf[1] = (pad[4+(i*64)] << 24) + (pad[5+(i*64)] << 16) + (pad[6+(i*64)] << 8) + (pad[7+(i*64)]);
buf[2] = (pad[8+(i*64)] << 24) + (pad[9+(i*64)] << 16) + (pad[10+(i*64)] << 8) + (pad[11+(i*64)]);
buf[3] = (pad[12+(i*64)] << 24) + (pad[13+(i*64)] << 16) + (pad[14+(i*64)] << 8) + (pad[15+(i*64)]);
buf[4] = (pad[16+(i*64)] << 24) + (pad[17+(i*64)] << 16) + (pad[18+(i*64)] << 8) + (pad[19+(i*64)]);
buf[5] = (pad[20+(i*64)] << 24) + (pad[21+(i*64)] << 16) + (pad[22+(i*64)] << 8) + (pad[23+(i*64)]);
buf[6] = (pad[24+(i*64)] << 24) + (pad[25+(i*64)] << 16) + (pad[26+(i*64)] << 8) + (pad[27+(i*64)]);
buf[7] = (pad[28+(i*64)] << 24) + (pad[29+(i*64)] << 16) + (pad[30+(i*64)] << 8) + (pad[31+(i*64)]);
buf[8] = (pad[32+(i*64)] << 24) + (pad[33+(i*64)] << 16) + (pad[34+(i*64)] << 8) + (pad[35+(i*64)]);
buf[9] = (pad[36+(i*64)] << 24) + (pad[37+(i*64)] << 16) + (pad[38+(i*64)] << 8) + (pad[39+(i*64)]);
buf[10] = (pad[40+(i*64)] << 24) + (pad[41+(i*64)] << 16) + (pad[42+(i*64)] << 8) + (pad[43+(i*64)]);
buf[11] = (pad[44+(i*64)] << 24) + (pad[45+(i*64)] << 16) + (pad[46+(i*64)] << 8) + (pad[47+(i*64)]);
buf[12] = (pad[48+(i*64)] << 24) + (pad[49+(i*64)] << 16) + (pad[50+(i*64)] << 8) + (pad[51+(i*64)]);
buf[13] = (pad[52+(i*64)] << 24) + (pad[53+(i*64)] << 16) + (pad[54+(i*64)] << 8) + (pad[55+(i*64)]);
buf[14] = (pad[56+(i*64)] << 24) + (pad[57+(i*64)] << 16) + (pad[58+(i*64)] << 8) + (pad[59+(i*64)]);
buf[15] = (pad[60+(i*64)] << 24) + (pad[61+(i*64)] << 16) + (pad[62+(i*64)] << 8) + (pad[63+(i*64)]);
sha1_update(&ctx, buf, 64);
}
sha1_final (&ctx);
const u32 r0 = ctx.h[DGST_R0];
const u32 r1 = ctx.h[DGST_R1];
const u32 r2 = ctx.h[DGST_R2];
const u32 r3 = ctx.h[DGST_R3];
COMPARE_S_SCALAR (r0, r1, r2, r3);
}
}