Hi I have a CCTV camera I got myself locked out of. I managed to get the stored hash and password check algorithm out thanks to power of ghira and chatgpt.
It is standard MD5 followed by pair sum and base62 conversion to 8 character hash.
I have opencl kernel and host code that run 360MH/s on p400. And only 5300MH/s on rtx3070.
I'm thinking, since hashcat hits ~40GH/s on 3070 can I add extra steps for my use case to existing module and how much will it slow down?
Or should I just get vast.ai instance with 4090? And pay for the 2 weeks it would take at 6GH/s?
Here is revelant opencl rernel section:
P.s I cannot change salt for known one, as I'm getting CRC error on boot and I can't find where it's stored(possibly in protected CPU memory)
And yes I realised(now) that I can gain some speed by calculating salt indices on in host code and comparing integers in kernel cutting out char conversion.
It is standard MD5 followed by pair sum and base62 conversion to 8 character hash.
I have opencl kernel and host code that run 360MH/s on p400. And only 5300MH/s on rtx3070.
I'm thinking, since hashcat hits ~40GH/s on 3070 can I add extra steps for my use case to existing module and how much will it slow down?
Or should I just get vast.ai instance with 4090? And pay for the 2 weeks it would take at 6GH/s?
Here is revelant opencl rernel section:
Code:
that fixed it put my hasrate dropped from 5130 to 5100Mh/s what are sane values of local and -D STEPS_PER_THREAD for rtx3070
it was 64 and 1 for p400(giving 360Mh/s
my kernel snippet for refernce:
[Code]
__kernel void crack_kernel(
const ulong base_idx,
const ulong total,
const uint charset_len,
const int L,
__constant const uchar *charset,
__constant const uchar *target8,
__global uint *found_count,
__global ulong *found_index,
__global uchar *found_digest
) {
ulong gid = (ulong)get_global_id(0);
ulong idx = base_idx + gid;
if (idx >= total) return;
uchar plain[16];
index_to_plain(idx, charset, charset_len, plain, L);
uchar digest[16];
md5_singleblock(plain, L, digest);
uchar out8[8];
// unrolled loop for 8 steps:
{
uint v0 = (uint)digest[0] + (uint)digest[1];
uint v1 = (uint)digest[2] + (uint)digest[3];
uint v2 = (uint)digest[4] + (uint)digest[5];
uint v3 = (uint)digest[6] + (uint)digest[7];
uint v4 = (uint)digest[8] + (uint)digest[9];
uint v5 = (uint)digest[10] + (uint)digest[11];
uint v6 = (uint)digest[12] + (uint)digest[13];
uint v7 = (uint)digest[14] + (uint)digest[15];
out8[0] = map62[mod62_lut[v0 & 0xFFu]];
out8[1] = map62[mod62_lut[v1 & 0xFFu]];
out8[2] = map62[mod62_lut[v2 & 0xFFu]];
out8[3] = map62[mod62_lut[v3 & 0xFFu]];
out8[4] = map62[mod62_lut[v4 & 0xFFu]];
out8[5] = map62[mod62_lut[v5 & 0xFFu]];
out8[6] = map62[mod62_lut[v6 & 0xFFu]];
out8[7] = map62[mod62_lut[v7 & 0xFFu]];
}
// compare with target8 without branches
int ok = 1;
for (int i=0; i<8; i++) {
if (out8[i] != target8[i]) {
ok = 0; break;
}
}
if (ok) {
uint old = atomic_inc(found_count);
if (old == 0u) {
found_index[0] = idx;
for (int i=0; i<16; i++) found_digest[i] = digest[i];
}
}
}
P.s I cannot change salt for known one, as I'm getting CRC error on boot and I can't find where it's stored(possibly in protected CPU memory)
And yes I realised(now) that I can gain some speed by calculating salt indices on in host code and comparing integers in kernel cutting out char conversion.