akira-bruteforce/akira-bruteforce.cu
2025-03-13 12:31:49 +07:00

3282 lines
100 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <fstream>
#include <unistd.h>
//stat
#include <sys/types.h>
#include <sys/stat.h>
//mmap
#include <sys/mman.h>
#include <fcntl.h>
#include "json.hpp"
#include "test-ts.h"
#include "akira-bruteforce.h"
#include "chacha8.h"
//by design, max matches is 127
#define MAX_MATCHES 32
using json = nlohmann::json;
int gpuIndex = 0;
#define SHA256_DIGEST_SIZE 32
// --- SHA-256 Device Implementation ---
// Rotate right.
__device__ __forceinline__ uint32_t rotr(uint32_t x, uint32_t n)
{
//return (x >> n) | (x << (32 - n));
return __funnelshift_r(x, x, n );
}
#define SHR(x, n) ((x) >> (n))
#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#define Sigma0(x) (rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22))
#define Sigma1(x) (rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25))
#define sigma0(x) (rotr(x, 7) ^ rotr(x, 18) ^ SHR(x, 3))
#define sigma1(x) (rotr(x, 17) ^ rotr(x, 19) ^ SHR(x, 10))
// Process one 64-byte block.
__device__ __forceinline__ void sha256_transform(const uint8_t *data, uint32_t state[8], const uint32_t *k)
{
uint32_t w[64];
#pragma unroll
for (int i = 0; i < 16; i++)
{
w[i] = ((uint32_t)data[i * 4] << 24) |
((uint32_t)data[i * 4 + 1] << 16) |
((uint32_t)data[i * 4 + 2] << 8) |
((uint32_t)data[i * 4 + 3]);
}
for (int i = 16; i < 64; i++)
{
w[i] = sigma1(w[i - 2]) + w[i - 7] + sigma0(w[i - 15]) + w[i - 16];
}
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
for (int i = 0; i < 64; i++)
{
uint32_t T1 = h + Sigma1(e) + Ch(e, f, g) + k[i] + w[i];
uint32_t T2 = Sigma0(a) + Maj(a, b, c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
// For a fixed 19-byte input, the padded message is exactly 64 bytes.
// Padding: message (19 bytes) || 0x80 || (zeros up to byte 56) || [0x0000000000000098]
// (19*8 = 152, or 0x98)
__device__ void sha256_hash_19(const uint8_t *msg, uint8_t *digest, const uint32_t *k)
{
uint32_t state[8] = {
0x6a09e667,
0xbb67ae85,
0x3c6ef372,
0xa54ff53a,
0x510e527f,
0x9b05688c,
0x1f83d9ab,
0x5be0cd19};
uint8_t block[64];
#pragma unroll
for (int i = 0; i < 19; i++)
block[i] = msg[i];
block[19] = 0x80;
#pragma unroll
for (int i = 20; i < 56; i++)
block[i] = 0;
// Append message length in bits: 152 = 0x0000000000000098 (big-endian)
block[56] = 0;
block[57] = 0;
block[58] = 0;
block[59] = 0;
block[60] = 0;
block[61] = 0;
block[62] = 0;
block[63] = 152;
sha256_transform(block, state, k);
#pragma unroll
for (int i = 0; i < 8; i++)
{
digest[i * 4] = (state[i] >> 24) & 0xff;
digest[i * 4 + 1] = (state[i] >> 16) & 0xff;
digest[i * 4 + 2] = (state[i] >> 8) & 0xff;
digest[i * 4 + 3] = state[i] & 0xff;
}
}
// For a fixed 68-byte input, the padded message consists of 2 blocks.
// Block 1: first 64 bytes of the input.
// Block 2: remaining 4 bytes || 0x80 || zeros until byte 56 || [bit length = 544 bits]
__device__ void sha256_hash_68(const uint8_t * msg, uint8_t *digest, const uint32_t * k)
{
uint32_t state[8] = {
0x6a09e667,
0xbb67ae85,
0x3c6ef372,
0xa54ff53a,
0x510e527f,
0x9b05688c,
0x1f83d9ab,
0x5be0cd19};
uint8_t block2[64];
// Block 1: copy first 64 bytes.
//uint8_t block[64];
#pragma unroll
for (int i = 0; i < 64; i++)
block2[i] = msg[i];
sha256_transform(block2, state, k);
// Block 2:
// First 4 bytes: remainder of message.
#pragma unroll
for (int i = 0; i < 4; i++)
block2[i] = msg[64 + i];
block2[4] = 0x80;
#pragma unroll
for (int i = 5; i < 56; i++)
block2[i] = 0;
// Append message length in bits: 68*8 = 544 = 0x0000000000000220 (big-endian)
block2[56] = 0;
block2[57] = 0;
block2[58] = 0;
block2[59] = 0;
block2[60] = 0;
block2[61] = 0;
block2[62] = 0x02;
block2[63] = 0x20;
sha256_transform(block2, state, k);
#pragma unroll
for (int i = 0; i < 8; i++)
{
digest[i * 4] = (state[i] >> 24) & 0xff;
digest[i * 4 + 1] = (state[i] >> 16) & 0xff;
digest[i * 4 + 2] = (state[i] >> 8) & 0xff;
digest[i * 4 + 3] = state[i] & 0xff;
}
}
// Minimal AES-256 context and helper functions.
typedef struct
{
uint32_t rk[60]; // 60 words for 14 rounds + initial key
} aes256_ctx;
__device__ __forceinline__ uint32_t SubWord(uint32_t word, const uint8_t *sbox)
{
return ((uint32_t)sbox[(word >> 24)] << 24) |
((uint32_t)sbox[(word >> 16) & 0xff] << 16) |
((uint32_t)sbox[(word >> 8) & 0xff] << 8) |
((uint32_t)sbox[word & 0xff]);
}
__device__ inline uint32_t RotWord(uint32_t word)
{
//return (word << 8) | (word >> 24);
return __funnelshift_l(word, word, 8);
}
// Key expansion for AES-256.
__device__ void aes256_set_encrypt_key(const uint8_t userKey[32], aes256_ctx *ctx, const uint8_t *sbox, const uint32_t *Rcon)
{
int i = 0;
// Copy the 256-bit key into the first 8 words.
for (i = 0; i < 8; i++)
{
ctx->rk[i] = ((uint32_t)userKey[4 * i] << 24) |
((uint32_t)userKey[4 * i + 1] << 16) |
((uint32_t)userKey[4 * i + 2] << 8) |
((uint32_t)userKey[4 * i + 3]);
}
int rcon_i = 0;
for (i = 8; i < 60; i++)
{
uint32_t temp = ctx->rk[i - 1];
if ((i % 8) == 0)
{
temp = SubWord(RotWord(temp), sbox) ^ Rcon[rcon_i++];
}
else if ((i % 8) == 4)
{
temp = SubWord(temp, sbox);
}
ctx->rk[i] = ctx->rk[i - 8] ^ temp;
}
}
// AES block encryption for a single 16-byte block.
__device__ void aes256_encrypt(const aes256_ctx *ctx, const uint8_t in[16], uint8_t out[16], const uint8_t *sbox, const uint8_t *xtime)
{
uint8_t state[16];
// Copy input into state.
for (int i = 0; i < 16; i++)
{
state[i] = in[i];
}
// Initial AddRoundKey.
for (int i = 0; i < 4; i++)
{
uint32_t rk = ctx->rk[i];
state[4 * i + 0] ^= (uint8_t)(rk >> 24);
state[4 * i + 1] ^= (uint8_t)(rk >> 16);
state[4 * i + 2] ^= (uint8_t)(rk >> 8);
state[4 * i + 3] ^= (uint8_t)(rk);
}
// Main rounds.
for (int round = 1; round < 14; round++)
{
// SubBytes.
for (int i = 0; i < 16; i++)
state[i] = sbox[state[i]];
// ShiftRows.
uint8_t tmp[16];
tmp[0] = state[0];
tmp[1] = state[5];
tmp[2] = state[10];
tmp[3] = state[15];
tmp[4] = state[4];
tmp[5] = state[9];
tmp[6] = state[14];
tmp[7] = state[3];
tmp[8] = state[8];
tmp[9] = state[13];
tmp[10] = state[2];
tmp[11] = state[7];
tmp[12] = state[12];
tmp[13] = state[1];
tmp[14] = state[6];
tmp[15] = state[11];
// MixColumns.
for (int i = 0; i < 4; i++)
{
int col = 4 * i;
uint8_t a0 = tmp[col + 0], a1 = tmp[col + 1],
a2 = tmp[col + 2], a3 = tmp[col + 3];
uint8_t r0 = xtime[a0] ^ (a1 ^ xtime[a1]) ^ a2 ^ a3;
uint8_t r1 = a0 ^ xtime[a1] ^ (a2 ^ xtime[a2]) ^ a3;
uint8_t r2 = a0 ^ a1 ^ xtime[a2] ^ (a3 ^ xtime[a3]);
uint8_t r3 = (a0 ^ xtime[a0]) ^ a1 ^ a2 ^ xtime[a3];
tmp[col + 0] = r0;
tmp[col + 1] = r1;
tmp[col + 2] = r2;
tmp[col + 3] = r3;
}
// Copy back to state.
for (int i = 0; i < 16; i++)
state[i] = tmp[i];
// AddRoundKey.
for (int i = 0; i < 4; i++)
{
uint32_t rk = ctx->rk[round * 4 + i];
state[4 * i + 0] ^= (uint8_t)(rk >> 24);
state[4 * i + 1] ^= (uint8_t)(rk >> 16);
state[4 * i + 2] ^= (uint8_t)(rk >> 8);
state[4 * i + 3] ^= (uint8_t)(rk);
}
}
// Final round (no MixColumns).
// SubBytes.
for (int i = 0; i < 16; i++)
state[i] = sbox[state[i]];
// ShiftRows.
uint8_t tmp[16];
tmp[0] = state[0];
tmp[1] = state[5];
tmp[2] = state[10];
tmp[3] = state[15];
tmp[4] = state[4];
tmp[5] = state[9];
tmp[6] = state[14];
tmp[7] = state[3];
tmp[8] = state[8];
tmp[9] = state[13];
tmp[10] = state[2];
tmp[11] = state[7];
tmp[12] = state[12];
tmp[13] = state[1];
tmp[14] = state[6];
tmp[15] = state[11];
// Final AddRoundKey.
for (int i = 0; i < 4; i++)
{
uint32_t rk = ctx->rk[14 * 4 + i];
tmp[4 * i + 0] ^= (uint8_t)(rk >> 24);
tmp[4 * i + 1] ^= (uint8_t)(rk >> 16);
tmp[4 * i + 2] ^= (uint8_t)(rk >> 8);
tmp[4 * i + 3] ^= (uint8_t)(rk);
}
// Write result.
for (int i = 0; i < 16; i++)
out[i] = tmp[i];
}
// kcipher2 state structure.
typedef struct
{
unsigned int A[5];
unsigned int B[11];
unsigned int L1, R1, L2, R2;
} kcipher2_state;
//---------------------------------------------------------------------------
// Device functions for kcipher2
//---------------------------------------------------------------------------
__device__ __forceinline__ unsigned int nlf(unsigned int a, unsigned int b, unsigned int c, unsigned int d)
{
return (a + b) ^ c ^ d;
}
__device__ __forceinline__ unsigned char gf_multiply_by_2(unsigned char t)
{
// return gf2_table[t];
unsigned int lq = t << 1;
if (lq & 0x100)
lq ^= 0x011B;
return ((unsigned char)lq) ^ 0xFF;
}
__device__ __forceinline__ unsigned char gf_multiply_by_3(unsigned char t)
{
// return gf3_table[t];
unsigned int lq = (t << 1) ^ t;
if (lq & 0x100)
lq ^= 0x011B;
return ((unsigned char)lq) ^ 0xFF;
}
__device__ unsigned int sub_k2(unsigned int in)
{
unsigned char w0 = in & 0xFF;
unsigned char w1 = (in >> 8) & 0xFF;
unsigned char w2 = (in >> 16) & 0xFF;
unsigned char w3 = (in >> 24) & 0xFF;
unsigned char t0 = d_s_box[w0];
unsigned char t1 = d_s_box[w1];
unsigned char t2 = d_s_box[w2];
unsigned char t3 = d_s_box[w3];
unsigned char q0 = gf_multiply_by_2(t0) ^ gf_multiply_by_3(t1) ^ t2 ^ t3;
unsigned char q1 = t0 ^ gf_multiply_by_2(t1) ^ gf_multiply_by_3(t2) ^ t3;
unsigned char q2 = t0 ^ t1 ^ gf_multiply_by_2(t2) ^ gf_multiply_by_3(t3);
unsigned char q3 = gf_multiply_by_3(t0) ^ t1 ^ t2 ^ gf_multiply_by_2(t3);
return ((unsigned int)q3 << 24) | ((unsigned int)q2 << 16) | ((unsigned int)q1 << 8) | q0;
}
__device__ void setup_state_values(const unsigned int *key, const unsigned int *iv, kcipher2_state *state)
{
unsigned int IK[12];
IK[0] = key[0];
IK[1] = key[1];
IK[2] = key[2];
IK[3] = key[3];
IK[4] = IK[0] ^ sub_k2((IK[3] << 8) ^ (IK[3] >> 24)) ^ 0x01000000;
IK[5] = IK[1] ^ IK[4];
IK[6] = IK[2] ^ IK[5];
IK[7] = IK[3] ^ IK[6];
IK[8] = IK[4] ^ sub_k2((IK[7] << 8) ^ (IK[7] >> 24)) ^ 0x02000000;
IK[9] = IK[5] ^ IK[8];
IK[10] = IK[6] ^ IK[9];
IK[11] = IK[7] ^ IK[10];
state->A[0] = IK[4];
state->A[1] = IK[3];
state->A[2] = IK[2];
state->A[3] = IK[1];
state->A[4] = IK[0];
state->B[0] = IK[10];
state->B[1] = IK[11];
state->B[2] = iv[0];
state->B[3] = iv[1];
state->B[4] = IK[8];
state->B[5] = IK[9];
state->B[6] = iv[2];
state->B[7] = iv[3];
state->B[8] = IK[7];
state->B[9] = IK[5];
state->B[10] = IK[6];
state->L1 = state->R1 = state->L2 = state->R2 = 0x00000000;
}
__device__ void next_INIT(kcipher2_state *state)
{
unsigned int temp2;
unsigned int nL1 = sub_k2(state->R2 + state->B[4]);
unsigned int nR1 = sub_k2(state->L2 + state->B[9]);
unsigned int nL2 = sub_k2(state->L1);
unsigned int nR2 = sub_k2(state->R1);
unsigned int nA[5];
nA[0] = state->A[1];
nA[1] = state->A[2];
nA[2] = state->A[3];
nA[3] = state->A[4];
unsigned int nB[11];
nB[0] = state->B[1];
nB[1] = state->B[2];
nB[2] = state->B[3];
nB[3] = state->B[4];
nB[4] = state->B[5];
nB[5] = state->B[6];
nB[6] = state->B[7];
nB[7] = state->B[8];
nB[8] = state->B[9];
nB[9] = state->B[10];
unsigned int temp1 = (state->A[0] << 8) ^ d_amul0[(state->A[0] >> 24) & 0xFF];
nA[4] = temp1 ^ state->A[3];
nA[4] ^= nlf(state->B[0], state->R2, state->R1, state->A[4]);
if (state->A[2] & 0x40000000)
temp1 = (state->B[0] << 8) ^ d_amul1[(state->B[0] >> 24) & 0xFF];
else
temp1 = (state->B[0] << 8) ^ d_amul2[(state->B[0] >> 24) & 0xFF];
// branchless version (not faster)
// unsigned int mask = -(unsigned int)(!!(state->A[2] & 0x40000000));
// unsigned int candidate1 = (state->B[0] << 8) ^ d_amul1[(state->B[0] >> 24) & 0xFF];
// unsigned int candidate2 = (state->B[0] << 8) ^ d_amul2[(state->B[0] >> 24) & 0xFF];
// temp1 = candidate2 ^ ((candidate1 ^ candidate2) & mask);
if (state->A[2] & 0x80000000)
temp2 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF];
else
temp2 = state->B[8];
// branchless version (not faster)
// unsigned int mask2 = -(unsigned int)(!!(state->A[2] & 0x80000000));
// unsigned int candidate3 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF];
// unsigned int candidate4 = state->B[8];
// temp2 = candidate4 ^ ((candidate3 ^ candidate4) & mask2);
nB[10] = temp1 ^ state->B[1] ^ state->B[6] ^ temp2;
nB[10] ^= nlf(state->B[10], state->L2, state->L1, state->A[0]);
state->A[0] = nA[0];
state->A[1] = nA[1];
state->A[2] = nA[2];
state->A[3] = nA[3];
state->A[4] = nA[4];
state->B[0] = nB[0];
state->B[1] = nB[1];
state->B[2] = nB[2];
state->B[3] = nB[3];
state->B[4] = nB[4];
state->B[5] = nB[5];
state->B[6] = nB[6];
state->B[7] = nB[7];
state->B[8] = nB[8];
state->B[9] = nB[9];
state->B[10] = nB[10];
state->L1 = nL1;
state->R1 = nR1;
state->L2 = nL2;
state->R2 = nR2;
}
__device__ unsigned int sub_k2_shared(unsigned int in, const unsigned char *gf2_table,
const unsigned char *gf3_table,
const unsigned char *d_s_box)
{
unsigned char w0 = in & 0xFF;
unsigned char w1 = (in >> 8) & 0xFF;
unsigned char w2 = (in >> 16) & 0xFF;
unsigned char w3 = (in >> 24) & 0xFF;
unsigned char t0 = d_s_box[w0];
unsigned char t1 = d_s_box[w1];
unsigned char t2 = d_s_box[w2];
unsigned char t3 = d_s_box[w3];
unsigned char q0 = gf2_table[t0] ^ gf3_table[t1] ^ t2 ^ t3;
unsigned char q1 = t0 ^ gf2_table[t1] ^ gf3_table[t2] ^ t3;
unsigned char q2 = t0 ^ t1 ^ gf2_table[t2] ^ gf3_table[t3];
unsigned char q3 = gf3_table[t0] ^ t1 ^ t2 ^ gf2_table[t3];
return ((unsigned int)q3 << 24) | ((unsigned int)q2 << 16) | ((unsigned int)q1 << 8) | q0;
}
__device__ void setup_state_values_shared(const unsigned int *key, const unsigned int *iv, kcipher2_state *state,
const unsigned char *gf2_table,
const unsigned char *gf3_table,
const unsigned char *d_s_box)
{
unsigned int IK[12];
IK[0] = key[0];
IK[1] = key[1];
IK[2] = key[2];
IK[3] = key[3];
IK[4] = IK[0] ^ sub_k2_shared((IK[3] << 8) ^ (IK[3] >> 24), gf2_table, gf3_table, d_s_box) ^ 0x01000000;
IK[5] = IK[1] ^ IK[4];
IK[6] = IK[2] ^ IK[5];
IK[7] = IK[3] ^ IK[6];
IK[8] = IK[4] ^ sub_k2_shared((IK[7] << 8) ^ (IK[7] >> 24), gf2_table, gf3_table, d_s_box) ^ 0x02000000;
IK[9] = IK[5] ^ IK[8];
IK[10] = IK[6] ^ IK[9];
IK[11] = IK[7] ^ IK[10];
state->A[0] = IK[4];
state->A[1] = IK[3];
state->A[2] = IK[2];
state->A[3] = IK[1];
state->A[4] = IK[0];
state->B[0] = IK[10];
state->B[1] = IK[11];
state->B[2] = iv[0];
state->B[3] = iv[1];
state->B[4] = IK[8];
state->B[5] = IK[9];
state->B[6] = iv[2];
state->B[7] = iv[3];
state->B[8] = IK[7];
state->B[9] = IK[5];
state->B[10] = IK[6];
state->L1 = state->R1 = state->L2 = state->R2 = 0x00000000;
}
__device__ void next_INIT_shared(kcipher2_state *state,
const unsigned char *gf2_table,
const unsigned char *gf3_table,
const unsigned char *d_s_box,
const unsigned int *d_amul0,
const unsigned int *d_amul1,
const unsigned int *d_amul2,
const unsigned int *d_amul3)
{
unsigned int temp2;
unsigned int nL1 = sub_k2_shared(state->R2 + state->B[4], gf2_table, gf3_table, d_s_box);
unsigned int nR1 = sub_k2_shared(state->L2 + state->B[9], gf2_table, gf3_table, d_s_box);
unsigned int nL2 = sub_k2_shared(state->L1, gf2_table, gf3_table, d_s_box);
unsigned int nR2 = sub_k2_shared(state->R1, gf2_table, gf3_table, d_s_box);
unsigned int nA[5];
nA[0] = state->A[1];
nA[1] = state->A[2];
nA[2] = state->A[3];
nA[3] = state->A[4];
unsigned int nB[11];
nB[0] = state->B[1];
nB[1] = state->B[2];
nB[2] = state->B[3];
nB[3] = state->B[4];
nB[4] = state->B[5];
nB[5] = state->B[6];
nB[6] = state->B[7];
nB[7] = state->B[8];
nB[8] = state->B[9];
nB[9] = state->B[10];
unsigned int temp1 = (state->A[0] << 8) ^ d_amul0[(state->A[0] >> 24) & 0xFF];
nA[4] = temp1 ^ state->A[3];
nA[4] ^= nlf(state->B[0], state->R2, state->R1, state->A[4]);
if (state->A[2] & 0x40000000)
temp1 = (state->B[0] << 8) ^ d_amul1[(state->B[0] >> 24) & 0xFF];
else
temp1 = (state->B[0] << 8) ^ d_amul2[(state->B[0] >> 24) & 0xFF];
// branchless version (not faster)
// unsigned int mask = -(unsigned int)(!!(state->A[2] & 0x40000000));
// int b0 = state->B[0] << 8;
// int b0_24 = (state->B[0] >> 24) & 0xFF;
// unsigned int candidate1 = (b0) ^ d_amul1[b0_24];
// unsigned int candidate2 = (b0) ^ d_amul2[b0_24];
// temp1 = candidate2 ^ ((candidate1 ^ candidate2) & mask);
if (state->A[2] & 0x80000000)
temp2 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF];
else
temp2 = state->B[8];
// branchless version (not faster)
// unsigned int mask2 = -(unsigned int)(!!(state->A[2] & 0x80000000));
// unsigned int candidate3 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF];
// unsigned int candidate4 = state->B[8];
// temp2 = candidate4 ^ ((candidate3 ^ candidate4) & mask2);
nB[10] = temp1 ^ state->B[1] ^ state->B[6] ^ temp2;
nB[10] ^= nlf(state->B[10], state->L2, state->L1, state->A[0]);
state->A[0] = nA[0];
state->A[1] = nA[1];
state->A[2] = nA[2];
state->A[3] = nA[3];
state->A[4] = nA[4];
state->B[0] = nB[0];
state->B[1] = nB[1];
state->B[2] = nB[2];
state->B[3] = nB[3];
state->B[4] = nB[4];
state->B[5] = nB[5];
state->B[6] = nB[6];
state->B[7] = nB[7];
state->B[8] = nB[8];
state->B[9] = nB[9];
state->B[10] = nB[10];
state->L1 = nL1;
state->R1 = nR1;
state->L2 = nL2;
state->R2 = nR2;
}
__device__ __forceinline__ unsigned long long bswap64(unsigned long long x)
{
uint32_t hi = x >> 32; // Upper 32 bits
uint32_t lo = x & 0xFFFFFFFF; // Lower 32 bits
// Swap bytes in each 32-bit part using __byte_perm
hi = __byte_perm(hi, 0, 0x0123);
lo = __byte_perm(lo, 0, 0x0123);
// Swap the high and low parts and combine them back
return ((uint64_t)lo << 32) | hi;
}
__device__ unsigned long long kcipher2_encrypt_1_zero_block(const unsigned int *key, const unsigned int *iv)
{
kcipher2_state state;
setup_state_values(key, iv, &state);
for (unsigned char i = 0; i < 24; i++)
next_INIT(&state);
unsigned int zh = nlf(state.B[10], state.L2, state.L1, state.A[0]);
unsigned int zl = nlf(state.B[0], state.R2, state.R1, state.A[4]);
return (((unsigned long long)zh) << 32) | zl;
}
__device__ unsigned long long kcipher2_encrypt_1_zero_block_shared(const unsigned int *key, const unsigned int *iv,
const unsigned char *gf2_table,
const unsigned char *gf3_table,
const unsigned char *d_s_box,
const unsigned int *d_amul0,
const unsigned int *d_amul1,
const unsigned int *d_amul2,
const unsigned int *d_amul3
)
{
kcipher2_state state;
setup_state_values_shared(key, iv, &state, gf2_table, gf3_table, d_s_box);
for (unsigned char i = 0; i < 24; i++)
next_INIT_shared(&state, gf2_table, gf3_table, d_s_box, d_amul0, d_amul1, d_amul2, d_amul3);
unsigned int zh = nlf(state.B[10], state.L2, state.L1, state.A[0]);
unsigned int zl = nlf(state.B[0], state.R2, state.R1, state.A[4]);
return (((unsigned long long)zh) << 32) | zl;
}
//---------------------------------------------------------------------------
// End of file
//---------------------------------------------------------------------------
// --- Multihash Kernel ---
//
// Each thread computes:
// digest0 = SHA256(19-byte input)
// v0 = digest0
// for i=1..1499:
// buffer = digest(i-1) || v0 || [i as 4-byte big-endian]
// digest(i) = SHA256(buffer)
// aes256_encrypt(&key_ctx, counter, counter);
// Encrypt the (now updated) counter to produce the final output.
// aes256_encrypt(&key_ctx, counter, digest);
// this kernel only output 16 byte random number
// but we output 32 byte block, so its like this <16 byte random> <16 byte empty> <16 byte random> <16 byte empty> ...
// This will make it faster
__global__ void multihash_kernel(const uint8_t *input, uint8_t *output, size_t num)
{
ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num)
return;
__shared__ uint32_t sha256_k[64];
__shared__ unsigned char gf2_table_shared[256];
__shared__ uint8_t aes_sbox[256];
__shared__ uint32_t aes_Rcon[7];
if (threadIdx.x == 0) {
memcpy(sha256_k, k, sizeof(sha256_k));
memcpy(aes_sbox, sbox, sizeof(aes_sbox));
memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon));
memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared));
}
__syncthreads(); // Make sure the data is loaded before use
// Each input message is 19 bytes.
const uint8_t *data = input + idx * 19;
uint8_t digest[SHA256_DIGEST_SIZE];
uint8_t v0[SHA256_DIGEST_SIZE];
// First round: hash the 19-byte input.
// sha256_hash(data, 19, digest);
sha256_hash_19(data, digest, sha256_k);
// Save initial digest as v0.
for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
{
v0[i] = digest[i];
}
uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter)
// Copy v0.
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
{
buffer[32 + j] = v0[j];
}
// Write counter (big-endian).
// buffer[64] = (i >> 24) & 0xff;
// buffer[65] = (i >> 16) & 0xff;
// max is 1500
buffer[64] = 0;
buffer[65] = 0;
// 1,500 rounds total (first one is done above).
for (int i = 1; i < 1500; i++)
{
// Copy current digest
#pragma unroll
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
{
buffer[j] = digest[j];
}
buffer[66] = (i >> 8) & 0xff;
buffer[67] = i & 0xff;
// sha256_hash(buffer, 68, digest);
sha256_hash_68(buffer, digest, sha256_k);
}
#define AES_BLOCK_SIZE 16
// -------- AES post-processing on final digest --------
// Use final digest as the AES key.
aes256_ctx key_ctx;
aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon);
uint8_t counter[16] = {0}; // clear counter
// Encrypt the counter block.
aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared);
// Encrypt the (now updated) counter to produce the final output.
aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared);
// Write final (AESencrypted) digest to output.
uint8_t *out = output + idx * SHA256_DIGEST_SIZE;
uint32_t * out_u32 = (uint32_t *)out;
uint32_t * digest_u32 = (uint32_t *)digest;
//byte swap32 bit every 4 byte
for (int i = 0; i < SHA256_DIGEST_SIZE / 4; i++)
{
out_u32[i] = __byte_perm(digest_u32[i], 0, 0x0123);
}
}
__global__ void multihash_kernel_in_memory(uint8_t *output, uint64_t t_start, size_t num)
{
ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num)
return;
__shared__ uint32_t sha256_k[64];
__shared__ unsigned char gf2_table_shared[256];
__shared__ uint8_t aes_sbox[256];
__shared__ uint32_t aes_Rcon[7];
if (threadIdx.x == 0) {
memcpy(sha256_k, k, sizeof(sha256_k));
memcpy(aes_sbox, sbox, sizeof(aes_sbox));
memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon));
memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared));
}
__syncthreads(); // Make sure the data is loaded before use
uint64_t t = t_start + idx;
// Each input message is 19 bytes.
uint8_t data[19];
for (int i =0; i < 19; i++) {
data[18-i]='0' + (t%10);
t /= 10;
}
uint8_t digest[SHA256_DIGEST_SIZE];
// uint8_t v0[SHA256_DIGEST_SIZE];
// First round: hash the 19-byte input.
// sha256_hash(data, 19, digest);
sha256_hash_19(data, digest, sha256_k);
// Save initial digest as v0.
// for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
// {
// v0[i] = digest[i];
// }
uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter)
// Copy v0.
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
{
buffer[32 + j] = digest[j];
}
// Write counter (big-endian).
// buffer[64] = (i >> 24) & 0xff;
// buffer[65] = (i >> 16) & 0xff;
// max is 1500
buffer[64] = 0;
buffer[65] = 0;
// 1,500 rounds total (first one is done above).
for (int i = 1; i < 1500; i++)
{
// Copy current digest.
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
{
buffer[j] = digest[j];
}
buffer[66] = (i >> 8) & 0xff;
buffer[67] = i & 0xff;
// sha256_hash(buffer, 68, digest);
sha256_hash_68(buffer, digest, sha256_k);
}
#define AES_BLOCK_SIZE 16
// -------- AES post-processing on final digest --------
// Use final digest as the AES key.
aes256_ctx key_ctx;
aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon);
uint8_t counter[16] = {0}; // clear counter
// Encrypt the counter block.
aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared);
// Encrypt the (now updated) counter to produce the final output.
aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared);
// Write final (AESencrypted) digest to output.
uint8_t *out = output + idx * SHA256_DIGEST_SIZE;
uint32_t * out_u32 = (uint32_t *)out;
uint32_t * digest_u32 = (uint32_t *)digest;
//byte swap32 bit every 4 byte
for (int i = 0; i < SHA256_DIGEST_SIZE / 4; i++)
{
out_u32[i] = __byte_perm(digest_u32[i], 0, 0x0123);
}
}
//this is for chacha8 key, so no need to swap, and we nseed the 32 byte block
__global__ void multihash_kernel_in_memory_no_swap(uint8_t *output, uint64_t t_start, size_t num)
{
ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num)
return;
__shared__ uint32_t sha256_k[64];
__shared__ unsigned char gf2_table_shared[256];
__shared__ uint8_t aes_sbox[256];
__shared__ uint32_t aes_Rcon[7];
if (threadIdx.x == 0) {
memcpy(sha256_k, k, sizeof(sha256_k));
memcpy(aes_sbox, sbox, sizeof(aes_sbox));
memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon));
memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared));
}
__syncthreads(); // Make sure the data is loaded before use
uint64_t t = t_start + idx;
// Each input message is 19 bytes.
uint8_t data[19];
for (int i =0; i < 19; i++) {
data[18-i]='0' + (t%10);
t /= 10;
}
uint8_t digest[SHA256_DIGEST_SIZE];
// uint8_t v0[SHA256_DIGEST_SIZE];
// First round: hash the 19-byte input.
// sha256_hash(data, 19, digest);
sha256_hash_19(data, digest, sha256_k);
// Save initial digest as v0.
// for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
// {
// v0[i] = digest[i];
// }
uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter)
// Copy v0.
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
{
buffer[32 + j] = digest[j];
}
// Write counter (big-endian).
// buffer[64] = (i >> 24) & 0xff;
// buffer[65] = (i >> 16) & 0xff;
// max is 1500
buffer[64] = 0;
buffer[65] = 0;
// 1,500 rounds total (first one is done above).
for (int i = 1; i < 1500; i++)
{
// Copy current digest.
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
{
buffer[j] = digest[j];
}
buffer[66] = (i >> 8) & 0xff;
buffer[67] = i & 0xff;
// sha256_hash(buffer, 68, digest);
sha256_hash_68(buffer, digest, sha256_k);
}
#define AES_BLOCK_SIZE 16
// -------- AES post-processing on final digest --------
// Use final digest as the AES key.
aes256_ctx key_ctx;
aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon);
uint8_t counter[16] = {0}; // clear counter
// Encrypt the counter block.
aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared);
// Encrypt the (now updated) counter to produce the final output.
aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared);
// Write final (AESencrypted) digest to output.
uint8_t *out = output + idx * SHA256_DIGEST_SIZE;
#pragma unroll
for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
{
out[i] = digest[i];
}
}
//chacha8 key doesn't need swapping
__global__ void multihash_kernel_noswap(const uint8_t *input, uint8_t *output, size_t num)
{
ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num)
return;
__shared__ uint32_t sha256_k[64];
__shared__ unsigned char gf2_table_shared[256];
__shared__ uint8_t aes_sbox[256];
__shared__ uint32_t aes_Rcon[7];
if (threadIdx.x == 0) {
memcpy(sha256_k, k, sizeof(sha256_k));
memcpy(aes_sbox, sbox, sizeof(aes_sbox));
memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon));
memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared));
}
__syncthreads(); // Make sure the data is loaded before use
// Each input message is 19 bytes.
const uint8_t *data = input + idx * 19;
uint8_t digest[SHA256_DIGEST_SIZE];
uint8_t v0[SHA256_DIGEST_SIZE];
// First round: hash the 19-byte input.
// sha256_hash(data, 19, digest);
sha256_hash_19(data, digest, sha256_k);
// Save initial digest as v0.
for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
{
v0[i] = digest[i];
}
uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter)
// Copy v0.
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
{
buffer[32 + j] = v0[j];
}
// Write counter (big-endian).
// buffer[64] = (i >> 24) & 0xff;
// buffer[65] = (i >> 16) & 0xff;
// max is 1500
buffer[64] = 0;
buffer[65] = 0;
// 1,500 rounds total (first one is done above).
for (int i = 1; i < 1500; i++)
{
// Copy current digest.
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
{
buffer[j] = digest[j];
}
buffer[66] = (i >> 8) & 0xff;
buffer[67] = i & 0xff;
// sha256_hash(buffer, 68, digest);
sha256_hash_68(buffer, digest, sha256_k);
}
#define AES_BLOCK_SIZE 16
// -------- AES post-processing on final digest --------
// Use final digest as the AES key.
aes256_ctx key_ctx;
aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon);
uint8_t counter[16] = {0}; // clear counter
// Encrypt the counter block.
aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared);
// Encrypt the (now updated) counter to produce the final output.
aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared);
// Write final (AESencrypted) digest to output.
uint8_t *out = output + idx * SHA256_DIGEST_SIZE;
#pragma unroll
for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
{
out[i] = digest[i];
}
}
uint64_t get_time_in_nanosecond()
{
struct timespec time;
clock_gettime(CLOCK_MONOTONIC, &time);
return (uint64_t)time.tv_sec * 1000000000 + (uint64_t)time.tv_nsec;
}
#define TEST_ENCRYPT_BLOCK_SIZE 32
// Test kernel: each thread processes one key/IV pair.
// this is to test raw encryption speed
__global__ void test_kcipher2_kernel(const uint8_t *in, uint8_t *out, size_t N)
{
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
return;
// Each input block: first 16 bytes = Key, next 16 bytes = IV.
const int in_block_size = 32; // 16-byte key + 16-byte IV
const int out_block_size = TEST_ENCRYPT_BLOCK_SIZE; // 128-byte ciphertext
// Pointers to the key/IV for this thread.
const uint8_t *block_in = in + idx * in_block_size;
// Load key (16 bytes) and IV (16 bytes) into local arrays.
unsigned int key_local[4];
unsigned int iv_local[4];
for (int i = 0; i < 4; i++)
{
int base_key = i * 4;
key_local[i] = ((unsigned int)block_in[base_key] << 24) |
((unsigned int)block_in[base_key + 1] << 16) |
((unsigned int)block_in[base_key + 2] << 8) |
((unsigned int)block_in[base_key + 3]);
}
for (int i = 0; i < 4; i++)
{
int base_iv = 16 + i * 4;
iv_local[i] = ((unsigned int)block_in[base_iv] << 24) |
((unsigned int)block_in[base_iv + 1] << 16) |
((unsigned int)block_in[base_iv + 2] << 8) |
((unsigned int)block_in[base_iv + 3]);
}
unsigned char ciphertext[8];
long long res = kcipher2_encrypt_1_zero_block(key_local, iv_local);
// copy res to ciphertext
for (int i = 0; i < 8; i++)
{
ciphertext[i] = (res >> (56 - i * 8)) & 0xFF;
}
// Write the output ciphertext.
size_t out_offset = idx * out_block_size;
for (int i = 0; i < 8; i++)
{
out[out_offset + i] = ciphertext[i];
}
}
void fill_input(uint8_t *h_input, uint64_t start, size_t num)
{
char buffer[20];
snprintf(buffer, 20, "%019lu", start);
for (size_t i = 0; i < num; i++)
{
// copy buffer to input
memcpy(h_input + i * 19, buffer, 19);
// increment buffer, starting from last index, until it reaches '9'
for (int j = 18; j >= 0; j--)
{
if (buffer[j] == '9')
{
buffer[j] = '0';
}
else
{
buffer[j]++;
break;
}
}
}
}
int test_encryption_only(size_t num)
{
printf("Count %zu \n", num);
size_t input_size = num * TEST_ENCRYPT_BLOCK_SIZE;
// Query device properties
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, gpuIndex);
int maxThreadsPerBlock = prop.maxThreadsPerBlock; // Maximum allowed per block
int blockSize = maxThreadsPerBlock; // Dynamically set blockSize
// int blockSize = 256;
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
size_t output_size = num * TEST_ENCRYPT_BLOCK_SIZE;
printf("GPU needed memory: %.2f MB\n", (input_size + output_size) / 1024.0 / 1024.0);
uint8_t *h_input = (uint8_t *)malloc(input_size);
// test the blank/zero encryption
// memset(h_input, 0, input_size);
// fill with random
fill_input(h_input, TEST_TIMESTAMP, num * (32 / 19));
uint8_t *h_output_enc = (uint8_t *)malloc(output_size);
// Allocate device memory.
uint8_t *d_input, *d_output_enc;
printf("Allocating device memory...%.2f Mb\n", (input_size + output_size) / 1024.0 / 1024.0);
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
return 1;
}
if (cudaMalloc(&d_output_enc, output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output_enc\n");
return 1;
}
uint64_t start = get_time_in_nanosecond();
// copy to device
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
test_kcipher2_kernel<<<gridSize, blockSize>>>(d_input, d_output_enc, num);
cudaDeviceSynchronize();
// debug encryption output
cudaMemcpy(h_output_enc, d_output_enc, output_size, cudaMemcpyDeviceToHost);
uint64_t end = get_time_in_nanosecond();
printf("Enc Time: %f ms\n", (end - start) / 1000000.0);
// print speed
printf("Enc Speed: %f enc per second\n", num / ((end - start) / 1000000000.0));
// print first plaintext
printf("Plaintext for first input: ");
for (size_t i = 0; i < 19; i++)
{
printf("%c", h_input[i]);
}
printf("\n");
// For demonstration, print the first output digest in hexadecimal.
printf("Encrypted for first input: ");
for (size_t i = 0; i < TEST_ENCRYPT_BLOCK_SIZE; i++)
{
printf("%02x", h_output_enc[i]);
}
printf("\n");
// print last
// print plaintext
printf("Plaintext for last input: ");
for (size_t i = 19 * num - 19; i < 19 * num; i++)
{
printf("%c", h_input[i]);
}
printf("\n");
printf("Encrypted for last input: offs <%zu>: ", num * TEST_ENCRYPT_BLOCK_SIZE - TEST_ENCRYPT_BLOCK_SIZE);
for (size_t i = TEST_ENCRYPT_BLOCK_SIZE * num - TEST_ENCRYPT_BLOCK_SIZE; i < TEST_ENCRYPT_BLOCK_SIZE * num; i++)
{
printf("%02x", h_output_enc[i]);
}
printf("\n");
// Cleanup.
cudaFree(d_input);
cudaFree(d_output_enc);
free(h_input);
free(h_output_enc);
return 0;
}
#define DIGEST_SIZE 32
#define KEY_IV_SIZE 16 // we use first 16 bytes for key/iv
#define KCIPHER_OUT_SIZE 8
// KCIPHER2, single kernel
// --- Host Code ---
//for testing only, it will save all random number generated from a given second
int save_random(uint64_t start_time, const char *filename)
{
printf("Saving all seeds for %lu to %s\n", start_time, filename);
uint64_t timer_start = get_time_in_nanosecond();
//do it for every 10 million
size_t num = 10*1000*1000;
size_t input_size = num * 19 * sizeof(uint8_t);
uint8_t *h_input = (uint8_t *)malloc(input_size);
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
uint8_t *h_output = (uint8_t *)malloc(output_size);
uint8_t *h_output_non_zeroes = (uint8_t *)malloc(num * 16);
// Allocate device memory.
uint8_t *d_input, *d_output;
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
return 1;
}
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
return 1;
}
int blockSize = 256;
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
FILE *f = fopen(filename, "wb");
if (f == NULL)
{
fprintf(stderr, "Error: Cannot open file %s\n", filename);
return 1;
}
for (size_t i = 0; i < 100; i++) {
printf("Saving %zu %%\r", i);fflush(stdout);
fill_input(h_input, start_time + i * num, num);
//copy to CUDA
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
return 1;
}
cudaDeviceSynchronize();
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
//copy every 16 bytes
for (size_t j = 0; j < num; j++)
{
memcpy(h_output_non_zeroes + j * 16, h_output + j * DIGEST_SIZE, 16);
}
fwrite(h_output_non_zeroes, 16, num, f);
}
fclose(f);
uint64_t timer_end = get_time_in_nanosecond();
printf("DONE saved to %s: total time %.2f second \n", filename, (timer_end - timer_start) / 1000000000.0);
return 0;
}
int test_generate_random_only_in_gpu(size_t num)
{
uint64_t t_start = TEST_TIMESTAMP + 2000; // TEST: 0 is for chacha, +1000 for chacha_nonce
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
uint8_t *h_output = (uint8_t *)malloc(output_size);
uint8_t *d_output;
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
return 1;
}
int blockSize = 256;
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
uint64_t start = get_time_in_nanosecond();
multihash_kernel_in_memory<<<gridSize, blockSize>>>(d_output, t_start, num);
cudaDeviceSynchronize();
uint64_t end = get_time_in_nanosecond();
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
// print first and last
printf("First random ");
for (int i = 0; i < DIGEST_SIZE; i++)
{
printf("%02x", h_output[i]);
}
printf("\n");
printf("Last random ");
for (size_t i = DIGEST_SIZE * num - DIGEST_SIZE; i < DIGEST_SIZE * num; i++)
{
printf("%02x", h_output[i]);
}
printf("\n");
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
printf("Speed: %f hashes per second\n", num / ((end - start) / 1000000000.0));
return 0;
}
int test_generate_random_only(size_t num)
{
printf("Test generate random only: %lu\n", num);
size_t input_size = num * 19 * sizeof(uint8_t);
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
// Allocate host memory.
uint8_t *h_input = (uint8_t *)malloc(input_size);
uint64_t t_start = TEST_TIMESTAMP + 2000; // TEST: 0 is for chacha, +1000 for chacha_nonce
// output
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
uint8_t *h_output = (uint8_t *)malloc(output_size);
uint64_t start_fill = get_time_in_nanosecond();
fill_input(h_input, t_start, num);
uint64_t end_fill = get_time_in_nanosecond();
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
// Allocate device memory.
uint8_t *d_input, *d_output;
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
return 1;
}
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
return 1;
}
uint64_t start = get_time_in_nanosecond();
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
int blockSize = 256;
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
printf("Starting timestamp -> random calculation...\n");
fflush(stdout);
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
return 1;
}
cudaDeviceSynchronize();
uint64_t end1 = get_time_in_nanosecond();
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
uint64_t end = get_time_in_nanosecond();
// print first and last
printf("First random ");
for (int i = 0; i < DIGEST_SIZE; i++)
{
printf("%02x", h_output[i]);
}
printf("\n");
printf("Last random ");
for (size_t i = DIGEST_SIZE * num - DIGEST_SIZE; i < DIGEST_SIZE * num; i++)
{
printf("%02x", h_output[i]);
}
printf("\n");
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
// print speed per second
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
// Cleanup.
cudaFree(d_input);
cudaFree(d_output);
free(h_input);
free(h_output);
return 0;
}
#define U8TO32_LITTLE(p) (*(const uint32_t *)(p))
typedef struct {
uint32_t a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p;
} BLOCK;
// Device helper: rotate left 32-bit value
__device__ __forceinline__ uint32_t _rotl(uint32_t x, int n) {
return (x << n) | (x >> (32 - n));
}
#define QROUND(a, b, c, d) \
d = _rotl(d ^ (a += b), 16); \
b = _rotl(b ^ (c += d), 12); \
d = _rotl(d ^ (a += b), 8); \
b = _rotl(b ^ (c += d), 7)
#define FROUND \
QROUND(x.d, x.h, x.l, x.p); \
QROUND(x.c, x.g, x.k, x.o); \
QROUND(x.b, x.f, x.j, x.n); \
QROUND(x.a, x.e, x.i, x.m); \
QROUND(x.a, x.f, x.k, x.p); \
QROUND(x.b, x.g, x.l, x.m); \
QROUND(x.c, x.h, x.i, x.n); \
QROUND(x.d, x.e, x.j, x.o)
#define FFINAL \
x.a += j.a; \
x.b += j.b; \
x.c += j.c; \
x.d += j.d; \
x.e += j.e; \
x.f += j.f; \
x.g += j.g; \
x.h += j.h; \
x.i += j.i; \
x.j += j.j; \
x.k += j.k; \
x.l += j.l; \
x.m += j.m; \
x.n += j.n; \
x.o += j.o; \
x.p += j.p
//in contains key/nonce
//offset is the match offset
//out_flag is the flag: ts << 32 | offset
__global__ void chacha8_encrypt_and_match(const uint8_t *in, size_t offset,
unsigned long long *out_flag,
const unsigned long long value,
int N)
{
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
{
return;
}
size_t idx_offs = idx + offset;
if (idx_offs >= N)
{
return;
}
const int in_block_size = 32;
// Pointers to the key/IV for this thread.
const uint8_t *block_in = in + idx * in_block_size;
const uint8_t *block_offset_in = in + idx_offs * in_block_size;
const uint8_t *k = block_in;
const uint8_t *iv = block_offset_in;
const char constants[16] = {'e', 'x', 'p', 'a', 'n', 'd', ' ', '1', '6', '-', 'b', 'y', 't', 'e', ' ', 'k'};
uint32_t state[16];
state[4] = U8TO32_LITTLE(k + 0);
state[5] = U8TO32_LITTLE(k + 4);
state[6] = U8TO32_LITTLE(k + 8);
state[7] = U8TO32_LITTLE(k + 12);
state[8] = U8TO32_LITTLE(k + 0);
state[9] = U8TO32_LITTLE(k + 4);
state[10] = U8TO32_LITTLE(k + 8);
state[11] = U8TO32_LITTLE(k + 12);
state[0] = U8TO32_LITTLE(constants + 0);
state[1] = U8TO32_LITTLE(constants + 4);
state[2] = U8TO32_LITTLE(constants + 8);
state[3] = U8TO32_LITTLE(constants + 12);
state[12] = 0;
state[13] = 0;
state[14] = U8TO32_LITTLE(iv + 0);
state[15] = U8TO32_LITTLE(iv + 4);
BLOCK x;
BLOCK j;
memcpy(&j, state, sizeof(BLOCK)); //j is for final addition
j.m = 0;
j.n = 0;
memcpy(&x, &j, sizeof(BLOCK)); //FROUND will modify x
FROUND;
FROUND;
FROUND;
FROUND;
FFINAL;
uint64_t *result = (uint64_t *)&x;
if (*result == value) {
unsigned long long encoded_offset_and_index = (unsigned long long)offset << 32 | idx;
*out_flag = encoded_offset_and_index;
}
}
__global__ void encrypt_and_search_offset(const uint8_t *in, size_t offset,
unsigned long long *out_flag,
const unsigned long long *masks,
const unsigned long long *values,
int numComb,
int N)
{
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
{
return;
}
size_t idx_offs = idx + offset;
if (idx_offs >= N)
{
return;
}
__shared__ unsigned char gf2_table_shared[256];
__shared__ unsigned char gf3_table_shared[256];
__shared__ unsigned char d_s_box_shared[256];
__shared__ unsigned int d_amul0_shared[256];
__shared__ unsigned int d_amul1_shared[256];
__shared__ unsigned int d_amul2_shared[256];
__shared__ unsigned int d_amul3_shared[256];
__shared__ unsigned long long shared_masks[MAX_MATCHES];
__shared__ unsigned long long shared_values[MAX_MATCHES];
if (threadIdx.x == 0) {
//copy gf2_table from const
memcpy(gf2_table_shared, gf2_table, 256);
memcpy(gf3_table_shared, gf3_table, 256);
memcpy(d_s_box_shared, d_s_box, 256);
memcpy(d_amul0_shared, d_amul0, 256 * sizeof(int));
memcpy(d_amul1_shared, d_amul1, 256 * sizeof(int));
memcpy(d_amul2_shared, d_amul2, 256 * sizeof(int));
memcpy(d_amul3_shared, d_amul3, 256 * sizeof(int));
for (int i = 0; i < numComb; i++)
{
shared_masks[i] = masks[i];
shared_values[i] = values[i];
}
}
__syncthreads(); // Make sure the data is loaded before use
const int in_block_size = 32;
// Pointers to the key/IV for this thread.
const uint8_t *block_in = in + idx * in_block_size;
const uint8_t *block_offset_in = in + idx_offs * in_block_size;
// Load key (16 bytes) and IV (16 bytes) into local arrays.
unsigned int key_local[4];
unsigned int iv_local[4];
const uint8_t *key_in = block_in;
const uint8_t *iv_in = block_offset_in;
memcpy(key_local, key_in, 16);
memcpy(iv_local, iv_in, 16);
//long long res = kcipher2_encrypt_1_zero_block(key_local, iv_local);
long long res = kcipher2_encrypt_1_zero_block_shared(key_local, iv_local, gf2_table_shared, gf3_table_shared, d_s_box_shared,
d_amul0_shared, d_amul1_shared, d_amul2_shared, d_amul3_shared);
// compare with matches
unsigned long long in_val = bswap64(res);
unsigned int flag = 0;
for (int i = 0; i < numComb; i++) {
// diff is 0 if the masked input equals the expected value
unsigned long long diff = (in_val & shared_masks[i]) ^ shared_values[i];
// Compute a branch-free match: returns nonzero if diff is zero.
flag |= (1 - ((diff | -diff) >> 63));
}
if (flag) {
//we found it, now find out exactly which one did we find
for (int i = 0; i < numComb; i++) {
if ((in_val & shared_masks[i]) == shared_values[i]) {
//idx: max 1 billion, use 30 bits
//offset: max 64 million, use 26
//numComb: max 128, use 7
unsigned long long encoded_offset_and_index = (unsigned long long)(idx << 34 | offset << 8 | i << 1 | 1);
*out_flag = encoded_offset_and_index;
break;
}
}
}
}
void decode_offset_and_index(unsigned long long encoded_offset_and_index,
size_t *offset, size_t *index, size_t *matchPos)
{
//idx: max 1 billion, use 30 bits
//offset: max 64 million, use 26
//matchPos: max 128, use 7 bits
//final 1 bit to ensure we have true value
//how it was encoded:
//unsigned long long encoded_offset_and_index = (unsigned long long)(idx << 34 | offset << 8 | i << 1 | 1);
//decode:
*index = (encoded_offset_and_index >> 34) & 0x3fffffff;
*offset = (encoded_offset_and_index >> 8) & 0x3ffffff;
*matchPos = (encoded_offset_and_index >> 1) & 0x7f;
}
__global__ void encrypt_and_search(const uint8_t *in, uint8_t *out, uint32_t k1, uint32_t k2, uint32_t k3, uint32_t k4,
int *out_flag,
const unsigned long long *masks,
const unsigned long long *values,
int numComb,
int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
return;
// for (int i =0; i < 16; i++) {
// printf("%02x", in[i]);
// }
// printf("\n");
// Each input block: first 16 bytes = Key, next 16 bytes = IV.
const int in_block_size = 32; // 16-byte IV + zeroes
const int out_block_size = TEST_ENCRYPT_BLOCK_SIZE; // block size
// Pointers to the key/IV for this thread.
const uint8_t *block_in = in + idx * in_block_size;
// Load key (16 bytes) and IV (16 bytes) into local arrays.
__shared__ unsigned int key_local[4];
__shared__ unsigned char gf2_table_shared[256];
__shared__ unsigned char gf3_table_shared[256];
__shared__ unsigned char d_s_box_shared[256];
__shared__ unsigned int d_amul0_shared[256];
__shared__ unsigned int d_amul1_shared[256];
__shared__ unsigned int d_amul2_shared[256];
__shared__ unsigned int d_amul3_shared[256];
__shared__ unsigned long long shared_masks[MAX_MATCHES];
__shared__ unsigned long long shared_values[MAX_MATCHES];
unsigned int iv_local[4];
if (threadIdx.x == 0) {
key_local[0] = k1;
key_local[1] = k2;
key_local[2] = k3;
key_local[3] = k4;
//copy gf2_table from const
memcpy(gf2_table_shared, gf2_table, 256);
memcpy(gf3_table_shared, gf3_table, 256);
memcpy(d_s_box_shared, d_s_box, 256);
memcpy(d_amul0_shared, d_amul0, 256 * sizeof(int));
memcpy(d_amul1_shared, d_amul1, 256 * sizeof(int));
memcpy(d_amul2_shared, d_amul2, 256 * sizeof(int));
memcpy(d_amul3_shared, d_amul3, 256 * sizeof(int));
for (int i = 0; i < numComb; i++)
{
shared_masks[i] = masks[i];
shared_values[i] = values[i];
}
}
__syncthreads(); // Make sure the data is loaded before use
const uint8_t *iv_in = block_in;
memcpy(iv_local, iv_in,16);
//long long res = kcipher2_encrypt_1_zero_block(key_local, iv_local);
long long res = kcipher2_encrypt_1_zero_block_shared(key_local, iv_local, gf2_table_shared, gf3_table_shared, d_s_box_shared,
d_amul0_shared, d_amul1_shared, d_amul2_shared, d_amul3_shared);
// compare with matches
unsigned long long in_val = bswap64(res);
unsigned int flag = 0;
for (int i = 0; i < numComb; i++)
{
if ((in_val & masks[i]) == values[i])
{
*out_flag = 1;
flag = 1;
break;
}
}
#if 1
#ifdef NUM_MATCHES
#pragma unroll
for (int i = 0; i < NUM_MATCHES; i++) {
#else
for (int i = 0; i < numComb; i++) {
#endif
// diff is 0 if the masked input equals the expected value
unsigned long long diff = (in_val & shared_masks[i]) ^ shared_values[i];
// Compute a branch-free match: returns nonzero if diff is zero.
flag |= (1 - ((diff | -diff) >> 63));
}
if (flag)
*out_flag = flag;
#endif
if (flag)
{
// Write the output ciphertext.
int out_offset = idx * out_block_size;
#pragma unroll
for (int i = 0; i < 8; i++) // only copy changed
{
out[out_offset + i] = (res >> (56 - i * 8)) & 0xFF;
}
}
}
int do_bruteforce_offset(const char *filename)
{
printf("Bruteforce Offset\n");
json config;
std::string json_file = filename;
std::string checkpoint = json_file + ".checkpoint.json";
std::ifstream f(filename);
if (!f.is_open())
{
fprintf(stderr, "Error: Could not open config file %s\n", filename);
return 1;
}
try
{
f >> config;
}
catch (json::parse_error &e)
{
fprintf(stderr, "Error parsing JSON: %s\n", e.what());
return 1;
}
size_t *offsets = 0;
size_t offset_count = 0;
//open offset.txt
FILE *f_offset = fopen("offset.txt", "r");
if (f_offset == NULL)
{
fprintf(stderr, "Error: Could not open offset file\n");
return 1;
}
//read line by line, convert to integer
char line[256];
while (fgets(line, sizeof(line), f_offset))
{
offsets = (size_t *)realloc(offsets, (offset_count + 1) * sizeof(size_t));
offsets[offset_count] = strtoull(line, NULL, 10);
offset_count++;
}
fclose(f_offset);
printf("Offset count: %zu\n", offset_count);
//print first and last offset
printf("First offset: %zu\n", offsets[0]);
printf("Last offset: %zu\n", offsets[offset_count - 1]);
size_t num;
uint64_t t_start;
//size_t enc_count;
size_t offset;
size_t matches_size = 0;
uint64_t *matches = 0;
uint64_t *masks = 0;
try
{
t_start = config["start_timestamp"].get<uint64_t>(); // start T3
num = config["count"].get<size_t>(); // stop at T3 + count ns
offset = config["offset"].get<size_t>(); // we start at T3 + offset ns
//enc_count = config["brute_force_time_range"].get<size_t>(); // we stop at T3 + offset + enc_count ns
// "mathes": [
// {
// "plaintext": "0x00000000",
// "encrypted": "0x00000000",
// "bitmask": "0xffffffff"
// },
// {
// "plaintext": "0x00000000",
// "encrypted": "0x00000001",
// "bitmask": "0xffffffff"
// }
// ]
// parse matches, and put it in array (match = plaintext^encrypted)
if (config.contains("matches"))
{
matches_size = config["matches"].size();
assert(matches_size < MAX_MATCHES);
assert(matches_size > 0);
matches = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
masks = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
for (size_t i = 0; i < matches_size; i++)
{
uint64_t plaintext = std::stoull(config["matches"][i]["plaintext"].get<std::string>(), 0, 16);
uint64_t encrypted = std::stoull(config["matches"][i]["encrypted"].get<std::string>(), 0, 16);
uint64_t bitmask = std::stoull(config["matches"][i]["bitmask"].get<std::string>(), 0, 16);
matches[i] = plaintext ^ encrypted;
masks[i] = bitmask;
printf("Match %zu: %016lx bitmask %016lx \n", i, matches[i], masks[i]);
}
}
}
catch (json::exception &e)
{
fprintf(stderr, "Error reading JSON values: %s\n", e.what());
return 1;
}
printf("Configuration:\n");
printf("num: %zu\n", num);
printf("t_start: %lu\n", t_start);
//printf("enc_count: %zu\n", enc_count);
printf("offset: %zu\n", offset);
//printf("Brute forcing: %zu enc count %zu\n", num, enc_count);
size_t input_size = num * 19 * sizeof(uint8_t);
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
// Allocate host memory.
uint8_t *h_input = (uint8_t *)malloc(input_size);
// output
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
uint8_t *h_output = (uint8_t *)malloc(output_size);
//size_t enc_output_size = enc_count * TEST_ENCRYPT_BLOCK_SIZE * sizeof(uint8_t);
//uint8_t *h_output_enc = (uint8_t *)malloc(enc_output_size);
uint64_t start_fill = get_time_in_nanosecond();
fill_input(h_input, t_start, num); // GENERATE TS
uint64_t end_fill = get_time_in_nanosecond();
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
// Allocate device memory.
uint8_t *d_input, *d_output;
//uint8_t *d_output_enc;
unsigned long long *d_found;
unsigned long long *d_matches;
unsigned long long *d_masks;
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
return 1;
}
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
return 1;
}
// if (cudaMalloc(&d_output_enc, enc_output_size) != cudaSuccess)
// {
// fprintf(stderr, "Error: cudaMalloc failed for d_output_enc\n");
// return 1;
// }
if (cudaMalloc(&d_found, sizeof(unsigned long long)) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_found\n");
return 1;
}
if (cudaMalloc(&d_matches, matches_size * sizeof(uint64_t)) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_matches\n");
return 1;
}
if (cudaMalloc(&d_masks, matches_size * sizeof(uint64_t)) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_masks\n");
return 1;
}
unsigned long long zero = 0;
cudaMemcpy(d_found, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
// copy matches and masks
cudaMemcpy(d_matches, matches, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_masks, masks, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
uint64_t start = get_time_in_nanosecond();
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
// Launch kernel.
int blockSize = 256;
int gridSize = (num + blockSize - 1) / blockSize;
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
printf("Starting timestamp -> random calculation...\n");
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
cudaDeviceSynchronize();
uint64_t end1 = get_time_in_nanosecond();
uint64_t end = get_time_in_nanosecond();
int limit = num - offset;
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
// print speed per second
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
printf("Limit: %d\n", limit);
start = get_time_in_nanosecond();
uint64_t start_enc = get_time_in_nanosecond();
// set gridsize based on limit
gridSize = (limit + blockSize - 1) / blockSize;
for (size_t i = 0; i < offset_count; i++)
{
printf("Starting encryption and search at offset %zu (%zu)\n", i, offsets[i]);
encrypt_and_search_offset<<<gridSize, blockSize>>>(d_output, offsets[i], d_found, d_masks, d_matches, matches_size, limit);
//check for error
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
return 1;
}
if (i > 0 && (i % 1) == 0)
{
uint64_t end_enc = get_time_in_nanosecond();
printf("Enc Time: %f ms\n", (end_enc - start_enc) / 1000000.0);
start_enc = get_time_in_nanosecond();
printf("Progress: %zu/%zu (testing: %zu)\n", i, offset_count, offsets[i]);
cudaDeviceSynchronize();
unsigned long long found = 0;
cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
if (found)
{
//unsigned long long encoded_offset_and_index = (unsigned long long)offset << 31 | idx << 1 | 1;
size_t t_offset, t_index, match_pos;
decode_offset_and_index(found, &t_offset, &t_index, &match_pos);
printf("Found at index %zu offset=%zu ts = %zu + %zu filename = %s match_pos = %zu\n", i, t_offset, t_start, t_index, filename, match_pos);
//write to output.txt
FILE *f = fopen("output.txt", "a");
if (f == NULL)
{
printf("Error opening file!\n");
exit(1);
}
fprintf(f, "Found at index %zu offset=%zu ts = %zu + %zu filename = %s match_pos = %zu\n", i, t_offset, t_start, t_index, filename, match_pos);
fclose(f);
// reset for next
cudaMemcpy(d_found, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
// err = cudaGetLastError();
// if (err != cudaSuccess)
// {
// fprintf(stderr, "Error2: %s\n", cudaGetErrorString(err));
// return 1;
// }
}
//write checkpoint
config["start_timestamp"] = t_start + i;
config["index"] = i;
std::ofstream o(checkpoint);
o << std::setw(4) << config << std::endl;
}
}
cudaDeviceSynchronize();
long long found = 0;
cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
return 1;
}
if (found)
{
size_t t_offset, t_index, match_pos;
decode_offset_and_index(found, &t_offset, &t_index, &match_pos);
printf("Found at offset=%zu ts = %zu + %zu filename = %s match indxex = %zu\n", t_offset, t_start, t_index, filename, match_pos);
}
uint64_t end_enc = get_time_in_nanosecond();
printf("Enc Offset list Time: %f ms\n", (end_enc - start_enc) / 1000000.0);
return 0;
}
int do_bruteforce_new(const char *filename)
{
json config;
std::string json_file = filename;
std::string checkpoint = json_file + ".checkpoint.json";
std::ifstream f(filename);
if (!f.is_open())
{
fprintf(stderr, "Error: Could not open config file %s\n", filename);
return 1;
}
try
{
f >> config;
}
catch (json::parse_error &e)
{
fprintf(stderr, "Error parsing JSON: %s\n", e.what());
return 1;
}
size_t num;
uint64_t t_start;
size_t enc_count;
size_t offset;
size_t matches_size = 0;
uint64_t *matches = 0;
uint64_t *masks = 0;
char **matches_filename = 0;
try
{
t_start = config["start_timestamp"].get<uint64_t>(); // start T3
num = config["count"].get<size_t>(); // stop at T3 + count ns
offset = config["offset"].get<size_t>(); // we start at T3 + offset ns
enc_count = config["brute_force_time_range"].get<size_t>(); // we stop at T3 + offset + enc_count ns
// parse matches, and put it in array (match = plaintext^encrypted)
if (config.contains("matches"))
{
matches_size = config["matches"].size();
assert(matches_size < MAX_MATCHES);
assert(matches_size > 0);
matches = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
masks = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
matches_filename = (char **)malloc(matches_size * sizeof(char *));
for (size_t i = 0; i < matches_size; i++)
{
uint64_t plaintext = std::stoull(config["matches"][i]["plaintext"].get<std::string>(), 0, 16);
uint64_t encrypted = std::stoull(config["matches"][i]["encrypted"].get<std::string>(), 0, 16);
uint64_t bitmask = std::stoull(config["matches"][i]["bitmask"].get<std::string>(), 0, 16);
std::string filename = config["matches"][i]["filename"].get<std::string>();
//strdup filename
matches_filename[i] = strdup(filename.c_str());
matches[i] = plaintext ^ encrypted;
masks[i] = bitmask;
printf("Match %zu: %016lx bitmask %016lx \n", i, matches[i], masks[i]);
}
}
}
catch (json::exception &e)
{
fprintf(stderr, "Error reading JSON values: %s\n", e.what());
return 1;
}
printf("Configuration:\n");
printf("num: %zu\n", num);
printf("t_start: %lu\n", t_start);
printf("t_end %lu\n", t_start + num);
printf("enc_count: %zu\n", enc_count);
printf("offset: %zu\n", offset);
printf("Brute forcing: %zu enc count %zu\n", num, enc_count);
// output
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
printf("GPU RAM for random %.2f MB\n", output_size / 1024.0 / 1024.0);
// Allocate device memory.
uint8_t *d_output;
unsigned long long *d_found;
unsigned long long *d_matches;
unsigned long long *d_masks;
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
return 1;
}
if (cudaMalloc(&d_found, sizeof(unsigned long long )) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_found\n");
return 1;
}
if (cudaMalloc(&d_matches, matches_size * sizeof(uint64_t)) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_matches\n");
return 1;
}
if (cudaMalloc(&d_masks, matches_size * sizeof(uint64_t)) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_masks\n");
return 1;
}
unsigned long long zero = 0;
cudaMemcpy(d_found, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
// copy matches and masks
cudaMemcpy(d_matches, matches, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_masks, masks, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
uint64_t start = get_time_in_nanosecond();
// Launch kernel.
int blockSize = 256;
int gridSize = (num + blockSize - 1) / blockSize;
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
printf("Starting timestamp -> random calculation...: %zu nanoseconds\n", num);
multihash_kernel_in_memory<<<gridSize, blockSize>>>(d_output, t_start, num);
cudaDeviceSynchronize();
uint64_t end1 = get_time_in_nanosecond();
uint64_t end = get_time_in_nanosecond();
int limit = num - enc_count - offset;
printf("Total Time to translate timestamp to random %f ms\n", (end - start) / 1000000.0);
// print speed per second
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
printf("Limit: %d\n", limit);
start = get_time_in_nanosecond();
uint64_t start_enc = get_time_in_nanosecond();
// set gridsize based on limit
gridSize = (limit + blockSize - 1) / blockSize;
uint64_t elapsed = 0;
for (size_t i = 0; i < enc_count; i++)
{
// printf("Starting encryption and search at offset %d\n", i);
encrypt_and_search_offset<<<gridSize, blockSize>>>(d_output, offset + i, d_found, d_masks, d_matches, matches_size, limit);
// print error
// cudaError_t err = cudaGetLastError();
// if (err != cudaSuccess)
// {
// fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
// return 1;
// }
#define SKIP_CHECK 100
if (i > 0 && (i % SKIP_CHECK) == 0)
{
uint64_t end_enc = get_time_in_nanosecond();
printf("Enc Time (%zu): %f ms\n", i, (end_enc - start_enc) / 1000000.0);
elapsed += (end_enc - start_enc);
printf("%.2f minutes elapsed, ", elapsed / 1000000000.0 / 60.0);
//remaining
double remaining = (enc_count - i) * ((end_enc - start_enc)/SKIP_CHECK);
printf("%.2f minutes remaining (%.2f hours). ", remaining / 1000000000.0 / 60.0, remaining / 1000000000.0 / 3600.0);
start_enc = get_time_in_nanosecond();
printf("Progress: %zu/%zu (testing: %zu)\n", i, enc_count, offset + i);
cudaDeviceSynchronize();
// //print error
// err = cudaGetLastError();
// if (err != cudaSuccess)
// {
// fprintf(stderr, "Error0: %s\n", cudaGetErrorString(err));
// return 1;
// }
unsigned long long found = 0;
cudaMemcpy(&found, d_found, sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
// err = cudaGetLastError();
// if (err != cudaSuccess)
// {
// fprintf(stderr, "Error1: %s\n", cudaGetErrorString(err));
// return 1;
// }
if (found)
{
size_t t_offset, t_index, match_pos;
decode_offset_and_index(found, &t_offset, &t_index, &match_pos);
printf("Found at offset=%zu ts = %zu + %zu config_file = %s match_index %zu, file : %s\n", t_offset, t_start, t_index, filename, match_pos,
matches_filename[match_pos]);
printf("Found at offset %zu found = %llu ts = %zu file = %s, match_index = %zu file: %s\n", t_offset, found, t_start + t_index, filename, match_pos,
matches_filename[match_pos]);
//write to output.txt
FILE *f = fopen("output.txt", "a");
if (f == NULL)
{
printf("Error opening file!\n");
exit(1);
}
fprintf(f, "Found at offset %zu found = %llu ts = %zu file = %s, match_index = %zu file: %s\n", t_offset, found, t_start + t_index, filename, match_pos,
matches_filename[match_pos]);
fclose(f);
// reset for next
cudaMemcpy(d_found, &zero, sizeof(unsigned long long ), cudaMemcpyHostToDevice);
// err = cudaGetLastError();
// if (err != cudaSuccess)
// {
// fprintf(stderr, "Error2: %s\n", cudaGetErrorString(err));
// return 1;
// }
}
//write checkpoint
config["offset"] = offset + i;
config["index"] = i;
std::ofstream o(checkpoint);
o << std::setw(4) << config << std::endl;
}
}
cudaDeviceSynchronize();
// //print error
// err = cudaGetLastError();
// if (err != cudaSuccess)
// {
// fprintf(stderr, "Error0: %s\n", cudaGetErrorString(err));
// return 1;
// }
unsigned long long found = 0;
cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Error1: %s\n", cudaGetErrorString(err));
return 1;
}
if (found)
{
size_t t_offset, t_index, match_pos;
decode_offset_and_index(found, &t_offset, &t_index, &match_pos);
printf("Found at offset=%zu ts = %zu + %zu filename = %s\n", t_offset, t_start, t_index, filename);
}
uint64_t end_enc = get_time_in_nanosecond();
printf("Offset Enc Time: %f ms\n", (end_enc - start_enc) / 1000000.0);
return 0;
}
int do_bruteforce(const char *filename)
{
json config;
std::string json_file = filename;
std::string checkpoint = json_file + "checkpoint.json";
std::ifstream f(filename);
if (!f.is_open())
{
fprintf(stderr, "Error: Could not open config file %s\n", filename);
return 1;
}
try
{
f >> config;
}
catch (json::parse_error &e)
{
fprintf(stderr, "Error parsing JSON: %s\n", e.what());
return 1;
}
size_t num;
uint64_t t_start;
size_t enc_count;
size_t offset;
size_t matches_size = 0;
uint64_t *matches = 0;
uint64_t *masks = 0;
try
{
t_start = config["start_timestamp"].get<uint64_t>(); // start T3
num = config["count"].get<size_t>(); // stop at T3 + count ns
offset = config["offset"].get<size_t>(); // we start at T3 + offset ns
enc_count = config["brute_force_time_range"].get<size_t>(); // we stop at T3 + offset + enc_count ns
// parse matches, and put it in array (match = plaintext^encrypted)
if (config.contains("matches"))
{
matches_size = config["matches"].size();
matches = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
masks = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
for (size_t i = 0; i < matches_size; i++)
{
uint64_t plaintext = std::stoull(config["matches"][i]["plaintext"].get<std::string>(), 0, 16);
uint64_t encrypted = std::stoull(config["matches"][i]["encrypted"].get<std::string>(), 0, 16);
uint64_t bitmask = std::stoull(config["matches"][i]["bitmask"].get<std::string>(), 0, 16);
matches[i] = plaintext ^ encrypted;
printf("Match %zu: %016lx\n", i, matches[i]);
masks[i] = bitmask;
printf("Mask %zu: %016lx\n", i, masks[i]);
}
}
}
catch (json::exception &e)
{
fprintf(stderr, "Error reading JSON values: %s\n", e.what());
return 1;
}
printf("Configuration:\n");
printf("num: %zu\n", num);
printf("t_start: %lu\n", t_start);
printf("enc_count: %zu\n", enc_count);
printf("offset: %zu\n", offset);
printf("Brute forcing: %zu enc count %zu\n", num, enc_count);
size_t input_size = num * 19 * sizeof(uint8_t);
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
// Allocate host memory.
uint8_t *h_input = (uint8_t *)malloc(input_size);
// output
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
uint8_t *h_output = (uint8_t *)malloc(output_size);
size_t enc_output_size = enc_count * TEST_ENCRYPT_BLOCK_SIZE * sizeof(uint8_t);
uint8_t *h_output_enc = (uint8_t *)malloc(enc_output_size);
uint64_t start_fill = get_time_in_nanosecond();
fill_input(h_input, t_start, num); // GENERATE TS
uint64_t end_fill = get_time_in_nanosecond();
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
// Allocate device memory.
uint8_t *d_input, *d_output, *d_output_enc;
int *d_found;
unsigned long long *d_matches;
unsigned long long *d_masks;
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
return 1;
}
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
return 1;
}
if (cudaMalloc(&d_output_enc, enc_output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output_enc\n");
return 1;
}
if (cudaMalloc(&d_found, sizeof(int)) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_found\n");
return 1;
}
if (cudaMalloc(&d_matches, matches_size * sizeof(uint64_t)) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_matches\n");
return 1;
}
if (cudaMalloc(&d_masks, matches_size * sizeof(uint64_t)) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_masks\n");
return 1;
}
long zero = 0;
cudaMemcpy(d_found, &zero, sizeof(int), cudaMemcpyHostToDevice);
// copy matches and masks
cudaMemcpy(d_matches, matches, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_masks, masks, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
uint64_t start = get_time_in_nanosecond();
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
// Launch kernel.
int blockSize = 256;
int gridSize = (num + blockSize - 1) / blockSize;
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
printf("Starting timestamp -> random calculation...\n");
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
cudaDeviceSynchronize();
uint64_t end1 = get_time_in_nanosecond();
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
uint64_t end = get_time_in_nanosecond();
// //print first hash
printf("First hash: ");
for (int i = 0; i < 16; i++)
{
printf("%02x", h_output[i]);
}
printf("\n");
// //print 2nd hash
printf("+offset hash: ");
for (int i = 0; i < 16; i++)
{
printf("%02x", h_output[SHA256_DIGEST_SIZE * offset + i]);
}
printf("\n");
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
// print speed per second
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
size_t limit = num - enc_count - offset;
printf("Limit: %zu\n", limit);
start = get_time_in_nanosecond();
int gridSizeEnc = (enc_count + blockSize - 1) / blockSize;
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, gpuIndex);
int maxThreadsPerBlock = prop.maxThreadsPerBlock; // Maximum allowed per block
blockSize = maxThreadsPerBlock; // Dynamically set blockSize
gridSizeEnc = (enc_count + blockSize - 1) / blockSize; // Ensure full coverage
printf("Encryption Using blockSize = %d, gridSize = %d\n", blockSize, gridSizeEnc);
uint64_t start_enc = get_time_in_nanosecond();
for (size_t i = 0; i < limit; i++)
{
// launch test_kcipher2_kernel_single
uint32_t *host_i32 = (uint32_t *)(h_output + i * 32);
uint32_t k1 = host_i32[0];
uint32_t k2 = host_i32[1];
uint32_t k3 = host_i32[2];
uint32_t k4 = host_i32[3];
if (i > 0 && i % 10000 == 0)
{
uint64_t end_enc = get_time_in_nanosecond();
printf("10000 ns Time: %f ms\n", (end_enc - start_enc) / 1000000.0);
printf("Speed for 10000 ns: %f enc per second\n", 10000 / ((end_enc - start_enc) / 1000000000.0));
start_enc = get_time_in_nanosecond();
printf("Processing %zu: T3 = %lu\n", i, t_start + i);
fflush(stdout);
// printf("Current key: %08x %08x %08x %08x\n", k1, k2, k3, k4);
}
if ((i % 100000) == 0)
{
// create checkpoint
// modify json with current time
config["start_timestamp"] = t_start + i;
std::ofstream o(checkpoint);
o << std::setw(4) << config << std::endl;
}
encrypt_and_search<<<gridSizeEnc, blockSize>>>(d_output + (offset + i) * 32, d_output_enc, k1, k2, k3, k4, d_found, d_masks, d_matches, matches_size, enc_count);
cudaDeviceSynchronize();
// copy match
int found;
cudaMemcpy(&found, d_found, sizeof(int), cudaMemcpyDeviceToHost);
if (found)
{
// only copy the large block if we found a match
cudaMemcpy(h_output_enc, d_output_enc, enc_output_size, cudaMemcpyDeviceToHost);
// reset found flag for next match
cudaMemcpy(d_found, &zero, sizeof(int), cudaMemcpyHostToDevice);
// reset d_output_enc
cudaMemset(d_output_enc, 0, enc_output_size);
uint64_t *host_i64 = (uint64_t *)(h_output_enc);
for (size_t j = 0; j < enc_count; j++)
{
for (size_t k = 0; k < matches_size; k++)
{
if ((*host_i64 & masks[k]) == matches[k])
{
printf("Found Match %zu target %zu T3 = %lu T4 = %lu offset=%lu\n", k, j, t_start + i, (t_start + i) + offset + j, offset + j);
// open file and write to it
FILE *out = fopen("output.txt", "a");
if (out != NULL)
{
fprintf(out, "Found Match %zu target %zu T3 = %lu T4 = %lu offset=%lu\n", k, j, t_start + i, (t_start + i) + offset + j, offset + j);
fclose(out);
}
}
}
host_i64 += 4;
}
}
}
end = get_time_in_nanosecond();
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
// print speed per second
printf("Speed: %f enc per second\n", enc_count / ((end - start) / 1000000000.0));
// Cleanup.
cudaFree(d_input);
cudaFree(d_output);
free(h_input);
free(h_output);
return 0;
}
//this will leak fd, but we don't care
uint8_t *load_random(const char *filename, size_t *size)
{
//mmap the file
int fd;
struct stat sb;
uint8_t *addr;
fd = open(filename, O_RDONLY);
if (fd == -1)
{
perror("open");
return 0;
}
//get file size using stat
if (fstat(fd, &sb) == -1)
{
close(fd);
perror("fstat");
return 0;
}
*size = sb.st_size;
addr = (uint8_t *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED)
{
close(fd);
perror("mmap");
return 0;
}
return addr;
}
uint8_t *hex_to_bytes(const char *search_hex)
{
if (strlen(search_hex)!=32) {
printf("Search hex must be 32 bytes hex characters\n");
return 0;
}
uint8_t * search = (uint8_t *)malloc(16);
//convert search_hex (32 hex character, no space) to search
for (int i = 0; i < 16; i++)
{
char tmp[3];
tmp[0] = search_hex[i * 2];
tmp[1] = search_hex[i * 2 + 1];
tmp[2] = 0;
search[i] = strtol(tmp, NULL, 16);
}
printf("Searching: ");
//hexdump to verify
for (int i = 0; i < 16; i++)
{
printf("%02x", search[i]);
}
printf("\n");
//for every 4 byte do: __builtin_bswap32
uint32_t *search_i32 = (uint32_t *)search;
for (int i = 0; i < 4; i++)
{
search_i32[i] = __builtin_bswap32(search_i32[i]);
}
return search;
}
int search_random(uint8_t *addr, size_t size, const char *search_hex)
{
if (!addr || !search_hex) {
return -1;
}
uint8_t *search = hex_to_bytes(search_hex);
if (!search) {
return -1;
}
//now search the file using memmem
void * pos = memmem(addr, size, search, 16);
int found_pos = 1;
if (pos ) {
printf("Found at index %ld\n", ((uint8_t *)pos - addr)/16);
found_pos = ((uint8_t *)pos - addr)/16;
} else {
printf("Not found\n");
}
return found_pos;
}
//hash computation in GPU, chacha8 in CPU
int test_chacha8_speed(size_t num)
{
printf("Test random + Chacha8 : %lu\n", num);
size_t input_size = num * 19 * sizeof(uint8_t);
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
// Allocate host memory.
uint8_t *h_input = (uint8_t *)malloc(input_size);
uint64_t t_start = TEST_TIMESTAMP + 2000; // TEST: 0 is for chacha, +1000 for chacha_nonce
// output
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
uint8_t *h_output = (uint8_t *)malloc(output_size);
uint64_t start_fill = get_time_in_nanosecond();
fill_input(h_input, t_start, num);
uint64_t end_fill = get_time_in_nanosecond();
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
// Allocate device memory.
uint8_t *d_input, *d_output;
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
return 1;
}
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
return 1;
}
uint64_t start = get_time_in_nanosecond();
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
int blockSize = 256;
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
printf("Starting timestamp -> random calculation...\n");
fflush(stdout);
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
return 1;
}
cudaDeviceSynchronize();
uint64_t end1 = get_time_in_nanosecond();
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
uint64_t end = get_time_in_nanosecond();
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
// print speed per second
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
// Cleanup.
cudaFree(d_input);
cudaFree(d_output);
uint8_t input[64];
uint8_t output[64];
memset(input, 0, 64);
start = get_time_in_nanosecond();
for (size_t i = 0; i < num; i++) {
uint8_t *seed = h_output + i * SHA256_DIGEST_SIZE;
uint8_t *iv = h_output + i * SHA256_DIGEST_SIZE;
chacha8_ctx ctx;
chacha8_keysetup(&ctx, seed, iv);
chacha8_get_keystream_oneblock(&ctx, output);
}
end = get_time_in_nanosecond();
printf("Enc Total Time: %f ms\n", (end - start) / 1000000.0);
//speed
printf("Speed: %f enc per second\n", num / ((end - start) / 1000000000.0));
return 0;
}
void hexdump(const char *title, const uint8_t *data, size_t size)
{
printf("%s: ", title);
const uint8_t *p = data;
for (size_t i = 0; i < size; i++) {
printf("%02x ", p[i]);
}
printf("\n");
}
int bruteforce_chacha(const char *filename)
{
std::string json_file = filename;
std::string checkpoint = json_file + "checkpoint.json";
uint64_t t3_ts;
uint64_t t3_t1_offset;
uint64_t t1_t2_start_offset;
uint64_t t1_t2_end_offset;
uint64_t encrypted, plaintext, value;
//parse JSON
std::ifstream f(filename);
if (!f.is_open())
{
fprintf(stderr, "Error: Could not open config file %s\n", filename);
return 1;
}
json config;
try
{
f >> config;
t3_ts = config["t3_ts"].get<uint64_t>(); //start of kcipher2
t3_t1_offset = config["t3_t1_offset"].get<uint64_t>(); //how far from T3 do we want to start our timestamp
t1_t2_start_offset = config["t1_t2_start_offset"].get<uint64_t>(); //start offset to test for t1-t2
t1_t2_end_offset = config["t1_t2_end_offset"].get<uint64_t>(); //end offset to test for t1-t2
encrypted = std::stoull(config["encrypted"].get<std::string>(), 0, 16);
printf("Encrypted: %016lx\n", encrypted);
plaintext = std::stoull(config["plaintext"].get<std::string>(), 0, 16);
printf("Plaintext: %016lx\n", plaintext);
value = encrypted ^ plaintext;
printf("Value: %016lx\n", value);
}
catch (json::parse_error &e)
{
fprintf(stderr, "Error parsing JSON: %s\n", e.what());
return 1;
}
size_t num = t3_t1_offset + t1_t2_end_offset;
size_t input_size = num * 19 * sizeof(uint8_t);
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
// Allocate host memory.
uint8_t *h_input = (uint8_t *)malloc(input_size);
uint64_t t_start = t3_ts - t3_t1_offset;
printf("t_start: %lu\n", t_start);
uint64_t t_end = t_start + num;
printf("t_end: %lu\n", t_end);
// output
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
uint8_t *h_output = (uint8_t *)malloc(output_size);
uint64_t start_fill = get_time_in_nanosecond();
fill_input(h_input, t_start, num);
uint64_t end_fill = get_time_in_nanosecond();
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
// Allocate device memory.
uint8_t *d_input, *d_output;
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
return 1;
}
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
return 1;
}
uint64_t start = get_time_in_nanosecond();
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
int blockSize = 256;
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
printf("Starting seed calculation...\n");
fflush(stdout);
multihash_kernel_noswap<<<gridSize, blockSize>>>(d_input, d_output, num);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
return 1;
}
cudaDeviceSynchronize();
uint64_t end1 = get_time_in_nanosecond();
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
uint64_t end = get_time_in_nanosecond();
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
// print speed per second
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
// Cleanup.
cudaFree(d_input);
size_t num_loop = t3_t1_offset;
blockSize = 256;
gridSize = (num_loop + blockSize - 1) / blockSize; // Ensure full coverage
unsigned long long *d_found;
if (cudaMalloc(&d_found, sizeof(unsigned long long)) != cudaSuccess)
{
fprintf(stderr, "Error: cudaMalloc failed for d_found\n");
return 1;
}
start = get_time_in_nanosecond();
for (size_t offset = t1_t2_start_offset; offset < t1_t2_end_offset; offset++)
{
size_t n = offset - t1_t2_start_offset;
if ((n % 500)==0) {
end1 = get_time_in_nanosecond();
printf("Time: %f ms\n", (end1 - start) / 1000000.0);
double offset_per_second = n / ((end1 - start) / 1000000000.0);
printf("Speed: %f offsets per second\n", offset_per_second);
printf("Processing offset %zu elapsed %f minutes\n", offset, (end1 - start) / 1000000000.0 / 60.0);
//remaining
double remaining = (t1_t2_end_offset - offset) / offset_per_second;
printf("Remaining: %zu minutes\n", (size_t)(remaining / 60.0));
}
#if 0
printf("ts = %zu\n", t_start);
printf("Tp Offset: %zu ts = %zu\n", offset, t_start + offset);
uint8_t output[64];
uint64_t *val = (uint64_t *)output;
for (size_t i = 0; i < num_loop; i++ ) {
uint8_t *seed = h_output + i * SHA256_DIGEST_SIZE;
//hexdump("Seed", seed, 32);
uint8_t *iv = h_output + (i + offset) * SHA256_DIGEST_SIZE;
//hexdump("IV", iv, 16);
chacha8_ctx ctx;
chacha8_keysetup(&ctx, seed, iv);
chacha8_get_keystream_oneblock(&ctx, output);
//hexdump output
hexdump("Output", output, 64);
if (*val == value) {
printf("Found at time %zu offset = %zu\n", i, offset);
//save to file output.txt
FILE *out = fopen("output.txt", "a");
if (out != NULL)
{
fprintf(out, "Found at time %zu offset = %zu\n", i, offset);
fclose(out);
}
return 0;
}
//break;
}
//break;
#else
//test using GPU
chacha8_encrypt_and_match<<<gridSize, blockSize>>>(d_output, offset, d_found, value, num_loop);
// //check for errors
// cudaError_t err = cudaGetLastError();
// if (err != cudaSuccess)
// {
// fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
// return 1;
// }
//sync
if (n % 1000 == 0) {
cudaDeviceSynchronize();
unsigned long long found = 0;
cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
if (found) {
// unsigned long long encoded_offset_and_index = (unsigned long long)offset << 32 | idx;
size_t t_offset, t_index;
t_offset = found >> 32;
t_index = found & 0xffffffff;
printf("Found at offset %zu index %zu\n", t_offset, t_index + t_start);
//write to file
FILE *out = fopen("output.txt", "a");
if (out != NULL)
{
fprintf(out, "Found at offset %zu index %zu\n", t_offset, t_index + t_start);
fclose(out);
}
return 1;
}
}
#endif
}
return 0;
}
int main(int argc, char *argv[])
{
size_t num = 128 * 1000;
int smCount;
cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, gpuIndex);
printf("SM count: %d\n", smCount);
if (argc > 1)
{
if (strcmp(argv[1], "random") == 0) //test generate random from timestamp
{
if (argc > 2)
{
num = atoll(argv[2]);
}
return test_generate_random_only(num);
}
if (strcmp(argv[1], "random-gpu") == 0) //test generate random from timestamp, using GPU
{
if (argc > 2)
{
num = atoll(argv[2]);
}
return test_generate_random_only_in_gpu(num);
}
if (strcmp(argv[1], "saverandom") == 0)
{
uint64_t start_time = get_time_in_nanosecond();
if (argc > 1)
{
start_time = atoll(argv[2]);
char tmp[16];
snprintf(tmp, sizeof(tmp), "%lu", start_time);
if (strlen(tmp)<19) {
start_time = start_time * 1000000000;
}
}
char filename[256];
snprintf(filename, 256, "random_%lu.bin", start_time);
return save_random(start_time, filename);
}
if (strcmp(argv[1], "search") == 0) {
if (argc > 3) {
const char *filename = argv[2];
const char *search_hex = argv[3];
size_t size;
uint8_t *addr = load_random(filename, &size);
if (addr) {
int res = search_random(addr, size, search_hex);
if (argc > 4) {
const char *search_hex2 = argv[4];
int res2 = search_random(addr, size, search_hex2);
printf("Diff: %d\n", res2-res);
}
}
} else {
printf("usage: search random.bin 16_byte_hex_sequece_no_space\n");
}
}
if (strcmp(argv[1], "enc") == 0) //test encryption only
{
if (argc > 2)
{
num = atoll(argv[2]);
}
return test_encryption_only(num);
}
if (strcmp(argv[1], "chacha8") == 0) //test chacha8 only
{
if (argc > 2)
{
num = atoll(argv[2]);
}
return test_chacha8_speed(num);
}
if (strcmp(argv[1], "runchacha") == 0 || strcmp(argv[1], "runchacha8") == 0) //run chacha8 bruteforce
{
if (argc > 2)
{
if (argc > 3)
{
gpuIndex = atoi(argv[3]);
}
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (gpuIndex < 0 || gpuIndex >= deviceCount)
{
printf("Invalid GPU index %d\n", gpuIndex);
return EXIT_FAILURE;
}
cudaSetDevice(gpuIndex);
printf("Using GPU %d\n", gpuIndex);
return bruteforce_chacha(argv[2]);
} else {
printf("Usage: runchacha config.json\n");
}
}
//main bruteforce loop
if (strcmp(argv[1], "run") == 0 || strcmp(argv[1], "run2")==0 || strcmp(argv[1], "run3") == 0)
{
// read config from JSON file
if (argc > 2)
{
if (argc > 3)
{
gpuIndex = atoi(argv[3]);
}
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (gpuIndex < 0 || gpuIndex >= deviceCount)
{
printf("Invalid GPU index %d\n", gpuIndex);
return EXIT_FAILURE;
}
cudaSetDevice(gpuIndex);
printf("Using GPU %d\n", gpuIndex);
if (strcmp(argv[1], "run") == 0)
{
return do_bruteforce(argv[2]); //this is the slow method
}
else if (strcmp(argv[1], "run2") == 0)
{
return do_bruteforce_new(argv[2]); //this is the faster method
} else if (strcmp(argv[1], "run3") == 0) {
return do_bruteforce_offset(argv[2]); //this will read values from offset.txt instead of using ranges
}
}
printf("Please specify the JSON config file\n");
}
}
return 0;
}