3282 lines
100 KiB
Plaintext
3282 lines
100 KiB
Plaintext
#include <cuda_runtime.h>
|
||
#include <stdio.h>
|
||
#include <stdint.h>
|
||
#include <stdlib.h>
|
||
#include <fstream>
|
||
#include <unistd.h>
|
||
//stat
|
||
#include <sys/types.h>
|
||
#include <sys/stat.h>
|
||
//mmap
|
||
#include <sys/mman.h>
|
||
#include <fcntl.h>
|
||
#include "json.hpp"
|
||
#include "test-ts.h"
|
||
#include "akira-bruteforce.h"
|
||
#include "chacha8.h"
|
||
|
||
//by design, max matches is 127
|
||
#define MAX_MATCHES 32
|
||
|
||
using json = nlohmann::json;
|
||
|
||
int gpuIndex = 0;
|
||
|
||
#define SHA256_DIGEST_SIZE 32
|
||
|
||
// --- SHA-256 Device Implementation ---
|
||
|
||
// Rotate right.
|
||
__device__ __forceinline__ uint32_t rotr(uint32_t x, uint32_t n)
|
||
{
|
||
//return (x >> n) | (x << (32 - n));
|
||
return __funnelshift_r(x, x, n );
|
||
|
||
}
|
||
|
||
|
||
#define SHR(x, n) ((x) >> (n))
|
||
#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
|
||
#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
|
||
#define Sigma0(x) (rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22))
|
||
#define Sigma1(x) (rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25))
|
||
#define sigma0(x) (rotr(x, 7) ^ rotr(x, 18) ^ SHR(x, 3))
|
||
#define sigma1(x) (rotr(x, 17) ^ rotr(x, 19) ^ SHR(x, 10))
|
||
|
||
// Process one 64-byte block.
|
||
__device__ __forceinline__ void sha256_transform(const uint8_t *data, uint32_t state[8], const uint32_t *k)
|
||
{
|
||
uint32_t w[64];
|
||
#pragma unroll
|
||
for (int i = 0; i < 16; i++)
|
||
{
|
||
w[i] = ((uint32_t)data[i * 4] << 24) |
|
||
((uint32_t)data[i * 4 + 1] << 16) |
|
||
((uint32_t)data[i * 4 + 2] << 8) |
|
||
((uint32_t)data[i * 4 + 3]);
|
||
}
|
||
for (int i = 16; i < 64; i++)
|
||
{
|
||
w[i] = sigma1(w[i - 2]) + w[i - 7] + sigma0(w[i - 15]) + w[i - 16];
|
||
}
|
||
|
||
uint32_t a = state[0];
|
||
uint32_t b = state[1];
|
||
uint32_t c = state[2];
|
||
uint32_t d = state[3];
|
||
uint32_t e = state[4];
|
||
uint32_t f = state[5];
|
||
uint32_t g = state[6];
|
||
uint32_t h = state[7];
|
||
|
||
for (int i = 0; i < 64; i++)
|
||
{
|
||
uint32_t T1 = h + Sigma1(e) + Ch(e, f, g) + k[i] + w[i];
|
||
uint32_t T2 = Sigma0(a) + Maj(a, b, c);
|
||
h = g;
|
||
g = f;
|
||
f = e;
|
||
e = d + T1;
|
||
d = c;
|
||
c = b;
|
||
b = a;
|
||
a = T1 + T2;
|
||
}
|
||
|
||
state[0] += a;
|
||
state[1] += b;
|
||
state[2] += c;
|
||
state[3] += d;
|
||
state[4] += e;
|
||
state[5] += f;
|
||
state[6] += g;
|
||
state[7] += h;
|
||
}
|
||
|
||
// For a fixed 19-byte input, the padded message is exactly 64 bytes.
|
||
// Padding: message (19 bytes) || 0x80 || (zeros up to byte 56) || [0x0000000000000098]
|
||
// (19*8 = 152, or 0x98)
|
||
__device__ void sha256_hash_19(const uint8_t *msg, uint8_t *digest, const uint32_t *k)
|
||
{
|
||
uint32_t state[8] = {
|
||
0x6a09e667,
|
||
0xbb67ae85,
|
||
0x3c6ef372,
|
||
0xa54ff53a,
|
||
0x510e527f,
|
||
0x9b05688c,
|
||
0x1f83d9ab,
|
||
0x5be0cd19};
|
||
|
||
|
||
uint8_t block[64];
|
||
|
||
#pragma unroll
|
||
for (int i = 0; i < 19; i++)
|
||
block[i] = msg[i];
|
||
block[19] = 0x80;
|
||
#pragma unroll
|
||
for (int i = 20; i < 56; i++)
|
||
block[i] = 0;
|
||
// Append message length in bits: 152 = 0x0000000000000098 (big-endian)
|
||
block[56] = 0;
|
||
block[57] = 0;
|
||
block[58] = 0;
|
||
block[59] = 0;
|
||
block[60] = 0;
|
||
block[61] = 0;
|
||
block[62] = 0;
|
||
block[63] = 152;
|
||
|
||
sha256_transform(block, state, k);
|
||
|
||
#pragma unroll
|
||
for (int i = 0; i < 8; i++)
|
||
{
|
||
digest[i * 4] = (state[i] >> 24) & 0xff;
|
||
digest[i * 4 + 1] = (state[i] >> 16) & 0xff;
|
||
digest[i * 4 + 2] = (state[i] >> 8) & 0xff;
|
||
digest[i * 4 + 3] = state[i] & 0xff;
|
||
}
|
||
}
|
||
|
||
// For a fixed 68-byte input, the padded message consists of 2 blocks.
|
||
// Block 1: first 64 bytes of the input.
|
||
// Block 2: remaining 4 bytes || 0x80 || zeros until byte 56 || [bit length = 544 bits]
|
||
__device__ void sha256_hash_68(const uint8_t * msg, uint8_t *digest, const uint32_t * k)
|
||
{
|
||
uint32_t state[8] = {
|
||
0x6a09e667,
|
||
0xbb67ae85,
|
||
0x3c6ef372,
|
||
0xa54ff53a,
|
||
0x510e527f,
|
||
0x9b05688c,
|
||
0x1f83d9ab,
|
||
0x5be0cd19};
|
||
|
||
uint8_t block2[64];
|
||
|
||
// Block 1: copy first 64 bytes.
|
||
|
||
//uint8_t block[64];
|
||
#pragma unroll
|
||
for (int i = 0; i < 64; i++)
|
||
block2[i] = msg[i];
|
||
sha256_transform(block2, state, k);
|
||
|
||
|
||
// Block 2:
|
||
// First 4 bytes: remainder of message.
|
||
#pragma unroll
|
||
for (int i = 0; i < 4; i++)
|
||
block2[i] = msg[64 + i];
|
||
block2[4] = 0x80;
|
||
#pragma unroll
|
||
for (int i = 5; i < 56; i++)
|
||
block2[i] = 0;
|
||
// Append message length in bits: 68*8 = 544 = 0x0000000000000220 (big-endian)
|
||
block2[56] = 0;
|
||
block2[57] = 0;
|
||
block2[58] = 0;
|
||
block2[59] = 0;
|
||
block2[60] = 0;
|
||
block2[61] = 0;
|
||
block2[62] = 0x02;
|
||
block2[63] = 0x20;
|
||
|
||
sha256_transform(block2, state, k);
|
||
|
||
#pragma unroll
|
||
for (int i = 0; i < 8; i++)
|
||
{
|
||
digest[i * 4] = (state[i] >> 24) & 0xff;
|
||
digest[i * 4 + 1] = (state[i] >> 16) & 0xff;
|
||
digest[i * 4 + 2] = (state[i] >> 8) & 0xff;
|
||
digest[i * 4 + 3] = state[i] & 0xff;
|
||
}
|
||
}
|
||
|
||
// Minimal AES-256 context and helper functions.
|
||
typedef struct
|
||
{
|
||
uint32_t rk[60]; // 60 words for 14 rounds + initial key
|
||
} aes256_ctx;
|
||
|
||
__device__ __forceinline__ uint32_t SubWord(uint32_t word, const uint8_t *sbox)
|
||
{
|
||
return ((uint32_t)sbox[(word >> 24)] << 24) |
|
||
((uint32_t)sbox[(word >> 16) & 0xff] << 16) |
|
||
((uint32_t)sbox[(word >> 8) & 0xff] << 8) |
|
||
((uint32_t)sbox[word & 0xff]);
|
||
}
|
||
|
||
__device__ inline uint32_t RotWord(uint32_t word)
|
||
{
|
||
//return (word << 8) | (word >> 24);
|
||
return __funnelshift_l(word, word, 8);
|
||
|
||
}
|
||
|
||
// Key expansion for AES-256.
|
||
__device__ void aes256_set_encrypt_key(const uint8_t userKey[32], aes256_ctx *ctx, const uint8_t *sbox, const uint32_t *Rcon)
|
||
{
|
||
int i = 0;
|
||
// Copy the 256-bit key into the first 8 words.
|
||
for (i = 0; i < 8; i++)
|
||
{
|
||
ctx->rk[i] = ((uint32_t)userKey[4 * i] << 24) |
|
||
((uint32_t)userKey[4 * i + 1] << 16) |
|
||
((uint32_t)userKey[4 * i + 2] << 8) |
|
||
((uint32_t)userKey[4 * i + 3]);
|
||
}
|
||
int rcon_i = 0;
|
||
|
||
for (i = 8; i < 60; i++)
|
||
{
|
||
uint32_t temp = ctx->rk[i - 1];
|
||
if ((i % 8) == 0)
|
||
{
|
||
temp = SubWord(RotWord(temp), sbox) ^ Rcon[rcon_i++];
|
||
}
|
||
else if ((i % 8) == 4)
|
||
{
|
||
temp = SubWord(temp, sbox);
|
||
}
|
||
ctx->rk[i] = ctx->rk[i - 8] ^ temp;
|
||
}
|
||
}
|
||
|
||
|
||
// AES block encryption for a single 16-byte block.
|
||
__device__ void aes256_encrypt(const aes256_ctx *ctx, const uint8_t in[16], uint8_t out[16], const uint8_t *sbox, const uint8_t *xtime)
|
||
{
|
||
uint8_t state[16];
|
||
// Copy input into state.
|
||
for (int i = 0; i < 16; i++)
|
||
{
|
||
state[i] = in[i];
|
||
}
|
||
// Initial AddRoundKey.
|
||
for (int i = 0; i < 4; i++)
|
||
{
|
||
uint32_t rk = ctx->rk[i];
|
||
state[4 * i + 0] ^= (uint8_t)(rk >> 24);
|
||
state[4 * i + 1] ^= (uint8_t)(rk >> 16);
|
||
state[4 * i + 2] ^= (uint8_t)(rk >> 8);
|
||
state[4 * i + 3] ^= (uint8_t)(rk);
|
||
}
|
||
// Main rounds.
|
||
for (int round = 1; round < 14; round++)
|
||
{
|
||
// SubBytes.
|
||
for (int i = 0; i < 16; i++)
|
||
state[i] = sbox[state[i]];
|
||
// ShiftRows.
|
||
uint8_t tmp[16];
|
||
tmp[0] = state[0];
|
||
tmp[1] = state[5];
|
||
tmp[2] = state[10];
|
||
tmp[3] = state[15];
|
||
|
||
tmp[4] = state[4];
|
||
tmp[5] = state[9];
|
||
tmp[6] = state[14];
|
||
tmp[7] = state[3];
|
||
|
||
tmp[8] = state[8];
|
||
tmp[9] = state[13];
|
||
tmp[10] = state[2];
|
||
tmp[11] = state[7];
|
||
|
||
tmp[12] = state[12];
|
||
tmp[13] = state[1];
|
||
tmp[14] = state[6];
|
||
tmp[15] = state[11];
|
||
|
||
// MixColumns.
|
||
for (int i = 0; i < 4; i++)
|
||
{
|
||
int col = 4 * i;
|
||
uint8_t a0 = tmp[col + 0], a1 = tmp[col + 1],
|
||
a2 = tmp[col + 2], a3 = tmp[col + 3];
|
||
uint8_t r0 = xtime[a0] ^ (a1 ^ xtime[a1]) ^ a2 ^ a3;
|
||
uint8_t r1 = a0 ^ xtime[a1] ^ (a2 ^ xtime[a2]) ^ a3;
|
||
uint8_t r2 = a0 ^ a1 ^ xtime[a2] ^ (a3 ^ xtime[a3]);
|
||
uint8_t r3 = (a0 ^ xtime[a0]) ^ a1 ^ a2 ^ xtime[a3];
|
||
tmp[col + 0] = r0;
|
||
tmp[col + 1] = r1;
|
||
tmp[col + 2] = r2;
|
||
tmp[col + 3] = r3;
|
||
}
|
||
// Copy back to state.
|
||
for (int i = 0; i < 16; i++)
|
||
state[i] = tmp[i];
|
||
// AddRoundKey.
|
||
for (int i = 0; i < 4; i++)
|
||
{
|
||
uint32_t rk = ctx->rk[round * 4 + i];
|
||
state[4 * i + 0] ^= (uint8_t)(rk >> 24);
|
||
state[4 * i + 1] ^= (uint8_t)(rk >> 16);
|
||
state[4 * i + 2] ^= (uint8_t)(rk >> 8);
|
||
state[4 * i + 3] ^= (uint8_t)(rk);
|
||
}
|
||
}
|
||
// Final round (no MixColumns).
|
||
// SubBytes.
|
||
for (int i = 0; i < 16; i++)
|
||
state[i] = sbox[state[i]];
|
||
// ShiftRows.
|
||
uint8_t tmp[16];
|
||
tmp[0] = state[0];
|
||
tmp[1] = state[5];
|
||
tmp[2] = state[10];
|
||
tmp[3] = state[15];
|
||
|
||
tmp[4] = state[4];
|
||
tmp[5] = state[9];
|
||
tmp[6] = state[14];
|
||
tmp[7] = state[3];
|
||
|
||
tmp[8] = state[8];
|
||
tmp[9] = state[13];
|
||
tmp[10] = state[2];
|
||
tmp[11] = state[7];
|
||
|
||
tmp[12] = state[12];
|
||
tmp[13] = state[1];
|
||
tmp[14] = state[6];
|
||
tmp[15] = state[11];
|
||
// Final AddRoundKey.
|
||
for (int i = 0; i < 4; i++)
|
||
{
|
||
uint32_t rk = ctx->rk[14 * 4 + i];
|
||
tmp[4 * i + 0] ^= (uint8_t)(rk >> 24);
|
||
tmp[4 * i + 1] ^= (uint8_t)(rk >> 16);
|
||
tmp[4 * i + 2] ^= (uint8_t)(rk >> 8);
|
||
tmp[4 * i + 3] ^= (uint8_t)(rk);
|
||
}
|
||
// Write result.
|
||
for (int i = 0; i < 16; i++)
|
||
out[i] = tmp[i];
|
||
}
|
||
|
||
// kcipher2 state structure.
|
||
typedef struct
|
||
{
|
||
unsigned int A[5];
|
||
unsigned int B[11];
|
||
unsigned int L1, R1, L2, R2;
|
||
} kcipher2_state;
|
||
|
||
//---------------------------------------------------------------------------
|
||
// Device functions for kcipher2
|
||
//---------------------------------------------------------------------------
|
||
|
||
__device__ __forceinline__ unsigned int nlf(unsigned int a, unsigned int b, unsigned int c, unsigned int d)
|
||
{
|
||
return (a + b) ^ c ^ d;
|
||
}
|
||
|
||
__device__ __forceinline__ unsigned char gf_multiply_by_2(unsigned char t)
|
||
{
|
||
// return gf2_table[t];
|
||
unsigned int lq = t << 1;
|
||
if (lq & 0x100)
|
||
lq ^= 0x011B;
|
||
return ((unsigned char)lq) ^ 0xFF;
|
||
}
|
||
|
||
__device__ __forceinline__ unsigned char gf_multiply_by_3(unsigned char t)
|
||
{
|
||
// return gf3_table[t];
|
||
unsigned int lq = (t << 1) ^ t;
|
||
if (lq & 0x100)
|
||
lq ^= 0x011B;
|
||
return ((unsigned char)lq) ^ 0xFF;
|
||
}
|
||
|
||
__device__ unsigned int sub_k2(unsigned int in)
|
||
{
|
||
unsigned char w0 = in & 0xFF;
|
||
unsigned char w1 = (in >> 8) & 0xFF;
|
||
unsigned char w2 = (in >> 16) & 0xFF;
|
||
unsigned char w3 = (in >> 24) & 0xFF;
|
||
|
||
unsigned char t0 = d_s_box[w0];
|
||
unsigned char t1 = d_s_box[w1];
|
||
unsigned char t2 = d_s_box[w2];
|
||
unsigned char t3 = d_s_box[w3];
|
||
|
||
unsigned char q0 = gf_multiply_by_2(t0) ^ gf_multiply_by_3(t1) ^ t2 ^ t3;
|
||
unsigned char q1 = t0 ^ gf_multiply_by_2(t1) ^ gf_multiply_by_3(t2) ^ t3;
|
||
unsigned char q2 = t0 ^ t1 ^ gf_multiply_by_2(t2) ^ gf_multiply_by_3(t3);
|
||
unsigned char q3 = gf_multiply_by_3(t0) ^ t1 ^ t2 ^ gf_multiply_by_2(t3);
|
||
|
||
return ((unsigned int)q3 << 24) | ((unsigned int)q2 << 16) | ((unsigned int)q1 << 8) | q0;
|
||
}
|
||
|
||
__device__ void setup_state_values(const unsigned int *key, const unsigned int *iv, kcipher2_state *state)
|
||
{
|
||
unsigned int IK[12];
|
||
IK[0] = key[0];
|
||
IK[1] = key[1];
|
||
IK[2] = key[2];
|
||
IK[3] = key[3];
|
||
|
||
IK[4] = IK[0] ^ sub_k2((IK[3] << 8) ^ (IK[3] >> 24)) ^ 0x01000000;
|
||
IK[5] = IK[1] ^ IK[4];
|
||
IK[6] = IK[2] ^ IK[5];
|
||
IK[7] = IK[3] ^ IK[6];
|
||
IK[8] = IK[4] ^ sub_k2((IK[7] << 8) ^ (IK[7] >> 24)) ^ 0x02000000;
|
||
|
||
IK[9] = IK[5] ^ IK[8];
|
||
IK[10] = IK[6] ^ IK[9];
|
||
IK[11] = IK[7] ^ IK[10];
|
||
|
||
state->A[0] = IK[4];
|
||
state->A[1] = IK[3];
|
||
state->A[2] = IK[2];
|
||
state->A[3] = IK[1];
|
||
state->A[4] = IK[0];
|
||
|
||
state->B[0] = IK[10];
|
||
state->B[1] = IK[11];
|
||
state->B[2] = iv[0];
|
||
state->B[3] = iv[1];
|
||
state->B[4] = IK[8];
|
||
state->B[5] = IK[9];
|
||
state->B[6] = iv[2];
|
||
state->B[7] = iv[3];
|
||
state->B[8] = IK[7];
|
||
state->B[9] = IK[5];
|
||
state->B[10] = IK[6];
|
||
|
||
state->L1 = state->R1 = state->L2 = state->R2 = 0x00000000;
|
||
}
|
||
|
||
__device__ void next_INIT(kcipher2_state *state)
|
||
{
|
||
unsigned int temp2;
|
||
unsigned int nL1 = sub_k2(state->R2 + state->B[4]);
|
||
unsigned int nR1 = sub_k2(state->L2 + state->B[9]);
|
||
unsigned int nL2 = sub_k2(state->L1);
|
||
unsigned int nR2 = sub_k2(state->R1);
|
||
|
||
unsigned int nA[5];
|
||
nA[0] = state->A[1];
|
||
nA[1] = state->A[2];
|
||
nA[2] = state->A[3];
|
||
nA[3] = state->A[4];
|
||
|
||
unsigned int nB[11];
|
||
nB[0] = state->B[1];
|
||
nB[1] = state->B[2];
|
||
nB[2] = state->B[3];
|
||
nB[3] = state->B[4];
|
||
nB[4] = state->B[5];
|
||
nB[5] = state->B[6];
|
||
nB[6] = state->B[7];
|
||
nB[7] = state->B[8];
|
||
nB[8] = state->B[9];
|
||
nB[9] = state->B[10];
|
||
|
||
unsigned int temp1 = (state->A[0] << 8) ^ d_amul0[(state->A[0] >> 24) & 0xFF];
|
||
nA[4] = temp1 ^ state->A[3];
|
||
nA[4] ^= nlf(state->B[0], state->R2, state->R1, state->A[4]);
|
||
|
||
if (state->A[2] & 0x40000000)
|
||
temp1 = (state->B[0] << 8) ^ d_amul1[(state->B[0] >> 24) & 0xFF];
|
||
else
|
||
temp1 = (state->B[0] << 8) ^ d_amul2[(state->B[0] >> 24) & 0xFF];
|
||
|
||
// branchless version (not faster)
|
||
// unsigned int mask = -(unsigned int)(!!(state->A[2] & 0x40000000));
|
||
// unsigned int candidate1 = (state->B[0] << 8) ^ d_amul1[(state->B[0] >> 24) & 0xFF];
|
||
// unsigned int candidate2 = (state->B[0] << 8) ^ d_amul2[(state->B[0] >> 24) & 0xFF];
|
||
// temp1 = candidate2 ^ ((candidate1 ^ candidate2) & mask);
|
||
|
||
if (state->A[2] & 0x80000000)
|
||
temp2 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF];
|
||
else
|
||
temp2 = state->B[8];
|
||
|
||
// branchless version (not faster)
|
||
// unsigned int mask2 = -(unsigned int)(!!(state->A[2] & 0x80000000));
|
||
// unsigned int candidate3 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF];
|
||
// unsigned int candidate4 = state->B[8];
|
||
// temp2 = candidate4 ^ ((candidate3 ^ candidate4) & mask2);
|
||
|
||
nB[10] = temp1 ^ state->B[1] ^ state->B[6] ^ temp2;
|
||
nB[10] ^= nlf(state->B[10], state->L2, state->L1, state->A[0]);
|
||
|
||
state->A[0] = nA[0];
|
||
state->A[1] = nA[1];
|
||
state->A[2] = nA[2];
|
||
state->A[3] = nA[3];
|
||
state->A[4] = nA[4];
|
||
|
||
state->B[0] = nB[0];
|
||
state->B[1] = nB[1];
|
||
state->B[2] = nB[2];
|
||
state->B[3] = nB[3];
|
||
state->B[4] = nB[4];
|
||
state->B[5] = nB[5];
|
||
state->B[6] = nB[6];
|
||
state->B[7] = nB[7];
|
||
state->B[8] = nB[8];
|
||
state->B[9] = nB[9];
|
||
state->B[10] = nB[10];
|
||
|
||
state->L1 = nL1;
|
||
state->R1 = nR1;
|
||
state->L2 = nL2;
|
||
state->R2 = nR2;
|
||
}
|
||
|
||
__device__ unsigned int sub_k2_shared(unsigned int in, const unsigned char *gf2_table,
|
||
const unsigned char *gf3_table,
|
||
const unsigned char *d_s_box)
|
||
{
|
||
unsigned char w0 = in & 0xFF;
|
||
unsigned char w1 = (in >> 8) & 0xFF;
|
||
unsigned char w2 = (in >> 16) & 0xFF;
|
||
unsigned char w3 = (in >> 24) & 0xFF;
|
||
|
||
unsigned char t0 = d_s_box[w0];
|
||
unsigned char t1 = d_s_box[w1];
|
||
unsigned char t2 = d_s_box[w2];
|
||
unsigned char t3 = d_s_box[w3];
|
||
|
||
unsigned char q0 = gf2_table[t0] ^ gf3_table[t1] ^ t2 ^ t3;
|
||
unsigned char q1 = t0 ^ gf2_table[t1] ^ gf3_table[t2] ^ t3;
|
||
unsigned char q2 = t0 ^ t1 ^ gf2_table[t2] ^ gf3_table[t3];
|
||
unsigned char q3 = gf3_table[t0] ^ t1 ^ t2 ^ gf2_table[t3];
|
||
|
||
return ((unsigned int)q3 << 24) | ((unsigned int)q2 << 16) | ((unsigned int)q1 << 8) | q0;
|
||
}
|
||
|
||
|
||
__device__ void setup_state_values_shared(const unsigned int *key, const unsigned int *iv, kcipher2_state *state,
|
||
const unsigned char *gf2_table,
|
||
const unsigned char *gf3_table,
|
||
const unsigned char *d_s_box)
|
||
{
|
||
unsigned int IK[12];
|
||
IK[0] = key[0];
|
||
IK[1] = key[1];
|
||
IK[2] = key[2];
|
||
IK[3] = key[3];
|
||
|
||
IK[4] = IK[0] ^ sub_k2_shared((IK[3] << 8) ^ (IK[3] >> 24), gf2_table, gf3_table, d_s_box) ^ 0x01000000;
|
||
IK[5] = IK[1] ^ IK[4];
|
||
IK[6] = IK[2] ^ IK[5];
|
||
IK[7] = IK[3] ^ IK[6];
|
||
IK[8] = IK[4] ^ sub_k2_shared((IK[7] << 8) ^ (IK[7] >> 24), gf2_table, gf3_table, d_s_box) ^ 0x02000000;
|
||
|
||
IK[9] = IK[5] ^ IK[8];
|
||
IK[10] = IK[6] ^ IK[9];
|
||
IK[11] = IK[7] ^ IK[10];
|
||
|
||
state->A[0] = IK[4];
|
||
state->A[1] = IK[3];
|
||
state->A[2] = IK[2];
|
||
state->A[3] = IK[1];
|
||
state->A[4] = IK[0];
|
||
|
||
state->B[0] = IK[10];
|
||
state->B[1] = IK[11];
|
||
state->B[2] = iv[0];
|
||
state->B[3] = iv[1];
|
||
state->B[4] = IK[8];
|
||
state->B[5] = IK[9];
|
||
state->B[6] = iv[2];
|
||
state->B[7] = iv[3];
|
||
state->B[8] = IK[7];
|
||
state->B[9] = IK[5];
|
||
state->B[10] = IK[6];
|
||
|
||
state->L1 = state->R1 = state->L2 = state->R2 = 0x00000000;
|
||
}
|
||
|
||
|
||
|
||
__device__ void next_INIT_shared(kcipher2_state *state,
|
||
const unsigned char *gf2_table,
|
||
const unsigned char *gf3_table,
|
||
const unsigned char *d_s_box,
|
||
const unsigned int *d_amul0,
|
||
const unsigned int *d_amul1,
|
||
const unsigned int *d_amul2,
|
||
const unsigned int *d_amul3)
|
||
{
|
||
unsigned int temp2;
|
||
unsigned int nL1 = sub_k2_shared(state->R2 + state->B[4], gf2_table, gf3_table, d_s_box);
|
||
unsigned int nR1 = sub_k2_shared(state->L2 + state->B[9], gf2_table, gf3_table, d_s_box);
|
||
unsigned int nL2 = sub_k2_shared(state->L1, gf2_table, gf3_table, d_s_box);
|
||
unsigned int nR2 = sub_k2_shared(state->R1, gf2_table, gf3_table, d_s_box);
|
||
|
||
unsigned int nA[5];
|
||
nA[0] = state->A[1];
|
||
nA[1] = state->A[2];
|
||
nA[2] = state->A[3];
|
||
nA[3] = state->A[4];
|
||
|
||
unsigned int nB[11];
|
||
nB[0] = state->B[1];
|
||
nB[1] = state->B[2];
|
||
nB[2] = state->B[3];
|
||
nB[3] = state->B[4];
|
||
nB[4] = state->B[5];
|
||
nB[5] = state->B[6];
|
||
nB[6] = state->B[7];
|
||
nB[7] = state->B[8];
|
||
nB[8] = state->B[9];
|
||
nB[9] = state->B[10];
|
||
|
||
unsigned int temp1 = (state->A[0] << 8) ^ d_amul0[(state->A[0] >> 24) & 0xFF];
|
||
nA[4] = temp1 ^ state->A[3];
|
||
nA[4] ^= nlf(state->B[0], state->R2, state->R1, state->A[4]);
|
||
|
||
if (state->A[2] & 0x40000000)
|
||
temp1 = (state->B[0] << 8) ^ d_amul1[(state->B[0] >> 24) & 0xFF];
|
||
else
|
||
temp1 = (state->B[0] << 8) ^ d_amul2[(state->B[0] >> 24) & 0xFF];
|
||
|
||
// branchless version (not faster)
|
||
// unsigned int mask = -(unsigned int)(!!(state->A[2] & 0x40000000));
|
||
// int b0 = state->B[0] << 8;
|
||
// int b0_24 = (state->B[0] >> 24) & 0xFF;
|
||
// unsigned int candidate1 = (b0) ^ d_amul1[b0_24];
|
||
// unsigned int candidate2 = (b0) ^ d_amul2[b0_24];
|
||
// temp1 = candidate2 ^ ((candidate1 ^ candidate2) & mask);
|
||
|
||
if (state->A[2] & 0x80000000)
|
||
temp2 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF];
|
||
else
|
||
temp2 = state->B[8];
|
||
|
||
// branchless version (not faster)
|
||
// unsigned int mask2 = -(unsigned int)(!!(state->A[2] & 0x80000000));
|
||
// unsigned int candidate3 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF];
|
||
// unsigned int candidate4 = state->B[8];
|
||
// temp2 = candidate4 ^ ((candidate3 ^ candidate4) & mask2);
|
||
|
||
nB[10] = temp1 ^ state->B[1] ^ state->B[6] ^ temp2;
|
||
nB[10] ^= nlf(state->B[10], state->L2, state->L1, state->A[0]);
|
||
|
||
state->A[0] = nA[0];
|
||
state->A[1] = nA[1];
|
||
state->A[2] = nA[2];
|
||
state->A[3] = nA[3];
|
||
state->A[4] = nA[4];
|
||
|
||
state->B[0] = nB[0];
|
||
state->B[1] = nB[1];
|
||
state->B[2] = nB[2];
|
||
state->B[3] = nB[3];
|
||
state->B[4] = nB[4];
|
||
state->B[5] = nB[5];
|
||
state->B[6] = nB[6];
|
||
state->B[7] = nB[7];
|
||
state->B[8] = nB[8];
|
||
state->B[9] = nB[9];
|
||
state->B[10] = nB[10];
|
||
|
||
state->L1 = nL1;
|
||
state->R1 = nR1;
|
||
state->L2 = nL2;
|
||
state->R2 = nR2;
|
||
}
|
||
|
||
|
||
__device__ __forceinline__ unsigned long long bswap64(unsigned long long x)
|
||
{
|
||
uint32_t hi = x >> 32; // Upper 32 bits
|
||
uint32_t lo = x & 0xFFFFFFFF; // Lower 32 bits
|
||
// Swap bytes in each 32-bit part using __byte_perm
|
||
hi = __byte_perm(hi, 0, 0x0123);
|
||
lo = __byte_perm(lo, 0, 0x0123);
|
||
|
||
// Swap the high and low parts and combine them back
|
||
return ((uint64_t)lo << 32) | hi;
|
||
}
|
||
|
||
__device__ unsigned long long kcipher2_encrypt_1_zero_block(const unsigned int *key, const unsigned int *iv)
|
||
{
|
||
kcipher2_state state;
|
||
setup_state_values(key, iv, &state);
|
||
for (unsigned char i = 0; i < 24; i++)
|
||
next_INIT(&state);
|
||
|
||
unsigned int zh = nlf(state.B[10], state.L2, state.L1, state.A[0]);
|
||
unsigned int zl = nlf(state.B[0], state.R2, state.R1, state.A[4]);
|
||
return (((unsigned long long)zh) << 32) | zl;
|
||
}
|
||
|
||
__device__ unsigned long long kcipher2_encrypt_1_zero_block_shared(const unsigned int *key, const unsigned int *iv,
|
||
const unsigned char *gf2_table,
|
||
const unsigned char *gf3_table,
|
||
const unsigned char *d_s_box,
|
||
const unsigned int *d_amul0,
|
||
const unsigned int *d_amul1,
|
||
const unsigned int *d_amul2,
|
||
const unsigned int *d_amul3
|
||
)
|
||
{
|
||
kcipher2_state state;
|
||
setup_state_values_shared(key, iv, &state, gf2_table, gf3_table, d_s_box);
|
||
for (unsigned char i = 0; i < 24; i++)
|
||
next_INIT_shared(&state, gf2_table, gf3_table, d_s_box, d_amul0, d_amul1, d_amul2, d_amul3);
|
||
|
||
unsigned int zh = nlf(state.B[10], state.L2, state.L1, state.A[0]);
|
||
unsigned int zl = nlf(state.B[0], state.R2, state.R1, state.A[4]);
|
||
return (((unsigned long long)zh) << 32) | zl;
|
||
}
|
||
|
||
//---------------------------------------------------------------------------
|
||
// End of file
|
||
//---------------------------------------------------------------------------
|
||
|
||
|
||
|
||
// --- Multihash Kernel ---
|
||
//
|
||
// Each thread computes:
|
||
// digest0 = SHA256(19-byte input)
|
||
// v0 = digest0
|
||
// for i=1..1499:
|
||
// buffer = digest(i-1) || v0 || [i as 4-byte big-endian]
|
||
// digest(i) = SHA256(buffer)
|
||
// aes256_encrypt(&key_ctx, counter, counter);
|
||
// Encrypt the (now updated) counter to produce the final output.
|
||
// aes256_encrypt(&key_ctx, counter, digest);
|
||
|
||
// this kernel only output 16 byte random number
|
||
// but we output 32 byte block, so its like this <16 byte random> <16 byte empty> <16 byte random> <16 byte empty> ...
|
||
// This will make it faster
|
||
|
||
__global__ void multihash_kernel(const uint8_t *input, uint8_t *output, size_t num)
|
||
{
|
||
ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx >= num)
|
||
return;
|
||
|
||
|
||
__shared__ uint32_t sha256_k[64];
|
||
__shared__ unsigned char gf2_table_shared[256];
|
||
__shared__ uint8_t aes_sbox[256];
|
||
__shared__ uint32_t aes_Rcon[7];
|
||
|
||
if (threadIdx.x == 0) {
|
||
memcpy(sha256_k, k, sizeof(sha256_k));
|
||
memcpy(aes_sbox, sbox, sizeof(aes_sbox));
|
||
memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon));
|
||
memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared));
|
||
}
|
||
__syncthreads(); // Make sure the data is loaded before use
|
||
|
||
|
||
// Each input message is 19 bytes.
|
||
const uint8_t *data = input + idx * 19;
|
||
uint8_t digest[SHA256_DIGEST_SIZE];
|
||
uint8_t v0[SHA256_DIGEST_SIZE];
|
||
|
||
// First round: hash the 19-byte input.
|
||
// sha256_hash(data, 19, digest);
|
||
|
||
sha256_hash_19(data, digest, sha256_k);
|
||
|
||
// Save initial digest as v0.
|
||
for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
|
||
{
|
||
v0[i] = digest[i];
|
||
}
|
||
uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter)
|
||
|
||
// Copy v0.
|
||
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
|
||
{
|
||
buffer[32 + j] = v0[j];
|
||
}
|
||
|
||
// Write counter (big-endian).
|
||
// buffer[64] = (i >> 24) & 0xff;
|
||
// buffer[65] = (i >> 16) & 0xff;
|
||
// max is 1500
|
||
buffer[64] = 0;
|
||
buffer[65] = 0;
|
||
|
||
// 1,500 rounds total (first one is done above).
|
||
for (int i = 1; i < 1500; i++)
|
||
{
|
||
// Copy current digest
|
||
#pragma unroll
|
||
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
|
||
{
|
||
buffer[j] = digest[j];
|
||
}
|
||
|
||
buffer[66] = (i >> 8) & 0xff;
|
||
buffer[67] = i & 0xff;
|
||
|
||
// sha256_hash(buffer, 68, digest);
|
||
sha256_hash_68(buffer, digest, sha256_k);
|
||
}
|
||
#define AES_BLOCK_SIZE 16
|
||
// -------- AES post-processing on final digest --------
|
||
// Use final digest as the AES key.
|
||
aes256_ctx key_ctx;
|
||
aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon);
|
||
uint8_t counter[16] = {0}; // clear counter
|
||
|
||
// Encrypt the counter block.
|
||
aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared);
|
||
// Encrypt the (now updated) counter to produce the final output.
|
||
aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared);
|
||
|
||
// Write final (AES‑encrypted) digest to output.
|
||
uint8_t *out = output + idx * SHA256_DIGEST_SIZE;
|
||
|
||
uint32_t * out_u32 = (uint32_t *)out;
|
||
uint32_t * digest_u32 = (uint32_t *)digest;
|
||
|
||
//byte swap32 bit every 4 byte
|
||
for (int i = 0; i < SHA256_DIGEST_SIZE / 4; i++)
|
||
{
|
||
out_u32[i] = __byte_perm(digest_u32[i], 0, 0x0123);
|
||
}
|
||
|
||
}
|
||
|
||
|
||
__global__ void multihash_kernel_in_memory(uint8_t *output, uint64_t t_start, size_t num)
|
||
{
|
||
ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx >= num)
|
||
return;
|
||
|
||
|
||
__shared__ uint32_t sha256_k[64];
|
||
__shared__ unsigned char gf2_table_shared[256];
|
||
__shared__ uint8_t aes_sbox[256];
|
||
__shared__ uint32_t aes_Rcon[7];
|
||
|
||
if (threadIdx.x == 0) {
|
||
memcpy(sha256_k, k, sizeof(sha256_k));
|
||
memcpy(aes_sbox, sbox, sizeof(aes_sbox));
|
||
memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon));
|
||
memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared));
|
||
}
|
||
__syncthreads(); // Make sure the data is loaded before use
|
||
uint64_t t = t_start + idx;
|
||
|
||
// Each input message is 19 bytes.
|
||
uint8_t data[19];
|
||
for (int i =0; i < 19; i++) {
|
||
data[18-i]='0' + (t%10);
|
||
t /= 10;
|
||
}
|
||
|
||
uint8_t digest[SHA256_DIGEST_SIZE];
|
||
// uint8_t v0[SHA256_DIGEST_SIZE];
|
||
|
||
// First round: hash the 19-byte input.
|
||
// sha256_hash(data, 19, digest);
|
||
|
||
sha256_hash_19(data, digest, sha256_k);
|
||
|
||
// Save initial digest as v0.
|
||
// for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
|
||
// {
|
||
// v0[i] = digest[i];
|
||
// }
|
||
uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter)
|
||
|
||
// Copy v0.
|
||
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
|
||
{
|
||
buffer[32 + j] = digest[j];
|
||
}
|
||
|
||
// Write counter (big-endian).
|
||
// buffer[64] = (i >> 24) & 0xff;
|
||
// buffer[65] = (i >> 16) & 0xff;
|
||
// max is 1500
|
||
buffer[64] = 0;
|
||
buffer[65] = 0;
|
||
|
||
// 1,500 rounds total (first one is done above).
|
||
for (int i = 1; i < 1500; i++)
|
||
{
|
||
// Copy current digest.
|
||
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
|
||
{
|
||
buffer[j] = digest[j];
|
||
}
|
||
|
||
buffer[66] = (i >> 8) & 0xff;
|
||
buffer[67] = i & 0xff;
|
||
|
||
// sha256_hash(buffer, 68, digest);
|
||
sha256_hash_68(buffer, digest, sha256_k);
|
||
}
|
||
#define AES_BLOCK_SIZE 16
|
||
// -------- AES post-processing on final digest --------
|
||
// Use final digest as the AES key.
|
||
aes256_ctx key_ctx;
|
||
aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon);
|
||
uint8_t counter[16] = {0}; // clear counter
|
||
|
||
// Encrypt the counter block.
|
||
aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared);
|
||
// Encrypt the (now updated) counter to produce the final output.
|
||
aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared);
|
||
|
||
// Write final (AES‑encrypted) digest to output.
|
||
uint8_t *out = output + idx * SHA256_DIGEST_SIZE;
|
||
|
||
uint32_t * out_u32 = (uint32_t *)out;
|
||
uint32_t * digest_u32 = (uint32_t *)digest;
|
||
|
||
//byte swap32 bit every 4 byte
|
||
for (int i = 0; i < SHA256_DIGEST_SIZE / 4; i++)
|
||
{
|
||
out_u32[i] = __byte_perm(digest_u32[i], 0, 0x0123);
|
||
}
|
||
|
||
}
|
||
|
||
//this is for chacha8 key, so no need to swap, and we nseed the 32 byte block
|
||
__global__ void multihash_kernel_in_memory_no_swap(uint8_t *output, uint64_t t_start, size_t num)
|
||
{
|
||
ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx >= num)
|
||
return;
|
||
|
||
|
||
__shared__ uint32_t sha256_k[64];
|
||
__shared__ unsigned char gf2_table_shared[256];
|
||
__shared__ uint8_t aes_sbox[256];
|
||
__shared__ uint32_t aes_Rcon[7];
|
||
|
||
if (threadIdx.x == 0) {
|
||
memcpy(sha256_k, k, sizeof(sha256_k));
|
||
memcpy(aes_sbox, sbox, sizeof(aes_sbox));
|
||
memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon));
|
||
memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared));
|
||
}
|
||
__syncthreads(); // Make sure the data is loaded before use
|
||
uint64_t t = t_start + idx;
|
||
|
||
// Each input message is 19 bytes.
|
||
uint8_t data[19];
|
||
for (int i =0; i < 19; i++) {
|
||
data[18-i]='0' + (t%10);
|
||
t /= 10;
|
||
}
|
||
|
||
uint8_t digest[SHA256_DIGEST_SIZE];
|
||
// uint8_t v0[SHA256_DIGEST_SIZE];
|
||
|
||
// First round: hash the 19-byte input.
|
||
// sha256_hash(data, 19, digest);
|
||
|
||
sha256_hash_19(data, digest, sha256_k);
|
||
|
||
// Save initial digest as v0.
|
||
// for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
|
||
// {
|
||
// v0[i] = digest[i];
|
||
// }
|
||
uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter)
|
||
|
||
// Copy v0.
|
||
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
|
||
{
|
||
buffer[32 + j] = digest[j];
|
||
}
|
||
|
||
// Write counter (big-endian).
|
||
// buffer[64] = (i >> 24) & 0xff;
|
||
// buffer[65] = (i >> 16) & 0xff;
|
||
// max is 1500
|
||
buffer[64] = 0;
|
||
buffer[65] = 0;
|
||
|
||
// 1,500 rounds total (first one is done above).
|
||
for (int i = 1; i < 1500; i++)
|
||
{
|
||
// Copy current digest.
|
||
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
|
||
{
|
||
buffer[j] = digest[j];
|
||
}
|
||
|
||
buffer[66] = (i >> 8) & 0xff;
|
||
buffer[67] = i & 0xff;
|
||
|
||
// sha256_hash(buffer, 68, digest);
|
||
sha256_hash_68(buffer, digest, sha256_k);
|
||
}
|
||
#define AES_BLOCK_SIZE 16
|
||
// -------- AES post-processing on final digest --------
|
||
// Use final digest as the AES key.
|
||
aes256_ctx key_ctx;
|
||
aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon);
|
||
uint8_t counter[16] = {0}; // clear counter
|
||
|
||
// Encrypt the counter block.
|
||
aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared);
|
||
// Encrypt the (now updated) counter to produce the final output.
|
||
aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared);
|
||
|
||
// Write final (AES‑encrypted) digest to output.
|
||
uint8_t *out = output + idx * SHA256_DIGEST_SIZE;
|
||
|
||
#pragma unroll
|
||
for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
|
||
{
|
||
out[i] = digest[i];
|
||
}
|
||
}
|
||
|
||
|
||
|
||
//chacha8 key doesn't need swapping
|
||
__global__ void multihash_kernel_noswap(const uint8_t *input, uint8_t *output, size_t num)
|
||
{
|
||
ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx >= num)
|
||
return;
|
||
|
||
|
||
__shared__ uint32_t sha256_k[64];
|
||
__shared__ unsigned char gf2_table_shared[256];
|
||
__shared__ uint8_t aes_sbox[256];
|
||
__shared__ uint32_t aes_Rcon[7];
|
||
|
||
if (threadIdx.x == 0) {
|
||
memcpy(sha256_k, k, sizeof(sha256_k));
|
||
memcpy(aes_sbox, sbox, sizeof(aes_sbox));
|
||
memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon));
|
||
memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared));
|
||
}
|
||
__syncthreads(); // Make sure the data is loaded before use
|
||
|
||
|
||
// Each input message is 19 bytes.
|
||
const uint8_t *data = input + idx * 19;
|
||
uint8_t digest[SHA256_DIGEST_SIZE];
|
||
uint8_t v0[SHA256_DIGEST_SIZE];
|
||
|
||
// First round: hash the 19-byte input.
|
||
// sha256_hash(data, 19, digest);
|
||
|
||
sha256_hash_19(data, digest, sha256_k);
|
||
|
||
// Save initial digest as v0.
|
||
for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
|
||
{
|
||
v0[i] = digest[i];
|
||
}
|
||
uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter)
|
||
|
||
// Copy v0.
|
||
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
|
||
{
|
||
buffer[32 + j] = v0[j];
|
||
}
|
||
|
||
// Write counter (big-endian).
|
||
// buffer[64] = (i >> 24) & 0xff;
|
||
// buffer[65] = (i >> 16) & 0xff;
|
||
// max is 1500
|
||
buffer[64] = 0;
|
||
buffer[65] = 0;
|
||
|
||
// 1,500 rounds total (first one is done above).
|
||
for (int i = 1; i < 1500; i++)
|
||
{
|
||
// Copy current digest.
|
||
for (int j = 0; j < SHA256_DIGEST_SIZE; j++)
|
||
{
|
||
buffer[j] = digest[j];
|
||
}
|
||
|
||
buffer[66] = (i >> 8) & 0xff;
|
||
buffer[67] = i & 0xff;
|
||
|
||
// sha256_hash(buffer, 68, digest);
|
||
sha256_hash_68(buffer, digest, sha256_k);
|
||
}
|
||
#define AES_BLOCK_SIZE 16
|
||
// -------- AES post-processing on final digest --------
|
||
// Use final digest as the AES key.
|
||
aes256_ctx key_ctx;
|
||
aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon);
|
||
uint8_t counter[16] = {0}; // clear counter
|
||
|
||
// Encrypt the counter block.
|
||
aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared);
|
||
// Encrypt the (now updated) counter to produce the final output.
|
||
aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared);
|
||
|
||
// Write final (AES‑encrypted) digest to output.
|
||
uint8_t *out = output + idx * SHA256_DIGEST_SIZE;
|
||
|
||
#pragma unroll
|
||
for (int i = 0; i < SHA256_DIGEST_SIZE; i++)
|
||
{
|
||
out[i] = digest[i];
|
||
}
|
||
}
|
||
|
||
|
||
|
||
uint64_t get_time_in_nanosecond()
|
||
{
|
||
struct timespec time;
|
||
clock_gettime(CLOCK_MONOTONIC, &time);
|
||
return (uint64_t)time.tv_sec * 1000000000 + (uint64_t)time.tv_nsec;
|
||
}
|
||
|
||
#define TEST_ENCRYPT_BLOCK_SIZE 32
|
||
|
||
// Test kernel: each thread processes one key/IV pair.
|
||
// this is to test raw encryption speed
|
||
__global__ void test_kcipher2_kernel(const uint8_t *in, uint8_t *out, size_t N)
|
||
{
|
||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx >= N)
|
||
return;
|
||
|
||
// Each input block: first 16 bytes = Key, next 16 bytes = IV.
|
||
const int in_block_size = 32; // 16-byte key + 16-byte IV
|
||
const int out_block_size = TEST_ENCRYPT_BLOCK_SIZE; // 128-byte ciphertext
|
||
|
||
// Pointers to the key/IV for this thread.
|
||
const uint8_t *block_in = in + idx * in_block_size;
|
||
|
||
// Load key (16 bytes) and IV (16 bytes) into local arrays.
|
||
unsigned int key_local[4];
|
||
unsigned int iv_local[4];
|
||
for (int i = 0; i < 4; i++)
|
||
{
|
||
int base_key = i * 4;
|
||
key_local[i] = ((unsigned int)block_in[base_key] << 24) |
|
||
((unsigned int)block_in[base_key + 1] << 16) |
|
||
((unsigned int)block_in[base_key + 2] << 8) |
|
||
((unsigned int)block_in[base_key + 3]);
|
||
}
|
||
for (int i = 0; i < 4; i++)
|
||
{
|
||
int base_iv = 16 + i * 4;
|
||
iv_local[i] = ((unsigned int)block_in[base_iv] << 24) |
|
||
((unsigned int)block_in[base_iv + 1] << 16) |
|
||
((unsigned int)block_in[base_iv + 2] << 8) |
|
||
((unsigned int)block_in[base_iv + 3]);
|
||
}
|
||
|
||
unsigned char ciphertext[8];
|
||
|
||
long long res = kcipher2_encrypt_1_zero_block(key_local, iv_local);
|
||
// copy res to ciphertext
|
||
for (int i = 0; i < 8; i++)
|
||
{
|
||
ciphertext[i] = (res >> (56 - i * 8)) & 0xFF;
|
||
}
|
||
|
||
// Write the output ciphertext.
|
||
size_t out_offset = idx * out_block_size;
|
||
for (int i = 0; i < 8; i++)
|
||
{
|
||
out[out_offset + i] = ciphertext[i];
|
||
}
|
||
}
|
||
|
||
void fill_input(uint8_t *h_input, uint64_t start, size_t num)
|
||
{
|
||
char buffer[20];
|
||
snprintf(buffer, 20, "%019lu", start);
|
||
for (size_t i = 0; i < num; i++)
|
||
{
|
||
// copy buffer to input
|
||
memcpy(h_input + i * 19, buffer, 19);
|
||
// increment buffer, starting from last index, until it reaches '9'
|
||
for (int j = 18; j >= 0; j--)
|
||
{
|
||
if (buffer[j] == '9')
|
||
{
|
||
buffer[j] = '0';
|
||
}
|
||
else
|
||
{
|
||
buffer[j]++;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
int test_encryption_only(size_t num)
|
||
{
|
||
printf("Count %zu \n", num);
|
||
size_t input_size = num * TEST_ENCRYPT_BLOCK_SIZE;
|
||
|
||
// Query device properties
|
||
cudaDeviceProp prop;
|
||
cudaGetDeviceProperties(&prop, gpuIndex);
|
||
|
||
int maxThreadsPerBlock = prop.maxThreadsPerBlock; // Maximum allowed per block
|
||
int blockSize = maxThreadsPerBlock; // Dynamically set blockSize
|
||
// int blockSize = 256;
|
||
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
|
||
|
||
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
|
||
|
||
size_t output_size = num * TEST_ENCRYPT_BLOCK_SIZE;
|
||
|
||
printf("GPU needed memory: %.2f MB\n", (input_size + output_size) / 1024.0 / 1024.0);
|
||
|
||
uint8_t *h_input = (uint8_t *)malloc(input_size);
|
||
// test the blank/zero encryption
|
||
// memset(h_input, 0, input_size);
|
||
// fill with random
|
||
fill_input(h_input, TEST_TIMESTAMP, num * (32 / 19));
|
||
|
||
uint8_t *h_output_enc = (uint8_t *)malloc(output_size);
|
||
|
||
// Allocate device memory.
|
||
uint8_t *d_input, *d_output_enc;
|
||
|
||
printf("Allocating device memory...%.2f Mb\n", (input_size + output_size) / 1024.0 / 1024.0);
|
||
|
||
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_output_enc, output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output_enc\n");
|
||
return 1;
|
||
}
|
||
uint64_t start = get_time_in_nanosecond();
|
||
|
||
// copy to device
|
||
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
|
||
|
||
test_kcipher2_kernel<<<gridSize, blockSize>>>(d_input, d_output_enc, num);
|
||
cudaDeviceSynchronize();
|
||
|
||
// debug encryption output
|
||
cudaMemcpy(h_output_enc, d_output_enc, output_size, cudaMemcpyDeviceToHost);
|
||
|
||
uint64_t end = get_time_in_nanosecond();
|
||
|
||
printf("Enc Time: %f ms\n", (end - start) / 1000000.0);
|
||
|
||
// print speed
|
||
printf("Enc Speed: %f enc per second\n", num / ((end - start) / 1000000000.0));
|
||
|
||
// print first plaintext
|
||
printf("Plaintext for first input: ");
|
||
for (size_t i = 0; i < 19; i++)
|
||
{
|
||
printf("%c", h_input[i]);
|
||
}
|
||
printf("\n");
|
||
|
||
// For demonstration, print the first output digest in hexadecimal.
|
||
printf("Encrypted for first input: ");
|
||
for (size_t i = 0; i < TEST_ENCRYPT_BLOCK_SIZE; i++)
|
||
{
|
||
printf("%02x", h_output_enc[i]);
|
||
}
|
||
printf("\n");
|
||
// print last
|
||
// print plaintext
|
||
printf("Plaintext for last input: ");
|
||
for (size_t i = 19 * num - 19; i < 19 * num; i++)
|
||
{
|
||
printf("%c", h_input[i]);
|
||
}
|
||
printf("\n");
|
||
printf("Encrypted for last input: offs <%zu>: ", num * TEST_ENCRYPT_BLOCK_SIZE - TEST_ENCRYPT_BLOCK_SIZE);
|
||
for (size_t i = TEST_ENCRYPT_BLOCK_SIZE * num - TEST_ENCRYPT_BLOCK_SIZE; i < TEST_ENCRYPT_BLOCK_SIZE * num; i++)
|
||
{
|
||
printf("%02x", h_output_enc[i]);
|
||
}
|
||
printf("\n");
|
||
|
||
// Cleanup.
|
||
cudaFree(d_input);
|
||
cudaFree(d_output_enc);
|
||
free(h_input);
|
||
free(h_output_enc);
|
||
|
||
return 0;
|
||
}
|
||
|
||
#define DIGEST_SIZE 32
|
||
#define KEY_IV_SIZE 16 // we use first 16 bytes for key/iv
|
||
#define KCIPHER_OUT_SIZE 8
|
||
|
||
// KCIPHER2, single kernel
|
||
|
||
// --- Host Code ---
|
||
|
||
//for testing only, it will save all random number generated from a given second
|
||
|
||
int save_random(uint64_t start_time, const char *filename)
|
||
{
|
||
printf("Saving all seeds for %lu to %s\n", start_time, filename);
|
||
|
||
uint64_t timer_start = get_time_in_nanosecond();
|
||
|
||
//do it for every 10 million
|
||
size_t num = 10*1000*1000;
|
||
size_t input_size = num * 19 * sizeof(uint8_t);
|
||
uint8_t *h_input = (uint8_t *)malloc(input_size);
|
||
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
|
||
uint8_t *h_output = (uint8_t *)malloc(output_size);
|
||
|
||
uint8_t *h_output_non_zeroes = (uint8_t *)malloc(num * 16);
|
||
|
||
// Allocate device memory.
|
||
uint8_t *d_input, *d_output;
|
||
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
|
||
return 1;
|
||
}
|
||
|
||
int blockSize = 256;
|
||
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
|
||
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
|
||
|
||
FILE *f = fopen(filename, "wb");
|
||
if (f == NULL)
|
||
{
|
||
fprintf(stderr, "Error: Cannot open file %s\n", filename);
|
||
return 1;
|
||
}
|
||
|
||
for (size_t i = 0; i < 100; i++) {
|
||
printf("Saving %zu %%\r", i);fflush(stdout);
|
||
fill_input(h_input, start_time + i * num, num);
|
||
//copy to CUDA
|
||
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
|
||
|
||
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
|
||
return 1;
|
||
}
|
||
cudaDeviceSynchronize();
|
||
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
|
||
//copy every 16 bytes
|
||
for (size_t j = 0; j < num; j++)
|
||
{
|
||
memcpy(h_output_non_zeroes + j * 16, h_output + j * DIGEST_SIZE, 16);
|
||
}
|
||
fwrite(h_output_non_zeroes, 16, num, f);
|
||
|
||
}
|
||
fclose(f);
|
||
uint64_t timer_end = get_time_in_nanosecond();
|
||
|
||
printf("DONE saved to %s: total time %.2f second \n", filename, (timer_end - timer_start) / 1000000000.0);
|
||
|
||
return 0;
|
||
}
|
||
|
||
int test_generate_random_only_in_gpu(size_t num)
|
||
{
|
||
uint64_t t_start = TEST_TIMESTAMP + 2000; // TEST: 0 is for chacha, +1000 for chacha_nonce
|
||
|
||
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
|
||
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
|
||
uint8_t *h_output = (uint8_t *)malloc(output_size);
|
||
|
||
uint8_t *d_output;
|
||
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
|
||
return 1;
|
||
}
|
||
int blockSize = 256;
|
||
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
|
||
uint64_t start = get_time_in_nanosecond();
|
||
multihash_kernel_in_memory<<<gridSize, blockSize>>>(d_output, t_start, num);
|
||
cudaDeviceSynchronize();
|
||
uint64_t end = get_time_in_nanosecond();
|
||
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
|
||
|
||
// print first and last
|
||
printf("First random ");
|
||
for (int i = 0; i < DIGEST_SIZE; i++)
|
||
{
|
||
printf("%02x", h_output[i]);
|
||
}
|
||
printf("\n");
|
||
printf("Last random ");
|
||
for (size_t i = DIGEST_SIZE * num - DIGEST_SIZE; i < DIGEST_SIZE * num; i++)
|
||
{
|
||
printf("%02x", h_output[i]);
|
||
}
|
||
printf("\n");
|
||
|
||
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
|
||
printf("Speed: %f hashes per second\n", num / ((end - start) / 1000000000.0));
|
||
|
||
|
||
return 0;
|
||
}
|
||
|
||
int test_generate_random_only(size_t num)
|
||
{
|
||
printf("Test generate random only: %lu\n", num);
|
||
size_t input_size = num * 19 * sizeof(uint8_t);
|
||
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
|
||
|
||
// Allocate host memory.
|
||
uint8_t *h_input = (uint8_t *)malloc(input_size);
|
||
|
||
uint64_t t_start = TEST_TIMESTAMP + 2000; // TEST: 0 is for chacha, +1000 for chacha_nonce
|
||
|
||
// output
|
||
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
|
||
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
|
||
|
||
uint8_t *h_output = (uint8_t *)malloc(output_size);
|
||
|
||
uint64_t start_fill = get_time_in_nanosecond();
|
||
|
||
fill_input(h_input, t_start, num);
|
||
uint64_t end_fill = get_time_in_nanosecond();
|
||
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
|
||
|
||
// Allocate device memory.
|
||
uint8_t *d_input, *d_output;
|
||
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
|
||
return 1;
|
||
}
|
||
|
||
uint64_t start = get_time_in_nanosecond();
|
||
|
||
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
|
||
|
||
int blockSize = 256;
|
||
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
|
||
|
||
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
|
||
|
||
printf("Starting timestamp -> random calculation...\n");
|
||
fflush(stdout);
|
||
|
||
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
|
||
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
|
||
return 1;
|
||
}
|
||
|
||
cudaDeviceSynchronize();
|
||
uint64_t end1 = get_time_in_nanosecond();
|
||
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
|
||
|
||
uint64_t end = get_time_in_nanosecond();
|
||
|
||
// print first and last
|
||
printf("First random ");
|
||
for (int i = 0; i < DIGEST_SIZE; i++)
|
||
{
|
||
printf("%02x", h_output[i]);
|
||
}
|
||
printf("\n");
|
||
printf("Last random ");
|
||
for (size_t i = DIGEST_SIZE * num - DIGEST_SIZE; i < DIGEST_SIZE * num; i++)
|
||
{
|
||
printf("%02x", h_output[i]);
|
||
}
|
||
printf("\n");
|
||
|
||
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
|
||
// print speed per second
|
||
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
|
||
// Cleanup.
|
||
cudaFree(d_input);
|
||
cudaFree(d_output);
|
||
free(h_input);
|
||
free(h_output);
|
||
return 0;
|
||
}
|
||
|
||
|
||
#define U8TO32_LITTLE(p) (*(const uint32_t *)(p))
|
||
|
||
typedef struct {
|
||
uint32_t a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p;
|
||
} BLOCK;
|
||
|
||
|
||
// Device helper: rotate left 32-bit value
|
||
__device__ __forceinline__ uint32_t _rotl(uint32_t x, int n) {
|
||
return (x << n) | (x >> (32 - n));
|
||
}
|
||
|
||
|
||
#define QROUND(a, b, c, d) \
|
||
d = _rotl(d ^ (a += b), 16); \
|
||
b = _rotl(b ^ (c += d), 12); \
|
||
d = _rotl(d ^ (a += b), 8); \
|
||
b = _rotl(b ^ (c += d), 7)
|
||
#define FROUND \
|
||
QROUND(x.d, x.h, x.l, x.p); \
|
||
QROUND(x.c, x.g, x.k, x.o); \
|
||
QROUND(x.b, x.f, x.j, x.n); \
|
||
QROUND(x.a, x.e, x.i, x.m); \
|
||
QROUND(x.a, x.f, x.k, x.p); \
|
||
QROUND(x.b, x.g, x.l, x.m); \
|
||
QROUND(x.c, x.h, x.i, x.n); \
|
||
QROUND(x.d, x.e, x.j, x.o)
|
||
#define FFINAL \
|
||
x.a += j.a; \
|
||
x.b += j.b; \
|
||
x.c += j.c; \
|
||
x.d += j.d; \
|
||
x.e += j.e; \
|
||
x.f += j.f; \
|
||
x.g += j.g; \
|
||
x.h += j.h; \
|
||
x.i += j.i; \
|
||
x.j += j.j; \
|
||
x.k += j.k; \
|
||
x.l += j.l; \
|
||
x.m += j.m; \
|
||
x.n += j.n; \
|
||
x.o += j.o; \
|
||
x.p += j.p
|
||
|
||
//in contains key/nonce
|
||
//offset is the match offset
|
||
//out_flag is the flag: ts << 32 | offset
|
||
__global__ void chacha8_encrypt_and_match(const uint8_t *in, size_t offset,
|
||
unsigned long long *out_flag,
|
||
const unsigned long long value,
|
||
int N)
|
||
{
|
||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
||
if (idx >= N)
|
||
{
|
||
return;
|
||
}
|
||
|
||
size_t idx_offs = idx + offset;
|
||
|
||
if (idx_offs >= N)
|
||
{
|
||
return;
|
||
}
|
||
|
||
const int in_block_size = 32;
|
||
|
||
// Pointers to the key/IV for this thread.
|
||
const uint8_t *block_in = in + idx * in_block_size;
|
||
const uint8_t *block_offset_in = in + idx_offs * in_block_size;
|
||
|
||
const uint8_t *k = block_in;
|
||
const uint8_t *iv = block_offset_in;
|
||
const char constants[16] = {'e', 'x', 'p', 'a', 'n', 'd', ' ', '1', '6', '-', 'b', 'y', 't', 'e', ' ', 'k'};
|
||
|
||
|
||
uint32_t state[16];
|
||
state[4] = U8TO32_LITTLE(k + 0);
|
||
state[5] = U8TO32_LITTLE(k + 4);
|
||
state[6] = U8TO32_LITTLE(k + 8);
|
||
state[7] = U8TO32_LITTLE(k + 12);
|
||
state[8] = U8TO32_LITTLE(k + 0);
|
||
state[9] = U8TO32_LITTLE(k + 4);
|
||
state[10] = U8TO32_LITTLE(k + 8);
|
||
state[11] = U8TO32_LITTLE(k + 12);
|
||
state[0] = U8TO32_LITTLE(constants + 0);
|
||
state[1] = U8TO32_LITTLE(constants + 4);
|
||
state[2] = U8TO32_LITTLE(constants + 8);
|
||
state[3] = U8TO32_LITTLE(constants + 12);
|
||
state[12] = 0;
|
||
state[13] = 0;
|
||
state[14] = U8TO32_LITTLE(iv + 0);
|
||
state[15] = U8TO32_LITTLE(iv + 4);
|
||
|
||
BLOCK x;
|
||
BLOCK j;
|
||
|
||
memcpy(&j, state, sizeof(BLOCK)); //j is for final addition
|
||
j.m = 0;
|
||
j.n = 0;
|
||
|
||
memcpy(&x, &j, sizeof(BLOCK)); //FROUND will modify x
|
||
|
||
FROUND;
|
||
FROUND;
|
||
FROUND;
|
||
FROUND;
|
||
FFINAL;
|
||
|
||
uint64_t *result = (uint64_t *)&x;
|
||
if (*result == value) {
|
||
unsigned long long encoded_offset_and_index = (unsigned long long)offset << 32 | idx;
|
||
*out_flag = encoded_offset_and_index;
|
||
}
|
||
|
||
}
|
||
|
||
__global__ void encrypt_and_search_offset(const uint8_t *in, size_t offset,
|
||
unsigned long long *out_flag,
|
||
const unsigned long long *masks,
|
||
const unsigned long long *values,
|
||
int numComb,
|
||
int N)
|
||
{
|
||
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
||
if (idx >= N)
|
||
{
|
||
return;
|
||
}
|
||
|
||
size_t idx_offs = idx + offset;
|
||
|
||
if (idx_offs >= N)
|
||
{
|
||
return;
|
||
}
|
||
|
||
|
||
__shared__ unsigned char gf2_table_shared[256];
|
||
__shared__ unsigned char gf3_table_shared[256];
|
||
__shared__ unsigned char d_s_box_shared[256];
|
||
__shared__ unsigned int d_amul0_shared[256];
|
||
__shared__ unsigned int d_amul1_shared[256];
|
||
__shared__ unsigned int d_amul2_shared[256];
|
||
__shared__ unsigned int d_amul3_shared[256];
|
||
__shared__ unsigned long long shared_masks[MAX_MATCHES];
|
||
__shared__ unsigned long long shared_values[MAX_MATCHES];
|
||
|
||
if (threadIdx.x == 0) {
|
||
//copy gf2_table from const
|
||
memcpy(gf2_table_shared, gf2_table, 256);
|
||
memcpy(gf3_table_shared, gf3_table, 256);
|
||
memcpy(d_s_box_shared, d_s_box, 256);
|
||
memcpy(d_amul0_shared, d_amul0, 256 * sizeof(int));
|
||
memcpy(d_amul1_shared, d_amul1, 256 * sizeof(int));
|
||
memcpy(d_amul2_shared, d_amul2, 256 * sizeof(int));
|
||
memcpy(d_amul3_shared, d_amul3, 256 * sizeof(int));
|
||
for (int i = 0; i < numComb; i++)
|
||
{
|
||
shared_masks[i] = masks[i];
|
||
shared_values[i] = values[i];
|
||
}
|
||
}
|
||
__syncthreads(); // Make sure the data is loaded before use
|
||
|
||
|
||
const int in_block_size = 32;
|
||
|
||
// Pointers to the key/IV for this thread.
|
||
const uint8_t *block_in = in + idx * in_block_size;
|
||
const uint8_t *block_offset_in = in + idx_offs * in_block_size;
|
||
|
||
// Load key (16 bytes) and IV (16 bytes) into local arrays.
|
||
unsigned int key_local[4];
|
||
unsigned int iv_local[4];
|
||
const uint8_t *key_in = block_in;
|
||
|
||
|
||
const uint8_t *iv_in = block_offset_in;
|
||
|
||
memcpy(key_local, key_in, 16);
|
||
|
||
|
||
memcpy(iv_local, iv_in, 16);
|
||
|
||
//long long res = kcipher2_encrypt_1_zero_block(key_local, iv_local);
|
||
long long res = kcipher2_encrypt_1_zero_block_shared(key_local, iv_local, gf2_table_shared, gf3_table_shared, d_s_box_shared,
|
||
d_amul0_shared, d_amul1_shared, d_amul2_shared, d_amul3_shared);
|
||
|
||
|
||
// compare with matches
|
||
unsigned long long in_val = bswap64(res);
|
||
|
||
unsigned int flag = 0;
|
||
|
||
for (int i = 0; i < numComb; i++) {
|
||
// diff is 0 if the masked input equals the expected value
|
||
unsigned long long diff = (in_val & shared_masks[i]) ^ shared_values[i];
|
||
// Compute a branch-free match: returns nonzero if diff is zero.
|
||
flag |= (1 - ((diff | -diff) >> 63));
|
||
}
|
||
if (flag) {
|
||
//we found it, now find out exactly which one did we find
|
||
for (int i = 0; i < numComb; i++) {
|
||
if ((in_val & shared_masks[i]) == shared_values[i]) {
|
||
//idx: max 1 billion, use 30 bits
|
||
//offset: max 64 million, use 26
|
||
//numComb: max 128, use 7
|
||
unsigned long long encoded_offset_and_index = (unsigned long long)(idx << 34 | offset << 8 | i << 1 | 1);
|
||
|
||
*out_flag = encoded_offset_and_index;
|
||
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
}
|
||
|
||
void decode_offset_and_index(unsigned long long encoded_offset_and_index,
|
||
size_t *offset, size_t *index, size_t *matchPos)
|
||
{
|
||
//idx: max 1 billion, use 30 bits
|
||
//offset: max 64 million, use 26
|
||
//matchPos: max 128, use 7 bits
|
||
//final 1 bit to ensure we have true value
|
||
//how it was encoded:
|
||
//unsigned long long encoded_offset_and_index = (unsigned long long)(idx << 34 | offset << 8 | i << 1 | 1);
|
||
//decode:
|
||
*index = (encoded_offset_and_index >> 34) & 0x3fffffff;
|
||
*offset = (encoded_offset_and_index >> 8) & 0x3ffffff;
|
||
*matchPos = (encoded_offset_and_index >> 1) & 0x7f;
|
||
}
|
||
|
||
__global__ void encrypt_and_search(const uint8_t *in, uint8_t *out, uint32_t k1, uint32_t k2, uint32_t k3, uint32_t k4,
|
||
int *out_flag,
|
||
const unsigned long long *masks,
|
||
const unsigned long long *values,
|
||
int numComb,
|
||
int N)
|
||
{
|
||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx >= N)
|
||
return;
|
||
|
||
// for (int i =0; i < 16; i++) {
|
||
// printf("%02x", in[i]);
|
||
// }
|
||
// printf("\n");
|
||
|
||
// Each input block: first 16 bytes = Key, next 16 bytes = IV.
|
||
const int in_block_size = 32; // 16-byte IV + zeroes
|
||
const int out_block_size = TEST_ENCRYPT_BLOCK_SIZE; // block size
|
||
|
||
// Pointers to the key/IV for this thread.
|
||
const uint8_t *block_in = in + idx * in_block_size;
|
||
|
||
// Load key (16 bytes) and IV (16 bytes) into local arrays.
|
||
__shared__ unsigned int key_local[4];
|
||
|
||
__shared__ unsigned char gf2_table_shared[256];
|
||
__shared__ unsigned char gf3_table_shared[256];
|
||
__shared__ unsigned char d_s_box_shared[256];
|
||
__shared__ unsigned int d_amul0_shared[256];
|
||
__shared__ unsigned int d_amul1_shared[256];
|
||
__shared__ unsigned int d_amul2_shared[256];
|
||
__shared__ unsigned int d_amul3_shared[256];
|
||
__shared__ unsigned long long shared_masks[MAX_MATCHES];
|
||
__shared__ unsigned long long shared_values[MAX_MATCHES];
|
||
|
||
unsigned int iv_local[4];
|
||
if (threadIdx.x == 0) {
|
||
key_local[0] = k1;
|
||
key_local[1] = k2;
|
||
key_local[2] = k3;
|
||
key_local[3] = k4;
|
||
//copy gf2_table from const
|
||
memcpy(gf2_table_shared, gf2_table, 256);
|
||
memcpy(gf3_table_shared, gf3_table, 256);
|
||
memcpy(d_s_box_shared, d_s_box, 256);
|
||
memcpy(d_amul0_shared, d_amul0, 256 * sizeof(int));
|
||
memcpy(d_amul1_shared, d_amul1, 256 * sizeof(int));
|
||
memcpy(d_amul2_shared, d_amul2, 256 * sizeof(int));
|
||
memcpy(d_amul3_shared, d_amul3, 256 * sizeof(int));
|
||
for (int i = 0; i < numComb; i++)
|
||
{
|
||
shared_masks[i] = masks[i];
|
||
shared_values[i] = values[i];
|
||
}
|
||
}
|
||
__syncthreads(); // Make sure the data is loaded before use
|
||
|
||
const uint8_t *iv_in = block_in;
|
||
|
||
|
||
memcpy(iv_local, iv_in,16);
|
||
|
||
//long long res = kcipher2_encrypt_1_zero_block(key_local, iv_local);
|
||
long long res = kcipher2_encrypt_1_zero_block_shared(key_local, iv_local, gf2_table_shared, gf3_table_shared, d_s_box_shared,
|
||
d_amul0_shared, d_amul1_shared, d_amul2_shared, d_amul3_shared);
|
||
|
||
// compare with matches
|
||
unsigned long long in_val = bswap64(res);
|
||
unsigned int flag = 0;
|
||
for (int i = 0; i < numComb; i++)
|
||
{
|
||
if ((in_val & masks[i]) == values[i])
|
||
{
|
||
*out_flag = 1;
|
||
flag = 1;
|
||
break;
|
||
}
|
||
}
|
||
|
||
|
||
#if 1
|
||
#ifdef NUM_MATCHES
|
||
#pragma unroll
|
||
for (int i = 0; i < NUM_MATCHES; i++) {
|
||
#else
|
||
for (int i = 0; i < numComb; i++) {
|
||
#endif
|
||
|
||
// diff is 0 if the masked input equals the expected value
|
||
unsigned long long diff = (in_val & shared_masks[i]) ^ shared_values[i];
|
||
// Compute a branch-free match: returns nonzero if diff is zero.
|
||
flag |= (1 - ((diff | -diff) >> 63));
|
||
}
|
||
if (flag)
|
||
*out_flag = flag;
|
||
|
||
#endif
|
||
|
||
if (flag)
|
||
{
|
||
// Write the output ciphertext.
|
||
int out_offset = idx * out_block_size;
|
||
#pragma unroll
|
||
for (int i = 0; i < 8; i++) // only copy changed
|
||
{
|
||
out[out_offset + i] = (res >> (56 - i * 8)) & 0xFF;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
|
||
int do_bruteforce_offset(const char *filename)
|
||
{
|
||
printf("Bruteforce Offset\n");
|
||
json config;
|
||
|
||
std::string json_file = filename;
|
||
|
||
std::string checkpoint = json_file + ".checkpoint.json";
|
||
|
||
std::ifstream f(filename);
|
||
if (!f.is_open())
|
||
{
|
||
fprintf(stderr, "Error: Could not open config file %s\n", filename);
|
||
return 1;
|
||
}
|
||
|
||
try
|
||
{
|
||
f >> config;
|
||
}
|
||
catch (json::parse_error &e)
|
||
{
|
||
fprintf(stderr, "Error parsing JSON: %s\n", e.what());
|
||
return 1;
|
||
}
|
||
|
||
size_t *offsets = 0;
|
||
size_t offset_count = 0;
|
||
//open offset.txt
|
||
FILE *f_offset = fopen("offset.txt", "r");
|
||
if (f_offset == NULL)
|
||
{
|
||
fprintf(stderr, "Error: Could not open offset file\n");
|
||
return 1;
|
||
}
|
||
//read line by line, convert to integer
|
||
char line[256];
|
||
while (fgets(line, sizeof(line), f_offset))
|
||
{
|
||
offsets = (size_t *)realloc(offsets, (offset_count + 1) * sizeof(size_t));
|
||
offsets[offset_count] = strtoull(line, NULL, 10);
|
||
offset_count++;
|
||
}
|
||
fclose(f_offset);
|
||
printf("Offset count: %zu\n", offset_count);
|
||
//print first and last offset
|
||
printf("First offset: %zu\n", offsets[0]);
|
||
printf("Last offset: %zu\n", offsets[offset_count - 1]);
|
||
|
||
|
||
size_t num;
|
||
uint64_t t_start;
|
||
//size_t enc_count;
|
||
size_t offset;
|
||
|
||
size_t matches_size = 0;
|
||
uint64_t *matches = 0;
|
||
uint64_t *masks = 0;
|
||
|
||
try
|
||
{
|
||
t_start = config["start_timestamp"].get<uint64_t>(); // start T3
|
||
|
||
num = config["count"].get<size_t>(); // stop at T3 + count ns
|
||
|
||
offset = config["offset"].get<size_t>(); // we start at T3 + offset ns
|
||
|
||
//enc_count = config["brute_force_time_range"].get<size_t>(); // we stop at T3 + offset + enc_count ns
|
||
|
||
// "mathes": [
|
||
// {
|
||
// "plaintext": "0x00000000",
|
||
// "encrypted": "0x00000000",
|
||
// "bitmask": "0xffffffff"
|
||
// },
|
||
// {
|
||
// "plaintext": "0x00000000",
|
||
// "encrypted": "0x00000001",
|
||
// "bitmask": "0xffffffff"
|
||
// }
|
||
// ]
|
||
// parse matches, and put it in array (match = plaintext^encrypted)
|
||
if (config.contains("matches"))
|
||
{
|
||
matches_size = config["matches"].size();
|
||
assert(matches_size < MAX_MATCHES);
|
||
assert(matches_size > 0);
|
||
matches = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
|
||
masks = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
|
||
for (size_t i = 0; i < matches_size; i++)
|
||
{
|
||
uint64_t plaintext = std::stoull(config["matches"][i]["plaintext"].get<std::string>(), 0, 16);
|
||
uint64_t encrypted = std::stoull(config["matches"][i]["encrypted"].get<std::string>(), 0, 16);
|
||
uint64_t bitmask = std::stoull(config["matches"][i]["bitmask"].get<std::string>(), 0, 16);
|
||
matches[i] = plaintext ^ encrypted;
|
||
masks[i] = bitmask;
|
||
printf("Match %zu: %016lx bitmask %016lx \n", i, matches[i], masks[i]);
|
||
}
|
||
|
||
}
|
||
}
|
||
catch (json::exception &e)
|
||
{
|
||
fprintf(stderr, "Error reading JSON values: %s\n", e.what());
|
||
return 1;
|
||
}
|
||
|
||
printf("Configuration:\n");
|
||
printf("num: %zu\n", num);
|
||
printf("t_start: %lu\n", t_start);
|
||
//printf("enc_count: %zu\n", enc_count);
|
||
printf("offset: %zu\n", offset);
|
||
|
||
//printf("Brute forcing: %zu enc count %zu\n", num, enc_count);
|
||
size_t input_size = num * 19 * sizeof(uint8_t);
|
||
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
|
||
|
||
// Allocate host memory.
|
||
uint8_t *h_input = (uint8_t *)malloc(input_size);
|
||
|
||
// output
|
||
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
|
||
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
|
||
|
||
uint8_t *h_output = (uint8_t *)malloc(output_size);
|
||
|
||
//size_t enc_output_size = enc_count * TEST_ENCRYPT_BLOCK_SIZE * sizeof(uint8_t);
|
||
|
||
//uint8_t *h_output_enc = (uint8_t *)malloc(enc_output_size);
|
||
|
||
uint64_t start_fill = get_time_in_nanosecond();
|
||
|
||
fill_input(h_input, t_start, num); // GENERATE TS
|
||
|
||
uint64_t end_fill = get_time_in_nanosecond();
|
||
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
|
||
|
||
// Allocate device memory.
|
||
uint8_t *d_input, *d_output;
|
||
//uint8_t *d_output_enc;
|
||
unsigned long long *d_found;
|
||
unsigned long long *d_matches;
|
||
unsigned long long *d_masks;
|
||
|
||
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
|
||
return 1;
|
||
}
|
||
// if (cudaMalloc(&d_output_enc, enc_output_size) != cudaSuccess)
|
||
// {
|
||
// fprintf(stderr, "Error: cudaMalloc failed for d_output_enc\n");
|
||
// return 1;
|
||
// }
|
||
if (cudaMalloc(&d_found, sizeof(unsigned long long)) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_found\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_matches, matches_size * sizeof(uint64_t)) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_matches\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_masks, matches_size * sizeof(uint64_t)) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_masks\n");
|
||
return 1;
|
||
}
|
||
|
||
unsigned long long zero = 0;
|
||
cudaMemcpy(d_found, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
|
||
|
||
// copy matches and masks
|
||
cudaMemcpy(d_matches, matches, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
|
||
cudaMemcpy(d_masks, masks, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
|
||
|
||
uint64_t start = get_time_in_nanosecond();
|
||
|
||
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
|
||
|
||
// Launch kernel.
|
||
int blockSize = 256;
|
||
int gridSize = (num + blockSize - 1) / blockSize;
|
||
|
||
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
|
||
|
||
printf("Starting timestamp -> random calculation...\n");
|
||
|
||
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
|
||
cudaDeviceSynchronize();
|
||
uint64_t end1 = get_time_in_nanosecond();
|
||
|
||
uint64_t end = get_time_in_nanosecond();
|
||
|
||
int limit = num - offset;
|
||
|
||
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
|
||
// print speed per second
|
||
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
|
||
|
||
printf("Limit: %d\n", limit);
|
||
start = get_time_in_nanosecond();
|
||
|
||
uint64_t start_enc = get_time_in_nanosecond();
|
||
|
||
// set gridsize based on limit
|
||
gridSize = (limit + blockSize - 1) / blockSize;
|
||
|
||
for (size_t i = 0; i < offset_count; i++)
|
||
{
|
||
printf("Starting encryption and search at offset %zu (%zu)\n", i, offsets[i]);
|
||
encrypt_and_search_offset<<<gridSize, blockSize>>>(d_output, offsets[i], d_found, d_masks, d_matches, matches_size, limit);
|
||
//check for error
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
|
||
return 1;
|
||
}
|
||
|
||
if (i > 0 && (i % 1) == 0)
|
||
{
|
||
uint64_t end_enc = get_time_in_nanosecond();
|
||
printf("Enc Time: %f ms\n", (end_enc - start_enc) / 1000000.0);
|
||
start_enc = get_time_in_nanosecond();
|
||
printf("Progress: %zu/%zu (testing: %zu)\n", i, offset_count, offsets[i]);
|
||
cudaDeviceSynchronize();
|
||
|
||
unsigned long long found = 0;
|
||
cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
|
||
if (found)
|
||
{
|
||
//unsigned long long encoded_offset_and_index = (unsigned long long)offset << 31 | idx << 1 | 1;
|
||
|
||
size_t t_offset, t_index, match_pos;
|
||
decode_offset_and_index(found, &t_offset, &t_index, &match_pos);
|
||
|
||
printf("Found at index %zu offset=%zu ts = %zu + %zu filename = %s match_pos = %zu\n", i, t_offset, t_start, t_index, filename, match_pos);
|
||
//write to output.txt
|
||
FILE *f = fopen("output.txt", "a");
|
||
if (f == NULL)
|
||
{
|
||
printf("Error opening file!\n");
|
||
exit(1);
|
||
}
|
||
fprintf(f, "Found at index %zu offset=%zu ts = %zu + %zu filename = %s match_pos = %zu\n", i, t_offset, t_start, t_index, filename, match_pos);
|
||
fclose(f);
|
||
// reset for next
|
||
cudaMemcpy(d_found, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
|
||
// err = cudaGetLastError();
|
||
// if (err != cudaSuccess)
|
||
// {
|
||
// fprintf(stderr, "Error2: %s\n", cudaGetErrorString(err));
|
||
// return 1;
|
||
// }
|
||
}
|
||
//write checkpoint
|
||
config["start_timestamp"] = t_start + i;
|
||
config["index"] = i;
|
||
std::ofstream o(checkpoint);
|
||
o << std::setw(4) << config << std::endl;
|
||
}
|
||
}
|
||
|
||
cudaDeviceSynchronize();
|
||
|
||
long long found = 0;
|
||
cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
|
||
return 1;
|
||
}
|
||
if (found)
|
||
{
|
||
size_t t_offset, t_index, match_pos;
|
||
decode_offset_and_index(found, &t_offset, &t_index, &match_pos);
|
||
printf("Found at offset=%zu ts = %zu + %zu filename = %s match indxex = %zu\n", t_offset, t_start, t_index, filename, match_pos);
|
||
|
||
}
|
||
|
||
uint64_t end_enc = get_time_in_nanosecond();
|
||
printf("Enc Offset list Time: %f ms\n", (end_enc - start_enc) / 1000000.0);
|
||
|
||
return 0;
|
||
}
|
||
|
||
int do_bruteforce_new(const char *filename)
|
||
{
|
||
json config;
|
||
|
||
std::string json_file = filename;
|
||
|
||
std::string checkpoint = json_file + ".checkpoint.json";
|
||
|
||
std::ifstream f(filename);
|
||
if (!f.is_open())
|
||
{
|
||
fprintf(stderr, "Error: Could not open config file %s\n", filename);
|
||
return 1;
|
||
}
|
||
|
||
try
|
||
{
|
||
f >> config;
|
||
}
|
||
catch (json::parse_error &e)
|
||
{
|
||
fprintf(stderr, "Error parsing JSON: %s\n", e.what());
|
||
return 1;
|
||
}
|
||
|
||
size_t num;
|
||
uint64_t t_start;
|
||
size_t enc_count;
|
||
size_t offset;
|
||
|
||
size_t matches_size = 0;
|
||
uint64_t *matches = 0;
|
||
uint64_t *masks = 0;
|
||
char **matches_filename = 0;
|
||
|
||
try
|
||
{
|
||
t_start = config["start_timestamp"].get<uint64_t>(); // start T3
|
||
|
||
num = config["count"].get<size_t>(); // stop at T3 + count ns
|
||
|
||
offset = config["offset"].get<size_t>(); // we start at T3 + offset ns
|
||
|
||
enc_count = config["brute_force_time_range"].get<size_t>(); // we stop at T3 + offset + enc_count ns
|
||
|
||
// parse matches, and put it in array (match = plaintext^encrypted)
|
||
if (config.contains("matches"))
|
||
{
|
||
matches_size = config["matches"].size();
|
||
|
||
assert(matches_size < MAX_MATCHES);
|
||
assert(matches_size > 0);
|
||
|
||
matches = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
|
||
masks = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
|
||
matches_filename = (char **)malloc(matches_size * sizeof(char *));
|
||
for (size_t i = 0; i < matches_size; i++)
|
||
{
|
||
uint64_t plaintext = std::stoull(config["matches"][i]["plaintext"].get<std::string>(), 0, 16);
|
||
uint64_t encrypted = std::stoull(config["matches"][i]["encrypted"].get<std::string>(), 0, 16);
|
||
uint64_t bitmask = std::stoull(config["matches"][i]["bitmask"].get<std::string>(), 0, 16);
|
||
std::string filename = config["matches"][i]["filename"].get<std::string>();
|
||
//strdup filename
|
||
matches_filename[i] = strdup(filename.c_str());
|
||
matches[i] = plaintext ^ encrypted;
|
||
masks[i] = bitmask;
|
||
printf("Match %zu: %016lx bitmask %016lx \n", i, matches[i], masks[i]);
|
||
|
||
}
|
||
}
|
||
}
|
||
catch (json::exception &e)
|
||
{
|
||
fprintf(stderr, "Error reading JSON values: %s\n", e.what());
|
||
return 1;
|
||
}
|
||
|
||
printf("Configuration:\n");
|
||
printf("num: %zu\n", num);
|
||
printf("t_start: %lu\n", t_start);
|
||
printf("t_end %lu\n", t_start + num);
|
||
printf("enc_count: %zu\n", enc_count);
|
||
printf("offset: %zu\n", offset);
|
||
|
||
printf("Brute forcing: %zu enc count %zu\n", num, enc_count);
|
||
|
||
// output
|
||
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
|
||
printf("GPU RAM for random %.2f MB\n", output_size / 1024.0 / 1024.0);
|
||
|
||
// Allocate device memory.
|
||
uint8_t *d_output;
|
||
unsigned long long *d_found;
|
||
unsigned long long *d_matches;
|
||
unsigned long long *d_masks;
|
||
|
||
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_found, sizeof(unsigned long long )) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_found\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_matches, matches_size * sizeof(uint64_t)) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_matches\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_masks, matches_size * sizeof(uint64_t)) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_masks\n");
|
||
return 1;
|
||
}
|
||
|
||
unsigned long long zero = 0;
|
||
cudaMemcpy(d_found, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice);
|
||
|
||
// copy matches and masks
|
||
cudaMemcpy(d_matches, matches, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
|
||
cudaMemcpy(d_masks, masks, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
|
||
|
||
uint64_t start = get_time_in_nanosecond();
|
||
|
||
|
||
// Launch kernel.
|
||
int blockSize = 256;
|
||
int gridSize = (num + blockSize - 1) / blockSize;
|
||
|
||
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
|
||
|
||
printf("Starting timestamp -> random calculation...: %zu nanoseconds\n", num);
|
||
|
||
multihash_kernel_in_memory<<<gridSize, blockSize>>>(d_output, t_start, num);
|
||
|
||
cudaDeviceSynchronize();
|
||
uint64_t end1 = get_time_in_nanosecond();
|
||
|
||
uint64_t end = get_time_in_nanosecond();
|
||
|
||
|
||
int limit = num - enc_count - offset;
|
||
|
||
printf("Total Time to translate timestamp to random %f ms\n", (end - start) / 1000000.0);
|
||
// print speed per second
|
||
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
|
||
|
||
printf("Limit: %d\n", limit);
|
||
start = get_time_in_nanosecond();
|
||
|
||
uint64_t start_enc = get_time_in_nanosecond();
|
||
|
||
// set gridsize based on limit
|
||
gridSize = (limit + blockSize - 1) / blockSize;
|
||
|
||
uint64_t elapsed = 0;
|
||
|
||
for (size_t i = 0; i < enc_count; i++)
|
||
{
|
||
// printf("Starting encryption and search at offset %d\n", i);
|
||
encrypt_and_search_offset<<<gridSize, blockSize>>>(d_output, offset + i, d_found, d_masks, d_matches, matches_size, limit);
|
||
// print error
|
||
// cudaError_t err = cudaGetLastError();
|
||
// if (err != cudaSuccess)
|
||
// {
|
||
// fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
|
||
// return 1;
|
||
// }
|
||
#define SKIP_CHECK 100
|
||
if (i > 0 && (i % SKIP_CHECK) == 0)
|
||
{
|
||
uint64_t end_enc = get_time_in_nanosecond();
|
||
printf("Enc Time (%zu): %f ms\n", i, (end_enc - start_enc) / 1000000.0);
|
||
|
||
elapsed += (end_enc - start_enc);
|
||
printf("%.2f minutes elapsed, ", elapsed / 1000000000.0 / 60.0);
|
||
//remaining
|
||
double remaining = (enc_count - i) * ((end_enc - start_enc)/SKIP_CHECK);
|
||
printf("%.2f minutes remaining (%.2f hours). ", remaining / 1000000000.0 / 60.0, remaining / 1000000000.0 / 3600.0);
|
||
|
||
start_enc = get_time_in_nanosecond();
|
||
printf("Progress: %zu/%zu (testing: %zu)\n", i, enc_count, offset + i);
|
||
|
||
cudaDeviceSynchronize();
|
||
// //print error
|
||
// err = cudaGetLastError();
|
||
// if (err != cudaSuccess)
|
||
// {
|
||
// fprintf(stderr, "Error0: %s\n", cudaGetErrorString(err));
|
||
// return 1;
|
||
// }
|
||
|
||
unsigned long long found = 0;
|
||
cudaMemcpy(&found, d_found, sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
|
||
// err = cudaGetLastError();
|
||
// if (err != cudaSuccess)
|
||
// {
|
||
// fprintf(stderr, "Error1: %s\n", cudaGetErrorString(err));
|
||
// return 1;
|
||
// }
|
||
if (found)
|
||
{
|
||
size_t t_offset, t_index, match_pos;
|
||
decode_offset_and_index(found, &t_offset, &t_index, &match_pos);
|
||
printf("Found at offset=%zu ts = %zu + %zu config_file = %s match_index %zu, file : %s\n", t_offset, t_start, t_index, filename, match_pos,
|
||
matches_filename[match_pos]);
|
||
|
||
|
||
printf("Found at offset %zu found = %llu ts = %zu file = %s, match_index = %zu file: %s\n", t_offset, found, t_start + t_index, filename, match_pos,
|
||
matches_filename[match_pos]);
|
||
//write to output.txt
|
||
FILE *f = fopen("output.txt", "a");
|
||
if (f == NULL)
|
||
{
|
||
printf("Error opening file!\n");
|
||
exit(1);
|
||
}
|
||
fprintf(f, "Found at offset %zu found = %llu ts = %zu file = %s, match_index = %zu file: %s\n", t_offset, found, t_start + t_index, filename, match_pos,
|
||
matches_filename[match_pos]);
|
||
fclose(f);
|
||
// reset for next
|
||
cudaMemcpy(d_found, &zero, sizeof(unsigned long long ), cudaMemcpyHostToDevice);
|
||
// err = cudaGetLastError();
|
||
// if (err != cudaSuccess)
|
||
// {
|
||
// fprintf(stderr, "Error2: %s\n", cudaGetErrorString(err));
|
||
// return 1;
|
||
// }
|
||
}
|
||
|
||
//write checkpoint
|
||
config["offset"] = offset + i;
|
||
config["index"] = i;
|
||
std::ofstream o(checkpoint);
|
||
o << std::setw(4) << config << std::endl;
|
||
}
|
||
}
|
||
|
||
cudaDeviceSynchronize();
|
||
// //print error
|
||
// err = cudaGetLastError();
|
||
// if (err != cudaSuccess)
|
||
// {
|
||
// fprintf(stderr, "Error0: %s\n", cudaGetErrorString(err));
|
||
// return 1;
|
||
// }
|
||
|
||
unsigned long long found = 0;
|
||
cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error1: %s\n", cudaGetErrorString(err));
|
||
return 1;
|
||
}
|
||
if (found)
|
||
{
|
||
size_t t_offset, t_index, match_pos;
|
||
decode_offset_and_index(found, &t_offset, &t_index, &match_pos);
|
||
printf("Found at offset=%zu ts = %zu + %zu filename = %s\n", t_offset, t_start, t_index, filename);
|
||
|
||
}
|
||
|
||
uint64_t end_enc = get_time_in_nanosecond();
|
||
printf("Offset Enc Time: %f ms\n", (end_enc - start_enc) / 1000000.0);
|
||
|
||
return 0;
|
||
}
|
||
|
||
int do_bruteforce(const char *filename)
|
||
{
|
||
json config;
|
||
|
||
std::string json_file = filename;
|
||
|
||
std::string checkpoint = json_file + "checkpoint.json";
|
||
|
||
std::ifstream f(filename);
|
||
if (!f.is_open())
|
||
{
|
||
fprintf(stderr, "Error: Could not open config file %s\n", filename);
|
||
return 1;
|
||
}
|
||
|
||
try
|
||
{
|
||
f >> config;
|
||
}
|
||
catch (json::parse_error &e)
|
||
{
|
||
fprintf(stderr, "Error parsing JSON: %s\n", e.what());
|
||
return 1;
|
||
}
|
||
|
||
size_t num;
|
||
uint64_t t_start;
|
||
size_t enc_count;
|
||
size_t offset;
|
||
|
||
size_t matches_size = 0;
|
||
uint64_t *matches = 0;
|
||
uint64_t *masks = 0;
|
||
|
||
try
|
||
{
|
||
t_start = config["start_timestamp"].get<uint64_t>(); // start T3
|
||
|
||
num = config["count"].get<size_t>(); // stop at T3 + count ns
|
||
|
||
offset = config["offset"].get<size_t>(); // we start at T3 + offset ns
|
||
|
||
enc_count = config["brute_force_time_range"].get<size_t>(); // we stop at T3 + offset + enc_count ns
|
||
|
||
// parse matches, and put it in array (match = plaintext^encrypted)
|
||
if (config.contains("matches"))
|
||
{
|
||
matches_size = config["matches"].size();
|
||
matches = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
|
||
masks = (uint64_t *)malloc(matches_size * sizeof(uint64_t));
|
||
for (size_t i = 0; i < matches_size; i++)
|
||
{
|
||
uint64_t plaintext = std::stoull(config["matches"][i]["plaintext"].get<std::string>(), 0, 16);
|
||
uint64_t encrypted = std::stoull(config["matches"][i]["encrypted"].get<std::string>(), 0, 16);
|
||
uint64_t bitmask = std::stoull(config["matches"][i]["bitmask"].get<std::string>(), 0, 16);
|
||
matches[i] = plaintext ^ encrypted;
|
||
printf("Match %zu: %016lx\n", i, matches[i]);
|
||
masks[i] = bitmask;
|
||
printf("Mask %zu: %016lx\n", i, masks[i]);
|
||
}
|
||
}
|
||
}
|
||
catch (json::exception &e)
|
||
{
|
||
fprintf(stderr, "Error reading JSON values: %s\n", e.what());
|
||
return 1;
|
||
}
|
||
|
||
printf("Configuration:\n");
|
||
printf("num: %zu\n", num);
|
||
printf("t_start: %lu\n", t_start);
|
||
printf("enc_count: %zu\n", enc_count);
|
||
printf("offset: %zu\n", offset);
|
||
|
||
printf("Brute forcing: %zu enc count %zu\n", num, enc_count);
|
||
size_t input_size = num * 19 * sizeof(uint8_t);
|
||
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
|
||
|
||
// Allocate host memory.
|
||
uint8_t *h_input = (uint8_t *)malloc(input_size);
|
||
|
||
// output
|
||
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
|
||
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
|
||
|
||
uint8_t *h_output = (uint8_t *)malloc(output_size);
|
||
|
||
size_t enc_output_size = enc_count * TEST_ENCRYPT_BLOCK_SIZE * sizeof(uint8_t);
|
||
|
||
uint8_t *h_output_enc = (uint8_t *)malloc(enc_output_size);
|
||
|
||
uint64_t start_fill = get_time_in_nanosecond();
|
||
|
||
fill_input(h_input, t_start, num); // GENERATE TS
|
||
|
||
uint64_t end_fill = get_time_in_nanosecond();
|
||
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
|
||
|
||
// Allocate device memory.
|
||
uint8_t *d_input, *d_output, *d_output_enc;
|
||
int *d_found;
|
||
unsigned long long *d_matches;
|
||
unsigned long long *d_masks;
|
||
|
||
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_output_enc, enc_output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output_enc\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_found, sizeof(int)) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_found\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_matches, matches_size * sizeof(uint64_t)) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_matches\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_masks, matches_size * sizeof(uint64_t)) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_masks\n");
|
||
return 1;
|
||
}
|
||
|
||
long zero = 0;
|
||
cudaMemcpy(d_found, &zero, sizeof(int), cudaMemcpyHostToDevice);
|
||
|
||
// copy matches and masks
|
||
cudaMemcpy(d_matches, matches, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
|
||
cudaMemcpy(d_masks, masks, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice);
|
||
|
||
uint64_t start = get_time_in_nanosecond();
|
||
|
||
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
|
||
|
||
// Launch kernel.
|
||
int blockSize = 256;
|
||
int gridSize = (num + blockSize - 1) / blockSize;
|
||
|
||
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
|
||
|
||
printf("Starting timestamp -> random calculation...\n");
|
||
|
||
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
|
||
cudaDeviceSynchronize();
|
||
uint64_t end1 = get_time_in_nanosecond();
|
||
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
|
||
|
||
uint64_t end = get_time_in_nanosecond();
|
||
|
||
// //print first hash
|
||
printf("First hash: ");
|
||
for (int i = 0; i < 16; i++)
|
||
{
|
||
printf("%02x", h_output[i]);
|
||
}
|
||
printf("\n");
|
||
// //print 2nd hash
|
||
printf("+offset hash: ");
|
||
for (int i = 0; i < 16; i++)
|
||
{
|
||
printf("%02x", h_output[SHA256_DIGEST_SIZE * offset + i]);
|
||
}
|
||
printf("\n");
|
||
|
||
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
|
||
// print speed per second
|
||
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
|
||
|
||
size_t limit = num - enc_count - offset;
|
||
printf("Limit: %zu\n", limit);
|
||
start = get_time_in_nanosecond();
|
||
|
||
int gridSizeEnc = (enc_count + blockSize - 1) / blockSize;
|
||
|
||
cudaDeviceProp prop;
|
||
cudaGetDeviceProperties(&prop, gpuIndex);
|
||
|
||
int maxThreadsPerBlock = prop.maxThreadsPerBlock; // Maximum allowed per block
|
||
blockSize = maxThreadsPerBlock; // Dynamically set blockSize
|
||
gridSizeEnc = (enc_count + blockSize - 1) / blockSize; // Ensure full coverage
|
||
|
||
printf("Encryption Using blockSize = %d, gridSize = %d\n", blockSize, gridSizeEnc);
|
||
|
||
uint64_t start_enc = get_time_in_nanosecond();
|
||
|
||
for (size_t i = 0; i < limit; i++)
|
||
{
|
||
|
||
// launch test_kcipher2_kernel_single
|
||
uint32_t *host_i32 = (uint32_t *)(h_output + i * 32);
|
||
|
||
uint32_t k1 = host_i32[0];
|
||
uint32_t k2 = host_i32[1];
|
||
uint32_t k3 = host_i32[2];
|
||
uint32_t k4 = host_i32[3];
|
||
|
||
|
||
if (i > 0 && i % 10000 == 0)
|
||
{
|
||
uint64_t end_enc = get_time_in_nanosecond();
|
||
printf("10000 ns Time: %f ms\n", (end_enc - start_enc) / 1000000.0);
|
||
printf("Speed for 10000 ns: %f enc per second\n", 10000 / ((end_enc - start_enc) / 1000000000.0));
|
||
start_enc = get_time_in_nanosecond();
|
||
|
||
printf("Processing %zu: T3 = %lu\n", i, t_start + i);
|
||
fflush(stdout);
|
||
// printf("Current key: %08x %08x %08x %08x\n", k1, k2, k3, k4);
|
||
}
|
||
|
||
if ((i % 100000) == 0)
|
||
{
|
||
// create checkpoint
|
||
// modify json with current time
|
||
config["start_timestamp"] = t_start + i;
|
||
std::ofstream o(checkpoint);
|
||
o << std::setw(4) << config << std::endl;
|
||
}
|
||
|
||
encrypt_and_search<<<gridSizeEnc, blockSize>>>(d_output + (offset + i) * 32, d_output_enc, k1, k2, k3, k4, d_found, d_masks, d_matches, matches_size, enc_count);
|
||
|
||
cudaDeviceSynchronize();
|
||
|
||
// copy match
|
||
int found;
|
||
cudaMemcpy(&found, d_found, sizeof(int), cudaMemcpyDeviceToHost);
|
||
if (found)
|
||
{
|
||
|
||
// only copy the large block if we found a match
|
||
cudaMemcpy(h_output_enc, d_output_enc, enc_output_size, cudaMemcpyDeviceToHost);
|
||
|
||
// reset found flag for next match
|
||
cudaMemcpy(d_found, &zero, sizeof(int), cudaMemcpyHostToDevice);
|
||
// reset d_output_enc
|
||
cudaMemset(d_output_enc, 0, enc_output_size);
|
||
|
||
uint64_t *host_i64 = (uint64_t *)(h_output_enc);
|
||
for (size_t j = 0; j < enc_count; j++)
|
||
{
|
||
for (size_t k = 0; k < matches_size; k++)
|
||
{
|
||
if ((*host_i64 & masks[k]) == matches[k])
|
||
{
|
||
printf("Found Match %zu target %zu T3 = %lu T4 = %lu offset=%lu\n", k, j, t_start + i, (t_start + i) + offset + j, offset + j);
|
||
// open file and write to it
|
||
FILE *out = fopen("output.txt", "a");
|
||
if (out != NULL)
|
||
{
|
||
fprintf(out, "Found Match %zu target %zu T3 = %lu T4 = %lu offset=%lu\n", k, j, t_start + i, (t_start + i) + offset + j, offset + j);
|
||
fclose(out);
|
||
}
|
||
}
|
||
}
|
||
|
||
host_i64 += 4;
|
||
}
|
||
}
|
||
}
|
||
|
||
end = get_time_in_nanosecond();
|
||
|
||
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
|
||
// print speed per second
|
||
printf("Speed: %f enc per second\n", enc_count / ((end - start) / 1000000000.0));
|
||
|
||
// Cleanup.
|
||
cudaFree(d_input);
|
||
cudaFree(d_output);
|
||
free(h_input);
|
||
free(h_output);
|
||
return 0;
|
||
}
|
||
|
||
//this will leak fd, but we don't care
|
||
uint8_t *load_random(const char *filename, size_t *size)
|
||
{
|
||
//mmap the file
|
||
int fd;
|
||
struct stat sb;
|
||
uint8_t *addr;
|
||
fd = open(filename, O_RDONLY);
|
||
if (fd == -1)
|
||
{
|
||
perror("open");
|
||
return 0;
|
||
}
|
||
//get file size using stat
|
||
if (fstat(fd, &sb) == -1)
|
||
{
|
||
close(fd);
|
||
perror("fstat");
|
||
return 0;
|
||
}
|
||
*size = sb.st_size;
|
||
addr = (uint8_t *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
||
if (addr == MAP_FAILED)
|
||
{
|
||
close(fd);
|
||
perror("mmap");
|
||
return 0;
|
||
}
|
||
return addr;
|
||
}
|
||
|
||
uint8_t *hex_to_bytes(const char *search_hex)
|
||
{
|
||
if (strlen(search_hex)!=32) {
|
||
printf("Search hex must be 32 bytes hex characters\n");
|
||
return 0;
|
||
}
|
||
|
||
uint8_t * search = (uint8_t *)malloc(16);
|
||
|
||
//convert search_hex (32 hex character, no space) to search
|
||
for (int i = 0; i < 16; i++)
|
||
{
|
||
char tmp[3];
|
||
tmp[0] = search_hex[i * 2];
|
||
tmp[1] = search_hex[i * 2 + 1];
|
||
tmp[2] = 0;
|
||
search[i] = strtol(tmp, NULL, 16);
|
||
}
|
||
|
||
printf("Searching: ");
|
||
//hexdump to verify
|
||
for (int i = 0; i < 16; i++)
|
||
{
|
||
printf("%02x", search[i]);
|
||
}
|
||
printf("\n");
|
||
|
||
//for every 4 byte do: __builtin_bswap32
|
||
uint32_t *search_i32 = (uint32_t *)search;
|
||
for (int i = 0; i < 4; i++)
|
||
{
|
||
search_i32[i] = __builtin_bswap32(search_i32[i]);
|
||
}
|
||
return search;
|
||
}
|
||
|
||
int search_random(uint8_t *addr, size_t size, const char *search_hex)
|
||
{
|
||
if (!addr || !search_hex) {
|
||
return -1;
|
||
}
|
||
|
||
uint8_t *search = hex_to_bytes(search_hex);
|
||
if (!search) {
|
||
return -1;
|
||
}
|
||
|
||
//now search the file using memmem
|
||
void * pos = memmem(addr, size, search, 16);
|
||
int found_pos = 1;
|
||
if (pos ) {
|
||
printf("Found at index %ld\n", ((uint8_t *)pos - addr)/16);
|
||
found_pos = ((uint8_t *)pos - addr)/16;
|
||
|
||
} else {
|
||
printf("Not found\n");
|
||
}
|
||
return found_pos;
|
||
}
|
||
|
||
//hash computation in GPU, chacha8 in CPU
|
||
int test_chacha8_speed(size_t num)
|
||
{
|
||
printf("Test random + Chacha8 : %lu\n", num);
|
||
size_t input_size = num * 19 * sizeof(uint8_t);
|
||
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
|
||
|
||
// Allocate host memory.
|
||
uint8_t *h_input = (uint8_t *)malloc(input_size);
|
||
|
||
uint64_t t_start = TEST_TIMESTAMP + 2000; // TEST: 0 is for chacha, +1000 for chacha_nonce
|
||
|
||
// output
|
||
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
|
||
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
|
||
|
||
uint8_t *h_output = (uint8_t *)malloc(output_size);
|
||
|
||
uint64_t start_fill = get_time_in_nanosecond();
|
||
|
||
fill_input(h_input, t_start, num);
|
||
uint64_t end_fill = get_time_in_nanosecond();
|
||
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
|
||
|
||
// Allocate device memory.
|
||
uint8_t *d_input, *d_output;
|
||
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
|
||
return 1;
|
||
}
|
||
|
||
uint64_t start = get_time_in_nanosecond();
|
||
|
||
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
|
||
|
||
int blockSize = 256;
|
||
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
|
||
|
||
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
|
||
|
||
printf("Starting timestamp -> random calculation...\n");
|
||
fflush(stdout);
|
||
|
||
multihash_kernel<<<gridSize, blockSize>>>(d_input, d_output, num);
|
||
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
|
||
return 1;
|
||
}
|
||
|
||
cudaDeviceSynchronize();
|
||
uint64_t end1 = get_time_in_nanosecond();
|
||
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
|
||
|
||
uint64_t end = get_time_in_nanosecond();
|
||
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
|
||
// print speed per second
|
||
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
|
||
// Cleanup.
|
||
cudaFree(d_input);
|
||
cudaFree(d_output);
|
||
uint8_t input[64];
|
||
uint8_t output[64];
|
||
memset(input, 0, 64);
|
||
start = get_time_in_nanosecond();
|
||
for (size_t i = 0; i < num; i++) {
|
||
uint8_t *seed = h_output + i * SHA256_DIGEST_SIZE;
|
||
uint8_t *iv = h_output + i * SHA256_DIGEST_SIZE;
|
||
chacha8_ctx ctx;
|
||
chacha8_keysetup(&ctx, seed, iv);
|
||
chacha8_get_keystream_oneblock(&ctx, output);
|
||
|
||
}
|
||
end = get_time_in_nanosecond();
|
||
printf("Enc Total Time: %f ms\n", (end - start) / 1000000.0);
|
||
//speed
|
||
printf("Speed: %f enc per second\n", num / ((end - start) / 1000000000.0));
|
||
|
||
|
||
return 0;
|
||
}
|
||
|
||
void hexdump(const char *title, const uint8_t *data, size_t size)
|
||
{
|
||
printf("%s: ", title);
|
||
const uint8_t *p = data;
|
||
for (size_t i = 0; i < size; i++) {
|
||
printf("%02x ", p[i]);
|
||
}
|
||
printf("\n");
|
||
}
|
||
|
||
|
||
int bruteforce_chacha(const char *filename)
|
||
{
|
||
std::string json_file = filename;
|
||
std::string checkpoint = json_file + "checkpoint.json";
|
||
uint64_t t3_ts;
|
||
uint64_t t3_t1_offset;
|
||
uint64_t t1_t2_start_offset;
|
||
uint64_t t1_t2_end_offset;
|
||
uint64_t encrypted, plaintext, value;
|
||
|
||
//parse JSON
|
||
std::ifstream f(filename);
|
||
if (!f.is_open())
|
||
{
|
||
fprintf(stderr, "Error: Could not open config file %s\n", filename);
|
||
return 1;
|
||
}
|
||
json config;
|
||
try
|
||
{
|
||
f >> config;
|
||
t3_ts = config["t3_ts"].get<uint64_t>(); //start of kcipher2
|
||
t3_t1_offset = config["t3_t1_offset"].get<uint64_t>(); //how far from T3 do we want to start our timestamp
|
||
t1_t2_start_offset = config["t1_t2_start_offset"].get<uint64_t>(); //start offset to test for t1-t2
|
||
t1_t2_end_offset = config["t1_t2_end_offset"].get<uint64_t>(); //end offset to test for t1-t2
|
||
encrypted = std::stoull(config["encrypted"].get<std::string>(), 0, 16);
|
||
printf("Encrypted: %016lx\n", encrypted);
|
||
plaintext = std::stoull(config["plaintext"].get<std::string>(), 0, 16);
|
||
printf("Plaintext: %016lx\n", plaintext);
|
||
value = encrypted ^ plaintext;
|
||
printf("Value: %016lx\n", value);
|
||
}
|
||
catch (json::parse_error &e)
|
||
{
|
||
fprintf(stderr, "Error parsing JSON: %s\n", e.what());
|
||
return 1;
|
||
}
|
||
|
||
|
||
|
||
size_t num = t3_t1_offset + t1_t2_end_offset;
|
||
|
||
size_t input_size = num * 19 * sizeof(uint8_t);
|
||
printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0);
|
||
|
||
// Allocate host memory.
|
||
uint8_t *h_input = (uint8_t *)malloc(input_size);
|
||
|
||
uint64_t t_start = t3_ts - t3_t1_offset;
|
||
printf("t_start: %lu\n", t_start);
|
||
uint64_t t_end = t_start + num;
|
||
printf("t_end: %lu\n", t_end);
|
||
|
||
// output
|
||
size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t);
|
||
printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0);
|
||
|
||
uint8_t *h_output = (uint8_t *)malloc(output_size);
|
||
|
||
uint64_t start_fill = get_time_in_nanosecond();
|
||
|
||
fill_input(h_input, t_start, num);
|
||
uint64_t end_fill = get_time_in_nanosecond();
|
||
printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0);
|
||
|
||
// Allocate device memory.
|
||
uint8_t *d_input, *d_output;
|
||
if (cudaMalloc(&d_input, input_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_input\n");
|
||
return 1;
|
||
}
|
||
if (cudaMalloc(&d_output, output_size) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_output\n");
|
||
return 1;
|
||
}
|
||
|
||
uint64_t start = get_time_in_nanosecond();
|
||
|
||
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
|
||
|
||
int blockSize = 256;
|
||
int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage
|
||
|
||
printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize);
|
||
|
||
printf("Starting seed calculation...\n");
|
||
fflush(stdout);
|
||
|
||
multihash_kernel_noswap<<<gridSize, blockSize>>>(d_input, d_output, num);
|
||
|
||
cudaError_t err = cudaGetLastError();
|
||
if (err != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
|
||
return 1;
|
||
}
|
||
|
||
cudaDeviceSynchronize();
|
||
uint64_t end1 = get_time_in_nanosecond();
|
||
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
|
||
|
||
uint64_t end = get_time_in_nanosecond();
|
||
printf("Total Time: %f ms\n", (end - start) / 1000000.0);
|
||
// print speed per second
|
||
printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0));
|
||
// Cleanup.
|
||
cudaFree(d_input);
|
||
|
||
size_t num_loop = t3_t1_offset;
|
||
|
||
blockSize = 256;
|
||
gridSize = (num_loop + blockSize - 1) / blockSize; // Ensure full coverage
|
||
|
||
unsigned long long *d_found;
|
||
if (cudaMalloc(&d_found, sizeof(unsigned long long)) != cudaSuccess)
|
||
{
|
||
fprintf(stderr, "Error: cudaMalloc failed for d_found\n");
|
||
return 1;
|
||
}
|
||
|
||
start = get_time_in_nanosecond();
|
||
|
||
for (size_t offset = t1_t2_start_offset; offset < t1_t2_end_offset; offset++)
|
||
{
|
||
size_t n = offset - t1_t2_start_offset;
|
||
if ((n % 500)==0) {
|
||
end1 = get_time_in_nanosecond();
|
||
printf("Time: %f ms\n", (end1 - start) / 1000000.0);
|
||
double offset_per_second = n / ((end1 - start) / 1000000000.0);
|
||
printf("Speed: %f offsets per second\n", offset_per_second);
|
||
printf("Processing offset %zu elapsed %f minutes\n", offset, (end1 - start) / 1000000000.0 / 60.0);
|
||
//remaining
|
||
double remaining = (t1_t2_end_offset - offset) / offset_per_second;
|
||
printf("Remaining: %zu minutes\n", (size_t)(remaining / 60.0));
|
||
}
|
||
#if 0
|
||
printf("ts = %zu\n", t_start);
|
||
printf("Tp Offset: %zu ts = %zu\n", offset, t_start + offset);
|
||
uint8_t output[64];
|
||
uint64_t *val = (uint64_t *)output;
|
||
for (size_t i = 0; i < num_loop; i++ ) {
|
||
uint8_t *seed = h_output + i * SHA256_DIGEST_SIZE;
|
||
//hexdump("Seed", seed, 32);
|
||
uint8_t *iv = h_output + (i + offset) * SHA256_DIGEST_SIZE;
|
||
//hexdump("IV", iv, 16);
|
||
chacha8_ctx ctx;
|
||
chacha8_keysetup(&ctx, seed, iv);
|
||
chacha8_get_keystream_oneblock(&ctx, output);
|
||
//hexdump output
|
||
hexdump("Output", output, 64);
|
||
if (*val == value) {
|
||
printf("Found at time %zu offset = %zu\n", i, offset);
|
||
//save to file output.txt
|
||
FILE *out = fopen("output.txt", "a");
|
||
if (out != NULL)
|
||
{
|
||
fprintf(out, "Found at time %zu offset = %zu\n", i, offset);
|
||
fclose(out);
|
||
}
|
||
|
||
return 0;
|
||
}
|
||
//break;
|
||
}
|
||
//break;
|
||
#else
|
||
//test using GPU
|
||
|
||
chacha8_encrypt_and_match<<<gridSize, blockSize>>>(d_output, offset, d_found, value, num_loop);
|
||
// //check for errors
|
||
// cudaError_t err = cudaGetLastError();
|
||
// if (err != cudaSuccess)
|
||
// {
|
||
// fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
|
||
// return 1;
|
||
// }
|
||
//sync
|
||
if (n % 1000 == 0) {
|
||
cudaDeviceSynchronize();
|
||
unsigned long long found = 0;
|
||
cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost);
|
||
if (found) {
|
||
// unsigned long long encoded_offset_and_index = (unsigned long long)offset << 32 | idx;
|
||
size_t t_offset, t_index;
|
||
t_offset = found >> 32;
|
||
t_index = found & 0xffffffff;
|
||
printf("Found at offset %zu index %zu\n", t_offset, t_index + t_start);
|
||
//write to file
|
||
FILE *out = fopen("output.txt", "a");
|
||
if (out != NULL)
|
||
{
|
||
fprintf(out, "Found at offset %zu index %zu\n", t_offset, t_index + t_start);
|
||
fclose(out);
|
||
}
|
||
return 1;
|
||
}
|
||
}
|
||
#endif
|
||
}
|
||
|
||
return 0;
|
||
|
||
}
|
||
|
||
|
||
|
||
int main(int argc, char *argv[])
|
||
{
|
||
size_t num = 128 * 1000;
|
||
|
||
int smCount;
|
||
cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, gpuIndex);
|
||
printf("SM count: %d\n", smCount);
|
||
|
||
if (argc > 1)
|
||
{
|
||
if (strcmp(argv[1], "random") == 0) //test generate random from timestamp
|
||
{
|
||
if (argc > 2)
|
||
{
|
||
num = atoll(argv[2]);
|
||
}
|
||
|
||
return test_generate_random_only(num);
|
||
}
|
||
if (strcmp(argv[1], "random-gpu") == 0) //test generate random from timestamp, using GPU
|
||
{
|
||
if (argc > 2)
|
||
{
|
||
num = atoll(argv[2]);
|
||
}
|
||
|
||
return test_generate_random_only_in_gpu(num);
|
||
}
|
||
|
||
if (strcmp(argv[1], "saverandom") == 0)
|
||
{
|
||
uint64_t start_time = get_time_in_nanosecond();
|
||
if (argc > 1)
|
||
{
|
||
start_time = atoll(argv[2]);
|
||
char tmp[16];
|
||
snprintf(tmp, sizeof(tmp), "%lu", start_time);
|
||
if (strlen(tmp)<19) {
|
||
start_time = start_time * 1000000000;
|
||
}
|
||
|
||
}
|
||
char filename[256];
|
||
snprintf(filename, 256, "random_%lu.bin", start_time);
|
||
|
||
return save_random(start_time, filename);
|
||
}
|
||
if (strcmp(argv[1], "search") == 0) {
|
||
if (argc > 3) {
|
||
const char *filename = argv[2];
|
||
const char *search_hex = argv[3];
|
||
size_t size;
|
||
uint8_t *addr = load_random(filename, &size);
|
||
if (addr) {
|
||
int res = search_random(addr, size, search_hex);
|
||
if (argc > 4) {
|
||
const char *search_hex2 = argv[4];
|
||
int res2 = search_random(addr, size, search_hex2);
|
||
printf("Diff: %d\n", res2-res);
|
||
}
|
||
}
|
||
} else {
|
||
printf("usage: search random.bin 16_byte_hex_sequece_no_space\n");
|
||
}
|
||
}
|
||
|
||
if (strcmp(argv[1], "enc") == 0) //test encryption only
|
||
{
|
||
if (argc > 2)
|
||
{
|
||
num = atoll(argv[2]);
|
||
}
|
||
return test_encryption_only(num);
|
||
}
|
||
|
||
if (strcmp(argv[1], "chacha8") == 0) //test chacha8 only
|
||
{
|
||
if (argc > 2)
|
||
{
|
||
num = atoll(argv[2]);
|
||
}
|
||
return test_chacha8_speed(num);
|
||
}
|
||
|
||
if (strcmp(argv[1], "runchacha") == 0 || strcmp(argv[1], "runchacha8") == 0) //run chacha8 bruteforce
|
||
{
|
||
if (argc > 2)
|
||
{
|
||
if (argc > 3)
|
||
{
|
||
gpuIndex = atoi(argv[3]);
|
||
}
|
||
|
||
int deviceCount;
|
||
cudaGetDeviceCount(&deviceCount);
|
||
|
||
if (gpuIndex < 0 || gpuIndex >= deviceCount)
|
||
{
|
||
printf("Invalid GPU index %d\n", gpuIndex);
|
||
return EXIT_FAILURE;
|
||
}
|
||
cudaSetDevice(gpuIndex);
|
||
printf("Using GPU %d\n", gpuIndex);
|
||
|
||
return bruteforce_chacha(argv[2]);
|
||
} else {
|
||
printf("Usage: runchacha config.json\n");
|
||
}
|
||
}
|
||
//main bruteforce loop
|
||
if (strcmp(argv[1], "run") == 0 || strcmp(argv[1], "run2")==0 || strcmp(argv[1], "run3") == 0)
|
||
{
|
||
// read config from JSON file
|
||
if (argc > 2)
|
||
{
|
||
if (argc > 3)
|
||
{
|
||
gpuIndex = atoi(argv[3]);
|
||
}
|
||
|
||
int deviceCount;
|
||
cudaGetDeviceCount(&deviceCount);
|
||
|
||
if (gpuIndex < 0 || gpuIndex >= deviceCount)
|
||
{
|
||
printf("Invalid GPU index %d\n", gpuIndex);
|
||
return EXIT_FAILURE;
|
||
}
|
||
cudaSetDevice(gpuIndex);
|
||
printf("Using GPU %d\n", gpuIndex);
|
||
|
||
if (strcmp(argv[1], "run") == 0)
|
||
{
|
||
return do_bruteforce(argv[2]); //this is the slow method
|
||
}
|
||
else if (strcmp(argv[1], "run2") == 0)
|
||
{
|
||
return do_bruteforce_new(argv[2]); //this is the faster method
|
||
} else if (strcmp(argv[1], "run3") == 0) {
|
||
return do_bruteforce_offset(argv[2]); //this will read values from offset.txt instead of using ranges
|
||
}
|
||
}
|
||
printf("Please specify the JSON config file\n");
|
||
}
|
||
}
|
||
|
||
return 0;
|
||
}
|