#include #include #include #include #include #include //stat #include #include //mmap #include #include #include "json.hpp" #include "test-ts.h" #include "akira-bruteforce.h" #include "chacha8.h" //by design, max matches is 127 #define MAX_MATCHES 32 using json = nlohmann::json; int gpuIndex = 0; #define SHA256_DIGEST_SIZE 32 // --- SHA-256 Device Implementation --- // Rotate right. __device__ __forceinline__ uint32_t rotr(uint32_t x, uint32_t n) { //return (x >> n) | (x << (32 - n)); return __funnelshift_r(x, x, n ); } #define SHR(x, n) ((x) >> (n)) #define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) #define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) #define Sigma0(x) (rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22)) #define Sigma1(x) (rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25)) #define sigma0(x) (rotr(x, 7) ^ rotr(x, 18) ^ SHR(x, 3)) #define sigma1(x) (rotr(x, 17) ^ rotr(x, 19) ^ SHR(x, 10)) // Process one 64-byte block. __device__ __forceinline__ void sha256_transform(const uint8_t *data, uint32_t state[8], const uint32_t *k) { uint32_t w[64]; #pragma unroll for (int i = 0; i < 16; i++) { w[i] = ((uint32_t)data[i * 4] << 24) | ((uint32_t)data[i * 4 + 1] << 16) | ((uint32_t)data[i * 4 + 2] << 8) | ((uint32_t)data[i * 4 + 3]); } for (int i = 16; i < 64; i++) { w[i] = sigma1(w[i - 2]) + w[i - 7] + sigma0(w[i - 15]) + w[i - 16]; } uint32_t a = state[0]; uint32_t b = state[1]; uint32_t c = state[2]; uint32_t d = state[3]; uint32_t e = state[4]; uint32_t f = state[5]; uint32_t g = state[6]; uint32_t h = state[7]; for (int i = 0; i < 64; i++) { uint32_t T1 = h + Sigma1(e) + Ch(e, f, g) + k[i] + w[i]; uint32_t T2 = Sigma0(a) + Maj(a, b, c); h = g; g = f; f = e; e = d + T1; d = c; c = b; b = a; a = T1 + T2; } state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; state[5] += f; state[6] += g; state[7] += h; } // For a fixed 19-byte input, the padded message is exactly 64 bytes. // Padding: message (19 bytes) || 0x80 || (zeros up to byte 56) || [0x0000000000000098] // (19*8 = 152, or 0x98) __device__ void sha256_hash_19(const uint8_t *msg, uint8_t *digest, const uint32_t *k) { uint32_t state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; uint8_t block[64]; #pragma unroll for (int i = 0; i < 19; i++) block[i] = msg[i]; block[19] = 0x80; #pragma unroll for (int i = 20; i < 56; i++) block[i] = 0; // Append message length in bits: 152 = 0x0000000000000098 (big-endian) block[56] = 0; block[57] = 0; block[58] = 0; block[59] = 0; block[60] = 0; block[61] = 0; block[62] = 0; block[63] = 152; sha256_transform(block, state, k); #pragma unroll for (int i = 0; i < 8; i++) { digest[i * 4] = (state[i] >> 24) & 0xff; digest[i * 4 + 1] = (state[i] >> 16) & 0xff; digest[i * 4 + 2] = (state[i] >> 8) & 0xff; digest[i * 4 + 3] = state[i] & 0xff; } } // For a fixed 68-byte input, the padded message consists of 2 blocks. // Block 1: first 64 bytes of the input. // Block 2: remaining 4 bytes || 0x80 || zeros until byte 56 || [bit length = 544 bits] __device__ void sha256_hash_68(const uint8_t * msg, uint8_t *digest, const uint32_t * k) { uint32_t state[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; uint8_t block2[64]; // Block 1: copy first 64 bytes. //uint8_t block[64]; #pragma unroll for (int i = 0; i < 64; i++) block2[i] = msg[i]; sha256_transform(block2, state, k); // Block 2: // First 4 bytes: remainder of message. #pragma unroll for (int i = 0; i < 4; i++) block2[i] = msg[64 + i]; block2[4] = 0x80; #pragma unroll for (int i = 5; i < 56; i++) block2[i] = 0; // Append message length in bits: 68*8 = 544 = 0x0000000000000220 (big-endian) block2[56] = 0; block2[57] = 0; block2[58] = 0; block2[59] = 0; block2[60] = 0; block2[61] = 0; block2[62] = 0x02; block2[63] = 0x20; sha256_transform(block2, state, k); #pragma unroll for (int i = 0; i < 8; i++) { digest[i * 4] = (state[i] >> 24) & 0xff; digest[i * 4 + 1] = (state[i] >> 16) & 0xff; digest[i * 4 + 2] = (state[i] >> 8) & 0xff; digest[i * 4 + 3] = state[i] & 0xff; } } // Minimal AES-256 context and helper functions. typedef struct { uint32_t rk[60]; // 60 words for 14 rounds + initial key } aes256_ctx; __device__ __forceinline__ uint32_t SubWord(uint32_t word, const uint8_t *sbox) { return ((uint32_t)sbox[(word >> 24)] << 24) | ((uint32_t)sbox[(word >> 16) & 0xff] << 16) | ((uint32_t)sbox[(word >> 8) & 0xff] << 8) | ((uint32_t)sbox[word & 0xff]); } __device__ inline uint32_t RotWord(uint32_t word) { //return (word << 8) | (word >> 24); return __funnelshift_l(word, word, 8); } // Key expansion for AES-256. __device__ void aes256_set_encrypt_key(const uint8_t userKey[32], aes256_ctx *ctx, const uint8_t *sbox, const uint32_t *Rcon) { int i = 0; // Copy the 256-bit key into the first 8 words. for (i = 0; i < 8; i++) { ctx->rk[i] = ((uint32_t)userKey[4 * i] << 24) | ((uint32_t)userKey[4 * i + 1] << 16) | ((uint32_t)userKey[4 * i + 2] << 8) | ((uint32_t)userKey[4 * i + 3]); } int rcon_i = 0; for (i = 8; i < 60; i++) { uint32_t temp = ctx->rk[i - 1]; if ((i % 8) == 0) { temp = SubWord(RotWord(temp), sbox) ^ Rcon[rcon_i++]; } else if ((i % 8) == 4) { temp = SubWord(temp, sbox); } ctx->rk[i] = ctx->rk[i - 8] ^ temp; } } // AES block encryption for a single 16-byte block. __device__ void aes256_encrypt(const aes256_ctx *ctx, const uint8_t in[16], uint8_t out[16], const uint8_t *sbox, const uint8_t *xtime) { uint8_t state[16]; // Copy input into state. for (int i = 0; i < 16; i++) { state[i] = in[i]; } // Initial AddRoundKey. for (int i = 0; i < 4; i++) { uint32_t rk = ctx->rk[i]; state[4 * i + 0] ^= (uint8_t)(rk >> 24); state[4 * i + 1] ^= (uint8_t)(rk >> 16); state[4 * i + 2] ^= (uint8_t)(rk >> 8); state[4 * i + 3] ^= (uint8_t)(rk); } // Main rounds. for (int round = 1; round < 14; round++) { // SubBytes. for (int i = 0; i < 16; i++) state[i] = sbox[state[i]]; // ShiftRows. uint8_t tmp[16]; tmp[0] = state[0]; tmp[1] = state[5]; tmp[2] = state[10]; tmp[3] = state[15]; tmp[4] = state[4]; tmp[5] = state[9]; tmp[6] = state[14]; tmp[7] = state[3]; tmp[8] = state[8]; tmp[9] = state[13]; tmp[10] = state[2]; tmp[11] = state[7]; tmp[12] = state[12]; tmp[13] = state[1]; tmp[14] = state[6]; tmp[15] = state[11]; // MixColumns. for (int i = 0; i < 4; i++) { int col = 4 * i; uint8_t a0 = tmp[col + 0], a1 = tmp[col + 1], a2 = tmp[col + 2], a3 = tmp[col + 3]; uint8_t r0 = xtime[a0] ^ (a1 ^ xtime[a1]) ^ a2 ^ a3; uint8_t r1 = a0 ^ xtime[a1] ^ (a2 ^ xtime[a2]) ^ a3; uint8_t r2 = a0 ^ a1 ^ xtime[a2] ^ (a3 ^ xtime[a3]); uint8_t r3 = (a0 ^ xtime[a0]) ^ a1 ^ a2 ^ xtime[a3]; tmp[col + 0] = r0; tmp[col + 1] = r1; tmp[col + 2] = r2; tmp[col + 3] = r3; } // Copy back to state. for (int i = 0; i < 16; i++) state[i] = tmp[i]; // AddRoundKey. for (int i = 0; i < 4; i++) { uint32_t rk = ctx->rk[round * 4 + i]; state[4 * i + 0] ^= (uint8_t)(rk >> 24); state[4 * i + 1] ^= (uint8_t)(rk >> 16); state[4 * i + 2] ^= (uint8_t)(rk >> 8); state[4 * i + 3] ^= (uint8_t)(rk); } } // Final round (no MixColumns). // SubBytes. for (int i = 0; i < 16; i++) state[i] = sbox[state[i]]; // ShiftRows. uint8_t tmp[16]; tmp[0] = state[0]; tmp[1] = state[5]; tmp[2] = state[10]; tmp[3] = state[15]; tmp[4] = state[4]; tmp[5] = state[9]; tmp[6] = state[14]; tmp[7] = state[3]; tmp[8] = state[8]; tmp[9] = state[13]; tmp[10] = state[2]; tmp[11] = state[7]; tmp[12] = state[12]; tmp[13] = state[1]; tmp[14] = state[6]; tmp[15] = state[11]; // Final AddRoundKey. for (int i = 0; i < 4; i++) { uint32_t rk = ctx->rk[14 * 4 + i]; tmp[4 * i + 0] ^= (uint8_t)(rk >> 24); tmp[4 * i + 1] ^= (uint8_t)(rk >> 16); tmp[4 * i + 2] ^= (uint8_t)(rk >> 8); tmp[4 * i + 3] ^= (uint8_t)(rk); } // Write result. for (int i = 0; i < 16; i++) out[i] = tmp[i]; } // kcipher2 state structure. typedef struct { unsigned int A[5]; unsigned int B[11]; unsigned int L1, R1, L2, R2; } kcipher2_state; //--------------------------------------------------------------------------- // Device functions for kcipher2 //--------------------------------------------------------------------------- __device__ __forceinline__ unsigned int nlf(unsigned int a, unsigned int b, unsigned int c, unsigned int d) { return (a + b) ^ c ^ d; } __device__ __forceinline__ unsigned char gf_multiply_by_2(unsigned char t) { // return gf2_table[t]; unsigned int lq = t << 1; if (lq & 0x100) lq ^= 0x011B; return ((unsigned char)lq) ^ 0xFF; } __device__ __forceinline__ unsigned char gf_multiply_by_3(unsigned char t) { // return gf3_table[t]; unsigned int lq = (t << 1) ^ t; if (lq & 0x100) lq ^= 0x011B; return ((unsigned char)lq) ^ 0xFF; } __device__ unsigned int sub_k2(unsigned int in) { unsigned char w0 = in & 0xFF; unsigned char w1 = (in >> 8) & 0xFF; unsigned char w2 = (in >> 16) & 0xFF; unsigned char w3 = (in >> 24) & 0xFF; unsigned char t0 = d_s_box[w0]; unsigned char t1 = d_s_box[w1]; unsigned char t2 = d_s_box[w2]; unsigned char t3 = d_s_box[w3]; unsigned char q0 = gf_multiply_by_2(t0) ^ gf_multiply_by_3(t1) ^ t2 ^ t3; unsigned char q1 = t0 ^ gf_multiply_by_2(t1) ^ gf_multiply_by_3(t2) ^ t3; unsigned char q2 = t0 ^ t1 ^ gf_multiply_by_2(t2) ^ gf_multiply_by_3(t3); unsigned char q3 = gf_multiply_by_3(t0) ^ t1 ^ t2 ^ gf_multiply_by_2(t3); return ((unsigned int)q3 << 24) | ((unsigned int)q2 << 16) | ((unsigned int)q1 << 8) | q0; } __device__ void setup_state_values(const unsigned int *key, const unsigned int *iv, kcipher2_state *state) { unsigned int IK[12]; IK[0] = key[0]; IK[1] = key[1]; IK[2] = key[2]; IK[3] = key[3]; IK[4] = IK[0] ^ sub_k2((IK[3] << 8) ^ (IK[3] >> 24)) ^ 0x01000000; IK[5] = IK[1] ^ IK[4]; IK[6] = IK[2] ^ IK[5]; IK[7] = IK[3] ^ IK[6]; IK[8] = IK[4] ^ sub_k2((IK[7] << 8) ^ (IK[7] >> 24)) ^ 0x02000000; IK[9] = IK[5] ^ IK[8]; IK[10] = IK[6] ^ IK[9]; IK[11] = IK[7] ^ IK[10]; state->A[0] = IK[4]; state->A[1] = IK[3]; state->A[2] = IK[2]; state->A[3] = IK[1]; state->A[4] = IK[0]; state->B[0] = IK[10]; state->B[1] = IK[11]; state->B[2] = iv[0]; state->B[3] = iv[1]; state->B[4] = IK[8]; state->B[5] = IK[9]; state->B[6] = iv[2]; state->B[7] = iv[3]; state->B[8] = IK[7]; state->B[9] = IK[5]; state->B[10] = IK[6]; state->L1 = state->R1 = state->L2 = state->R2 = 0x00000000; } __device__ void next_INIT(kcipher2_state *state) { unsigned int temp2; unsigned int nL1 = sub_k2(state->R2 + state->B[4]); unsigned int nR1 = sub_k2(state->L2 + state->B[9]); unsigned int nL2 = sub_k2(state->L1); unsigned int nR2 = sub_k2(state->R1); unsigned int nA[5]; nA[0] = state->A[1]; nA[1] = state->A[2]; nA[2] = state->A[3]; nA[3] = state->A[4]; unsigned int nB[11]; nB[0] = state->B[1]; nB[1] = state->B[2]; nB[2] = state->B[3]; nB[3] = state->B[4]; nB[4] = state->B[5]; nB[5] = state->B[6]; nB[6] = state->B[7]; nB[7] = state->B[8]; nB[8] = state->B[9]; nB[9] = state->B[10]; unsigned int temp1 = (state->A[0] << 8) ^ d_amul0[(state->A[0] >> 24) & 0xFF]; nA[4] = temp1 ^ state->A[3]; nA[4] ^= nlf(state->B[0], state->R2, state->R1, state->A[4]); if (state->A[2] & 0x40000000) temp1 = (state->B[0] << 8) ^ d_amul1[(state->B[0] >> 24) & 0xFF]; else temp1 = (state->B[0] << 8) ^ d_amul2[(state->B[0] >> 24) & 0xFF]; // branchless version (not faster) // unsigned int mask = -(unsigned int)(!!(state->A[2] & 0x40000000)); // unsigned int candidate1 = (state->B[0] << 8) ^ d_amul1[(state->B[0] >> 24) & 0xFF]; // unsigned int candidate2 = (state->B[0] << 8) ^ d_amul2[(state->B[0] >> 24) & 0xFF]; // temp1 = candidate2 ^ ((candidate1 ^ candidate2) & mask); if (state->A[2] & 0x80000000) temp2 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF]; else temp2 = state->B[8]; // branchless version (not faster) // unsigned int mask2 = -(unsigned int)(!!(state->A[2] & 0x80000000)); // unsigned int candidate3 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF]; // unsigned int candidate4 = state->B[8]; // temp2 = candidate4 ^ ((candidate3 ^ candidate4) & mask2); nB[10] = temp1 ^ state->B[1] ^ state->B[6] ^ temp2; nB[10] ^= nlf(state->B[10], state->L2, state->L1, state->A[0]); state->A[0] = nA[0]; state->A[1] = nA[1]; state->A[2] = nA[2]; state->A[3] = nA[3]; state->A[4] = nA[4]; state->B[0] = nB[0]; state->B[1] = nB[1]; state->B[2] = nB[2]; state->B[3] = nB[3]; state->B[4] = nB[4]; state->B[5] = nB[5]; state->B[6] = nB[6]; state->B[7] = nB[7]; state->B[8] = nB[8]; state->B[9] = nB[9]; state->B[10] = nB[10]; state->L1 = nL1; state->R1 = nR1; state->L2 = nL2; state->R2 = nR2; } __device__ unsigned int sub_k2_shared(unsigned int in, const unsigned char *gf2_table, const unsigned char *gf3_table, const unsigned char *d_s_box) { unsigned char w0 = in & 0xFF; unsigned char w1 = (in >> 8) & 0xFF; unsigned char w2 = (in >> 16) & 0xFF; unsigned char w3 = (in >> 24) & 0xFF; unsigned char t0 = d_s_box[w0]; unsigned char t1 = d_s_box[w1]; unsigned char t2 = d_s_box[w2]; unsigned char t3 = d_s_box[w3]; unsigned char q0 = gf2_table[t0] ^ gf3_table[t1] ^ t2 ^ t3; unsigned char q1 = t0 ^ gf2_table[t1] ^ gf3_table[t2] ^ t3; unsigned char q2 = t0 ^ t1 ^ gf2_table[t2] ^ gf3_table[t3]; unsigned char q3 = gf3_table[t0] ^ t1 ^ t2 ^ gf2_table[t3]; return ((unsigned int)q3 << 24) | ((unsigned int)q2 << 16) | ((unsigned int)q1 << 8) | q0; } __device__ void setup_state_values_shared(const unsigned int *key, const unsigned int *iv, kcipher2_state *state, const unsigned char *gf2_table, const unsigned char *gf3_table, const unsigned char *d_s_box) { unsigned int IK[12]; IK[0] = key[0]; IK[1] = key[1]; IK[2] = key[2]; IK[3] = key[3]; IK[4] = IK[0] ^ sub_k2_shared((IK[3] << 8) ^ (IK[3] >> 24), gf2_table, gf3_table, d_s_box) ^ 0x01000000; IK[5] = IK[1] ^ IK[4]; IK[6] = IK[2] ^ IK[5]; IK[7] = IK[3] ^ IK[6]; IK[8] = IK[4] ^ sub_k2_shared((IK[7] << 8) ^ (IK[7] >> 24), gf2_table, gf3_table, d_s_box) ^ 0x02000000; IK[9] = IK[5] ^ IK[8]; IK[10] = IK[6] ^ IK[9]; IK[11] = IK[7] ^ IK[10]; state->A[0] = IK[4]; state->A[1] = IK[3]; state->A[2] = IK[2]; state->A[3] = IK[1]; state->A[4] = IK[0]; state->B[0] = IK[10]; state->B[1] = IK[11]; state->B[2] = iv[0]; state->B[3] = iv[1]; state->B[4] = IK[8]; state->B[5] = IK[9]; state->B[6] = iv[2]; state->B[7] = iv[3]; state->B[8] = IK[7]; state->B[9] = IK[5]; state->B[10] = IK[6]; state->L1 = state->R1 = state->L2 = state->R2 = 0x00000000; } __device__ void next_INIT_shared(kcipher2_state *state, const unsigned char *gf2_table, const unsigned char *gf3_table, const unsigned char *d_s_box, const unsigned int *d_amul0, const unsigned int *d_amul1, const unsigned int *d_amul2, const unsigned int *d_amul3) { unsigned int temp2; unsigned int nL1 = sub_k2_shared(state->R2 + state->B[4], gf2_table, gf3_table, d_s_box); unsigned int nR1 = sub_k2_shared(state->L2 + state->B[9], gf2_table, gf3_table, d_s_box); unsigned int nL2 = sub_k2_shared(state->L1, gf2_table, gf3_table, d_s_box); unsigned int nR2 = sub_k2_shared(state->R1, gf2_table, gf3_table, d_s_box); unsigned int nA[5]; nA[0] = state->A[1]; nA[1] = state->A[2]; nA[2] = state->A[3]; nA[3] = state->A[4]; unsigned int nB[11]; nB[0] = state->B[1]; nB[1] = state->B[2]; nB[2] = state->B[3]; nB[3] = state->B[4]; nB[4] = state->B[5]; nB[5] = state->B[6]; nB[6] = state->B[7]; nB[7] = state->B[8]; nB[8] = state->B[9]; nB[9] = state->B[10]; unsigned int temp1 = (state->A[0] << 8) ^ d_amul0[(state->A[0] >> 24) & 0xFF]; nA[4] = temp1 ^ state->A[3]; nA[4] ^= nlf(state->B[0], state->R2, state->R1, state->A[4]); if (state->A[2] & 0x40000000) temp1 = (state->B[0] << 8) ^ d_amul1[(state->B[0] >> 24) & 0xFF]; else temp1 = (state->B[0] << 8) ^ d_amul2[(state->B[0] >> 24) & 0xFF]; // branchless version (not faster) // unsigned int mask = -(unsigned int)(!!(state->A[2] & 0x40000000)); // int b0 = state->B[0] << 8; // int b0_24 = (state->B[0] >> 24) & 0xFF; // unsigned int candidate1 = (b0) ^ d_amul1[b0_24]; // unsigned int candidate2 = (b0) ^ d_amul2[b0_24]; // temp1 = candidate2 ^ ((candidate1 ^ candidate2) & mask); if (state->A[2] & 0x80000000) temp2 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF]; else temp2 = state->B[8]; // branchless version (not faster) // unsigned int mask2 = -(unsigned int)(!!(state->A[2] & 0x80000000)); // unsigned int candidate3 = (state->B[8] << 8) ^ d_amul3[(state->B[8] >> 24) & 0xFF]; // unsigned int candidate4 = state->B[8]; // temp2 = candidate4 ^ ((candidate3 ^ candidate4) & mask2); nB[10] = temp1 ^ state->B[1] ^ state->B[6] ^ temp2; nB[10] ^= nlf(state->B[10], state->L2, state->L1, state->A[0]); state->A[0] = nA[0]; state->A[1] = nA[1]; state->A[2] = nA[2]; state->A[3] = nA[3]; state->A[4] = nA[4]; state->B[0] = nB[0]; state->B[1] = nB[1]; state->B[2] = nB[2]; state->B[3] = nB[3]; state->B[4] = nB[4]; state->B[5] = nB[5]; state->B[6] = nB[6]; state->B[7] = nB[7]; state->B[8] = nB[8]; state->B[9] = nB[9]; state->B[10] = nB[10]; state->L1 = nL1; state->R1 = nR1; state->L2 = nL2; state->R2 = nR2; } __device__ __forceinline__ unsigned long long bswap64(unsigned long long x) { uint32_t hi = x >> 32; // Upper 32 bits uint32_t lo = x & 0xFFFFFFFF; // Lower 32 bits // Swap bytes in each 32-bit part using __byte_perm hi = __byte_perm(hi, 0, 0x0123); lo = __byte_perm(lo, 0, 0x0123); // Swap the high and low parts and combine them back return ((uint64_t)lo << 32) | hi; } __device__ unsigned long long kcipher2_encrypt_1_zero_block(const unsigned int *key, const unsigned int *iv) { kcipher2_state state; setup_state_values(key, iv, &state); for (unsigned char i = 0; i < 24; i++) next_INIT(&state); unsigned int zh = nlf(state.B[10], state.L2, state.L1, state.A[0]); unsigned int zl = nlf(state.B[0], state.R2, state.R1, state.A[4]); return (((unsigned long long)zh) << 32) | zl; } __device__ unsigned long long kcipher2_encrypt_1_zero_block_shared(const unsigned int *key, const unsigned int *iv, const unsigned char *gf2_table, const unsigned char *gf3_table, const unsigned char *d_s_box, const unsigned int *d_amul0, const unsigned int *d_amul1, const unsigned int *d_amul2, const unsigned int *d_amul3 ) { kcipher2_state state; setup_state_values_shared(key, iv, &state, gf2_table, gf3_table, d_s_box); for (unsigned char i = 0; i < 24; i++) next_INIT_shared(&state, gf2_table, gf3_table, d_s_box, d_amul0, d_amul1, d_amul2, d_amul3); unsigned int zh = nlf(state.B[10], state.L2, state.L1, state.A[0]); unsigned int zl = nlf(state.B[0], state.R2, state.R1, state.A[4]); return (((unsigned long long)zh) << 32) | zl; } //--------------------------------------------------------------------------- // End of file //--------------------------------------------------------------------------- // --- Multihash Kernel --- // // Each thread computes: // digest0 = SHA256(19-byte input) // v0 = digest0 // for i=1..1499: // buffer = digest(i-1) || v0 || [i as 4-byte big-endian] // digest(i) = SHA256(buffer) // aes256_encrypt(&key_ctx, counter, counter); // Encrypt the (now updated) counter to produce the final output. // aes256_encrypt(&key_ctx, counter, digest); // this kernel only output 16 byte random number // but we output 32 byte block, so its like this <16 byte random> <16 byte empty> <16 byte random> <16 byte empty> ... // This will make it faster __global__ void multihash_kernel(const uint8_t *input, uint8_t *output, size_t num) { ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num) return; __shared__ uint32_t sha256_k[64]; __shared__ unsigned char gf2_table_shared[256]; __shared__ uint8_t aes_sbox[256]; __shared__ uint32_t aes_Rcon[7]; if (threadIdx.x == 0) { memcpy(sha256_k, k, sizeof(sha256_k)); memcpy(aes_sbox, sbox, sizeof(aes_sbox)); memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon)); memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared)); } __syncthreads(); // Make sure the data is loaded before use // Each input message is 19 bytes. const uint8_t *data = input + idx * 19; uint8_t digest[SHA256_DIGEST_SIZE]; uint8_t v0[SHA256_DIGEST_SIZE]; // First round: hash the 19-byte input. // sha256_hash(data, 19, digest); sha256_hash_19(data, digest, sha256_k); // Save initial digest as v0. for (int i = 0; i < SHA256_DIGEST_SIZE; i++) { v0[i] = digest[i]; } uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter) // Copy v0. for (int j = 0; j < SHA256_DIGEST_SIZE; j++) { buffer[32 + j] = v0[j]; } // Write counter (big-endian). // buffer[64] = (i >> 24) & 0xff; // buffer[65] = (i >> 16) & 0xff; // max is 1500 buffer[64] = 0; buffer[65] = 0; // 1,500 rounds total (first one is done above). for (int i = 1; i < 1500; i++) { // Copy current digest #pragma unroll for (int j = 0; j < SHA256_DIGEST_SIZE; j++) { buffer[j] = digest[j]; } buffer[66] = (i >> 8) & 0xff; buffer[67] = i & 0xff; // sha256_hash(buffer, 68, digest); sha256_hash_68(buffer, digest, sha256_k); } #define AES_BLOCK_SIZE 16 // -------- AES post-processing on final digest -------- // Use final digest as the AES key. aes256_ctx key_ctx; aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon); uint8_t counter[16] = {0}; // clear counter // Encrypt the counter block. aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared); // Encrypt the (now updated) counter to produce the final output. aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared); // Write final (AES‑encrypted) digest to output. uint8_t *out = output + idx * SHA256_DIGEST_SIZE; uint32_t * out_u32 = (uint32_t *)out; uint32_t * digest_u32 = (uint32_t *)digest; //byte swap32 bit every 4 byte for (int i = 0; i < SHA256_DIGEST_SIZE / 4; i++) { out_u32[i] = __byte_perm(digest_u32[i], 0, 0x0123); } } __global__ void multihash_kernel_in_memory(uint8_t *output, uint64_t t_start, size_t num) { ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num) return; __shared__ uint32_t sha256_k[64]; __shared__ unsigned char gf2_table_shared[256]; __shared__ uint8_t aes_sbox[256]; __shared__ uint32_t aes_Rcon[7]; if (threadIdx.x == 0) { memcpy(sha256_k, k, sizeof(sha256_k)); memcpy(aes_sbox, sbox, sizeof(aes_sbox)); memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon)); memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared)); } __syncthreads(); // Make sure the data is loaded before use uint64_t t = t_start + idx; // Each input message is 19 bytes. uint8_t data[19]; for (int i =0; i < 19; i++) { data[18-i]='0' + (t%10); t /= 10; } uint8_t digest[SHA256_DIGEST_SIZE]; // uint8_t v0[SHA256_DIGEST_SIZE]; // First round: hash the 19-byte input. // sha256_hash(data, 19, digest); sha256_hash_19(data, digest, sha256_k); // Save initial digest as v0. // for (int i = 0; i < SHA256_DIGEST_SIZE; i++) // { // v0[i] = digest[i]; // } uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter) // Copy v0. for (int j = 0; j < SHA256_DIGEST_SIZE; j++) { buffer[32 + j] = digest[j]; } // Write counter (big-endian). // buffer[64] = (i >> 24) & 0xff; // buffer[65] = (i >> 16) & 0xff; // max is 1500 buffer[64] = 0; buffer[65] = 0; // 1,500 rounds total (first one is done above). for (int i = 1; i < 1500; i++) { // Copy current digest. for (int j = 0; j < SHA256_DIGEST_SIZE; j++) { buffer[j] = digest[j]; } buffer[66] = (i >> 8) & 0xff; buffer[67] = i & 0xff; // sha256_hash(buffer, 68, digest); sha256_hash_68(buffer, digest, sha256_k); } #define AES_BLOCK_SIZE 16 // -------- AES post-processing on final digest -------- // Use final digest as the AES key. aes256_ctx key_ctx; aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon); uint8_t counter[16] = {0}; // clear counter // Encrypt the counter block. aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared); // Encrypt the (now updated) counter to produce the final output. aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared); // Write final (AES‑encrypted) digest to output. uint8_t *out = output + idx * SHA256_DIGEST_SIZE; uint32_t * out_u32 = (uint32_t *)out; uint32_t * digest_u32 = (uint32_t *)digest; //byte swap32 bit every 4 byte for (int i = 0; i < SHA256_DIGEST_SIZE / 4; i++) { out_u32[i] = __byte_perm(digest_u32[i], 0, 0x0123); } } //this is for chacha8 key, so no need to swap, and we nseed the 32 byte block __global__ void multihash_kernel_in_memory_no_swap(uint8_t *output, uint64_t t_start, size_t num) { ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num) return; __shared__ uint32_t sha256_k[64]; __shared__ unsigned char gf2_table_shared[256]; __shared__ uint8_t aes_sbox[256]; __shared__ uint32_t aes_Rcon[7]; if (threadIdx.x == 0) { memcpy(sha256_k, k, sizeof(sha256_k)); memcpy(aes_sbox, sbox, sizeof(aes_sbox)); memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon)); memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared)); } __syncthreads(); // Make sure the data is loaded before use uint64_t t = t_start + idx; // Each input message is 19 bytes. uint8_t data[19]; for (int i =0; i < 19; i++) { data[18-i]='0' + (t%10); t /= 10; } uint8_t digest[SHA256_DIGEST_SIZE]; // uint8_t v0[SHA256_DIGEST_SIZE]; // First round: hash the 19-byte input. // sha256_hash(data, 19, digest); sha256_hash_19(data, digest, sha256_k); // Save initial digest as v0. // for (int i = 0; i < SHA256_DIGEST_SIZE; i++) // { // v0[i] = digest[i]; // } uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter) // Copy v0. for (int j = 0; j < SHA256_DIGEST_SIZE; j++) { buffer[32 + j] = digest[j]; } // Write counter (big-endian). // buffer[64] = (i >> 24) & 0xff; // buffer[65] = (i >> 16) & 0xff; // max is 1500 buffer[64] = 0; buffer[65] = 0; // 1,500 rounds total (first one is done above). for (int i = 1; i < 1500; i++) { // Copy current digest. for (int j = 0; j < SHA256_DIGEST_SIZE; j++) { buffer[j] = digest[j]; } buffer[66] = (i >> 8) & 0xff; buffer[67] = i & 0xff; // sha256_hash(buffer, 68, digest); sha256_hash_68(buffer, digest, sha256_k); } #define AES_BLOCK_SIZE 16 // -------- AES post-processing on final digest -------- // Use final digest as the AES key. aes256_ctx key_ctx; aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon); uint8_t counter[16] = {0}; // clear counter // Encrypt the counter block. aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared); // Encrypt the (now updated) counter to produce the final output. aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared); // Write final (AES‑encrypted) digest to output. uint8_t *out = output + idx * SHA256_DIGEST_SIZE; #pragma unroll for (int i = 0; i < SHA256_DIGEST_SIZE; i++) { out[i] = digest[i]; } } //chacha8 key doesn't need swapping __global__ void multihash_kernel_noswap(const uint8_t *input, uint8_t *output, size_t num) { ssize_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num) return; __shared__ uint32_t sha256_k[64]; __shared__ unsigned char gf2_table_shared[256]; __shared__ uint8_t aes_sbox[256]; __shared__ uint32_t aes_Rcon[7]; if (threadIdx.x == 0) { memcpy(sha256_k, k, sizeof(sha256_k)); memcpy(aes_sbox, sbox, sizeof(aes_sbox)); memcpy(aes_Rcon, Rcon, sizeof(aes_Rcon)); memcpy(gf2_table_shared, gf2_table, sizeof(gf2_table_shared)); } __syncthreads(); // Make sure the data is loaded before use // Each input message is 19 bytes. const uint8_t *data = input + idx * 19; uint8_t digest[SHA256_DIGEST_SIZE]; uint8_t v0[SHA256_DIGEST_SIZE]; // First round: hash the 19-byte input. // sha256_hash(data, 19, digest); sha256_hash_19(data, digest, sha256_k); // Save initial digest as v0. for (int i = 0; i < SHA256_DIGEST_SIZE; i++) { v0[i] = digest[i]; } uint8_t buffer[68]; // 32 bytes (digest) + 32 bytes (v0) + 4 bytes (counter) // Copy v0. for (int j = 0; j < SHA256_DIGEST_SIZE; j++) { buffer[32 + j] = v0[j]; } // Write counter (big-endian). // buffer[64] = (i >> 24) & 0xff; // buffer[65] = (i >> 16) & 0xff; // max is 1500 buffer[64] = 0; buffer[65] = 0; // 1,500 rounds total (first one is done above). for (int i = 1; i < 1500; i++) { // Copy current digest. for (int j = 0; j < SHA256_DIGEST_SIZE; j++) { buffer[j] = digest[j]; } buffer[66] = (i >> 8) & 0xff; buffer[67] = i & 0xff; // sha256_hash(buffer, 68, digest); sha256_hash_68(buffer, digest, sha256_k); } #define AES_BLOCK_SIZE 16 // -------- AES post-processing on final digest -------- // Use final digest as the AES key. aes256_ctx key_ctx; aes256_set_encrypt_key(digest, &key_ctx, aes_sbox, aes_Rcon); uint8_t counter[16] = {0}; // clear counter // Encrypt the counter block. aes256_encrypt(&key_ctx, counter, counter, aes_sbox, gf2_table_shared); // Encrypt the (now updated) counter to produce the final output. aes256_encrypt(&key_ctx, counter, digest, aes_sbox, gf2_table_shared); // Write final (AES‑encrypted) digest to output. uint8_t *out = output + idx * SHA256_DIGEST_SIZE; #pragma unroll for (int i = 0; i < SHA256_DIGEST_SIZE; i++) { out[i] = digest[i]; } } uint64_t get_time_in_nanosecond() { struct timespec time; clock_gettime(CLOCK_MONOTONIC, &time); return (uint64_t)time.tv_sec * 1000000000 + (uint64_t)time.tv_nsec; } #define TEST_ENCRYPT_BLOCK_SIZE 32 // Test kernel: each thread processes one key/IV pair. // this is to test raw encryption speed __global__ void test_kcipher2_kernel(const uint8_t *in, uint8_t *out, size_t N) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= N) return; // Each input block: first 16 bytes = Key, next 16 bytes = IV. const int in_block_size = 32; // 16-byte key + 16-byte IV const int out_block_size = TEST_ENCRYPT_BLOCK_SIZE; // 128-byte ciphertext // Pointers to the key/IV for this thread. const uint8_t *block_in = in + idx * in_block_size; // Load key (16 bytes) and IV (16 bytes) into local arrays. unsigned int key_local[4]; unsigned int iv_local[4]; for (int i = 0; i < 4; i++) { int base_key = i * 4; key_local[i] = ((unsigned int)block_in[base_key] << 24) | ((unsigned int)block_in[base_key + 1] << 16) | ((unsigned int)block_in[base_key + 2] << 8) | ((unsigned int)block_in[base_key + 3]); } for (int i = 0; i < 4; i++) { int base_iv = 16 + i * 4; iv_local[i] = ((unsigned int)block_in[base_iv] << 24) | ((unsigned int)block_in[base_iv + 1] << 16) | ((unsigned int)block_in[base_iv + 2] << 8) | ((unsigned int)block_in[base_iv + 3]); } unsigned char ciphertext[8]; long long res = kcipher2_encrypt_1_zero_block(key_local, iv_local); // copy res to ciphertext for (int i = 0; i < 8; i++) { ciphertext[i] = (res >> (56 - i * 8)) & 0xFF; } // Write the output ciphertext. size_t out_offset = idx * out_block_size; for (int i = 0; i < 8; i++) { out[out_offset + i] = ciphertext[i]; } } void fill_input(uint8_t *h_input, uint64_t start, size_t num) { char buffer[20]; snprintf(buffer, 20, "%019lu", start); for (size_t i = 0; i < num; i++) { // copy buffer to input memcpy(h_input + i * 19, buffer, 19); // increment buffer, starting from last index, until it reaches '9' for (int j = 18; j >= 0; j--) { if (buffer[j] == '9') { buffer[j] = '0'; } else { buffer[j]++; break; } } } } int test_encryption_only(size_t num) { printf("Count %zu \n", num); size_t input_size = num * TEST_ENCRYPT_BLOCK_SIZE; // Query device properties cudaDeviceProp prop; cudaGetDeviceProperties(&prop, gpuIndex); int maxThreadsPerBlock = prop.maxThreadsPerBlock; // Maximum allowed per block int blockSize = maxThreadsPerBlock; // Dynamically set blockSize // int blockSize = 256; int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize); size_t output_size = num * TEST_ENCRYPT_BLOCK_SIZE; printf("GPU needed memory: %.2f MB\n", (input_size + output_size) / 1024.0 / 1024.0); uint8_t *h_input = (uint8_t *)malloc(input_size); // test the blank/zero encryption // memset(h_input, 0, input_size); // fill with random fill_input(h_input, TEST_TIMESTAMP, num * (32 / 19)); uint8_t *h_output_enc = (uint8_t *)malloc(output_size); // Allocate device memory. uint8_t *d_input, *d_output_enc; printf("Allocating device memory...%.2f Mb\n", (input_size + output_size) / 1024.0 / 1024.0); if (cudaMalloc(&d_input, input_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_input\n"); return 1; } if (cudaMalloc(&d_output_enc, output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output_enc\n"); return 1; } uint64_t start = get_time_in_nanosecond(); // copy to device cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice); test_kcipher2_kernel<<>>(d_input, d_output_enc, num); cudaDeviceSynchronize(); // debug encryption output cudaMemcpy(h_output_enc, d_output_enc, output_size, cudaMemcpyDeviceToHost); uint64_t end = get_time_in_nanosecond(); printf("Enc Time: %f ms\n", (end - start) / 1000000.0); // print speed printf("Enc Speed: %f enc per second\n", num / ((end - start) / 1000000000.0)); // print first plaintext printf("Plaintext for first input: "); for (size_t i = 0; i < 19; i++) { printf("%c", h_input[i]); } printf("\n"); // For demonstration, print the first output digest in hexadecimal. printf("Encrypted for first input: "); for (size_t i = 0; i < TEST_ENCRYPT_BLOCK_SIZE; i++) { printf("%02x", h_output_enc[i]); } printf("\n"); // print last // print plaintext printf("Plaintext for last input: "); for (size_t i = 19 * num - 19; i < 19 * num; i++) { printf("%c", h_input[i]); } printf("\n"); printf("Encrypted for last input: offs <%zu>: ", num * TEST_ENCRYPT_BLOCK_SIZE - TEST_ENCRYPT_BLOCK_SIZE); for (size_t i = TEST_ENCRYPT_BLOCK_SIZE * num - TEST_ENCRYPT_BLOCK_SIZE; i < TEST_ENCRYPT_BLOCK_SIZE * num; i++) { printf("%02x", h_output_enc[i]); } printf("\n"); // Cleanup. cudaFree(d_input); cudaFree(d_output_enc); free(h_input); free(h_output_enc); return 0; } #define DIGEST_SIZE 32 #define KEY_IV_SIZE 16 // we use first 16 bytes for key/iv #define KCIPHER_OUT_SIZE 8 // KCIPHER2, single kernel // --- Host Code --- //for testing only, it will save all random number generated from a given second int save_random(uint64_t start_time, const char *filename) { printf("Saving all seeds for %lu to %s\n", start_time, filename); uint64_t timer_start = get_time_in_nanosecond(); //do it for every 10 million size_t num = 10*1000*1000; size_t input_size = num * 19 * sizeof(uint8_t); uint8_t *h_input = (uint8_t *)malloc(input_size); size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t); uint8_t *h_output = (uint8_t *)malloc(output_size); uint8_t *h_output_non_zeroes = (uint8_t *)malloc(num * 16); // Allocate device memory. uint8_t *d_input, *d_output; if (cudaMalloc(&d_input, input_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_input\n"); return 1; } if (cudaMalloc(&d_output, output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output\n"); return 1; } int blockSize = 256; int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize); FILE *f = fopen(filename, "wb"); if (f == NULL) { fprintf(stderr, "Error: Cannot open file %s\n", filename); return 1; } for (size_t i = 0; i < 100; i++) { printf("Saving %zu %%\r", i);fflush(stdout); fill_input(h_input, start_time + i * num, num); //copy to CUDA cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice); multihash_kernel<<>>(d_input, d_output, num); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); return 1; } cudaDeviceSynchronize(); cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost); //copy every 16 bytes for (size_t j = 0; j < num; j++) { memcpy(h_output_non_zeroes + j * 16, h_output + j * DIGEST_SIZE, 16); } fwrite(h_output_non_zeroes, 16, num, f); } fclose(f); uint64_t timer_end = get_time_in_nanosecond(); printf("DONE saved to %s: total time %.2f second \n", filename, (timer_end - timer_start) / 1000000000.0); return 0; } int test_generate_random_only_in_gpu(size_t num) { uint64_t t_start = TEST_TIMESTAMP + 2000; // TEST: 0 is for chacha, +1000 for chacha_nonce size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t); printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0); uint8_t *h_output = (uint8_t *)malloc(output_size); uint8_t *d_output; if (cudaMalloc(&d_output, output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output\n"); return 1; } int blockSize = 256; int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage uint64_t start = get_time_in_nanosecond(); multihash_kernel_in_memory<<>>(d_output, t_start, num); cudaDeviceSynchronize(); uint64_t end = get_time_in_nanosecond(); cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost); // print first and last printf("First random "); for (int i = 0; i < DIGEST_SIZE; i++) { printf("%02x", h_output[i]); } printf("\n"); printf("Last random "); for (size_t i = DIGEST_SIZE * num - DIGEST_SIZE; i < DIGEST_SIZE * num; i++) { printf("%02x", h_output[i]); } printf("\n"); printf("Total Time: %f ms\n", (end - start) / 1000000.0); printf("Speed: %f hashes per second\n", num / ((end - start) / 1000000000.0)); return 0; } int test_generate_random_only(size_t num) { printf("Test generate random only: %lu\n", num); size_t input_size = num * 19 * sizeof(uint8_t); printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0); // Allocate host memory. uint8_t *h_input = (uint8_t *)malloc(input_size); uint64_t t_start = TEST_TIMESTAMP + 2000; // TEST: 0 is for chacha, +1000 for chacha_nonce // output size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t); printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0); uint8_t *h_output = (uint8_t *)malloc(output_size); uint64_t start_fill = get_time_in_nanosecond(); fill_input(h_input, t_start, num); uint64_t end_fill = get_time_in_nanosecond(); printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0); // Allocate device memory. uint8_t *d_input, *d_output; if (cudaMalloc(&d_input, input_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_input\n"); return 1; } if (cudaMalloc(&d_output, output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output\n"); return 1; } uint64_t start = get_time_in_nanosecond(); cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice); int blockSize = 256; int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize); printf("Starting timestamp -> random calculation...\n"); fflush(stdout); multihash_kernel<<>>(d_input, d_output, num); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); return 1; } cudaDeviceSynchronize(); uint64_t end1 = get_time_in_nanosecond(); cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost); uint64_t end = get_time_in_nanosecond(); // print first and last printf("First random "); for (int i = 0; i < DIGEST_SIZE; i++) { printf("%02x", h_output[i]); } printf("\n"); printf("Last random "); for (size_t i = DIGEST_SIZE * num - DIGEST_SIZE; i < DIGEST_SIZE * num; i++) { printf("%02x", h_output[i]); } printf("\n"); printf("Total Time: %f ms\n", (end - start) / 1000000.0); // print speed per second printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0)); // Cleanup. cudaFree(d_input); cudaFree(d_output); free(h_input); free(h_output); return 0; } #define U8TO32_LITTLE(p) (*(const uint32_t *)(p)) typedef struct { uint32_t a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p; } BLOCK; // Device helper: rotate left 32-bit value __device__ __forceinline__ uint32_t _rotl(uint32_t x, int n) { return (x << n) | (x >> (32 - n)); } #define QROUND(a, b, c, d) \ d = _rotl(d ^ (a += b), 16); \ b = _rotl(b ^ (c += d), 12); \ d = _rotl(d ^ (a += b), 8); \ b = _rotl(b ^ (c += d), 7) #define FROUND \ QROUND(x.d, x.h, x.l, x.p); \ QROUND(x.c, x.g, x.k, x.o); \ QROUND(x.b, x.f, x.j, x.n); \ QROUND(x.a, x.e, x.i, x.m); \ QROUND(x.a, x.f, x.k, x.p); \ QROUND(x.b, x.g, x.l, x.m); \ QROUND(x.c, x.h, x.i, x.n); \ QROUND(x.d, x.e, x.j, x.o) #define FFINAL \ x.a += j.a; \ x.b += j.b; \ x.c += j.c; \ x.d += j.d; \ x.e += j.e; \ x.f += j.f; \ x.g += j.g; \ x.h += j.h; \ x.i += j.i; \ x.j += j.j; \ x.k += j.k; \ x.l += j.l; \ x.m += j.m; \ x.n += j.n; \ x.o += j.o; \ x.p += j.p //in contains key/nonce //offset is the match offset //out_flag is the flag: ts << 32 | offset __global__ void chacha8_encrypt_and_match(const uint8_t *in, size_t offset, unsigned long long *out_flag, const unsigned long long value, int N) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= N) { return; } size_t idx_offs = idx + offset; if (idx_offs >= N) { return; } const int in_block_size = 32; // Pointers to the key/IV for this thread. const uint8_t *block_in = in + idx * in_block_size; const uint8_t *block_offset_in = in + idx_offs * in_block_size; const uint8_t *k = block_in; const uint8_t *iv = block_offset_in; const char constants[16] = {'e', 'x', 'p', 'a', 'n', 'd', ' ', '1', '6', '-', 'b', 'y', 't', 'e', ' ', 'k'}; uint32_t state[16]; state[4] = U8TO32_LITTLE(k + 0); state[5] = U8TO32_LITTLE(k + 4); state[6] = U8TO32_LITTLE(k + 8); state[7] = U8TO32_LITTLE(k + 12); state[8] = U8TO32_LITTLE(k + 0); state[9] = U8TO32_LITTLE(k + 4); state[10] = U8TO32_LITTLE(k + 8); state[11] = U8TO32_LITTLE(k + 12); state[0] = U8TO32_LITTLE(constants + 0); state[1] = U8TO32_LITTLE(constants + 4); state[2] = U8TO32_LITTLE(constants + 8); state[3] = U8TO32_LITTLE(constants + 12); state[12] = 0; state[13] = 0; state[14] = U8TO32_LITTLE(iv + 0); state[15] = U8TO32_LITTLE(iv + 4); BLOCK x; BLOCK j; memcpy(&j, state, sizeof(BLOCK)); //j is for final addition j.m = 0; j.n = 0; memcpy(&x, &j, sizeof(BLOCK)); //FROUND will modify x FROUND; FROUND; FROUND; FROUND; FFINAL; uint64_t *result = (uint64_t *)&x; if (*result == value) { unsigned long long encoded_offset_and_index = (unsigned long long)offset << 32 | idx; *out_flag = encoded_offset_and_index; } } __global__ void encrypt_and_search_offset(const uint8_t *in, size_t offset, unsigned long long *out_flag, const unsigned long long *masks, const unsigned long long *values, int numComb, int N) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= N) { return; } size_t idx_offs = idx + offset; if (idx_offs >= N) { return; } __shared__ unsigned char gf2_table_shared[256]; __shared__ unsigned char gf3_table_shared[256]; __shared__ unsigned char d_s_box_shared[256]; __shared__ unsigned int d_amul0_shared[256]; __shared__ unsigned int d_amul1_shared[256]; __shared__ unsigned int d_amul2_shared[256]; __shared__ unsigned int d_amul3_shared[256]; __shared__ unsigned long long shared_masks[MAX_MATCHES]; __shared__ unsigned long long shared_values[MAX_MATCHES]; if (threadIdx.x == 0) { //copy gf2_table from const memcpy(gf2_table_shared, gf2_table, 256); memcpy(gf3_table_shared, gf3_table, 256); memcpy(d_s_box_shared, d_s_box, 256); memcpy(d_amul0_shared, d_amul0, 256 * sizeof(int)); memcpy(d_amul1_shared, d_amul1, 256 * sizeof(int)); memcpy(d_amul2_shared, d_amul2, 256 * sizeof(int)); memcpy(d_amul3_shared, d_amul3, 256 * sizeof(int)); for (int i = 0; i < numComb; i++) { shared_masks[i] = masks[i]; shared_values[i] = values[i]; } } __syncthreads(); // Make sure the data is loaded before use const int in_block_size = 32; // Pointers to the key/IV for this thread. const uint8_t *block_in = in + idx * in_block_size; const uint8_t *block_offset_in = in + idx_offs * in_block_size; // Load key (16 bytes) and IV (16 bytes) into local arrays. unsigned int key_local[4]; unsigned int iv_local[4]; const uint8_t *key_in = block_in; const uint8_t *iv_in = block_offset_in; memcpy(key_local, key_in, 16); memcpy(iv_local, iv_in, 16); //long long res = kcipher2_encrypt_1_zero_block(key_local, iv_local); long long res = kcipher2_encrypt_1_zero_block_shared(key_local, iv_local, gf2_table_shared, gf3_table_shared, d_s_box_shared, d_amul0_shared, d_amul1_shared, d_amul2_shared, d_amul3_shared); // compare with matches unsigned long long in_val = bswap64(res); unsigned int flag = 0; for (int i = 0; i < numComb; i++) { // diff is 0 if the masked input equals the expected value unsigned long long diff = (in_val & shared_masks[i]) ^ shared_values[i]; // Compute a branch-free match: returns nonzero if diff is zero. flag |= (1 - ((diff | -diff) >> 63)); } if (flag) { //we found it, now find out exactly which one did we find for (int i = 0; i < numComb; i++) { if ((in_val & shared_masks[i]) == shared_values[i]) { //idx: max 1 billion, use 30 bits //offset: max 64 million, use 26 //numComb: max 128, use 7 unsigned long long encoded_offset_and_index = (unsigned long long)(idx << 34 | offset << 8 | i << 1 | 1); *out_flag = encoded_offset_and_index; break; } } } } void decode_offset_and_index(unsigned long long encoded_offset_and_index, size_t *offset, size_t *index, size_t *matchPos) { //idx: max 1 billion, use 30 bits //offset: max 64 million, use 26 //matchPos: max 128, use 7 bits //final 1 bit to ensure we have true value //how it was encoded: //unsigned long long encoded_offset_and_index = (unsigned long long)(idx << 34 | offset << 8 | i << 1 | 1); //decode: *index = (encoded_offset_and_index >> 34) & 0x3fffffff; *offset = (encoded_offset_and_index >> 8) & 0x3ffffff; *matchPos = (encoded_offset_and_index >> 1) & 0x7f; } __global__ void encrypt_and_search(const uint8_t *in, uint8_t *out, uint32_t k1, uint32_t k2, uint32_t k3, uint32_t k4, int *out_flag, const unsigned long long *masks, const unsigned long long *values, int numComb, int N) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= N) return; // for (int i =0; i < 16; i++) { // printf("%02x", in[i]); // } // printf("\n"); // Each input block: first 16 bytes = Key, next 16 bytes = IV. const int in_block_size = 32; // 16-byte IV + zeroes const int out_block_size = TEST_ENCRYPT_BLOCK_SIZE; // block size // Pointers to the key/IV for this thread. const uint8_t *block_in = in + idx * in_block_size; // Load key (16 bytes) and IV (16 bytes) into local arrays. __shared__ unsigned int key_local[4]; __shared__ unsigned char gf2_table_shared[256]; __shared__ unsigned char gf3_table_shared[256]; __shared__ unsigned char d_s_box_shared[256]; __shared__ unsigned int d_amul0_shared[256]; __shared__ unsigned int d_amul1_shared[256]; __shared__ unsigned int d_amul2_shared[256]; __shared__ unsigned int d_amul3_shared[256]; __shared__ unsigned long long shared_masks[MAX_MATCHES]; __shared__ unsigned long long shared_values[MAX_MATCHES]; unsigned int iv_local[4]; if (threadIdx.x == 0) { key_local[0] = k1; key_local[1] = k2; key_local[2] = k3; key_local[3] = k4; //copy gf2_table from const memcpy(gf2_table_shared, gf2_table, 256); memcpy(gf3_table_shared, gf3_table, 256); memcpy(d_s_box_shared, d_s_box, 256); memcpy(d_amul0_shared, d_amul0, 256 * sizeof(int)); memcpy(d_amul1_shared, d_amul1, 256 * sizeof(int)); memcpy(d_amul2_shared, d_amul2, 256 * sizeof(int)); memcpy(d_amul3_shared, d_amul3, 256 * sizeof(int)); for (int i = 0; i < numComb; i++) { shared_masks[i] = masks[i]; shared_values[i] = values[i]; } } __syncthreads(); // Make sure the data is loaded before use const uint8_t *iv_in = block_in; memcpy(iv_local, iv_in,16); //long long res = kcipher2_encrypt_1_zero_block(key_local, iv_local); long long res = kcipher2_encrypt_1_zero_block_shared(key_local, iv_local, gf2_table_shared, gf3_table_shared, d_s_box_shared, d_amul0_shared, d_amul1_shared, d_amul2_shared, d_amul3_shared); // compare with matches unsigned long long in_val = bswap64(res); unsigned int flag = 0; for (int i = 0; i < numComb; i++) { if ((in_val & masks[i]) == values[i]) { *out_flag = 1; flag = 1; break; } } #if 1 #ifdef NUM_MATCHES #pragma unroll for (int i = 0; i < NUM_MATCHES; i++) { #else for (int i = 0; i < numComb; i++) { #endif // diff is 0 if the masked input equals the expected value unsigned long long diff = (in_val & shared_masks[i]) ^ shared_values[i]; // Compute a branch-free match: returns nonzero if diff is zero. flag |= (1 - ((diff | -diff) >> 63)); } if (flag) *out_flag = flag; #endif if (flag) { // Write the output ciphertext. int out_offset = idx * out_block_size; #pragma unroll for (int i = 0; i < 8; i++) // only copy changed { out[out_offset + i] = (res >> (56 - i * 8)) & 0xFF; } } } int do_bruteforce_offset(const char *filename) { printf("Bruteforce Offset\n"); json config; std::string json_file = filename; std::string checkpoint = json_file + ".checkpoint.json"; std::ifstream f(filename); if (!f.is_open()) { fprintf(stderr, "Error: Could not open config file %s\n", filename); return 1; } try { f >> config; } catch (json::parse_error &e) { fprintf(stderr, "Error parsing JSON: %s\n", e.what()); return 1; } size_t *offsets = 0; size_t offset_count = 0; //open offset.txt FILE *f_offset = fopen("offset.txt", "r"); if (f_offset == NULL) { fprintf(stderr, "Error: Could not open offset file\n"); return 1; } //read line by line, convert to integer char line[256]; while (fgets(line, sizeof(line), f_offset)) { offsets = (size_t *)realloc(offsets, (offset_count + 1) * sizeof(size_t)); offsets[offset_count] = strtoull(line, NULL, 10); offset_count++; } fclose(f_offset); printf("Offset count: %zu\n", offset_count); //print first and last offset printf("First offset: %zu\n", offsets[0]); printf("Last offset: %zu\n", offsets[offset_count - 1]); size_t num; uint64_t t_start; //size_t enc_count; size_t offset; size_t matches_size = 0; uint64_t *matches = 0; uint64_t *masks = 0; try { t_start = config["start_timestamp"].get(); // start T3 num = config["count"].get(); // stop at T3 + count ns offset = config["offset"].get(); // we start at T3 + offset ns //enc_count = config["brute_force_time_range"].get(); // we stop at T3 + offset + enc_count ns // "mathes": [ // { // "plaintext": "0x00000000", // "encrypted": "0x00000000", // "bitmask": "0xffffffff" // }, // { // "plaintext": "0x00000000", // "encrypted": "0x00000001", // "bitmask": "0xffffffff" // } // ] // parse matches, and put it in array (match = plaintext^encrypted) if (config.contains("matches")) { matches_size = config["matches"].size(); assert(matches_size < MAX_MATCHES); assert(matches_size > 0); matches = (uint64_t *)malloc(matches_size * sizeof(uint64_t)); masks = (uint64_t *)malloc(matches_size * sizeof(uint64_t)); for (size_t i = 0; i < matches_size; i++) { uint64_t plaintext = std::stoull(config["matches"][i]["plaintext"].get(), 0, 16); uint64_t encrypted = std::stoull(config["matches"][i]["encrypted"].get(), 0, 16); uint64_t bitmask = std::stoull(config["matches"][i]["bitmask"].get(), 0, 16); matches[i] = plaintext ^ encrypted; masks[i] = bitmask; printf("Match %zu: %016lx bitmask %016lx \n", i, matches[i], masks[i]); } } } catch (json::exception &e) { fprintf(stderr, "Error reading JSON values: %s\n", e.what()); return 1; } printf("Configuration:\n"); printf("num: %zu\n", num); printf("t_start: %lu\n", t_start); //printf("enc_count: %zu\n", enc_count); printf("offset: %zu\n", offset); //printf("Brute forcing: %zu enc count %zu\n", num, enc_count); size_t input_size = num * 19 * sizeof(uint8_t); printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0); // Allocate host memory. uint8_t *h_input = (uint8_t *)malloc(input_size); // output size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t); printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0); uint8_t *h_output = (uint8_t *)malloc(output_size); //size_t enc_output_size = enc_count * TEST_ENCRYPT_BLOCK_SIZE * sizeof(uint8_t); //uint8_t *h_output_enc = (uint8_t *)malloc(enc_output_size); uint64_t start_fill = get_time_in_nanosecond(); fill_input(h_input, t_start, num); // GENERATE TS uint64_t end_fill = get_time_in_nanosecond(); printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0); // Allocate device memory. uint8_t *d_input, *d_output; //uint8_t *d_output_enc; unsigned long long *d_found; unsigned long long *d_matches; unsigned long long *d_masks; if (cudaMalloc(&d_input, input_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_input\n"); return 1; } if (cudaMalloc(&d_output, output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output\n"); return 1; } // if (cudaMalloc(&d_output_enc, enc_output_size) != cudaSuccess) // { // fprintf(stderr, "Error: cudaMalloc failed for d_output_enc\n"); // return 1; // } if (cudaMalloc(&d_found, sizeof(unsigned long long)) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_found\n"); return 1; } if (cudaMalloc(&d_matches, matches_size * sizeof(uint64_t)) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_matches\n"); return 1; } if (cudaMalloc(&d_masks, matches_size * sizeof(uint64_t)) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_masks\n"); return 1; } unsigned long long zero = 0; cudaMemcpy(d_found, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice); // copy matches and masks cudaMemcpy(d_matches, matches, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice); cudaMemcpy(d_masks, masks, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice); uint64_t start = get_time_in_nanosecond(); cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice); // Launch kernel. int blockSize = 256; int gridSize = (num + blockSize - 1) / blockSize; printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize); printf("Starting timestamp -> random calculation...\n"); multihash_kernel<<>>(d_input, d_output, num); cudaDeviceSynchronize(); uint64_t end1 = get_time_in_nanosecond(); uint64_t end = get_time_in_nanosecond(); int limit = num - offset; printf("Total Time: %f ms\n", (end - start) / 1000000.0); // print speed per second printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0)); printf("Limit: %d\n", limit); start = get_time_in_nanosecond(); uint64_t start_enc = get_time_in_nanosecond(); // set gridsize based on limit gridSize = (limit + blockSize - 1) / blockSize; for (size_t i = 0; i < offset_count; i++) { printf("Starting encryption and search at offset %zu (%zu)\n", i, offsets[i]); encrypt_and_search_offset<<>>(d_output, offsets[i], d_found, d_masks, d_matches, matches_size, limit); //check for error cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); return 1; } if (i > 0 && (i % 1) == 0) { uint64_t end_enc = get_time_in_nanosecond(); printf("Enc Time: %f ms\n", (end_enc - start_enc) / 1000000.0); start_enc = get_time_in_nanosecond(); printf("Progress: %zu/%zu (testing: %zu)\n", i, offset_count, offsets[i]); cudaDeviceSynchronize(); unsigned long long found = 0; cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost); if (found) { //unsigned long long encoded_offset_and_index = (unsigned long long)offset << 31 | idx << 1 | 1; size_t t_offset, t_index, match_pos; decode_offset_and_index(found, &t_offset, &t_index, &match_pos); printf("Found at index %zu offset=%zu ts = %zu + %zu filename = %s match_pos = %zu\n", i, t_offset, t_start, t_index, filename, match_pos); //write to output.txt FILE *f = fopen("output.txt", "a"); if (f == NULL) { printf("Error opening file!\n"); exit(1); } fprintf(f, "Found at index %zu offset=%zu ts = %zu + %zu filename = %s match_pos = %zu\n", i, t_offset, t_start, t_index, filename, match_pos); fclose(f); // reset for next cudaMemcpy(d_found, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice); // err = cudaGetLastError(); // if (err != cudaSuccess) // { // fprintf(stderr, "Error2: %s\n", cudaGetErrorString(err)); // return 1; // } } //write checkpoint config["start_timestamp"] = t_start + i; config["index"] = i; std::ofstream o(checkpoint); o << std::setw(4) << config << std::endl; } } cudaDeviceSynchronize(); long long found = 0; cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); return 1; } if (found) { size_t t_offset, t_index, match_pos; decode_offset_and_index(found, &t_offset, &t_index, &match_pos); printf("Found at offset=%zu ts = %zu + %zu filename = %s match indxex = %zu\n", t_offset, t_start, t_index, filename, match_pos); } uint64_t end_enc = get_time_in_nanosecond(); printf("Enc Offset list Time: %f ms\n", (end_enc - start_enc) / 1000000.0); return 0; } int do_bruteforce_new(const char *filename) { json config; std::string json_file = filename; std::string checkpoint = json_file + ".checkpoint.json"; std::ifstream f(filename); if (!f.is_open()) { fprintf(stderr, "Error: Could not open config file %s\n", filename); return 1; } try { f >> config; } catch (json::parse_error &e) { fprintf(stderr, "Error parsing JSON: %s\n", e.what()); return 1; } size_t num; uint64_t t_start; size_t enc_count; size_t offset; size_t matches_size = 0; uint64_t *matches = 0; uint64_t *masks = 0; char **matches_filename = 0; try { t_start = config["start_timestamp"].get(); // start T3 num = config["count"].get(); // stop at T3 + count ns offset = config["offset"].get(); // we start at T3 + offset ns enc_count = config["brute_force_time_range"].get(); // we stop at T3 + offset + enc_count ns // parse matches, and put it in array (match = plaintext^encrypted) if (config.contains("matches")) { matches_size = config["matches"].size(); assert(matches_size < MAX_MATCHES); assert(matches_size > 0); matches = (uint64_t *)malloc(matches_size * sizeof(uint64_t)); masks = (uint64_t *)malloc(matches_size * sizeof(uint64_t)); matches_filename = (char **)malloc(matches_size * sizeof(char *)); for (size_t i = 0; i < matches_size; i++) { uint64_t plaintext = std::stoull(config["matches"][i]["plaintext"].get(), 0, 16); uint64_t encrypted = std::stoull(config["matches"][i]["encrypted"].get(), 0, 16); uint64_t bitmask = std::stoull(config["matches"][i]["bitmask"].get(), 0, 16); std::string filename = config["matches"][i]["filename"].get(); //strdup filename matches_filename[i] = strdup(filename.c_str()); matches[i] = plaintext ^ encrypted; masks[i] = bitmask; printf("Match %zu: %016lx bitmask %016lx \n", i, matches[i], masks[i]); } } } catch (json::exception &e) { fprintf(stderr, "Error reading JSON values: %s\n", e.what()); return 1; } printf("Configuration:\n"); printf("num: %zu\n", num); printf("t_start: %lu\n", t_start); printf("t_end %lu\n", t_start + num); printf("enc_count: %zu\n", enc_count); printf("offset: %zu\n", offset); printf("Brute forcing: %zu enc count %zu\n", num, enc_count); // output size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t); printf("GPU RAM for random %.2f MB\n", output_size / 1024.0 / 1024.0); // Allocate device memory. uint8_t *d_output; unsigned long long *d_found; unsigned long long *d_matches; unsigned long long *d_masks; if (cudaMalloc(&d_output, output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output\n"); return 1; } if (cudaMalloc(&d_found, sizeof(unsigned long long )) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_found\n"); return 1; } if (cudaMalloc(&d_matches, matches_size * sizeof(uint64_t)) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_matches\n"); return 1; } if (cudaMalloc(&d_masks, matches_size * sizeof(uint64_t)) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_masks\n"); return 1; } unsigned long long zero = 0; cudaMemcpy(d_found, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice); // copy matches and masks cudaMemcpy(d_matches, matches, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice); cudaMemcpy(d_masks, masks, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice); uint64_t start = get_time_in_nanosecond(); // Launch kernel. int blockSize = 256; int gridSize = (num + blockSize - 1) / blockSize; printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize); printf("Starting timestamp -> random calculation...: %zu nanoseconds\n", num); multihash_kernel_in_memory<<>>(d_output, t_start, num); cudaDeviceSynchronize(); uint64_t end1 = get_time_in_nanosecond(); uint64_t end = get_time_in_nanosecond(); int limit = num - enc_count - offset; printf("Total Time to translate timestamp to random %f ms\n", (end - start) / 1000000.0); // print speed per second printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0)); printf("Limit: %d\n", limit); start = get_time_in_nanosecond(); uint64_t start_enc = get_time_in_nanosecond(); // set gridsize based on limit gridSize = (limit + blockSize - 1) / blockSize; uint64_t elapsed = 0; for (size_t i = 0; i < enc_count; i++) { // printf("Starting encryption and search at offset %d\n", i); encrypt_and_search_offset<<>>(d_output, offset + i, d_found, d_masks, d_matches, matches_size, limit); // print error // cudaError_t err = cudaGetLastError(); // if (err != cudaSuccess) // { // fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); // return 1; // } #define SKIP_CHECK 100 if (i > 0 && (i % SKIP_CHECK) == 0) { uint64_t end_enc = get_time_in_nanosecond(); printf("Enc Time (%zu): %f ms\n", i, (end_enc - start_enc) / 1000000.0); elapsed += (end_enc - start_enc); printf("%.2f minutes elapsed, ", elapsed / 1000000000.0 / 60.0); //remaining double remaining = (enc_count - i) * ((end_enc - start_enc)/SKIP_CHECK); printf("%.2f minutes remaining (%.2f hours). ", remaining / 1000000000.0 / 60.0, remaining / 1000000000.0 / 3600.0); start_enc = get_time_in_nanosecond(); printf("Progress: %zu/%zu (testing: %zu)\n", i, enc_count, offset + i); cudaDeviceSynchronize(); // //print error // err = cudaGetLastError(); // if (err != cudaSuccess) // { // fprintf(stderr, "Error0: %s\n", cudaGetErrorString(err)); // return 1; // } unsigned long long found = 0; cudaMemcpy(&found, d_found, sizeof(unsigned long long ), cudaMemcpyDeviceToHost); // err = cudaGetLastError(); // if (err != cudaSuccess) // { // fprintf(stderr, "Error1: %s\n", cudaGetErrorString(err)); // return 1; // } if (found) { size_t t_offset, t_index, match_pos; decode_offset_and_index(found, &t_offset, &t_index, &match_pos); printf("Found at offset=%zu ts = %zu + %zu config_file = %s match_index %zu, file : %s\n", t_offset, t_start, t_index, filename, match_pos, matches_filename[match_pos]); printf("Found at offset %zu found = %llu ts = %zu file = %s, match_index = %zu file: %s\n", t_offset, found, t_start + t_index, filename, match_pos, matches_filename[match_pos]); //write to output.txt FILE *f = fopen("output.txt", "a"); if (f == NULL) { printf("Error opening file!\n"); exit(1); } fprintf(f, "Found at offset %zu found = %llu ts = %zu file = %s, match_index = %zu file: %s\n", t_offset, found, t_start + t_index, filename, match_pos, matches_filename[match_pos]); fclose(f); // reset for next cudaMemcpy(d_found, &zero, sizeof(unsigned long long ), cudaMemcpyHostToDevice); // err = cudaGetLastError(); // if (err != cudaSuccess) // { // fprintf(stderr, "Error2: %s\n", cudaGetErrorString(err)); // return 1; // } } //write checkpoint config["offset"] = offset + i; config["index"] = i; std::ofstream o(checkpoint); o << std::setw(4) << config << std::endl; } } cudaDeviceSynchronize(); // //print error // err = cudaGetLastError(); // if (err != cudaSuccess) // { // fprintf(stderr, "Error0: %s\n", cudaGetErrorString(err)); // return 1; // } unsigned long long found = 0; cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error1: %s\n", cudaGetErrorString(err)); return 1; } if (found) { size_t t_offset, t_index, match_pos; decode_offset_and_index(found, &t_offset, &t_index, &match_pos); printf("Found at offset=%zu ts = %zu + %zu filename = %s\n", t_offset, t_start, t_index, filename); } uint64_t end_enc = get_time_in_nanosecond(); printf("Offset Enc Time: %f ms\n", (end_enc - start_enc) / 1000000.0); return 0; } int do_bruteforce(const char *filename) { json config; std::string json_file = filename; std::string checkpoint = json_file + "checkpoint.json"; std::ifstream f(filename); if (!f.is_open()) { fprintf(stderr, "Error: Could not open config file %s\n", filename); return 1; } try { f >> config; } catch (json::parse_error &e) { fprintf(stderr, "Error parsing JSON: %s\n", e.what()); return 1; } size_t num; uint64_t t_start; size_t enc_count; size_t offset; size_t matches_size = 0; uint64_t *matches = 0; uint64_t *masks = 0; try { t_start = config["start_timestamp"].get(); // start T3 num = config["count"].get(); // stop at T3 + count ns offset = config["offset"].get(); // we start at T3 + offset ns enc_count = config["brute_force_time_range"].get(); // we stop at T3 + offset + enc_count ns // parse matches, and put it in array (match = plaintext^encrypted) if (config.contains("matches")) { matches_size = config["matches"].size(); matches = (uint64_t *)malloc(matches_size * sizeof(uint64_t)); masks = (uint64_t *)malloc(matches_size * sizeof(uint64_t)); for (size_t i = 0; i < matches_size; i++) { uint64_t plaintext = std::stoull(config["matches"][i]["plaintext"].get(), 0, 16); uint64_t encrypted = std::stoull(config["matches"][i]["encrypted"].get(), 0, 16); uint64_t bitmask = std::stoull(config["matches"][i]["bitmask"].get(), 0, 16); matches[i] = plaintext ^ encrypted; printf("Match %zu: %016lx\n", i, matches[i]); masks[i] = bitmask; printf("Mask %zu: %016lx\n", i, masks[i]); } } } catch (json::exception &e) { fprintf(stderr, "Error reading JSON values: %s\n", e.what()); return 1; } printf("Configuration:\n"); printf("num: %zu\n", num); printf("t_start: %lu\n", t_start); printf("enc_count: %zu\n", enc_count); printf("offset: %zu\n", offset); printf("Brute forcing: %zu enc count %zu\n", num, enc_count); size_t input_size = num * 19 * sizeof(uint8_t); printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0); // Allocate host memory. uint8_t *h_input = (uint8_t *)malloc(input_size); // output size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t); printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0); uint8_t *h_output = (uint8_t *)malloc(output_size); size_t enc_output_size = enc_count * TEST_ENCRYPT_BLOCK_SIZE * sizeof(uint8_t); uint8_t *h_output_enc = (uint8_t *)malloc(enc_output_size); uint64_t start_fill = get_time_in_nanosecond(); fill_input(h_input, t_start, num); // GENERATE TS uint64_t end_fill = get_time_in_nanosecond(); printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0); // Allocate device memory. uint8_t *d_input, *d_output, *d_output_enc; int *d_found; unsigned long long *d_matches; unsigned long long *d_masks; if (cudaMalloc(&d_input, input_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_input\n"); return 1; } if (cudaMalloc(&d_output, output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output\n"); return 1; } if (cudaMalloc(&d_output_enc, enc_output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output_enc\n"); return 1; } if (cudaMalloc(&d_found, sizeof(int)) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_found\n"); return 1; } if (cudaMalloc(&d_matches, matches_size * sizeof(uint64_t)) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_matches\n"); return 1; } if (cudaMalloc(&d_masks, matches_size * sizeof(uint64_t)) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_masks\n"); return 1; } long zero = 0; cudaMemcpy(d_found, &zero, sizeof(int), cudaMemcpyHostToDevice); // copy matches and masks cudaMemcpy(d_matches, matches, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice); cudaMemcpy(d_masks, masks, matches_size * sizeof(uint64_t), cudaMemcpyHostToDevice); uint64_t start = get_time_in_nanosecond(); cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice); // Launch kernel. int blockSize = 256; int gridSize = (num + blockSize - 1) / blockSize; printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize); printf("Starting timestamp -> random calculation...\n"); multihash_kernel<<>>(d_input, d_output, num); cudaDeviceSynchronize(); uint64_t end1 = get_time_in_nanosecond(); cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost); uint64_t end = get_time_in_nanosecond(); // //print first hash printf("First hash: "); for (int i = 0; i < 16; i++) { printf("%02x", h_output[i]); } printf("\n"); // //print 2nd hash printf("+offset hash: "); for (int i = 0; i < 16; i++) { printf("%02x", h_output[SHA256_DIGEST_SIZE * offset + i]); } printf("\n"); printf("Total Time: %f ms\n", (end - start) / 1000000.0); // print speed per second printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0)); size_t limit = num - enc_count - offset; printf("Limit: %zu\n", limit); start = get_time_in_nanosecond(); int gridSizeEnc = (enc_count + blockSize - 1) / blockSize; cudaDeviceProp prop; cudaGetDeviceProperties(&prop, gpuIndex); int maxThreadsPerBlock = prop.maxThreadsPerBlock; // Maximum allowed per block blockSize = maxThreadsPerBlock; // Dynamically set blockSize gridSizeEnc = (enc_count + blockSize - 1) / blockSize; // Ensure full coverage printf("Encryption Using blockSize = %d, gridSize = %d\n", blockSize, gridSizeEnc); uint64_t start_enc = get_time_in_nanosecond(); for (size_t i = 0; i < limit; i++) { // launch test_kcipher2_kernel_single uint32_t *host_i32 = (uint32_t *)(h_output + i * 32); uint32_t k1 = host_i32[0]; uint32_t k2 = host_i32[1]; uint32_t k3 = host_i32[2]; uint32_t k4 = host_i32[3]; if (i > 0 && i % 10000 == 0) { uint64_t end_enc = get_time_in_nanosecond(); printf("10000 ns Time: %f ms\n", (end_enc - start_enc) / 1000000.0); printf("Speed for 10000 ns: %f enc per second\n", 10000 / ((end_enc - start_enc) / 1000000000.0)); start_enc = get_time_in_nanosecond(); printf("Processing %zu: T3 = %lu\n", i, t_start + i); fflush(stdout); // printf("Current key: %08x %08x %08x %08x\n", k1, k2, k3, k4); } if ((i % 100000) == 0) { // create checkpoint // modify json with current time config["start_timestamp"] = t_start + i; std::ofstream o(checkpoint); o << std::setw(4) << config << std::endl; } encrypt_and_search<<>>(d_output + (offset + i) * 32, d_output_enc, k1, k2, k3, k4, d_found, d_masks, d_matches, matches_size, enc_count); cudaDeviceSynchronize(); // copy match int found; cudaMemcpy(&found, d_found, sizeof(int), cudaMemcpyDeviceToHost); if (found) { // only copy the large block if we found a match cudaMemcpy(h_output_enc, d_output_enc, enc_output_size, cudaMemcpyDeviceToHost); // reset found flag for next match cudaMemcpy(d_found, &zero, sizeof(int), cudaMemcpyHostToDevice); // reset d_output_enc cudaMemset(d_output_enc, 0, enc_output_size); uint64_t *host_i64 = (uint64_t *)(h_output_enc); for (size_t j = 0; j < enc_count; j++) { for (size_t k = 0; k < matches_size; k++) { if ((*host_i64 & masks[k]) == matches[k]) { printf("Found Match %zu target %zu T3 = %lu T4 = %lu offset=%lu\n", k, j, t_start + i, (t_start + i) + offset + j, offset + j); // open file and write to it FILE *out = fopen("output.txt", "a"); if (out != NULL) { fprintf(out, "Found Match %zu target %zu T3 = %lu T4 = %lu offset=%lu\n", k, j, t_start + i, (t_start + i) + offset + j, offset + j); fclose(out); } } } host_i64 += 4; } } } end = get_time_in_nanosecond(); printf("Total Time: %f ms\n", (end - start) / 1000000.0); // print speed per second printf("Speed: %f enc per second\n", enc_count / ((end - start) / 1000000000.0)); // Cleanup. cudaFree(d_input); cudaFree(d_output); free(h_input); free(h_output); return 0; } //this will leak fd, but we don't care uint8_t *load_random(const char *filename, size_t *size) { //mmap the file int fd; struct stat sb; uint8_t *addr; fd = open(filename, O_RDONLY); if (fd == -1) { perror("open"); return 0; } //get file size using stat if (fstat(fd, &sb) == -1) { close(fd); perror("fstat"); return 0; } *size = sb.st_size; addr = (uint8_t *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) { close(fd); perror("mmap"); return 0; } return addr; } uint8_t *hex_to_bytes(const char *search_hex) { if (strlen(search_hex)!=32) { printf("Search hex must be 32 bytes hex characters\n"); return 0; } uint8_t * search = (uint8_t *)malloc(16); //convert search_hex (32 hex character, no space) to search for (int i = 0; i < 16; i++) { char tmp[3]; tmp[0] = search_hex[i * 2]; tmp[1] = search_hex[i * 2 + 1]; tmp[2] = 0; search[i] = strtol(tmp, NULL, 16); } printf("Searching: "); //hexdump to verify for (int i = 0; i < 16; i++) { printf("%02x", search[i]); } printf("\n"); //for every 4 byte do: __builtin_bswap32 uint32_t *search_i32 = (uint32_t *)search; for (int i = 0; i < 4; i++) { search_i32[i] = __builtin_bswap32(search_i32[i]); } return search; } int search_random(uint8_t *addr, size_t size, const char *search_hex) { if (!addr || !search_hex) { return -1; } uint8_t *search = hex_to_bytes(search_hex); if (!search) { return -1; } //now search the file using memmem void * pos = memmem(addr, size, search, 16); int found_pos = 1; if (pos ) { printf("Found at index %ld\n", ((uint8_t *)pos - addr)/16); found_pos = ((uint8_t *)pos - addr)/16; } else { printf("Not found\n"); } return found_pos; } //hash computation in GPU, chacha8 in CPU int test_chacha8_speed(size_t num) { printf("Test random + Chacha8 : %lu\n", num); size_t input_size = num * 19 * sizeof(uint8_t); printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0); // Allocate host memory. uint8_t *h_input = (uint8_t *)malloc(input_size); uint64_t t_start = TEST_TIMESTAMP + 2000; // TEST: 0 is for chacha, +1000 for chacha_nonce // output size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t); printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0); uint8_t *h_output = (uint8_t *)malloc(output_size); uint64_t start_fill = get_time_in_nanosecond(); fill_input(h_input, t_start, num); uint64_t end_fill = get_time_in_nanosecond(); printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0); // Allocate device memory. uint8_t *d_input, *d_output; if (cudaMalloc(&d_input, input_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_input\n"); return 1; } if (cudaMalloc(&d_output, output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output\n"); return 1; } uint64_t start = get_time_in_nanosecond(); cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice); int blockSize = 256; int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize); printf("Starting timestamp -> random calculation...\n"); fflush(stdout); multihash_kernel<<>>(d_input, d_output, num); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); return 1; } cudaDeviceSynchronize(); uint64_t end1 = get_time_in_nanosecond(); cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost); uint64_t end = get_time_in_nanosecond(); printf("Total Time: %f ms\n", (end - start) / 1000000.0); // print speed per second printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0)); // Cleanup. cudaFree(d_input); cudaFree(d_output); uint8_t input[64]; uint8_t output[64]; memset(input, 0, 64); start = get_time_in_nanosecond(); for (size_t i = 0; i < num; i++) { uint8_t *seed = h_output + i * SHA256_DIGEST_SIZE; uint8_t *iv = h_output + i * SHA256_DIGEST_SIZE; chacha8_ctx ctx; chacha8_keysetup(&ctx, seed, iv); chacha8_get_keystream_oneblock(&ctx, output); } end = get_time_in_nanosecond(); printf("Enc Total Time: %f ms\n", (end - start) / 1000000.0); //speed printf("Speed: %f enc per second\n", num / ((end - start) / 1000000000.0)); return 0; } void hexdump(const char *title, const uint8_t *data, size_t size) { printf("%s: ", title); const uint8_t *p = data; for (size_t i = 0; i < size; i++) { printf("%02x ", p[i]); } printf("\n"); } int bruteforce_chacha(const char *filename) { std::string json_file = filename; std::string checkpoint = json_file + "checkpoint.json"; uint64_t t3_ts; uint64_t t3_t1_offset; uint64_t t1_t2_start_offset; uint64_t t1_t2_end_offset; uint64_t encrypted, plaintext, value; //parse JSON std::ifstream f(filename); if (!f.is_open()) { fprintf(stderr, "Error: Could not open config file %s\n", filename); return 1; } json config; try { f >> config; t3_ts = config["t3_ts"].get(); //start of kcipher2 t3_t1_offset = config["t3_t1_offset"].get(); //how far from T3 do we want to start our timestamp t1_t2_start_offset = config["t1_t2_start_offset"].get(); //start offset to test for t1-t2 t1_t2_end_offset = config["t1_t2_end_offset"].get(); //end offset to test for t1-t2 encrypted = std::stoull(config["encrypted"].get(), 0, 16); printf("Encrypted: %016lx\n", encrypted); plaintext = std::stoull(config["plaintext"].get(), 0, 16); printf("Plaintext: %016lx\n", plaintext); value = encrypted ^ plaintext; printf("Value: %016lx\n", value); } catch (json::parse_error &e) { fprintf(stderr, "Error parsing JSON: %s\n", e.what()); return 1; } size_t num = t3_t1_offset + t1_t2_end_offset; size_t input_size = num * 19 * sizeof(uint8_t); printf("Input size: %.2f MB\n", input_size / 1024.0 / 1024.0); // Allocate host memory. uint8_t *h_input = (uint8_t *)malloc(input_size); uint64_t t_start = t3_ts - t3_t1_offset; printf("t_start: %lu\n", t_start); uint64_t t_end = t_start + num; printf("t_end: %lu\n", t_end); // output size_t output_size = num * SHA256_DIGEST_SIZE * sizeof(uint8_t); printf("Output size: %.2f MB\n", output_size / 1024.0 / 1024.0); uint8_t *h_output = (uint8_t *)malloc(output_size); uint64_t start_fill = get_time_in_nanosecond(); fill_input(h_input, t_start, num); uint64_t end_fill = get_time_in_nanosecond(); printf("Fill Time: %f ms\n", (end_fill - start_fill) / 1000000.0); // Allocate device memory. uint8_t *d_input, *d_output; if (cudaMalloc(&d_input, input_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_input\n"); return 1; } if (cudaMalloc(&d_output, output_size) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_output\n"); return 1; } uint64_t start = get_time_in_nanosecond(); cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice); int blockSize = 256; int gridSize = (num + blockSize - 1) / blockSize; // Ensure full coverage printf("Using blockSize = %d, gridSize = %d\n", blockSize, gridSize); printf("Starting seed calculation...\n"); fflush(stdout); multihash_kernel_noswap<<>>(d_input, d_output, num); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); return 1; } cudaDeviceSynchronize(); uint64_t end1 = get_time_in_nanosecond(); cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost); uint64_t end = get_time_in_nanosecond(); printf("Total Time: %f ms\n", (end - start) / 1000000.0); // print speed per second printf("Speed: %f hashes per second\n", num / ((end1 - start) / 1000000000.0)); // Cleanup. cudaFree(d_input); size_t num_loop = t3_t1_offset; blockSize = 256; gridSize = (num_loop + blockSize - 1) / blockSize; // Ensure full coverage unsigned long long *d_found; if (cudaMalloc(&d_found, sizeof(unsigned long long)) != cudaSuccess) { fprintf(stderr, "Error: cudaMalloc failed for d_found\n"); return 1; } start = get_time_in_nanosecond(); for (size_t offset = t1_t2_start_offset; offset < t1_t2_end_offset; offset++) { size_t n = offset - t1_t2_start_offset; if ((n % 500)==0) { end1 = get_time_in_nanosecond(); printf("Time: %f ms\n", (end1 - start) / 1000000.0); double offset_per_second = n / ((end1 - start) / 1000000000.0); printf("Speed: %f offsets per second\n", offset_per_second); printf("Processing offset %zu elapsed %f minutes\n", offset, (end1 - start) / 1000000000.0 / 60.0); //remaining double remaining = (t1_t2_end_offset - offset) / offset_per_second; printf("Remaining: %zu minutes\n", (size_t)(remaining / 60.0)); } #if 0 printf("ts = %zu\n", t_start); printf("Tp Offset: %zu ts = %zu\n", offset, t_start + offset); uint8_t output[64]; uint64_t *val = (uint64_t *)output; for (size_t i = 0; i < num_loop; i++ ) { uint8_t *seed = h_output + i * SHA256_DIGEST_SIZE; //hexdump("Seed", seed, 32); uint8_t *iv = h_output + (i + offset) * SHA256_DIGEST_SIZE; //hexdump("IV", iv, 16); chacha8_ctx ctx; chacha8_keysetup(&ctx, seed, iv); chacha8_get_keystream_oneblock(&ctx, output); //hexdump output hexdump("Output", output, 64); if (*val == value) { printf("Found at time %zu offset = %zu\n", i, offset); //save to file output.txt FILE *out = fopen("output.txt", "a"); if (out != NULL) { fprintf(out, "Found at time %zu offset = %zu\n", i, offset); fclose(out); } return 0; } //break; } //break; #else //test using GPU chacha8_encrypt_and_match<<>>(d_output, offset, d_found, value, num_loop); // //check for errors // cudaError_t err = cudaGetLastError(); // if (err != cudaSuccess) // { // fprintf(stderr, "Error: %s\n", cudaGetErrorString(err)); // return 1; // } //sync if (n % 1000 == 0) { cudaDeviceSynchronize(); unsigned long long found = 0; cudaMemcpy(&found, d_found, sizeof(unsigned long long), cudaMemcpyDeviceToHost); if (found) { // unsigned long long encoded_offset_and_index = (unsigned long long)offset << 32 | idx; size_t t_offset, t_index; t_offset = found >> 32; t_index = found & 0xffffffff; printf("Found at offset %zu index %zu\n", t_offset, t_index + t_start); //write to file FILE *out = fopen("output.txt", "a"); if (out != NULL) { fprintf(out, "Found at offset %zu index %zu\n", t_offset, t_index + t_start); fclose(out); } return 1; } } #endif } return 0; } int main(int argc, char *argv[]) { size_t num = 128 * 1000; int smCount; cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, gpuIndex); printf("SM count: %d\n", smCount); if (argc > 1) { if (strcmp(argv[1], "random") == 0) //test generate random from timestamp { if (argc > 2) { num = atoll(argv[2]); } return test_generate_random_only(num); } if (strcmp(argv[1], "random-gpu") == 0) //test generate random from timestamp, using GPU { if (argc > 2) { num = atoll(argv[2]); } return test_generate_random_only_in_gpu(num); } if (strcmp(argv[1], "saverandom") == 0) { uint64_t start_time = get_time_in_nanosecond(); if (argc > 1) { start_time = atoll(argv[2]); char tmp[16]; snprintf(tmp, sizeof(tmp), "%lu", start_time); if (strlen(tmp)<19) { start_time = start_time * 1000000000; } } char filename[256]; snprintf(filename, 256, "random_%lu.bin", start_time); return save_random(start_time, filename); } if (strcmp(argv[1], "search") == 0) { if (argc > 3) { const char *filename = argv[2]; const char *search_hex = argv[3]; size_t size; uint8_t *addr = load_random(filename, &size); if (addr) { int res = search_random(addr, size, search_hex); if (argc > 4) { const char *search_hex2 = argv[4]; int res2 = search_random(addr, size, search_hex2); printf("Diff: %d\n", res2-res); } } } else { printf("usage: search random.bin 16_byte_hex_sequece_no_space\n"); } } if (strcmp(argv[1], "enc") == 0) //test encryption only { if (argc > 2) { num = atoll(argv[2]); } return test_encryption_only(num); } if (strcmp(argv[1], "chacha8") == 0) //test chacha8 only { if (argc > 2) { num = atoll(argv[2]); } return test_chacha8_speed(num); } if (strcmp(argv[1], "runchacha") == 0 || strcmp(argv[1], "runchacha8") == 0) //run chacha8 bruteforce { if (argc > 2) { if (argc > 3) { gpuIndex = atoi(argv[3]); } int deviceCount; cudaGetDeviceCount(&deviceCount); if (gpuIndex < 0 || gpuIndex >= deviceCount) { printf("Invalid GPU index %d\n", gpuIndex); return EXIT_FAILURE; } cudaSetDevice(gpuIndex); printf("Using GPU %d\n", gpuIndex); return bruteforce_chacha(argv[2]); } else { printf("Usage: runchacha config.json\n"); } } //main bruteforce loop if (strcmp(argv[1], "run") == 0 || strcmp(argv[1], "run2")==0 || strcmp(argv[1], "run3") == 0) { // read config from JSON file if (argc > 2) { if (argc > 3) { gpuIndex = atoi(argv[3]); } int deviceCount; cudaGetDeviceCount(&deviceCount); if (gpuIndex < 0 || gpuIndex >= deviceCount) { printf("Invalid GPU index %d\n", gpuIndex); return EXIT_FAILURE; } cudaSetDevice(gpuIndex); printf("Using GPU %d\n", gpuIndex); if (strcmp(argv[1], "run") == 0) { return do_bruteforce(argv[2]); //this is the slow method } else if (strcmp(argv[1], "run2") == 0) { return do_bruteforce_new(argv[2]); //this is the faster method } else if (strcmp(argv[1], "run3") == 0) { return do_bruteforce_offset(argv[2]); //this will read values from offset.txt instead of using ranges } } printf("Please specify the JSON config file\n"); } } return 0; }