Any help would be much appreciated. I think it is a mem allocation issue/something 2 do with the 2d arrays maybe? and the blockIdx/threadIdx stuff isn’t optimized yet.
//wrapper
BYTE h_buffer[4][4];
BYTE h_plain[4][4];
int h_foundIdx = -1;
BYTE h_baseKey[KEY_SIZE] = {0};
BYTE *d_buffer = 0;
BYTE *d_plain = 0;
int *d_foundIdx = 0;
BYTE *d_baseKey = 0;
for(int i=0; i < 4; i++)
{
for(int j=0; j < 4; j++)
{
h_buffer[j][i] = cryptCtx->inputData[i*4 + j];
}
}
for(int i=0; i < 4; i++)
{
for(int j=0; j < 4; j++)
{
h_plain[j][i] = cryptCtx->extData[i*4 + j];
}
}
CUDA_SAFE_CALL(cudaMalloc((void**) &d_buffer, 4*4*sizeof(unsigned char)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_plain, 4*4*sizeof(unsigned char)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_foundIdx, sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_baseKey, KEY_SIZE*sizeof(unsigned char)));
CUDA_SAFE_CALL(cudaMemcpy(d_buffer, h_buffer, 4*4*sizeof(unsigned char), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_plain, h_plain, 4*4*sizeof(unsigned char), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_foundIdx, &h_foundIdx, sizeof(int), cudaMemcpyHostToDevice));
dim3 blockSize(192,1,1);//(384,1,1);
dim3 gridSize(1,1);//(32,1);
int threadCount = blockSize.x * gridSize.x;
long updateCount = 0;
unsigned int timer;
cutCreateTimer(&timer);
cutResetTimer(timer);
while(1)
{
cutStartTimer(timer);
CUDA_SAFE_CALL(cudaMemcpy(d_baseKey, h_baseKey, KEY_SIZE*sizeof(unsigned char), cudaMemcpyHostToDevice));
aesBruteForceKernel<<<gridSize, blockSize>>>(
d_buffer,
d_plain,
cryptCtx->numRounds,
d_foundIdx,
d_baseKey
);
CUT_CHECK_ERROR( "Kernel execution failed" );
CUDA_SAFE_CALL(cudaMemcpy(&h_foundIdx, d_foundIdx, sizeof(int), cudaMemcpyDeviceToHost));
if(h_foundIdx != -1) // mean we found the result
{
fixKeyWithCarry(h_baseKey, h_foundIdx);
break;
}
else
{
int carry = threadCount;
fixKeyWithCarry(h_baseKey, carry);
}
cutStopTimer(timer);
}
//kernel - parts that matter
__device__ void keyExpansion(unsigned char key[32], unsigned char roundKey[240], int numRounds);
__device__ void addRoundKey(unsigned char state[16], unsigned char roundKey[240], int round);
__device__ void invShiftRows(unsigned char state[16]);
__device__ void invSubBytes(unsigned char state[16]);
__device__ void invMixColumns(unsigned char state[16]);
__global__ void aesBruteForceKernel(
unsigned char *g_buffer,
unsigned char *g_plain,
int numRounds,
int *g_foundIdx,
unsigned char *g_baseKey
)
{
__shared__ unsigned char s_plain[16];
__shared__ unsigned char s_stateVal[384][16];
unsigned char t_roundKey[240], t_key[32];
int t_tidx, t_round, t_idx, t_tempEqual, t_carry;
t_tidx = blockIdx.x * blockDim.x + threadIdx.x;
if(threadIdx.x == 0)
{
for(t_idx=0; t_idx < 16; ++t_idx)
{
s_plain[t_idx] = g_plain[t_idx];
}
}
__syncthreads();
for (t_idx=0; t_idx < 32; t_idx++)
{
t_key[t_idx] = g_baseKey[t_idx];
__syncthreads();
}
t_carry = t_tidx;
t_carry = (int)t_key[0] + t_carry;
for (t_idx=0; t_idx < 32; t_idx++)
{
t_key[t_idx] = (char)t_carry;
t_carry = (int)t_key[t_idx+1] + (t_carry >> 8); // div 256;
}
__syncthreads();
for(t_idx=0; t_idx < 16; ++t_idx)
{
s_stateVal[threadIdx.x][t_idx] = g_buffer[t_idx];
}
keyExpansion(t_key, t_roundKey, numRounds);
addRoundKey(s_stateVal[threadIdx.x], t_roundKey, numRounds);
for(t_round=numRounds-1; t_round>0; t_round--)
{
invShiftRows(s_stateVal[threadIdx.x]);
invSubBytes(s_stateVal[threadIdx.x]);
addRoundKey(s_stateVal[threadIdx.x], t_roundKey, t_round);
invMixColumns(s_stateVal[threadIdx.x]);
}
invShiftRows(s_stateVal[threadIdx.x]);
invSubBytes(s_stateVal[threadIdx.x]);
addRoundKey(s_stateVal[threadIdx.x], t_roundKey, 0);
//Part that isn't being tripped
t_tempEqual = 1;
for(t_idx=0; t_idx < 16; ++t_idx)
{
if(s_stateVal[threadIdx.x][t_idx] != s_plain[t_idx])
{
t_tempEqual = 0;
}
}
if(t_tempEqual == 1)
{
*g_foundIdx = t_tidx;
}
__syncthreads();
}
I can give full/commentented versions if needed. XP, VS2005, c++. the code works in emuDebug mode, but never throws the flag “g_foundIdx” in debug mode (8800GT and Tesla, different machines). It passes the correct key and continues sudo infinate loop in while loop in wrapper. anybody have any ideas what the difference in modes would do. CUT_CHECK_ERROR never reports an issue. Any comments about optimizations are welcome, but my main concern is to get it working.
Thanx in advance for any comments.