emuDebug works, debug/release doesn't AES BF attack 4 school project

Any help would be much appreciated. I think it is a mem allocation issue/something 2 do with the 2d arrays maybe? and the blockIdx/threadIdx stuff isn’t optimized yet.

//wrapper

BYTE h_buffer[4][4];

BYTE h_plain[4][4];

int h_foundIdx = -1;

BYTE h_baseKey[KEY_SIZE] = {0};

BYTE *d_buffer = 0;

BYTE *d_plain = 0;

int *d_foundIdx = 0;

BYTE *d_baseKey = 0;

for(int i=0; i < 4; i++)

{

	for(int j=0; j < 4; j++)

	{

  h_buffer[j][i] = cryptCtx->inputData[i*4 + j];

	}

}

for(int i=0; i < 4; i++)

{

	for(int j=0; j < 4; j++)

	{

  h_plain[j][i] = cryptCtx->extData[i*4 + j];

	}

}

CUDA_SAFE_CALL(cudaMalloc((void**) &d_buffer, 4*4*sizeof(unsigned char)));

CUDA_SAFE_CALL(cudaMalloc((void**) &d_plain, 4*4*sizeof(unsigned char)));

CUDA_SAFE_CALL(cudaMalloc((void**) &d_foundIdx, sizeof(int)));

CUDA_SAFE_CALL(cudaMalloc((void**) &d_baseKey, KEY_SIZE*sizeof(unsigned char)));

CUDA_SAFE_CALL(cudaMemcpy(d_buffer, h_buffer, 4*4*sizeof(unsigned char), cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(d_plain, h_plain, 4*4*sizeof(unsigned char), cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(d_foundIdx, &h_foundIdx, sizeof(int), cudaMemcpyHostToDevice));

dim3 blockSize(192,1,1);//(384,1,1);

dim3 gridSize(1,1);//(32,1);

int threadCount = blockSize.x * gridSize.x;

long updateCount = 0;

unsigned int timer;

cutCreateTimer(&timer);

cutResetTimer(timer);

while(1)

	{

  cutStartTimer(timer);

  CUDA_SAFE_CALL(cudaMemcpy(d_baseKey, h_baseKey, KEY_SIZE*sizeof(unsigned char), cudaMemcpyHostToDevice));

  aesBruteForceKernel<<<gridSize, blockSize>>>(

  	d_buffer, 

  	d_plain, 

  	cryptCtx->numRounds,

  	d_foundIdx,

  	d_baseKey

  );

  CUT_CHECK_ERROR( "Kernel execution failed" ); 

  CUDA_SAFE_CALL(cudaMemcpy(&h_foundIdx, d_foundIdx, sizeof(int), cudaMemcpyDeviceToHost));

  

  if(h_foundIdx != -1) // mean we found the result

  {

  	fixKeyWithCarry(h_baseKey, h_foundIdx);

  	break;

  }

  else

  {

  	int carry = threadCount;

  	fixKeyWithCarry(h_baseKey, carry);	

  }

  

  cutStopTimer(timer);

	}

//kernel - parts that matter

__device__ void keyExpansion(unsigned char key[32], unsigned char roundKey[240], int numRounds);

__device__ void addRoundKey(unsigned char state[16], unsigned char roundKey[240], int round);

__device__ void invShiftRows(unsigned char state[16]);

__device__ void invSubBytes(unsigned char state[16]);

__device__ void invMixColumns(unsigned char state[16]);

__global__ void aesBruteForceKernel(

	unsigned char *g_buffer, 

	unsigned char *g_plain,

	int numRounds,

	int *g_foundIdx,

	unsigned char *g_baseKey

	)

{

	__shared__ unsigned char s_plain[16];

	__shared__ unsigned char s_stateVal[384][16];

	

	unsigned char t_roundKey[240], t_key[32];

	int t_tidx, t_round, t_idx, t_tempEqual, t_carry;

	t_tidx = blockIdx.x * blockDim.x + threadIdx.x;

	if(threadIdx.x == 0) 

	{

  for(t_idx=0; t_idx < 16; ++t_idx)

  {

  	s_plain[t_idx] = g_plain[t_idx];

  }

	}

	__syncthreads();

	for (t_idx=0; t_idx < 32; t_idx++) 

	{

  t_key[t_idx] = g_baseKey[t_idx];

  __syncthreads();

	}

	t_carry = t_tidx;

	t_carry = (int)t_key[0] + t_carry;

	for (t_idx=0; t_idx < 32; t_idx++)

	{

  t_key[t_idx] = (char)t_carry;

  t_carry = (int)t_key[t_idx+1] + (t_carry >> 8); // div 256;

	}

	__syncthreads();

	for(t_idx=0; t_idx < 16; ++t_idx)

	{

  s_stateVal[threadIdx.x][t_idx] = g_buffer[t_idx];

	}

	keyExpansion(t_key, t_roundKey, numRounds);

	addRoundKey(s_stateVal[threadIdx.x], t_roundKey, numRounds);

	for(t_round=numRounds-1; t_round>0; t_round--)

	{

  invShiftRows(s_stateVal[threadIdx.x]);

  invSubBytes(s_stateVal[threadIdx.x]);

  addRoundKey(s_stateVal[threadIdx.x], t_roundKey, t_round);

  invMixColumns(s_stateVal[threadIdx.x]);

	}

	invShiftRows(s_stateVal[threadIdx.x]);

	invSubBytes(s_stateVal[threadIdx.x]);

	addRoundKey(s_stateVal[threadIdx.x], t_roundKey, 0);

	//Part that isn't being tripped

	t_tempEqual = 1;

	for(t_idx=0; t_idx < 16; ++t_idx)

	{

  if(s_stateVal[threadIdx.x][t_idx] != s_plain[t_idx])

  {

  	t_tempEqual = 0;

  }

	}

	if(t_tempEqual == 1)

	{

  *g_foundIdx = t_tidx;

	}

	__syncthreads();

}

I can give full/commentented versions if needed. XP, VS2005, c++. the code works in emuDebug mode, but never throws the flag “g_foundIdx” in debug mode (8800GT and Tesla, different machines). It passes the correct key and continues sudo infinate loop in while loop in wrapper. anybody have any ideas what the difference in modes would do. CUT_CHECK_ERROR never reports an issue. Any comments about optimizations are welcome, but my main concern is to get it working.

Thanx in advance for any comments.

Sorry my problem was so broad. After 6 hours of testing today, i think the problem is related to beta 2.0’s handling of unsigned characters. The key is supposed to cover all possible combinations ( brute force), but i found that the keys were only only going to characters 0x7F. They never hit the values 0x80 through 0xFF (using the most significant bit). I made sure to used unsigned characters every where. I have read that others seem to be having similar problems, and was thinking about rolling back to CUDA 1.1. Any suggestions?

Would you mind trying it in 1.1 and seeing if it works correctly?

I’ll roll it back now, edit when I’m done.

1.1 works better than expected!!!

for anyone who cares, 2.6 mil key’s /sec tried in brute force over 45,000/s with AMD 6000+ X2 3.0 (cpu code is not multithreaded). Can be quickly adjusted to perform encryption/decryption, not just BF attacks. This was a brute force, known plain text attack on AES, 256 bit key, 12 rounds (not 14, like standard, but is not hardcoded). Thanx for anyone who read and tried to see the problem.