Hello all -
I am trying to run the Data Encryption Standard algorithm on CUDA 2.1 on ubuntu 8.10 machine.
The program works for total threads = approx 4,00,000 (which i get by no of threads per block * no of blocks). But after this value the program stops giving me the output and starts giving garbage values.
For finding out what the problem could be I made a simple multiplication program on CUDA using ‘unsigned long long’. It is as follows :
#include<stdio.h>
#define N 3800
#define T 512
global void multiply(unsigned long long *res,unsigned long long resD) {
unsigned long long id=blockIdx.xT+threadIdx.x;
res[id]=2*id;
if(id<10)
resD[id]=res[id];
}
int main(){
unsigned long long *res,res1[10],resD;
cudaMalloc((void **)&res, (((N-1)(T-1))+(T-1))*sizeof(unsigned long long));
cudaMalloc((void **)&resD, (10)*sizeof(unsigned long long));
dim3 dimGrid(N,1);
dim3 dimBlock(T,1);
multiply<<<dimGrid, dimBlock>>>(res,resD);
cudaMemcpy(res1,resD,10*sizeof(unsigned long long),cudaMemcpyDeviceToHost);
for(unsigned long long i=0;i<10;i++){
printf("%lld\n",res1[i]);
}
return 0;
}
This program gives the correct output for N = 3850 and T = 512 , but for N= 3900 it starts printing the garbage values.
For figuring out what could be the possible reason for this I replaced all the ‘unsigned long long’ with ‘int’ and the program started running for N = 65535 and T = 512.
Can this be a problem with the ‘unsigned long long’ ?
If not what could be the possible reason for this?
Please help…