Hello everyone,
I started CUDA for few weeks and I’m trying to develop my own application for doing mathematics calculation. I have already a CPU version which works well and I try to implement a GPU version.
I have a specific kernel which do a calculation, and I need to call it n * (n - 1) /2 times where n is the size of my problem.
My GPU program works well when n <= 30, but when I increased the size of my problem (for example n = 35 or n = 50), it sometimes give me wrong results. I tried to debug and I noticed that sometimes arrays from global memory have inconsistent values. I also tried to catch errors with cudaThreadSynchronize() + cudaGetLastError() but nothing was detected. In fact, the kernel executed but give wrong results.
Here is the number of registers used by thread:
ptxas info : Compiling entry function ‘_Z12CalculationPKiS0_S0_iiPi’
ptxas info : Used 16 registers, 44+40 bytes smem, 4 bytes cmem[1]
I thought I used too much registers, but it is strange that it works well for n=30 and not for n=35.
Here is the code of my program:
// Allocate a device matrix of same size as A.
int* AllocateDeviceMatrix (const int* A, int n)
{
int* Adevice;
int size = n * n * sizeof(int);
cudaMalloc((void**)&Adevice, size);
return Adevice;
}
// Allocate a device vector of same size as V.
int* AllocateDeviceVector (const int* V, int n)
{
int* Vdevice;
int size = n * sizeof(int);
cudaMalloc((void**)&Vdevice, size);
return Vdevice;
}
// Copy a host vector to a device vector.
void CopyToDeviceMatrix (int* Adevice, const int* Ahost, int n)
{
int size = n * n* sizeof(int);
cudaMemcpy(Adevice, Ahost, size,
cudaMemcpyHostToDevice);
}
// Copy a device matrix to a host matrix.
void CopyFromDeviceMatrix (int* Ahost, const int* Adevice, int n)
{
int size = n * n * sizeof(int);
cudaMemcpy(Ahost, Adevice, size,
cudaMemcpyDeviceToHost);
}
// Copy a host vector to a device vector.
void CopyToDeviceVector (int* Vdevice, const int* Vhost, int n)
{
int size = n * sizeof(int);
cudaMemcpy(Vdevice, Vhost, size,
cudaMemcpyHostToDevice);
}
------------------------------------------------------------
void ProcessingGPU(const int* A, const int* B, int n){
int* S = (int*)malloc(sizeof(int)*n);
create(S,n);
int best_value = pre_calculation(A, B, S, n); // CPU part
int* Ad = AllocateDeviceMatrix(A,n);
CopyToDeviceMatrix(Ad, A, n);
int* Bd = AllocateDeviceMatrix(B,n);
CopyToDeviceMatrix(Bd, B, n);
int* Sd = AllocateDeviceVector(S,n);
CopyToDeviceVector(Sd, S, n);
int* new_values = (int*)malloc(sizeof(int)*n*(n-1)/2);
int* new_values_d = AllocateDeviceVector(new_values,n*(n-1)/2);
Calculation<<<ceil((n*(n-1)*1.0/2)/BLOCK_SIZE),BLOCK_SIZE>>>(Ad, Bd, Sd, n, best_value, new_values_d);
CopyFromDeviceVector(new_values, new_values_d, n*(n-1)/2);
for (int i = 0; i < n*(n-1)/2; i++){
if (new_values[i] < best_value)
best_value = new_values[i];
}
printf("best_value = %d\n", best_value);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Sd);
cudaFree(new_values_d);
free(S);
free(new_values);
}
---------------------------------------------------------------------
#define ID(i,j,ld) (((i)*(ld))+(j))
#define BLOCK_SIZE 256
__device__ int internal_difference(int n, const int* a, const int* b,
const int* p, int i, int j)
{
int sum; int k;
sum = (a[ID(i,i,n)]-a[ID(j,j,n)])*(b[ID(p[j],p[j],n)]-b[ID(p[i],p[i],n)]) +
(a[ID(i,j,n)]-a[ID(j,i,n)])*(b[ID(p[j],p[i],n)]-b[ID(p[i],p[j],n)]);
for (k = 0; k < n; k++)
if (k!=i && k!=j)
sum = sum + (a[ID(k,i,n)]-a[ID(k,j,n)])*(b[ID(p[k],p[j],n)]-b[ID(p[k],p[i],n)]) + (a[ID(i,k,n)]-a[ID(j,k,n)])*(b[ID(p[j],p[k],n)]-b[ID(p[i],p[k],n)]);
return sum;
}
__global__ void Calculation(const int* A, const int* B, const int* S, int n, const int value, int* new_values)
{
volatile int temp = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int N;
N = n;
if (temp >= N*(N-1)/2)
;
else{
int first;
int second;
first = (N-1) - floor( (sqrtf( 8 * ((N*(N-1)/2) - temp - 1) + 1) -1 ) / 2 ) - 1;
second = temp - first * (N-1)+ first * (first + 1)/2 + 1;
new_values[temp] = value + internal_difference(N,A,B,S,first,second);
}
}
Thanks for helping me. It is very annoying problem for me because speed-up improvements in GPU seems very significant.