Hi everybody,
I am trying to do the 2_norm of each column of a Matrix and find which one is the minimum.
It looks like this:
__global__ void obtener_s_gpu (float *HT, int m, int n, float *c, int L, float *ss, int *ids, float *normaBloque)
{
int i,j;
int posib, div, resto;
float mult;
int thread;
int nTotalThreads = blockDim.x; // Total number of threads per block (512)
int numBlocks = gridDim.x; // Total number of block threads
int idx=blockIdx.x*blockDim.x+threadIdx.x;
// Declare arrays to be in shared memory.
__shared__ float norma[3];
posib=(int)pow((float)L,m);
if (idx<posib)
{
div = idx;
//building s
for (i=0; i<m; i++)
{
resto=div%L;
ss[i*posib+idx]=(-L+1.0)/2.0+resto;
div=div/L;
}
norma[threadIdx.x]=0.0;
for (i=m-1; i>-1; i--)
{
mult=0.0;
for (j=m-1; j>i-1; j--)
{
mult=mult+HT[j*n+i]*ss[j*posib+idx]; //mul=mul+R[i*m+j]*ss[j]
}
mult=mult-c[i];
norma[threadIdx.x] = norma[threadIdx.x] + mult*mult;
}
norma[threadIdx.x]=sqrt(norma[threadIdx.x]);
}else{
norma[threadIdx.x]=INF;
}
__syncthreads();
thread=idx;
//Encontrar la menor norma
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1); // divide by two
// only the first half of the threads will be active.
if (threadIdx.x < halfPoint)
{
// Get the shared value stored by another thread
float temp = norma[threadIdx.x + halfPoint];
if (temp < norma[threadIdx.x])
{
norma[threadIdx.x] = temp;
thread=thread+halfPoint;
}
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
__syncthreads();
if(threadIdx.x==0)
{
normaBloque[numBlocks] = norma[0];
ids[numBlocks] = thread;
}
__syncthreads();
}
Everything seems to work fine, but when I add the line " ids[blockIdx.x] = thread; " something happens.
Exit without that line:
6 threads en total. Se necesitan 2 bloques
posibles vectores de s: 4
Matriz ss:
-0.500000 0.500000 -0.500000 0.500000
-0.500000 -0.500000 0.500000 0.500000
ids_0: 1094378115
normaBloque_0: 6.114271
ids_1: 1063137839
normaBloque_1: 8.192823
Adding the line:
6 threads en total. Se necesitan 2 bloques
posibles vectores de s: 4
Matriz ss:
0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0.000000 0.000000
ids_0: 0
normaBloque_0: 0.000000
ids_1: 0
normaBloque_1: 0.000000
How can it happen??, any idea??.
I declared ids like normaBloque, and the last one looks fine, so I don’t think that’s the problem.
cudaMalloc ((void **) &normaBloque_d, gridSize*sizeof(float));
cudaMalloc ((void **) &ids_d, gridSize*sizeof(int));
I give it to the kernel and then copy it to host with :
cudaMemcpy(normaBloque, normaBloque_d, gridSize*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(ids, ids_d, gridSize*sizeof(int), cudaMemcpyDeviceToHost);
I am lost…
Edited: Here is the full Kernel code, maybe now it’s clearer… or maybe not. First I build the vector, then calculate the 2_norm, and then find the minimum.
Everything works but the ids vector. Also, when I execute the screen flashes a little. Why am I doing wrong???.