Hi,

I’m learning how to program GPU with CUDA by myself, I read the CUDA By Example book and I was trying to start doing a program, the problem is that it seems that my program is doing nothing when I launch the kernel but dunno what am I missing. The code looks like the following:

```
__constant__ int dev_width;
__constant__ float dev_pivot;
__global__ void pivotReduction(float *matrix, float *identity)
{
int tid;
tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < dev_width)
{
matrix[threadIdx.x] = 2;
identity[threadIdx.x] = identity[threadIdx.x]/dev_pivot;
}
}
int main(){
float matrixTest[9] = {1,2,5,2,3,7,6,7,1};
pprMatrix matrix;
matrix.x = 3;
matrix.y = 3;
matrix.data = matrixTest;
pprMatrix *identity;
identity = createIdentity(3, 3);
pprInverse(&matrix, identity);
printMatrix(&matrix);
printMatrix(identity);
}
void pprInverse(pprMatrix *matrix, pprMatrix *identity)
{
int m;
int n;
int width;
int indexI;
//int indexJ;
//int indexK;
float pivot;
m = matrix->x;
n = matrix->y;
width = m*n;
float *dev_matrix, *dev_identity;
cudaMalloc((void**)&dev_matrix, sizeof(float));
cudaMalloc((void**)&dev_identity, sizeof(float));
cudaMemcpyToSymbol(*(&dev_width), &(width), sizeof(int), 0, cudaMemcpyHostToDevice);
cudaMemcpy(dev_matrix, matrix->data, sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy(dev_identity, identity->data, sizeof(float), cudaMemcpyHostToDevice);
for (indexI = 0; indexI < m; indexI++)
{
pivot = ((float*)matrix->data)[indexI*matrix->y+indexI];
cudaMemcpyToSymbol(*(&dev_pivot), &pivot, sizeof(float), 0, cudaMemcpyHostToDevice);
pivotReduction<<<(16+width)/16,16>>>(dev_matrix, dev_identity);
}
cudaMemcpy(dev_matrix, matrix->data, sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(dev_identity, identity->data, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_matrix);
cudaFree(dev_identity);
}
```

What are my errors in this little code. Thanks