Hey,
Below is the code which I have written for a simple matrix adition:
#include<stdio.h>
#include<cuda.h>
const int N = 4;
const int blocksize = 2;
global void matrixAdd(float *a, float b, float c, int N)
{
int i = threadIdx.x;
//int j = blockIdx.yblockDim.y + threadIdx.y;
//int index = i +jN;
if(i<<N )
c[i]=a[i]+b[i];
}
int main()
{
float a, b, c;
a=(float )malloc(NNsizeof(float));
b=(float )malloc(NNsizeof(float));
c=(float )malloc(NNsizeof(float));
for ( int i = 0; i < N*N; ++i ) {
a[i] = 7.0; b[i] = 3.5; }
float *ad, *bd, *cd;
const int size = N*N*sizeof(float);
cudaMalloc( (void**)&ad, size );
cudaMalloc( (void**)&bd, size );
cudaMalloc( (void**)&cd, size );
cudaMemcpy(ad,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(ad,a,size,cudaMemcpyHostToDevice);
//dim3 dimBlock = (blocksize, blocksize);
//dim3 dimGrid = (N/dimBlock.x, N/dimBlock.y);
matrixAdd<<<1,16>>>(ad,bd,cd,N);
cudaMemcpy(c,cd,size,cudaMemcpyDeviceToHost);
for ( int i = 0; i < N*N; ++i ) {
printf("%f",c[i]); }
cudaFree(ad);
cudaFree(bd);
cudaFree(cd);
free(a);
free(B);
free©;
return 0;
}
When I compile with nvcc, I get a warning which says that the lines dim3 dimBlock = (blocksize, blocksize), dim3 dimGrid = (N/dimBlock.x, N/dimBlock.y); have no effect. Even when I try to directly pass the number of threads to the kernet, the output is not desirable.
Please help.