hello,
i would like write a code of substruction of two vectors like vectTest[576]-vectNormal[576]
the problem when i execute the code the result is false this is my code
if somebody have any solution please !
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil_inline.h>
#include <assert.h>
#include <cuda.h>
// includes, kernels
//#include <subvect_kernel.cu>
#define SI 24
#define NBRE 10
float *a_d; // pointer to device memory
float *b_h;
float *res_d;
float *b_d;
int N = 576;
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
global void incrementArrayOnDevice(float a,floatc,float *res, int N);
/////////////////declaration des tableaux/////////////////////////////////////
float *vectTest; //vector of test
float *vectNormal; //vector of normalization
global void incrementArrayOnDevice(float *a,float *c,float res ,int N)
{
int idx = blockIdx.xblockDim.x + threadIdx.x;
if (idx<N) res[idx]=a[idx]-c[idx];
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char** argv)
{
vectTest =(float*)malloc(SISIsizeof(float));
vectNormal =(float*)malloc(SISIsizeof(float));
for(int i=0;i<576;i++) vectTest[i]=(float)i;
for(int i=0;i<576;i++) vectNormal[i]=(float)i/2;
b_h =(float*)malloc(SISIsizeof(float));
size_t size = N*sizeof(float);
// allocate array on device
cudaMalloc((void **) &a_d, size);
cudaMalloc((void **) &b_d, size);
cudaMalloc((void **) &res_d, size);
// copy data from host to device
cudaMemcpy(a_d, vectTest, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, vectNormal, sizeof(float)*N, cudaMemcpyHostToDevice);
// do calculation on device:
// Part 1 of 2. Compute execution configuration
int blockSize = 16;
int nBlocks = N/blockSize + (N%blockSize == 0?0:1);
// Part 2 of 2. Call incrementArrayOnDevice kernel
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d,b_d,res_d, N);
// Retrieve result from device and store in b_h
cudaMemcpy(b_h, res_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// display result
for (int i=0;i<576;i++)
printf(“b[%d]=%f\n”,i,b_h[i]);
free(vectTest); cudaFree(a_d);
}
the result is just the only b_h[i]=0 …