cuda always setting valeu returning without changing whatever i do

global void degson(char *old1,char *new1,float *deg,int N)
{

int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) deg[0] =10;// (float)((0>(old1[idx]-new1[idx]))?(-(old1[idx]-new1[idx])): ((old1[idx]-new1[idx])))/(float)255;
//deg=(float)100 (*deg) / (float)(N);
//cuPrintf("—>%f\n",deg);

} here is the global i used

here is where i set cuda i m using it in a function for chaging pictures to find as percent but cuda cant return true value gpud is always stay with begining value.

char *eski,*yeni;
float *benim;

void SaveFrame(AVFrame *pFrame, int width, int height, int iFrame,int zz,int size,char *mm,char *nn,float *hihi) {

SaveFrame(pFrameRGB, pCodecCtx->width, pCodecCtx->height, i,pFrame->pict_type,sayac,eski,yeni,benim);

float *gpud=(float *)malloc(sizeof(float));
//float *deneme=(float *)malloc(sizeof(float));
*gpud=0;

/////////////////////////////////
cudaMalloc((void **) &eski, pCodecCtx->widthpCodecCtx->height3);
cudaMalloc((void **) &yeni, pCodecCtx->widthpCodecCtx->height3);
cudaMalloc((void **)&benim, sizeof(float));
///////////////////////////////
///////////////////gpu//////////////////////////////////////

cudaMemcpy(mm, old, widthheight3, cudaMemcpyHostToDevice);
cudaMemcpy(nn, pFrame->data[0],widthheight3, cudaMemcpyHostToDevice);

cudaMemcpy(hihi, gpud,sizeof(float), cudaMemcpyHostToDevice);
int block_size = 5;
int n_blocks = (widthheight3)/block_size + ((widthheight3)%block_size == 0 ? 0:1);
degson <<< n_blocks, block_size >>> (mm,nn,hihi,widthheight3);

if(cudaSuccess!=cudaMemcpy(gpud, hihi, sizeof(float), cudaMemcpyDeviceToHost)) puts(“patladı”);
printf("---->%f\n",gpud); getchar();
memcpy(old,pFrame->data[0],height
width*3);
/////////////////////////////////////////////////

gpud always 0 i wonder how i get true value thanks.

First your block size should always be a multiple of 32 and should usually be 128 or larger.

Second what is the width and height?

Third have you tried stepping through with cuda-gdb?

thanks for answer a i solved problem but performance didnt increase
because of for

now in global

global void degson(unsigned char *old1,unsigned char *new1,float *deg,int N)
{

int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) deg[idx]=(float)(((0>(old1[idx]-new1[idx]))?(new1[idx]-old1[idx]):(old1[idx]-new1[idx]))/255.0);
}

i have result in deg matrix np but i need total of deg. im using this for in host part

for(int vv=0;vv<sizeof(old);vv++) deg2 += gpud[vv]; like that

this is decreasing performace

when i changed global to

global void degson(unsigned char *old1,unsigned char *new1,float *deg,int N)
{

int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) *deg+=(float)(((0>(old1[idx]-new1[idx]))?(new1[idx]-old1[idx]):(old1[idx]-new1[idx]))/255.0);
}

i get wrong result because all threads go process same time

i wonder if there is way to be had total deg matrix calcuted by gpu or not

i want to make this process in gpu "for(int vv=0;vv<sizeof(old);vv++) deg2 += gpud[vv]; "
Thanks…