how thread function thread,cuda

I saw two programs en CUDA, one for testing the threads, the other for calculating the matrix.

Here are the code.

#include <stdlib.h>

#include <stdio.h>

#include <cuda_runtime.h>

#define BLOCK_DIM 512

const int size_x=512;

const int size_y=1;

__global__ static void ThreadDemo1 (int* ret)

{

  int xindex=blockDim.x*blockIdx.x+threadIdx.x;

  int yindex=blockDim.y*blockIdx.y+threadIdx.y;

//  if(xindex<size_x && yindex<size_y)

 // {

	 int index=xindex+size_x*yindex;

	 ret[index]=xindex;

	 ret[index+size_x*size_y]=yindex;

 // }

}

int main()

{

  int* ret=0;

  int host_ret={0};

  int i=0;

cudaMalloc((void**) &ret, sizeof(int)*(size_x*size_y*2));

dim3 grid(size_x / BLOCK_DIM, 1);

  dim3 block(BLOCK_DIM,1,1);

ThreadDemo1<<<grid,block>>>(ret);

cudaMemcpy(&host_ret, ret, sizeof(int)*(size_x*size_y*2), cudaMemcpyDeviceToHost);

for(i=0; i<size_x*size_y; i++)

  {

	 printf("(%d,%d)",host_ret[i],host_ret);

  }

  cudaFree(ret);

}
#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <cutil_inline.h>

__global__ void mykernel(float * A1, float * A2, float * R)

{

		int p = threadIdx.x;

		R[p] = A1[p] + A2[p];

}

int main()

{

		float A1[]={1,2,3,4,5,6,7,8,9};

		float A2[]={10,20,30,40,50,60,70,80,90};

		float R[9];

		int taille_mem=sizeof(float) * 9;

		// on alloue de la memoire sur la carte graphique

		float * a1_device;

		float * a2_device;

		float * r_device;

		cudaMalloc ( (void**) &a1_device, taille_mem);

		cudaMalloc ( (void**) &a2_device, taille_mem);

		cudaMalloc ( (void**) &r_device, taille_mem);

		// on copie les donnees sur la carte

		cudaMemcpy( a1_device,A1,taille_mem,cudaMemcpyHostToDevice);

		cudaMemcpy( a2_device,A2,taille_mem,cudaMemcpyHostToDevice);

		//9 additions, aucune boucle !

		mykernel<<<1,9>>>(a1_device,a2_device,r_device);

		// on recupere le resultat

		cudaMemcpy(R,r_device,taille_mem,cudaMemcpyDeviceToHost);

		//sortie à l'ecran

		for(int i=0; i<9; i++)

				printf("%f\n",R[i]);

		cudaFree(a1_device);

		cudaFree(a2_device);

		cudaFree(r_device);

}

From these two programs, I know that the index of the threads will increase automatically(can I say that?) and we don’t need to use the loop.

But in the following program, why don’t they function? Why cannot the variable “sum” caculate the sum of the pixel*poids?

__global__ void thread_kernel(int *pixel,int *poids,int *sum,int *asum,int *ret)

{

  int i=blockIdx.x*blockDim.x+threadIdx.x;

  int n=8;

*sum=0;

  *asum=0;

if(i<n)

  {

	*sum=*sum+pixel[i]*poids[i];

	*asum=*asum+poids[i];

	ret[i]=i;

  }

}

I wonder how the threads function…