I saw two programs en CUDA, one for testing the threads, the other for calculating the matrix.
Here are the code.
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#define BLOCK_DIM 512
const int size_x=512;
const int size_y=1;
__global__ static void ThreadDemo1 (int* ret)
{
int xindex=blockDim.x*blockIdx.x+threadIdx.x;
int yindex=blockDim.y*blockIdx.y+threadIdx.y;
// if(xindex<size_x && yindex<size_y)
// {
int index=xindex+size_x*yindex;
ret[index]=xindex;
ret[index+size_x*size_y]=yindex;
// }
}
int main()
{
int* ret=0;
int host_ret={0};
int i=0;
cudaMalloc((void**) &ret, sizeof(int)*(size_x*size_y*2));
dim3 grid(size_x / BLOCK_DIM, 1);
dim3 block(BLOCK_DIM,1,1);
ThreadDemo1<<<grid,block>>>(ret);
cudaMemcpy(&host_ret, ret, sizeof(int)*(size_x*size_y*2), cudaMemcpyDeviceToHost);
for(i=0; i<size_x*size_y; i++)
{
printf("(%d,%d)",host_ret[i],host_ret);
}
cudaFree(ret);
}
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
__global__ void mykernel(float * A1, float * A2, float * R)
{
int p = threadIdx.x;
R[p] = A1[p] + A2[p];
}
int main()
{
float A1[]={1,2,3,4,5,6,7,8,9};
float A2[]={10,20,30,40,50,60,70,80,90};
float R[9];
int taille_mem=sizeof(float) * 9;
// on alloue de la memoire sur la carte graphique
float * a1_device;
float * a2_device;
float * r_device;
cudaMalloc ( (void**) &a1_device, taille_mem);
cudaMalloc ( (void**) &a2_device, taille_mem);
cudaMalloc ( (void**) &r_device, taille_mem);
// on copie les donnees sur la carte
cudaMemcpy( a1_device,A1,taille_mem,cudaMemcpyHostToDevice);
cudaMemcpy( a2_device,A2,taille_mem,cudaMemcpyHostToDevice);
//9 additions, aucune boucle !
mykernel<<<1,9>>>(a1_device,a2_device,r_device);
// on recupere le resultat
cudaMemcpy(R,r_device,taille_mem,cudaMemcpyDeviceToHost);
//sortie à l'ecran
for(int i=0; i<9; i++)
printf("%f\n",R[i]);
cudaFree(a1_device);
cudaFree(a2_device);
cudaFree(r_device);
}
From these two programs, I know that the index of the threads will increase automatically(can I say that?) and we don’t need to use the loop.
But in the following program, why don’t they function? Why cannot the variable “sum” caculate the sum of the pixel*poids?
__global__ void thread_kernel(int *pixel,int *poids,int *sum,int *asum,int *ret)
{
int i=blockIdx.x*blockDim.x+threadIdx.x;
int n=8;
*sum=0;
*asum=0;
if(i<n)
{
*sum=*sum+pixel[i]*poids[i];
*asum=*asum+poids[i];
ret[i]=i;
}
}
I wonder how the threads function…