CUDA Threads

andreaaz · March 26, 2008, 4:38pm

excuse me for my english(i am italian):

I have the following program, with which I would like to print the the number of threads that run the program:

GPU:

global void testGPU(int *primoParametro,int *secondoParametro,int *risultatoGPU,int *numeroT){

unsigned int tidx= threadIdx.x;

unsigned int tidy= threadIdx.y;

__syncthreads();

numeroT[0] = *primoParametro;

numeroT[1] = *secondoParametro;

numeroT[2] = *risultatoGPU;

numeroT[4] = tidx;

numeroT[5] = tidy;

numeroT[7] = blockDim.x;

numeroT[8] = blockIdx.x;

numeroT[10 + tidx] = 999;

__syncthreads();

}

[b]CPU:

[/b]

[i]void testGPU(){

int *risultato,*risultatoGPU;

int *primoParametroGPU,*secondoParametroGPU,*numeroTGPU;

int *primoParametro,*secondoParametro;

int *numeroT;

printf(“CPU…Alloco la memoria.\n”);

primoParametro = (int *) malloc(sizeof(int));

secondoParametro = (int *) malloc(sizeof(int));

risultato  = (int *)malloc(sizeof(int));

numeroT = (int *)malloc(sizeof(int)*512);

for(int i=0;i<275;i++){

    numeroT[i] = 0;

}

for(int i=0;i<255;i++){

    printf("\nThreadh n.%d = %d",i,numeroT[i]);

}

*primoParametro = 8;

*secondoParametro = 88;

*risultato = 17;    

printf("1) Initializzo DEVICE...\n");

CUT_DEVICE_INIT();

printf(“1)Preparo la memoria della GPU\n”);

CUDA_SAFE_CALL( cudaMalloc((void **)&primoParametroGPU, sizeof(int *)));

CUDA_SAFE_CALL( cudaMalloc((void **)&secondoParametroGPU, sizeof(int *)));

CUDA_SAFE_CALL( cudaMalloc((void **)&risultatoGPU, sizeof(int *)));

CUDA_SAFE_CALL( cudaMalloc((void **)&numeroTGPU, sizeof(int *)));

printf(“2)Copio i dati nella memoria appena creata\n”);

CUDA_SAFE_CALL( cudaMemcpy(primoParametroGPU, primoParametro, sizeof(int), cudaMemcpyHostToDevice));

CUDA_SAFE_CALL( cudaMemcpy(secondoParametroGPU, secondoParametro, sizeof(int), cudaMemcpyHostToDevice));

CUDA_SAFE_CALL( cudaMemcpy(numeroTGPU, numeroT, sizeof(int)*512, cudaMemcpyHostToDevice));

CUDA_SAFE_CALL( cudaMemcpy(risultatoGPU, risultato, sizeof(int), cudaMemcpyHostToDevice));

printf(“Inizializzazione effettuata\n”);

printf(“3)Eseguo GPU kernel…\n”);

CUDA_SAFE_CALL( cudaThreadSynchronize() );

testGPU<<<128, 256>>>(primoParametroGPU, secondoParametroGPU,risultatoGPU,numeroTGPU );

CUT_CHECK_ERROR("testGPU() esecuzione Fallita\n");

CUDA_SAFE_CALL( cudaThreadSynchronize() );

printf(“4)Leggo i dati provenienti dalla GPU\n”);

CUDA_SAFE_CALL( cudaMemcpy(risultato, risultatoGPU, sizeof(int), cudaMemcpyDeviceToHost) );

CUDA_SAFE_CALL( cudaMemcpy(numeroT, numeroTGPU, sizeof(int)*512, cudaMemcpyDeviceToHost) );

printf(“Risultato GPU = %d\n”,*risultato);

for(int i=0;i<275;i++){

    printf("\nThreadh n.%d = %d",i,numeroT[i]);

}

printf(“Shutting down…\n”);

CUDA_SAFE_CALL(cudaFree(risultatoGPU));

CUDA_SAFE_CALL(cudaFree(numeroTGPU));

}

[/i]

and i have te follow response:

…

Initializzo DEVICE…

1)Preparo la memoria della GPU

2)Copio i dati nella memoria appena creata

Inizializzazione effettuata

3)Eseguo GPU kernel…

4)Leggo i dati provenienti dalla GPU

Risultato GPU = 17

Threadh n.0 = 8

Threadh n.1 = 88

Threadh n.2 = 17

Threadh n.3 = 0

Threadh n.4 = 191

Threadh n.5 = 0

Threadh n.6 = 0

Threadh n.7 = 256

Threadh n.8 = 127

Threadh n.9 = 0

Threadh n.10 = 999

Threadh n.11 = 999

Threadh n.12 = 999

Threadh n.13 = 999

…

Threadh n.265 = 999

why are Threadh n.10…265 equals to 999? Should only be an equal to 999?

DenisR · March 26, 2008, 7:04pm

you are calling your kernel with 256 threads (1D)

numeroT[10 + tidx] = 999; means that element 10 + 0 till 10+255 will be filled with 999, so

Threadh n.10 = 999
Threadh n.11 = 999
Threadh n.12 = 999
Threadh n.13 = 999
…
Threadh n.265 = 999

Is correct

andreaaz · March 26, 2008, 8:56pm

ok…

Then once defined the number of thread the single instruction is performed by each thread?

instruction 1: numeroA[10 + tidx]:

instruction 2: numeroB[10 + tidx]:

First instruction 1 x all thread

td0: numeroA[10 + 0]

td1: numeroA[10 + 1]

…

Next instruction 2 x all thread

td0: numeroB[10 + 0]

td1: numeroB[10 + 1]

…

In what order?It’isn’t definied…right?

Thanks

DenisR · March 27, 2008, 8:42am

the kernel function is being executed by all threads of the block yes. The only thing defined is that they are run in warps of 32 threads in ‘parallel’ (actually 4x 8 threads parallel)

Topic		Replies	Views
threads in a loop threads go missing CUDA Programming and Performance	13	8522	September 9, 2008
Urgent help with threads please! CUDA Programming and Performance	21	11055	March 6, 2008
Complete Novice Question Question on the basic implementation of a kernel CUDA Programming and Performance	6	4454	October 27, 2009
Understanding Threads in CUDA help me find the exact number of threads for my code CUDA Programming and Performance	4	2444	July 13, 2009
how thread function thread,cuda CUDA Programming and Performance	0	1171	September 11, 2009
Different thread values in the same cycle CUDA Programming and Performance	3	698	October 23, 2014
Can some one check this for me please..... Newbie needs help learning CUDA Programming and Performance	2	2679	April 10, 2008
Help me! CUDA Programming and Performance	5	2066	February 9, 2010
Simple question on passing to the kernel CUDA Programming and Performance	15	3628	January 15, 2012
How many parallel threads? CUDA Programming and Performance	19	10621	October 1, 2021

CUDA Threads

Related topics