please run this there is a run time error

Hi all can any body please run this code, and see if there is a error while the execution…

my result —

bibrak@biebo-laptop:/media/Academics/Academic/Research/HPC/CUDA/Iam new to CUDA/test/fence$ make

nvcc -run -I/home/bibrak/NVIDIA_CUDA_SDK/common/inc -I/usr/local/cuda/include -L/home/bibrak/NVIDIA_CUDA_SDK/lib -lcutil -L/usr/local/cuda/lib -lcudart -lcuda test.cu

total mem: 128.284 MB, free: 1024.395 MB, used : 3199.889 MB

total mem: 255.312 MB, free: 114.434 MB, used : 140.879 MB

cudaSafeCall() Runtime API error in file <test.cu>, line 192 : unspecified launch failure.

make: *** [all] Error 255

[codebox]/*

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

device unsigned int count = 0;

shared bool isLastBlockDone;

device shared float partialSum ;

const int N = 1024;

const int p = 2;

device float calPartialSum(const float *array,int i);

device float calculateTotalSum(const float *result);

global void sum(const float* array,float* result)

      {

	

	int i = threadIdx.x;

	if(i == 0 ){

	partialSum = 0;

	}

	__syncthreads();

               // Each block sums a subset of the input array

	

            float partialSum_thread = calPartialSum(array, i);

	partialSum += partialSum_thread;

	__threadfence_block(); // so that effect of one thread is vissible 

				// to all

if (threadIdx.x == 0) {

                     // Thread 0 of each block stores the partial sum

                     // to global memory

                     result[blockIdx.x] = partialSum;

                     // Thread 0 makes sure its result is visible to

                     // all other threads

                     __threadfence();

                     // Thread 0 of each block signals that it is done

                    // unsigned int value = atomicInc(&count, gridDim.x);

                    	 unsigned int value = count++;

		// Thread 0 of each block determines if its block is

                     // the last block to be done

                     isLastBlockDone = (value == (gridDim.x - 1));

               }

               // Synchronize to make sure that each thread reads

               // the correct value of isLastBlockDone

               __syncthreads();

if (isLastBlockDone) {

                     // The last block sums the partial sums

                     // stored in result[0 .. gridDim.x-1]

                     float totalSum = calculateTotalSum(result);

                     if (threadIdx.x == 0) {

                         // Thread 0 of last block stores total sum

                         // to global memory and resets count so that

                         // next kernel call works properly

                         result[0] = totalSum;

                         count = 0;

                     }

                 }

     }

// this funtion will calculate partial sum of the array

// the number of elements will be p

device float calPartialSum(const float *array,int i){

int start = i*p;

int end = start + p;

float pSum = 0;

for(int q = start; q<end ; q++){



pSum += array[q];

}

return pSum;

}

device float calculateTotalSum(const float *result){

float total = 0;

for (int q = 0; q<gridDim.x; q++){

	total += result[q];

}

return total;

}

int main(){

float *array;

float *result;

float *D_result;

float *D_array;

dim3 threads(256);

dim3 grid( (N/p) / threads.x );

//[debug]

unsigned int free_mem,total_mem, used_mem;

cuMemGetInfo( &free_mem, &total_mem );

used_mem = total_mem-free_mem;

printf(“total mem: %0.3f MB, free: %0.3f MB, used : %0.3f MB\n”,

((double)total_mem)/1024.0/1024.0,

((double)free_mem )/1024.0/1024.0,

((double)used_mem )/1024.0/1024.0 );

//[end debug]

array = (float*) malloc(N*sizeof(float));

size_t size_array = N * sizeof(float);

cutilSafeCall(cudaMalloc((void**)&D_array, size_array));

for(int w=0 ;w<N;w++){

array[w] = 1;

}

cutilSafeCall(cudaMemcpy(D_array,array,size_array,cudaMemcpy

HostToDevice));

size_t size_result = grid.x * sizeof(float);

result = (float*) malloc(size_result);

cutilSafeCall(cudaMalloc((void**)&D_result, size_result));

//cutilSafeCall(cudaMemcpy(D_result,array,size,cudaMemcpyHostT

oDevice));

sum<<<grid,threads>>>(D_array,D_result);

//[debug]

cuMemGetInfo( &free_mem, &total_mem );

used_mem = total_mem-free_mem;

printf(“total mem: %0.3f MB, free: %0.3f MB, used : %0.3f MB\n”,

((double)total_mem)/1024.0/1024.0,

((double)free_mem )/1024.0/1024.0,

((double)used_mem )/1024.0/1024.0 );

//[end debug]

//cutilSafeCall(cudaMemcpy(C.elements,d_C.elements,sizeC,cudaM

emcpyDeviceToHost));

cutilSafeCall(cudaMemcpy(result,D_result,size_result,cudaMem

cpyDeviceToHost));

//print result and free the memory

printf("The total sum = %f \n",result[0]);

free(array);

free(result);

cudaFree(D_result);

cudaFree(D_array);

return 0;

}

[/codebox]

I don’t mind trying to help, but when trying to build, I get:
make
nvcc -run -I/opt/cuda/sdk/C/common/inc/ -L/opt/cuda/sdk/C/lib -lcutil -L/opt/cuda/lib -lcudart -lcuda main.cu
main.cu(119): error: identifier “size” is undefined

main.cu(134): error: identifier “C” is undefined

main.cu(134): error: identifier “d_C” is undefined

main.cu(134): error: identifier “sizeC” is undefined

I would try commenting out stuff until I could determine where the problem occurs. Then move on to why.

i have attached the file(code)
test.cu (4.52 KB)

GTX295

total mem: 0.000 MB, free: 38.806 MB, used : 4057.194 MB

total mem: 895.750 MB, free: 849.434 MB, used : 46.316 MB

The total sum = 4.000000

TeslaC1060

total mem: 0.000 MB, free: 37.744 MB, used : 4058.256 MB

total mem: 4095.750 MB, free: 4049.434 MB, used : 46.316 MB

The total sum = 6.000000

what’s your kerel “sum” do? sum over all elememts in an array?

if so, I cannot understand your kernel code.

nvcc -run -I/opt/cuda/sdk/C/common/inc/ -L/opt/cuda/sdk/C/lib -lcutil -L/opt/cuda/lib -lcudart -lcuda test.cu
total mem: 0.000 MB, free: 0.000 MB, used : 0.000 MB
total mem: 511.688 MB, free: 320.820 MB, used : 190.867 MB
The total sum = 8.000000

No launch failure for me

thanks for the help …

i have implemented the addition of elements of array given in programing guide page 117.

i cannot understand why its giving me launch failure …

As I dislike repeating myself, I refer back to my first post.