Hi all can any body please run this code, and see if there is a error while the execution…
my result —
bibrak@biebo-laptop:/media/Academics/Academic/Research/HPC/CUDA/Iam new to CUDA/test/fence$ make
nvcc -run -I/home/bibrak/NVIDIA_CUDA_SDK/common/inc -I/usr/local/cuda/include -L/home/bibrak/NVIDIA_CUDA_SDK/lib -lcutil -L/usr/local/cuda/lib -lcudart -lcuda test.cu
total mem: 128.284 MB, free: 1024.395 MB, used : 3199.889 MB
total mem: 255.312 MB, free: 114.434 MB, used : 140.879 MB
cudaSafeCall() Runtime API error in file <test.cu>, line 192 : unspecified launch failure.
make: *** [all] Error 255
[codebox]/*
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil_inline.h>
device unsigned int count = 0;
shared bool isLastBlockDone;
device shared float partialSum ;
const int N = 1024;
const int p = 2;
device float calPartialSum(const float *array,int i);
device float calculateTotalSum(const float *result);
global void sum(const float* array,float* result)
{
int i = threadIdx.x;
if(i == 0 ){
partialSum = 0;
}
__syncthreads();
// Each block sums a subset of the input array
float partialSum_thread = calPartialSum(array, i);
partialSum += partialSum_thread;
__threadfence_block(); // so that effect of one thread is vissible
// to all
if (threadIdx.x == 0) {
// Thread 0 of each block stores the partial sum
// to global memory
result[blockIdx.x] = partialSum;
// Thread 0 makes sure its result is visible to
// all other threads
__threadfence();
// Thread 0 of each block signals that it is done
// unsigned int value = atomicInc(&count, gridDim.x);
unsigned int value = count++;
// Thread 0 of each block determines if its block is
// the last block to be done
isLastBlockDone = (value == (gridDim.x - 1));
}
// Synchronize to make sure that each thread reads
// the correct value of isLastBlockDone
__syncthreads();
if (isLastBlockDone) {
// The last block sums the partial sums
// stored in result[0 .. gridDim.x-1]
float totalSum = calculateTotalSum(result);
if (threadIdx.x == 0) {
// Thread 0 of last block stores total sum
// to global memory and resets count so that
// next kernel call works properly
result[0] = totalSum;
count = 0;
}
}
}
// this funtion will calculate partial sum of the array
// the number of elements will be p
device float calPartialSum(const float *array,int i){
int start = i*p;
int end = start + p;
float pSum = 0;
for(int q = start; q<end ; q++){
pSum += array[q];
}
return pSum;
}
device float calculateTotalSum(const float *result){
float total = 0;
for (int q = 0; q<gridDim.x; q++){
total += result[q];
}
return total;
}
int main(){
float *array;
float *result;
float *D_result;
float *D_array;
dim3 threads(256);
dim3 grid( (N/p) / threads.x );
//[debug]
unsigned int free_mem,total_mem, used_mem;
cuMemGetInfo( &free_mem, &total_mem );
used_mem = total_mem-free_mem;
printf(“total mem: %0.3f MB, free: %0.3f MB, used : %0.3f MB\n”,
((double)total_mem)/1024.0/1024.0,
((double)free_mem )/1024.0/1024.0,
((double)used_mem )/1024.0/1024.0 );
//[end debug]
array = (float*) malloc(N*sizeof(float));
size_t size_array = N * sizeof(float);
cutilSafeCall(cudaMalloc((void**)&D_array, size_array));
for(int w=0 ;w<N;w++){
array[w] = 1;
}
cutilSafeCall(cudaMemcpy(D_array,array,size_array,cudaMemcpy
HostToDevice));
size_t size_result = grid.x * sizeof(float);
result = (float*) malloc(size_result);
cutilSafeCall(cudaMalloc((void**)&D_result, size_result));
//cutilSafeCall(cudaMemcpy(D_result,array,size,cudaMemcpyHostT
oDevice));
sum<<<grid,threads>>>(D_array,D_result);
//[debug]
cuMemGetInfo( &free_mem, &total_mem );
used_mem = total_mem-free_mem;
printf(“total mem: %0.3f MB, free: %0.3f MB, used : %0.3f MB\n”,
((double)total_mem)/1024.0/1024.0,
((double)free_mem )/1024.0/1024.0,
((double)used_mem )/1024.0/1024.0 );
//[end debug]
//cutilSafeCall(cudaMemcpy(C.elements,d_C.elements,sizeC,cudaM
emcpyDeviceToHost));
cutilSafeCall(cudaMemcpy(result,D_result,size_result,cudaMem
cpyDeviceToHost));
//print result and free the memory
printf("The total sum = %f \n",result[0]);
free(array);
free(result);
cudaFree(D_result);
cudaFree(D_array);
return 0;
}
[/codebox]