Hi,
i’m trying to write a program to count how many times a number is contained in certain part of an array and consequentially increment the relative index of the number in another array.
The code:
#include <stdio.h>
#include <cutil_inline.h>
#include <cuda_runtime_api.h>
#include "parallelcount.h"
#include "cuPrintf.cu"
__global__ void VecAdd(int* d_vecE, int* B, int offs, int endset)
{
int i = threadIdx.x;
i += offs;
if (i < endset)
{
B[d_vecE[i]]++;
cuPrintf("i: %d, Value is: %d\n",i, B[d_vecE[i]]);
}
}
extern "C"
int functionCUDA(int* NghCounts, sparsegraph *sg, int *lab, int ind0, int ind2MainThr )
{
int* d_vecE;
int* d_vecNC;
int i = 0;
int iterations = 0;
int j1 = 0;
int iend1 = 0;
int threadsPerBlock = 256;
int blocksPerGrid = 0;
size_t sizeE = sg->elen;
size_t sizeNC = (sg->nv) * sizeof(int);
cudaPrintfInit();
// Allocate vectors in device memory
cutilSafeCall( cudaMalloc((void**)&d_vecE, sizeE) );
cutilSafeCall( cudaMalloc((void**)&d_vecNC, sizeNC) );
// Copy vectors from host memory to device memory
cutilSafeCall( cudaMemcpy(d_vecE, sg->e, sizeE, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_vecNC, NghCounts, sizeNC, cudaMemcpyHostToDevice) );
/*printf("N vertex: %d\n",sg->nv);
printf("Ind0: %d\n",ind0);
printf("ind2MainThr: %d\n",ind2MainThr);
printf("elen: %d\n",sg->elen);
*/
for (i = ind0; i < ind2MainThr; i++) {
j1 = sg->v[lab[i]];
iend1 = j1+sg->d[lab[i]];
printf("%d) j1: %d\n",i-ind0+1,j1);
printf("%d) iend1: %d\n",i-ind0+1,iend1);
iterations = sg->d[lab[i]];
printf("%d) iterations: %d\n",i-ind0+1,iterations);
blocksPerGrid = (iterations + threadsPerBlock - 1) / threadsPerBlock;
printf("%d) blocksPerGrid: %d\n",i-ind0+1,blocksPerGrid);
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_vecE, d_vecNC, j1, iend1);
cudaPrintfDisplay(stdout, true);
printf("%d\n",i-ind0+1);
cutilCheckMsg("kernel launch failure");
}
#ifdef _DEBUG
cutilSafeCall( cudaThreadSynchronize() );
#endif
// Copy result from device memory to host memory
cutilSafeCall( cudaMemcpy(NghCounts, d_vecNC, sizeNC, cudaMemcpyDeviceToHost) );
// Free device memory
if (d_vecE)
cudaFree(d_vecE);
if (d_vecNC)
cudaFree(d_vecNC);
cudaPrintfEnd();
return 0;
}
sg->e is the array containing the numbers, NghCounts is the array containing the count of the numbers in the sg->e.
j1 is the offset from i have to start searching in sg->e, iend1 is the offset i have to stop searching in sg->e, so if j1==5 and end1==7 i have to search in sg->e[5],sg->e[6],sg->e[7]. Again if sg->e[5]== 10, sg->e[6]==20, sg->e[7]==2, i have to do: NghCounts[2]++, NghCounts[10]++, NghCounts[20]++.
So what is the problem?
The problem is that running this code, i get “cutilCheckMsg() CUTIL CUDA error : kernel launch failure : unspecified launch failure.” on the line after the kernel call.
If i remove the line : “i += offs;” in the kernel function, i get non error but obviously is not a solution because i have to check only a part of the array and deleting that line i suppose to check all the array, right?
So what i’m doing wrong?
Thanks