Hi,
I’m trying to pass an integer array to a CUDA global routine, which then passes the array as a parameter to a CUDA device routine. (This is actually part of a numerical integration routine I’m working on, but I’ve pared down that code to this simpler one which still produces the same error.)
I can’t figure out why the integer array is not reproduced correctly in the output of the global routine “routine1” and the device routine “routine2”. Note I’m using the function cuPrintf() to output the arrays within these device functions. Also I’m not clear why the integer array is changed when it is copied back from the device memory into the host array h_conn2. I’m running the code on a Tesla C1060, compute capability 1.3.
Any suggestions on how to improve/fix this code so that all three outputs match the original integer array would be greatly appreciated. Thanks!
Dan
1 #include <math.h>
2 #include <time.h>
3 #include <stdlib.h>
4 #include “cuPrintf.cuh”
5 #include “cuPrintf.cu”
6
7 #include <cuda_runtime.h>
8
9 global void routine1(int *connect, int unit_size){
10 device void routine2(int connect1, int unit_sz);
11
12 int i = blockDim.xblockIdx.x + threadIdx.x;
13 if (i==0){
14 for (int lo=0;lo<unit_size;lo++) cuPrintf(“routine1 %i %i\n”,lo,connec t[lo]);
15 }
16 routine2(connect,unit_size);
17
18 }
19
20 device void routine2(int connect1, int unit_sz){
21
22 int i = blockDim.xblockIdx.x + threadIdx.x;
23 if (i==0){
24 for (int lo=0;lo<unit_sz;lo++) cuPrintf(“routine2 %i %i\n”,lo,connect1 [lo]);
25 }
26 }
27
28 int main(void){
29
30 cudaPrintfInit();
31 int unit_size = 4;
32
33 int *h_conn = (int *)malloc(unit_size);
34 int *h_conn2 = (int *)malloc(unit_size);
35 for (int lo=0;lo<unit_size;lo++) h_conn[lo] = lo;
36 for (int lo=0;lo<unit_size;lo++)
printf(“host initial conn[%i] = %i\n”,lo,h_conn[lo]);
38
39 cudaError_t err = cudaSuccess;
40
41 int *d_conn = NULL;
42 err = cudaMalloc((void **)&d_conn,unit_size);
43
44 err = cudaMemcpy(d_conn,h_conn,unit_size,cudaMemcpyHostToDevice);
45
46 int threadsPerBlock = 4;
47 int blocksPerGrid = (unit_size + threadsPerBlock - 1)/ threadsPerBlock;
48 printf(“CUDA kernel launch with %d blocks of %d threads\n”,blocksPerGrid ,threadsPerBlock);
49 routine1<<<blocksPerGrid , threadsPerBlock>>>(d_conn, unit_size);
50
51 cudaPrintfDisplay(stdout,true);
52 cudaPrintfEnd();
53
54 err = cudaGetLastError();
55
56 err = cudaMemcpy(h_conn2,d_conn,unit_size,cudaMemcpyDeviceToHost);
57
58 for (int lo=0;lo<unit_size;lo++)
59 printf(“host final conn[%i] = %i\n”,lo,h_conn2[lo]);
60
61 err = cudaFree(d_conn);
62 free(h_conn);
63 free(h_conn2);
64
65 err = cudaDeviceReset();
66 return 0;
67
68 }
Output:
host initial conn[0] = 0
host initial conn[1] = 1
host initial conn[2] = 2
host initial conn[3] = 3
CUDA kernel launch with 1 blocks of 4 threads
[0, 0]: routine1 0 0
[0, 0]: routine1 1 1009519056
[0, 0]: routine1 2 1017692322
[0, 0]: routine1 3 1023114360
[0, 0]: routine2 0 0
[0, 0]: routine2 1 1009519056
[0, 0]: routine2 2 1017692322
[0, 0]: routine2 3 1023114360
host final conn[0] = 0
host final conn[1] = 0
host final conn[2] = 0
host final conn[3] = 0