Confusing Execution Output

Hi,

I’m trying a simple GPU based array copy and assignment. With the following code :

[codebox]#include

#include <cuda.h>

using namespace std;

global void doNothing(int *A,int *B )

{

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    A[i] = B[i] = i;

}

int main(int argc,char **argv)

{

    int n=(argc==2?atoi(argv[1]):25);

    int a[n],b[n];

int block_size, block_no;

block_size=64;

    block_no = (int)((n-1)/block_size)+1;

    dim3 dimBlock(block_size,1,1);

    dim3 dimGrid(block_no,1,1);

for(int i=0;i<n;i++)

            a[i]=rand()%35,b[i]=rand()%25;

int *a_d,*b_d;

cudaMalloc((void **)&a_d,n);

    cudaMalloc((void **)&b_d,n);

cudaMemcpy(a_d,a,n*sizeof(int),cudaMemcpyHostToDevice);

    cudaMemcpy(b_d,b,n*sizeof(int),cudaMemcpyHostToDevice);

doNothing<<<block_no,block_size>>>(a_d,b_d);

    cudaThreadSynchronize();

cudaMemcpy(a,a_d,n*sizeof(int),cudaMemcpyDeviceToHost);

    cudaMemcpy(b,b_d,n*sizeof(int),cudaMemcpyDeviceToHost);

for(int i=0;i<n;i++)

            cout<<"a["<<i<<"]= "<<a[i]<<" ---- b["<<i<<"]= "<<b[i]<<endl;

cudaFree(a_d);

    cudaFree(b_d);

return 0;

}[/codebox]

Now, with just 25 elements, i get the output :

[codebox]$ ./cudaarray2

a[0]= 0 ---- b[0]= 0

a[1]= 1 ---- b[1]= 1

a[2]= 2 ---- b[2]= 2

a[3]= 3 ---- b[3]= 3

a[4]= 4 ---- b[4]= 4

a[5]= 5 ---- b[5]= 5

a[6]= 6 ---- b[6]= 6

a[7]= 7 ---- b[7]= 7

a[8]= 8 ---- b[8]= 8

a[9]= 9 ---- b[9]= 9

a[10]= 10 ---- b[10]= 10

a[11]= 11 ---- b[11]= 11

a[12]= 12 ---- b[12]= 12

a[13]= 13 ---- b[13]= 13

a[14]= 14 ---- b[14]= 14

a[15]= 15 ---- b[15]= 15

a[16]= 16 ---- b[16]= 16

a[17]= 17 ---- b[17]= 17

a[18]= 18 ---- b[18]= 18

a[19]= 19 ---- b[19]= 19

a[20]= 20 ---- b[20]= 20

a[21]= 21 ---- b[21]= 21

a[22]= 22 ---- b[22]= 22

a[23]= 23 ---- b[23]= 23

a[24]= 24 ---- b[24]= 24[/codebox]

while with 65 elements, I get the following output:

[codebox]$ ./cudaarray2 65

a[0]= 0 ---- b[0]= 64

a[1]= 1 ---- b[1]= 65

a[2]= 2 ---- b[2]= 66

a[3]= 3 ---- b[3]= 67

a[4]= 4 ---- b[4]= 68

a[5]= 5 ---- b[5]= 69

a[6]= 6 ---- b[6]= 70

a[7]= 7 ---- b[7]= 71

a[8]= 8 ---- b[8]= 72

a[9]= 9 ---- b[9]= 73

a[10]= 10 ---- b[10]= 74

a[11]= 11 ---- b[11]= 75

a[12]= 12 ---- b[12]= 76

a[13]= 13 ---- b[13]= 77

a[14]= 14 ---- b[14]= 78

a[15]= 15 ---- b[15]= 79

a[16]= 16 ---- b[16]= 80

a[17]= 17 ---- b[17]= 81

a[18]= 18 ---- b[18]= 82

a[19]= 19 ---- b[19]= 83

a[20]= 20 ---- b[20]= 84

a[21]= 21 ---- b[21]= 85

a[22]= 22 ---- b[22]= 86

a[23]= 23 ---- b[23]= 87

a[24]= 24 ---- b[24]= 88

a[25]= 25 ---- b[25]= 89

a[26]= 26 ---- b[26]= 90

a[27]= 27 ---- b[27]= 91

a[28]= 28 ---- b[28]= 92

a[29]= 29 ---- b[29]= 93

a[30]= 30 ---- b[30]= 94

a[31]= 31 ---- b[31]= 95

a[32]= 32 ---- b[32]= 96

a[33]= 33 ---- b[33]= 97

a[34]= 34 ---- b[34]= 98

a[35]= 35 ---- b[35]= 99

a[36]= 36 ---- b[36]= 100

a[37]= 37 ---- b[37]= 101

a[38]= 38 ---- b[38]= 102

a[39]= 39 ---- b[39]= 103

a[40]= 40 ---- b[40]= 104

a[41]= 41 ---- b[41]= 105

a[42]= 42 ---- b[42]= 106

a[43]= 43 ---- b[43]= 107

a[44]= 44 ---- b[44]= 108

a[45]= 45 ---- b[45]= 109

a[46]= 46 ---- b[46]= 110

a[47]= 47 ---- b[47]= 111

a[48]= 48 ---- b[48]= 112

a[49]= 49 ---- b[49]= 113

a[50]= 50 ---- b[50]= 114

a[51]= 51 ---- b[51]= 115

a[52]= 52 ---- b[52]= 116

a[53]= 53 ---- b[53]= 117

a[54]= 54 ---- b[54]= 118

a[55]= 55 ---- b[55]= 119

a[56]= 56 ---- b[56]= 120

a[57]= 57 ---- b[57]= 121

a[58]= 58 ---- b[58]= 122

a[59]= 59 ---- b[59]= 123

a[60]= 60 ---- b[60]= 124

a[61]= 61 ---- b[61]= 125

a[62]= 62 ---- b[62]= 126

a[63]= 63 ---- b[63]= 127

a[64]= 64 ---- b[64]= 64[/codebox]

Now, I know I’m hitting some limit, but how is b[0] getting set to a wrong value of 64? The problem is I cannot figure what limit I’m hitting which is causing it to wrap around. The same issue crops up if I increase the block_size variable in the code to anything over 64.

The second question is that a[i] and b[i] are set on the same line of code, does CUDA parallelize lines of code across threads too? or only kernels?

For clarity, my system config is :

Mac Book Pro/ 8600M GT, Compute Capability 1.1

[codebox]$ deviceQuery

There is 1 device supporting CUDA

Device 0: “GeForce 8600M GT”

Major revision number: 1

Minor revision number: 1

Total amount of global memory: 268238848 bytes

Number of multiprocessors: 4

Number of cores: 32

Total amount of constant memory: 65536 bytes

Total amount of shared memory per block: 16384 bytes

Total number of registers available per block: 8192

Warp size: 32

Maximum number of threads per block: 512

Maximum sizes of each dimension of a block: 512 x 512 x 64

Maximum sizes of each dimension of a grid: 65535 x 65535 x 1

Maximum memory pitch: 262144 bytes

Texture alignment: 256 bytes

Clock rate: 0.75 GHz

Concurrent copy and execution: Yes[/codebox]

  • Sahil

You probably want to do some bounds checking on your array:

__global__ void doNothing(int *A,int *B, int n )

{

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < n)

   {

   A[i] = B[i] = i;

   }

}

You also probably want to allocate n*sizeof(int) bytes for your device arrays.

I didn’t notice any other errors.

Oh blimey! Thanks for the pointer MisterAnderson. The malloc sure was faulty… I guess I’d better refresh my knowledge before I go further.

  • Sahil