Hi,
I’m trying a simple GPU based array copy and assignment. With the following code :
[codebox]#include
#include <cuda.h>
using namespace std;
global void doNothing(int *A,int *B )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
A[i] = B[i] = i;
}
int main(int argc,char **argv)
{
int n=(argc==2?atoi(argv[1]):25);
int a[n],b[n];
int block_size, block_no;
block_size=64;
block_no = (int)((n-1)/block_size)+1;
dim3 dimBlock(block_size,1,1);
dim3 dimGrid(block_no,1,1);
for(int i=0;i<n;i++)
a[i]=rand()%35,b[i]=rand()%25;
int *a_d,*b_d;
cudaMalloc((void **)&a_d,n);
cudaMalloc((void **)&b_d,n);
cudaMemcpy(a_d,a,n*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(b_d,b,n*sizeof(int),cudaMemcpyHostToDevice);
doNothing<<<block_no,block_size>>>(a_d,b_d);
cudaThreadSynchronize();
cudaMemcpy(a,a_d,n*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(b,b_d,n*sizeof(int),cudaMemcpyDeviceToHost);
for(int i=0;i<n;i++)
cout<<"a["<<i<<"]= "<<a[i]<<" ---- b["<<i<<"]= "<<b[i]<<endl;
cudaFree(a_d);
cudaFree(b_d);
return 0;
}[/codebox]
Now, with just 25 elements, i get the output :
[codebox]$ ./cudaarray2
a[0]= 0 ---- b[0]= 0
a[1]= 1 ---- b[1]= 1
a[2]= 2 ---- b[2]= 2
a[3]= 3 ---- b[3]= 3
a[4]= 4 ---- b[4]= 4
a[5]= 5 ---- b[5]= 5
a[6]= 6 ---- b[6]= 6
a[7]= 7 ---- b[7]= 7
a[8]= 8 ---- b[8]= 8
a[9]= 9 ---- b[9]= 9
a[10]= 10 ---- b[10]= 10
a[11]= 11 ---- b[11]= 11
a[12]= 12 ---- b[12]= 12
a[13]= 13 ---- b[13]= 13
a[14]= 14 ---- b[14]= 14
a[15]= 15 ---- b[15]= 15
a[16]= 16 ---- b[16]= 16
a[17]= 17 ---- b[17]= 17
a[18]= 18 ---- b[18]= 18
a[19]= 19 ---- b[19]= 19
a[20]= 20 ---- b[20]= 20
a[21]= 21 ---- b[21]= 21
a[22]= 22 ---- b[22]= 22
a[23]= 23 ---- b[23]= 23
a[24]= 24 ---- b[24]= 24[/codebox]
while with 65 elements, I get the following output:
[codebox]$ ./cudaarray2 65
a[0]= 0 ---- b[0]= 64
a[1]= 1 ---- b[1]= 65
a[2]= 2 ---- b[2]= 66
a[3]= 3 ---- b[3]= 67
a[4]= 4 ---- b[4]= 68
a[5]= 5 ---- b[5]= 69
a[6]= 6 ---- b[6]= 70
a[7]= 7 ---- b[7]= 71
a[8]= 8 ---- b[8]= 72
a[9]= 9 ---- b[9]= 73
a[10]= 10 ---- b[10]= 74
a[11]= 11 ---- b[11]= 75
a[12]= 12 ---- b[12]= 76
a[13]= 13 ---- b[13]= 77
a[14]= 14 ---- b[14]= 78
a[15]= 15 ---- b[15]= 79
a[16]= 16 ---- b[16]= 80
a[17]= 17 ---- b[17]= 81
a[18]= 18 ---- b[18]= 82
a[19]= 19 ---- b[19]= 83
a[20]= 20 ---- b[20]= 84
a[21]= 21 ---- b[21]= 85
a[22]= 22 ---- b[22]= 86
a[23]= 23 ---- b[23]= 87
a[24]= 24 ---- b[24]= 88
a[25]= 25 ---- b[25]= 89
a[26]= 26 ---- b[26]= 90
a[27]= 27 ---- b[27]= 91
a[28]= 28 ---- b[28]= 92
a[29]= 29 ---- b[29]= 93
a[30]= 30 ---- b[30]= 94
a[31]= 31 ---- b[31]= 95
a[32]= 32 ---- b[32]= 96
a[33]= 33 ---- b[33]= 97
a[34]= 34 ---- b[34]= 98
a[35]= 35 ---- b[35]= 99
a[36]= 36 ---- b[36]= 100
a[37]= 37 ---- b[37]= 101
a[38]= 38 ---- b[38]= 102
a[39]= 39 ---- b[39]= 103
a[40]= 40 ---- b[40]= 104
a[41]= 41 ---- b[41]= 105
a[42]= 42 ---- b[42]= 106
a[43]= 43 ---- b[43]= 107
a[44]= 44 ---- b[44]= 108
a[45]= 45 ---- b[45]= 109
a[46]= 46 ---- b[46]= 110
a[47]= 47 ---- b[47]= 111
a[48]= 48 ---- b[48]= 112
a[49]= 49 ---- b[49]= 113
a[50]= 50 ---- b[50]= 114
a[51]= 51 ---- b[51]= 115
a[52]= 52 ---- b[52]= 116
a[53]= 53 ---- b[53]= 117
a[54]= 54 ---- b[54]= 118
a[55]= 55 ---- b[55]= 119
a[56]= 56 ---- b[56]= 120
a[57]= 57 ---- b[57]= 121
a[58]= 58 ---- b[58]= 122
a[59]= 59 ---- b[59]= 123
a[60]= 60 ---- b[60]= 124
a[61]= 61 ---- b[61]= 125
a[62]= 62 ---- b[62]= 126
a[63]= 63 ---- b[63]= 127
a[64]= 64 ---- b[64]= 64[/codebox]
Now, I know I’m hitting some limit, but how is b[0] getting set to a wrong value of 64? The problem is I cannot figure what limit I’m hitting which is causing it to wrap around. The same issue crops up if I increase the block_size variable in the code to anything over 64.
The second question is that a[i] and b[i] are set on the same line of code, does CUDA parallelize lines of code across threads too? or only kernels?
For clarity, my system config is :
Mac Book Pro/ 8600M GT, Compute Capability 1.1
[codebox]$ deviceQuery
There is 1 device supporting CUDA
Device 0: “GeForce 8600M GT”
Major revision number: 1
Minor revision number: 1
Total amount of global memory: 268238848 bytes
Number of multiprocessors: 4
Number of cores: 32
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 8192
Warp size: 32
Maximum number of threads per block: 512
Maximum sizes of each dimension of a block: 512 x 512 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 262144 bytes
Texture alignment: 256 bytes
Clock rate: 0.75 GHz
Concurrent copy and execution: Yes[/codebox]
- Sahil