Hi everyone,
I just learned CUDA, and I found something that makes me confused.
Maybe my questions are really simple. (noob detected :D)
I have this code with data structure like this:
#include <stdio.h>
//data strucutre
typedef struct{
int column[22];
int total;
}row;
typedef row table[100];
//count the sum of each row
void count(table *p){
int i,j;
int temp;
for(i=0;i<100;i++){
temp=0;
for(j=0;j<22;j++){
temp=temp+(*p)[i].column[j];
}
(*p)[i].total=temp;
}
}
//main program
int main(){
int i,j;
table p;
//initialize table
for(i=0;i<100;i++){
for(j=0;j<22;j++){
p[i].column[j]=j;
}
p[i].total=0;
}
count(&p);
}
I’d like to calculate the sum of each row and put it in total data field.
My questions are simple:
-
How to allocate data structure like that in device memory?
-
How can I copy the data from host memory to device memory?
-
How can I invoke the kernel to run count procedure on GPU?
Thanks :)
I’m sorry… I have found out the solution but I’m nor really sure.
Please let me know if there’s something wrong with this, thanks again :)
#include <stdio.h>
typedef struct{
int column[22];
int total;
}row;
// Device code
__global__ void VecAdd(row* A, int N){
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j,temp;
if (i < N){
temp=0;
for(j=0;j<22;j++){
temp=temp+A[i].column[j];
}
A[i].total=temp;
}
}
int main(){
int i,j;
int N = 100;
size_t size = N * sizeof(row);
// Allocate input vectors h_P in host memory
row* h_P = (row*)malloc(size);
// Initialize table data
for(i=0;i<N;i++){
for(j=0;j<22;j++){
h_P[i].column[j]=j;
}
h_P[i].total=0;
}
// Allocate vectors d_P in device memory
row* d_P;
cudaMalloc((void**)&d_P, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_P, h_P, size, cudaMemcpyHostToDevice);
// Invoke kernel (I'm still confused in this part)
int threadsPerBlock = 256; // why 256?
int blocksPerGrid = (N+threadsPerBlock-1)/threadsPerBlock; // why (N+threadsPerBlock-1)/threadsPerBlock ?
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_P, N);
// Copy result from device memory to host memory
cudaMemcpy(h_P, d_P, size, cudaMemcpyDeviceToHost);
// Output
for(i=0;i<N;i++){
for(j=0;j<22;j++){
printf("%d ",h_P[i].column[j]);
}
printf("Total=%d\n",h_P[i].total);
}
// Free device memory
cudaFree(d_P);
// Free host memory
free(h_P);
}