#include <stdlib.h>
#include <stdio.h>
/*
device forceinline int get_global_index(int a,int base)
{
for(int i=0;i<3;i++){
a[i+base]=base;
}
return base;
}/
global void kernel1(int *array, int result, int row)
{
const int index = blockIdx.xblockDim.x+threadIdx.x;
int base =index*3;
result[index]=base;
for(int i=0;i<3;i++){
array[i+base]=base;
}
//get_global_index(&array[0],base); }
}
int main(void)
{
int *A,*G_A,R,G_R,aa;
cudaError_t err;
A=(int )malloc(3sizeof(int));
R=(int )malloc(3sizeof(int));
cudaMalloc((void **)&G_A,3sizeof(int));
cudaMalloc((void **)&G_R,3sizeof(int));
for(int i=0;i<3;i++){
for(int j=0;j<3;j++){
scanf(“%d”,&aa);
A[i+j*3]=aa;
}
}
cudaMemcpy(G_A,A,3*sizeof(int),cudaMemcpyHostToDevice);
kernel1<<<1,3>>>(G_A,G_R,3);
cudaDeviceSynchronize();
err=cudaMemcpy(R,G_R,3*sizeof(int),cudaMemcpyDeviceToHost);
printf("CUDA Memcpy G_R : %s\n", cudaGetErrorString(err));
err=cudaMemcpy(A,G_A,3*sizeof(int),cudaMemcpyDeviceToHost);
printf("CUDA Memcpy G_A : %s\n", cudaGetErrorString(err));
for(int i=0;i<3;i++){
for(int j=0;j<3;j++){
printf("%d\t",A[i+j*3]);
}
printf("\n");
}
for(int i=0;i<3;i++){
printf("The answer is %d\n",R[i]);
}
free(R);free(A);
cudaFree(G_A);cudaFree(G_R);
return 0;
}