I am a new beginner in China,and I am coding a example of stencil_1d in a NVIDIA <<CUDA C/C++ Basics>>PDF,
the input array size is 16 and every item is 1.
//#ifndef __CUDACC__
//#define __CUDACC__
//#endif
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<malloc.h>
//以下定义可以使用__syncthreads();
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
#include <device_launch_parameters.h>
#define BLOCKSIZE 16
#define DATASIZE BLOCKSIZE * sizeof(int)
#define RADIUS 3
__global__ void stencil_1d(int *in, int *out)
{
__shared__ int temp[BLOCKSIZE + 2 * RADIUS]; //临时数组-22个元素
int gindex = threadIdx.x + blockDim.x*blockIdx.x;//全局线程索引
int lindex = threadIdx.x + RADIUS;//临时数组中的结果索引
//向共享内存(中的临时数组)读取数据
temp[lindex] = in[gindex];
if (gindex < RADIUS)
{
temp[lindex - RADIUS] = in[gindex - RADIUS];
temp[lindex + BLOCKSIZE] = in[gindex + BLOCKSIZE];
}
__syncthreads();
//cudaThreadSynchronize();//cuda新的同步函数??
//使用漏字板
int sum = 0;
for (int offset = -RADIUS; offset <= RADIUS; offset++)
{
sum += temp[lindex + offset];
}
out[gindex] = sum;
//printf("Test%6d-%4d\n", gindex, out[gindex]);//----------There is a problem,when I added this sentence the result is right,but if not, the result is wrong,the rest of the code is same as before,----------
}
int main()
{
int *gpu_data_in, *gpu_data_out;
int *cpu_data;
cpu_data = (int *)malloc(DATASIZE);
for (int i = 0; i < BLOCKSIZE; i++)//初始化cpu数组
{
cpu_data[i] = 1;
}
cudaMalloc((int **)&gpu_data_in, DATASIZE);
cudaMalloc((int **)&gpu_data_out, DATASIZE);
cudaMemcpy(gpu_data_in, cpu_data, DATASIZE, cudaMemcpyHostToDevice);//内存数据复制到设备
stencil_1d << <1, BLOCKSIZE >> > (gpu_data_in, gpu_data_out);
cudaMemcpy(cpu_data, gpu_data_out, DATASIZE, cudaMemcpyDeviceToHost);//设备数据复制到内存
for (int i = 0; i < BLOCKSIZE; i++)
{
printf("%d\t", cpu_data[i]);
if ((i + 1) % 8 == 0)
printf("\n");
}
cudaFree(gpu_data_in);
cudaFree(gpu_data_out);
return 0;
}
This line has a problem.
printf(“Test%6d-%4d\n”, gindex, out[gindex]);//when I added this sentence the result is right,but if not, the result is wrong,the rest of the code is same as before,
for example,when the BLOCKSIZE is 16,the result is
1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1
and this is wrong
(without a printf code)
and
Test 0-4
Test 1-5
Test 2-6
Test 3-7
Test 4-7
Test 5-7
Test 6-7
Test 7-7
Test 8-7
Test 9-7
Test 10-7
Test 11-7
Test 12-7
Test 13-6
Test 14-5
Test 15-4
4 5 6 7 7 7 7 7
7 7 7 7 7 6 5 4
(with a printf code)the result is right
but I couldn’t understand why? How can I fix it and without a printf code?