Hello,
I have been working on a CUDA project although I’ve been experiencing errors along the way so I decided to reduce the program down to its simplest form to test the functionality. There is still an incorrect output for this very simple program, which has created the idea that maybe the issue lies in the TX2 board itself or problems with CUDA running on the board.
Sample programs do work on the board such as deviceQuery,vectorAdd,bandwidthTest and so on. Maybe there is actually an error within this program that I am not seeing, but there are no problems compiling and no issues with cuda-memcheck. Below is a sample code ready to be compiled. Let me know if you receive similar or different output.
//#include <cufft.h>
//#include <cuda_runtime.h>
//#include <cuda.h>
//#include <cuda_device_runtime_api.h>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#define height (2048ULL)
#define width (2448ULL)
#define size (height*width)
__global__ void datatransfer(float *f)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x>0 && x<width && y>0 && y< height) {
f[width*y+x] = 1.0f;
}
}
int main()
{
float *array = new float;
float *dev_array;
for (int i=0;i<size;i++)
{
array[i] = i;
}
std::cout<<" "<<std::endl;
std::cout<<"Cuda Array test: " <<std::endl;
for (int i=0;i<5;i++) {
std::cout<<array[10*i] <<" "<< array[20*i] <<" "<< array[30*i] <<" "<< array[40*i] <<" "<< array[50*i] <<std::endl;
}
if (cudaMalloc((void **)&dev_array,size* sizeof(float)) != cudaSuccess)
{
fprintf(stderr,"Cuda Error: failed to allocate dev_array");
}
cudaMemcpy(dev_array,array,size* sizeof(float),cudaMemcpyHostToDevice);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to copy to dev_array\n");
}
dim3 threadsPerBlock(32,32);
dim3 numBlocks((width+threadsPerBlock.x-1)/threadsPerBlock.x,(height+threadsPerBlock.y-1)/threadsPerBlock.y);
datatransfer<<<numBlocks,threadsPerBlock>>>(dev_array);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to execute kernel\n");
}
cudaMemcpy(array,dev_array,size*sizeof(float),cudaMemcpyDeviceToHost);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to copy to f2_array\n");
}
std::cout<<" "<<std::endl;
std::cout<<"Cuda New Array test: " <<std::endl;
for (int i=0;i<5;i++) {
std::cout<<array[10*i] <<" "<< array[20*i] <<" "<< array[30*i] <<" "<< array[40*i] <<" "<< array[50*i] <<std::endl;
}
if (cudaFree(dev_array) != cudaSuccess)
{
fprintf(stderr,"Failed to free dev_array\n");
}
return 0;
}
nvcc f_to_f2.cu -I/usr/local/cuda/include -I/usr/include -L/usr/local/cuda/lib64 -lcudart -o f_to_f2
cuda-memcheck ./f_to_f2
========= CUDA-MEMCHECK
Cuda Array test:
0 0 0 0 0
10 20 30 40 50
20 40 60 80 100
30 60 90 120 150
40 80 120 160 200
Cuda Float2 Array test:
0 0 0 0 0
10 20 30 40 50
20 40 60 80 100
30 60 90 120 150
40 80 120 160 200
========= ERROR SUMMARY: 0 errors