I’m getting odd results out of this code, it just copies unsigned long long data, but sometimes it gives wrong results.
If it finds something wrong, it prints 2 files, the input to:
input_data
and the output (incorrect) to:
output_data
Just wanted to make sure I’m not doing something dumb. Also it would be good to know if anyone can reproduce this problem.
Some info:
I am using cuda 4.0 drivers, on a Tesla S1070.
I’ve used all 4 gpus on the machine with the same result.
I’ve tried compiling with nvcc 3.2 and nvcc 4.0.
The machine is 64 bit linux and gcc is version 4.4.3.
I compiled with:
nvcc -o curr -arch=sm_13 -O2 curr.cu
Most times it will give the right answer, but sometimes it doesn’t.
When it doesn’t the differences are always few and each is a pair of consecutive numbers.
I’m attaching the code as an attachment as well.
Thanks
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda.h>
template <typename T>
void print_vector(const char * filename, const T * vec, const char * code, int n)
{
FILE * file = fopen(filename,"w");
for(int i = 0;i<n;++i)
fprintf(file,code,vec[i]);
fclose(file);
}
template <typename T>
__global__
void copy_function( T * A, T * B, int n)
{
int i = blockDim.x*blockIdx.x+threadIdx.x;
if(i<n)
{
B[i] = A[i];
}
}
int main(int argc, char * argv[])
{
int n_iterations = 100;
int device = 0;
cudaSetDevice(device);
typedef unsigned long long uint64;
typedef unsigned int uint32;
int N = 60000;
int BLOCKSIZE = 128;
int NBLOCKS = (N+BLOCKSIZE-1)/BLOCKSIZE;
uint64 * host_ptr = (uint64*)malloc(sizeof(uint64)*N);
uint64 * host_ptr_2 = (uint64*)malloc(sizeof(uint64)*N);
for(int i = 0; i<N; i++)
host_ptr[i] = (uint64)(((uint64)rand()<<32)^((uint64)rand()));
uint64 * device_ptr;
uint64 * device_ptr_2;
cudaMalloc(&device_ptr,sizeof(uint64)*N);
cudaMalloc(&device_ptr_2,sizeof(uint64)*N);
cudaMemcpy(device_ptr,host_ptr,sizeof(uint64)*N,cudaMemcpyHostToDevice);
printf("testing %d times on device %d\n",n_iterations,device);
bool correct = true;
for(int i = 0; i<n_iterations; i++) {
copy_function<uint64><<<NBLOCKS,BLOCKSIZE>>>(device_ptr,device_ptr_2,N);
cudaThreadSynchronize();
memset((void*)host_ptr_2,0,sizeof(uint64)*N);
cudaMemcpy(host_ptr_2,device_ptr_2,sizeof(uint64)*N,cudaMemcpyDeviceToHost);
for(int k=0; k<N; k++) {
correct = host_ptr[k]==host_ptr_2[k];
if(!correct)
break;
}
if(!correct)
{
printf("%d: Output incorrect on device %d\n",i,device);
print_vector<uint64>("input_data",host_ptr,"%llu\n",N);
print_vector<uint64>("output_data",host_ptr_2,"%llu\n",N);
break;
}
}
if(correct)
printf("Output correct %d times on device %d\n",n_iterations,device);
cudaFree(device_ptr_2);
cudaFree(device_ptr);
free(host_ptr_2);
free(host_ptr);
}
curr.cu (2.17 KB)