Unusual Results when copying unsigned long long

I’m getting odd results out of this code, it just copies unsigned long long data, but sometimes it gives wrong results.

If it finds something wrong, it prints 2 files, the input to:

input_data

and the output (incorrect) to:

output_data

Just wanted to make sure I’m not doing something dumb. Also it would be good to know if anyone can reproduce this problem.

Some info:

I am using cuda 4.0 drivers, on a Tesla S1070.

I’ve used all 4 gpus on the machine with the same result.

I’ve tried compiling with nvcc 3.2 and nvcc 4.0.

The machine is 64 bit linux and gcc is version 4.4.3.

I compiled with:

nvcc -o curr -arch=sm_13 -O2 curr.cu

Most times it will give the right answer, but sometimes it doesn’t.

When it doesn’t the differences are always few and each is a pair of consecutive numbers.

I’m attaching the code as an attachment as well.

Thanks

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <cuda.h>

template <typename T>

void print_vector(const char * filename, const T * vec, const char * code, int n)

{

    FILE * file = fopen(filename,"w");

    for(int i = 0;i<n;++i)

        fprintf(file,code,vec[i]);

    fclose(file);

}

template <typename T>

__global__

void copy_function( T * A, T * B, int n) 

{

    int i = blockDim.x*blockIdx.x+threadIdx.x;

    if(i<n)

    {

        B[i] = A[i];

    }

}

int main(int argc, char * argv[])

{

    int n_iterations = 100;

    int device = 0;

    cudaSetDevice(device);

typedef unsigned long long uint64;

    typedef unsigned int uint32;

int N = 60000;

    int BLOCKSIZE = 128;

    int NBLOCKS = (N+BLOCKSIZE-1)/BLOCKSIZE;

uint64 * host_ptr = (uint64*)malloc(sizeof(uint64)*N);

    uint64 * host_ptr_2 = (uint64*)malloc(sizeof(uint64)*N);

for(int i = 0; i<N; i++)

        host_ptr[i] = (uint64)(((uint64)rand()<<32)^((uint64)rand()));

uint64 * device_ptr;  

    uint64 * device_ptr_2;

cudaMalloc(&device_ptr,sizeof(uint64)*N);

    cudaMalloc(&device_ptr_2,sizeof(uint64)*N);

    cudaMemcpy(device_ptr,host_ptr,sizeof(uint64)*N,cudaMemcpyHostToDevice);

printf("testing %d times on device %d\n",n_iterations,device);

    bool correct = true;

    for(int i = 0; i<n_iterations; i++) {

        copy_function<uint64><<<NBLOCKS,BLOCKSIZE>>>(device_ptr,device_ptr_2,N);

        cudaThreadSynchronize();

        memset((void*)host_ptr_2,0,sizeof(uint64)*N);

        cudaMemcpy(host_ptr_2,device_ptr_2,sizeof(uint64)*N,cudaMemcpyDeviceToHost);

for(int k=0; k<N; k++) {

            correct = host_ptr[k]==host_ptr_2[k];

            if(!correct)

                break;

        }

if(!correct)

        {

            printf("%d: Output incorrect on device %d\n",i,device);

            print_vector<uint64>("input_data",host_ptr,"%llu\n",N);

            print_vector<uint64>("output_data",host_ptr_2,"%llu\n",N);

            break;

        }

    }

    if(correct)

        printf("Output correct %d times on device %d\n",n_iterations,device);

    cudaFree(device_ptr_2);

    cudaFree(device_ptr);

    free(host_ptr_2);

    free(host_ptr);

}

curr.cu (2.17 KB)