Wierd crash in modified VectorAdd example.

I have been experimenting with passing kernel arguments as global variables by initializing the variable using cudaMemcpyToSymbol. It worked at first. However, if in the class Arr, if i Just add an dummy data member after the float* myptr member, it crashes. What is even more perplexing is that moving the dummy data member below the Myptr data member resolves the problem.

I am on windows 7 64 bit , but the code is set to compile a 32 bit executable. I use a gtx 470 with driver 270.61, and cuda toolkit 3.2.

The problem file is attached below.

Debdatta Basu.
Tester.cu (4.23 KB)

Can’t view the file.

I have no experience with 64-bit system, but I’m pretty sure it’s a problem with your pointer size. You can download the 64-bit 4.0 toolkit and compile it in 64-bit. Hopefully that will solve the problems.

I have attached it below:

/*

 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

 *

 * Please refer to the NVIDIA end user license agreement (EULA) associated

 * with this source code for terms and conditions that govern your use of

 * this software. Any use, reproduction, disclosure, or distribution of

 * this software and related documentation outside the terms of the EULA

 * is strictly prohibited.

 *

 */

/* Vector addition: C = A + B.

 *

 * This sample is a very basic sample that implements element by element

 * vector addition. It is the same as the sample illustrating Chapter 3

 * of the programming guide with some additions like error checking.

 *

 */

// Includes

#include <stdio.h>

#include <cuda_runtime_api.h>

#include <cutil_inline.h>

// Variables

float* h_A;

float* h_B;

float* h_C;

float* d_A;

float* d_B;

float* d_C;

bool noprompt = false;

// Functions

void Cleanup(void);

void RandomInit(float*, int);

void ParseArguments(int, char**);

// Device code

class Arr

{

public:

	__host__ void init(float* p)

	{

		cudaMemcpyToSymbol(MyPtr, (void*)&p, sizeof(float*));

	}

	//The bug is here. Remove the int* Dummy, or place it after the Myptr to get a correct result.

	int* Dummy;

	float* MyPtr;

	

};

__device__ Arr A;

__device__ Arr B;

__device__ Arr C;

__global__ void VecAdd(int N)

{

	

	int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < N)

		C.MyPtr[i] = A.MyPtr[i] + B.MyPtr[i];

}

// Host code

int main(int argc, char** argv)

{

    printf("Vector addition\n");

    int N = 50000;

    size_t size = N * sizeof(float);

    ParseArguments(argc, argv);

// Allocate input vectors h_A and h_B in host memory

    h_A = (float*)malloc(size);

    if (h_A == 0) Cleanup();

    h_B = (float*)malloc(size);

    if (h_B == 0) Cleanup();

    h_C = (float*)malloc(size);

    if (h_C == 0) Cleanup();

// Initialize input vectors

    RandomInit(h_A, N);

    RandomInit(h_B, N);

// Allocate vectors in device memory

    cutilSafeCall( cudaMalloc((void**)&d_A, size) );

    cutilSafeCall( cudaMalloc((void**)&d_B, size) );

    cutilSafeCall( cudaMalloc((void**)&d_C, size) );

// Copy vectors from host memory to device memory

    cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );

    cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );

	//cudaMemcpyToSymbol(A.MyPtr, (void*)&d_A, sizeof(float*));

	//cudaMemcpyToSymbol(B.MyPtr, (void*)&d_B, sizeof(float*));

	//cudaMemcpyToSymbol(C.MyPtr, (void*)&d_C, sizeof(float*));

	A.init(d_A);

	B.init(d_B);

	C.init(d_C);

// Invoke kernel

    int threadsPerBlock = 256;

    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

VecAdd<<<blocksPerGrid, threadsPerBlock>>>(N);

    cutilCheckMsg("kernel launch failure");

#ifdef _DEBUG

    cutilSafeCall( cudaThreadSynchronize() );

#endif

// Copy result from device memory to host memory

    // h_C contains the result in host memory

    cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );

// Verify result

    int i;

    for (i = 0; i < N; ++i) {

        float sum = h_A[i] + h_B[i];

        if (fabs(h_C[i] - sum) > 1e-5)

            break;

    }

    printf("%s \n", (i == N) ? "PASSED" : "FAILED");

Cleanup();

}

void Cleanup(void)

{

    // Free device memory

    if (d_A)

        cudaFree(d_A);

    if (d_B)

        cudaFree(d_B);

    if (d_C)

        cudaFree(d_C);

// Free host memory

    if (h_A)

        free(h_A);

    if (h_B)

        free(h_B);

    if (h_C)

        free(h_C);

cutilSafeCall( cudaThreadExit() );

if (!noprompt) {

        printf("\nPress ENTER to exit...\n");

        fflush( stdout);

        fflush( stderr);

        getchar();

    }

exit(0);

}

// Allocates an array with random float entries.

void RandomInit(float* data, int n)

{

    for (int i = 0; i < n; ++i)

        data[i] = rand() / (float)RAND_MAX;

}

// Parse program arguments

void ParseArguments(int argc, char** argv)

{

    for (int i = 0; i < argc; ++i)

        if (strcmp(argv[i], "--noprompt") == 0 ||

			strcmp(argv[i], "-noprompt") == 0) 

		{

            noprompt = true;

            break;

        }

}

The pointer copying code doesn’t used hard coded pointer sizes. So that shouldn’t be a problem. I think this might be an alignment issue though Im not sure.

I got it to work! Simply change the Arr class in the above code to the following:

class Arr

{

public:

	__host__ void init(float* p)

	{

		int offset = 0;

		ALIGN_UP(offset, __alignof(int*));

		offset+= sizeof(int*);

		ALIGN_UP(offset, __alignof(float*));

		cudaMemcpyToSymbol(Dummy, &p, sizeof(float*), offset);

	}

	//The bug is removed.

	

	int* Dummy;

	float* MyPtr;

} ;

All I have done is reference the Myptr in a roundabout way by specifying an offset from Dummy. It appears that the cudaMemcpyToSymbol call is having problems obtaining addresses to data members of the class other than the first member. Is this a bug?

-Debdatta Basu.

Bump!

Sorry for that, but did anyone try it out? I want to get to the bottom of this issue.

Debdatta Basu.

How do you use MyPtr directly in host code when it’s supposed to exist only in device memory? Are you sure you are getting correct result?
I suppose in the second case you got it to run only because it’s no longer writing to address 0x00000000, but 0x00000004.

Either you do not put them as symbols or you get their symbol addresses before setting values in them.

I am getting the correct result. The manual says that cudamemcpytosymbol can take symbol names. Hence, I can use MyPtr directly.

const char*

that’s what it means by symbol name.