Wierd crash in modified VectorAdd example.

Debdatta · May 18, 2011, 1:22am

I have been experimenting with passing kernel arguments as global variables by initializing the variable using cudaMemcpyToSymbol. It worked at first. However, if in the class Arr, if i Just add an dummy data member after the float* myptr member, it crashes. What is even more perplexing is that moving the dummy data member below the Myptr data member resolves the problem.

I am on windows 7 64 bit , but the code is set to compile a 32 bit executable. I use a gtx 470 with driver 270.61, and cuda toolkit 3.2.

The problem file is attached below.

Debdatta Basu.
Tester.cu (4.23 KB)

hyqneuron · May 18, 2011, 7:43am

Can’t view the file.

I have no experience with 64-bit system, but I’m pretty sure it’s a problem with your pointer size. You can download the 64-bit 4.0 toolkit and compile it in 64-bit. Hopefully that will solve the problems.

Debdatta · May 18, 2011, 10:08am

I have attached it below:

/*

 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.

 *

 * Please refer to the NVIDIA end user license agreement (EULA) associated

 * with this source code for terms and conditions that govern your use of

 * this software. Any use, reproduction, disclosure, or distribution of

 * this software and related documentation outside the terms of the EULA

 * is strictly prohibited.

 *

 */

/* Vector addition: C = A + B.

 *

 * This sample is a very basic sample that implements element by element

 * vector addition. It is the same as the sample illustrating Chapter 3

 * of the programming guide with some additions like error checking.

 *

 */

// Includes

#include <stdio.h>

#include <cuda_runtime_api.h>

#include <cutil_inline.h>

// Variables

float* h_A;

float* h_B;

float* h_C;

float* d_A;

float* d_B;

float* d_C;

bool noprompt = false;

// Functions

void Cleanup(void);

void RandomInit(float*, int);

void ParseArguments(int, char**);

// Device code

class Arr

{

public:

	__host__ void init(float* p)

	{

		cudaMemcpyToSymbol(MyPtr, (void*)&p, sizeof(float*));

	}

	//The bug is here. Remove the int* Dummy, or place it after the Myptr to get a correct result.

	int* Dummy;

	float* MyPtr;

	

};

__device__ Arr A;

__device__ Arr B;

__device__ Arr C;

__global__ void VecAdd(int N)

{

	

	int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < N)

		C.MyPtr[i] = A.MyPtr[i] + B.MyPtr[i];

}

// Host code

int main(int argc, char** argv)

{

    printf("Vector addition\n");

    int N = 50000;

    size_t size = N * sizeof(float);

    ParseArguments(argc, argv);

// Allocate input vectors h_A and h_B in host memory

    h_A = (float*)malloc(size);

    if (h_A == 0) Cleanup();

    h_B = (float*)malloc(size);

    if (h_B == 0) Cleanup();

    h_C = (float*)malloc(size);

    if (h_C == 0) Cleanup();

// Initialize input vectors

    RandomInit(h_A, N);

    RandomInit(h_B, N);

// Allocate vectors in device memory

    cutilSafeCall( cudaMalloc((void**)&d_A, size) );

    cutilSafeCall( cudaMalloc((void**)&d_B, size) );

    cutilSafeCall( cudaMalloc((void**)&d_C, size) );

// Copy vectors from host memory to device memory

    cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );

    cutilSafeCall( cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice) );

	//cudaMemcpyToSymbol(A.MyPtr, (void*)&d_A, sizeof(float*));

	//cudaMemcpyToSymbol(B.MyPtr, (void*)&d_B, sizeof(float*));

	//cudaMemcpyToSymbol(C.MyPtr, (void*)&d_C, sizeof(float*));

	A.init(d_A);

	B.init(d_B);

	C.init(d_C);

// Invoke kernel

    int threadsPerBlock = 256;

    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

VecAdd<<<blocksPerGrid, threadsPerBlock>>>(N);

    cutilCheckMsg("kernel launch failure");

#ifdef _DEBUG

    cutilSafeCall( cudaThreadSynchronize() );

#endif

// Copy result from device memory to host memory

    // h_C contains the result in host memory

    cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );

// Verify result

    int i;

    for (i = 0; i < N; ++i) {

        float sum = h_A[i] + h_B[i];

        if (fabs(h_C[i] - sum) > 1e-5)

            break;

    }

    printf("%s \n", (i == N) ? "PASSED" : "FAILED");

Cleanup();

}

void Cleanup(void)

{

    // Free device memory

    if (d_A)

        cudaFree(d_A);

    if (d_B)

        cudaFree(d_B);

    if (d_C)

        cudaFree(d_C);

// Free host memory

    if (h_A)

        free(h_A);

    if (h_B)

        free(h_B);

    if (h_C)

        free(h_C);

cutilSafeCall( cudaThreadExit() );

if (!noprompt) {

        printf("\nPress ENTER to exit...\n");

        fflush( stdout);

        fflush( stderr);

        getchar();

    }

exit(0);

}

// Allocates an array with random float entries.

void RandomInit(float* data, int n)

{

    for (int i = 0; i < n; ++i)

        data[i] = rand() / (float)RAND_MAX;

}

// Parse program arguments

void ParseArguments(int argc, char** argv)

{

    for (int i = 0; i < argc; ++i)

        if (strcmp(argv[i], "--noprompt") == 0 ||

			strcmp(argv[i], "-noprompt") == 0) 

		{

            noprompt = true;

            break;

        }

}

The pointer copying code doesn’t used hard coded pointer sizes. So that shouldn’t be a problem. I think this might be an alignment issue though Im not sure.

Debdatta · May 18, 2011, 1:02pm

I got it to work! Simply change the Arr class in the above code to the following:

class Arr

{

public:

	__host__ void init(float* p)

	{

		int offset = 0;

		ALIGN_UP(offset, __alignof(int*));

		offset+= sizeof(int*);

		ALIGN_UP(offset, __alignof(float*));

		cudaMemcpyToSymbol(Dummy, &p, sizeof(float*), offset);

	}

	//The bug is removed.

	

	int* Dummy;

	float* MyPtr;

} ;

All I have done is reference the Myptr in a roundabout way by specifying an offset from Dummy. It appears that the cudaMemcpyToSymbol call is having problems obtaining addresses to data members of the class other than the first member. Is this a bug?

-Debdatta Basu.

Debdatta · May 20, 2011, 9:21pm

Bump!

Sorry for that, but did anyone try it out? I want to get to the bottom of this issue.

Debdatta Basu.

hyqneuron · May 23, 2011, 12:53pm

How do you use MyPtr directly in host code when it’s supposed to exist only in device memory? Are you sure you are getting correct result?
I suppose in the second case you got it to run only because it’s no longer writing to address 0x00000000, but 0x00000004.

Either you do not put them as symbols or you get their symbol addresses before setting values in them.

Debdatta · May 24, 2011, 12:20pm

I am getting the correct result. The manual says that cudamemcpytosymbol can take symbol names. Hence, I can use MyPtr directly.

hyqneuron · May 25, 2011, 8:01am

const char*

that’s what it means by symbol name.

Topic		Replies	Views
Strange memory gremlins Getting pwned by pointers CUDA Programming and Performance	9	12277	July 1, 2009
Global arrays? CUDA Programming and Performance	24	10873	August 18, 2010
cudaErrorInvalidValue when using float4* and cudaMemcpyToSymbol CUDA Programming and Performance	2	1114	August 27, 2015
cudaMemcpyToSymbol can cause program crush ? CUDA Programming and Performance	0	743	July 1, 2010
Problem using cudamemcpytosymbol CUDA Programming and Performance	5	1266	October 27, 2013
Trying to use global device memory CUDA Programming and Performance	2	460	February 22, 2022
cudaMemcpyToSymbol returnes "invalid device symbol" CUDA Programming and Performance	12	36041	May 2, 2011
program crash when copying from device to host <br /> CUDA Programming and Performance	11	1979	March 31, 2009
cudaMemcpyToSymbol(.....) input parameter issue CUDA Programming and Performance	2	4485	October 20, 2009
invalid argument with cudaMemcpyToSymbol CUDA Programming and Performance	1	8315	November 28, 2011

Wierd crash in modified VectorAdd example.

Related topics