Problem with a kernel example and SDK please help - it's very important for me

Hello there. I’ve got some problem with the sample from SDK. I changed one of them(vectorAdd sample). I work at win xp 32b sp3, visual studio 2008 and cuda 3.0.

When I only one file named vectorAdd.cu with code it works fine(does add +1 to each digit. After that I’ve got alphabet from A to Z):

// Device code

__global__ void VecAdd(float* A,  int N)

{

   int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < N)

	  

	  A[i]=A[i]+i;

	

}

// Host code

int main(int argc, char** argv)

{

	

	int len=34;

	size_t size = len * sizeof(float);

	

	// Allocate input vectors h_A and h_B in host memory

	h_A = (float*)malloc(size);

	if (h_A == 0) Cleanup();

	// Initialize input vectors

	RandomInit(h_A, len);

	// Allocate vectors in device memory

	cutilSafeCall( cudaMalloc((void**)&d_A, size) );

	// Copy vectors from host memory to device memory

	cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );

	// Invoke kernel

	int threadsPerBlock = 256;

	int blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, len);

	cutilCheckMsg("kernel launch failure");

#ifdef _DEBUG

	cutilSafeCall( cudaThreadSynchronize() );

#endif

	// Copy result from device memory to host memory

	// h_A contains the result in host memory

	cutilSafeCall( cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost) );

	

	for(int ii=0; ii < len; ++ii)

	{

	

		std::cout<<"Literka "<<(char)h_A[ii];

		std::cout << std::endl;

	

	}

	

	Cleanup();

}

void Cleanup(void)

{

	// Free device memory

	if (d_A)

		cudaFree(d_A);

	// Free host memory

	if (h_A)

		free(h_A);

		

	cutilSafeCall( cudaThreadExit() );

	

	if (!noprompt) {

		printf("\nPress ENTER to exit...\n");

		fflush( stdout);

		fflush( stderr);

		getchar();

	}

	exit(0);

}

// Allocates an array with random float entries.

void RandomInit(float* data, int n)

{

	for (int i = 0; i < n; ++i)

		data[i] = 65;// rand() / (float)RAND_MAX;

}

// Parse program arguments

void ParseArguments(int argc, char** argv)

{

	for (int i = 0; i < argc; ++i)

		if (strcmp(argv[i], "--noprompt") == 0 ||

			strcmp(argv[i], "-noprompt") == 0) 

		{

			noprompt = true;

			break;

		}

}

I was trying to make from that cppIntegrator like from example(cppIntegrator in SDK folder). I don’t get it why it doesn’t work. I don’t get alphabet from A-Z, just A letter.

It looks like that:

kamerl.cu - my kernel

#ifndef _KARMEL_H_

#define _KARMEL_H_

// Device code

__global__ void VecAdd(float* A, int N)

{

int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < N)

		  A[i]=A[i]+i;

	

}

#endif

vectorAdd.cu - cppIntegrator(like in SDK example):

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

// includes, kernels

#include <karmel.cu>

float* d_A;

// Host code

extern "C" void runKernel( const int argc, const char** argv, float* h_A, int len )

{

	

	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

		cutilDeviceInit(argc, (char**)argv);

	else

		cudaSetDevice( cutGetMaxGflopsDeviceId() );

	size_t size = len * sizeof(float);

	// Allocate input vectors h_A and h_B in host memory

	h_A = (float*)malloc(size);

	printf("h_a %d", h_A);

	printf("\n");

	// Allocate vectors in device memory

	cutilSafeCall( cudaMalloc((void**)&d_A, size) );

	// Copy vectors from host memory to device memory

	cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );

	// Invoke kernel

	int threadsPerBlock = 256;

	int blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, len);

	cutilCheckMsg("kernel launch failure");

#ifdef _DEBUG

	cutilSafeCall( cudaThreadSynchronize() );

#endif

	// Copy result from device memory to host memory

	// h_A contains the result in host memory

	cutilSafeCall( cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost) );

	

	cutilSafeCall(cudaFree(d_A));

	free(h_A);

	

	cudaThreadExit();

}

main.cpp

#include <stdio.h>

#include <iostream>

#include <cutil_inline.h>

using namespace std;

extern "C" void runKernel( const int argc, const char** argv , float* h_A, int len );

int main(int argc, char** argv)

{

	

	int dlug=34;

	float tab[]={ 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,

	 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65 };

	

	runKernel(argc, (const char**)argv, tab, dlug);

		for(int ii=0; ii < dlug; ++ii)

		{

	

			cout<<"Literka "<<(char)tab[ii]<<endl;

			

		}

	

	

	system("pause");

	return 0;

}

== edit ==

Problem has been solved :)

Hello there. I’ve got some problem with the sample from SDK. I changed one of them(vectorAdd sample). I work at win xp 32b sp3, visual studio 2008 and cuda 3.0.

When I only one file named vectorAdd.cu with code it works fine(does add +1 to each digit. After that I’ve got alphabet from A to Z):

// Device code

__global__ void VecAdd(float* A,  int N)

{

   int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < N)

	  

	  A[i]=A[i]+i;

	

}

// Host code

int main(int argc, char** argv)

{

	

	int len=34;

	size_t size = len * sizeof(float);

	

	// Allocate input vectors h_A and h_B in host memory

	h_A = (float*)malloc(size);

	if (h_A == 0) Cleanup();

	// Initialize input vectors

	RandomInit(h_A, len);

	// Allocate vectors in device memory

	cutilSafeCall( cudaMalloc((void**)&d_A, size) );

	// Copy vectors from host memory to device memory

	cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );

	// Invoke kernel

	int threadsPerBlock = 256;

	int blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, len);

	cutilCheckMsg("kernel launch failure");

#ifdef _DEBUG

	cutilSafeCall( cudaThreadSynchronize() );

#endif

	// Copy result from device memory to host memory

	// h_A contains the result in host memory

	cutilSafeCall( cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost) );

	

	for(int ii=0; ii < len; ++ii)

	{

	

		std::cout<<"Literka "<<(char)h_A[ii];

		std::cout << std::endl;

	

	}

	

	Cleanup();

}

void Cleanup(void)

{

	// Free device memory

	if (d_A)

		cudaFree(d_A);

	// Free host memory

	if (h_A)

		free(h_A);

		

	cutilSafeCall( cudaThreadExit() );

	

	if (!noprompt) {

		printf("\nPress ENTER to exit...\n");

		fflush( stdout);

		fflush( stderr);

		getchar();

	}

	exit(0);

}

// Allocates an array with random float entries.

void RandomInit(float* data, int n)

{

	for (int i = 0; i < n; ++i)

		data[i] = 65;// rand() / (float)RAND_MAX;

}

// Parse program arguments

void ParseArguments(int argc, char** argv)

{

	for (int i = 0; i < argc; ++i)

		if (strcmp(argv[i], "--noprompt") == 0 ||

			strcmp(argv[i], "-noprompt") == 0) 

		{

			noprompt = true;

			break;

		}

}

I was trying to make from that cppIntegrator like from example(cppIntegrator in SDK folder). I don’t get it why it doesn’t work. I don’t get alphabet from A-Z, just A letter.

It looks like that:

kamerl.cu - my kernel

#ifndef _KARMEL_H_

#define _KARMEL_H_

// Device code

__global__ void VecAdd(float* A, int N)

{

int i = blockDim.x * blockIdx.x + threadIdx.x;

	if (i < N)

		  A[i]=A[i]+i;

	

}

#endif

vectorAdd.cu - cppIntegrator(like in SDK example):

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

// includes, project

#include <cutil_inline.h>

// includes, kernels

#include <karmel.cu>

float* d_A;

// Host code

extern "C" void runKernel( const int argc, const char** argv, float* h_A, int len )

{

	

	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

		cutilDeviceInit(argc, (char**)argv);

	else

		cudaSetDevice( cutGetMaxGflopsDeviceId() );

	size_t size = len * sizeof(float);

	// Allocate input vectors h_A and h_B in host memory

	h_A = (float*)malloc(size);

	printf("h_a %d", h_A);

	printf("\n");

	// Allocate vectors in device memory

	cutilSafeCall( cudaMalloc((void**)&d_A, size) );

	// Copy vectors from host memory to device memory

	cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );

	// Invoke kernel

	int threadsPerBlock = 256;

	int blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock;

	VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, len);

	cutilCheckMsg("kernel launch failure");

#ifdef _DEBUG

	cutilSafeCall( cudaThreadSynchronize() );

#endif

	// Copy result from device memory to host memory

	// h_A contains the result in host memory

	cutilSafeCall( cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost) );

	

	cutilSafeCall(cudaFree(d_A));

	free(h_A);

	

	cudaThreadExit();

}

main.cpp

#include <stdio.h>

#include <iostream>

#include <cutil_inline.h>

using namespace std;

extern "C" void runKernel( const int argc, const char** argv , float* h_A, int len );

int main(int argc, char** argv)

{

	

	int dlug=34;

	float tab[]={ 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,

	 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65 };

	

	runKernel(argc, (const char**)argv, tab, dlug);

		for(int ii=0; ii < dlug; ++ii)

		{

	

			cout<<"Literka "<<(char)tab[ii]<<endl;

			

		}

	

	

	system("pause");

	return 0;

}

== edit ==

Problem has been solved :)