Beginer question Thread synchronization with shared memory

Getting a strange error running my kernal in emulation mode in order to be able to walkthorugh my code and i m getting a memory error i am allocating shared memory externally

//Calculated the amount of shared memory required for each individual block

sharedMemSize = columnSizethreadsPerBlocksizeof(float) + columnSizethreadsPerBlocksizeof(int) + columnSizethreadsPerBlocksizeof(int) + (threadsPerBlock*sizeof(float));

then calling the kernal

blockMultiplyColumnPerGrid<<<dimGrid, threadsPerBlock, sharedMemSize>>>

and allocating shared memory using the method you told me yet i get a strange error when trying to access shared_resultOrder

size_t offset0 = 0;

size_t offset1 = offset0 + sizeof(float)*maxColLen*blockDim.x;

size_t offset2 = offset1 + sizeof(int)*maxColLen*blockDim.x;

size_t offset3 = offset2 + sizeof(int)*maxColLen*blockDim.x;

float * shared_values = (float *)&shared_data[offset0];

int * shared_rowIDs = (int*)&shared_data[offset1];

int * shared_resultOrder = (int*)&shared_data[offset2];

float * shared_buff = (float *)&shared_data[offset3];

any ideas?

Managed to narrow down the error to extern shared unsigned char * shared_data;

Error message is error C2371: ‘__cuda_emu::shared_data’ : redefinition; different basic types

found the error strangely enough i had shared_data declared in a different kernel yet they had naming conflicts!

there are a LOT of subtle problems with warp level programming.

a question about this code below run in emulation mode

// Use first warp of block to compute parallel reduction on the

	// partial sum in shared memory.

	if (threadIdx.x < 32) {

		#pragma unroll 

		for(int i=32; i<TPB; i+=32) buff[threadIdx.x] += buff[threadIdx.x+i]; 

	}

	if (threadIdx.x < 16) { buff[threadIdx.x] += buff[threadIdx.x+16]; }

	if (threadIdx.x < 8)  { buff[threadIdx.x] += buff[threadIdx.x+8]; }

	if (threadIdx.x < 4)  { buff[threadIdx.x] += buff[threadIdx.x+4]; }

	if (threadIdx.x < 2)  { buff[threadIdx.x] += buff[threadIdx.x+2]; }

	// Finalise and write out the results to global memory

	if (threadIdx.x == 0)  { 

		r[blockIdx.x] = b[blockIdx.x] - buff[0] - buff[1];

	}

will this code run incorrectly in emulation mode since their is no concept of WARPS since they are run in a sequential manner one thread after each other?

You are right, it won’t work in emulation. There are actually warps in emulation, but the warp size is one, so the implicit synchronization relied upon by assuming a warp size of 32 breaks (see Tim was right :) ).

I wouldn’t recommend emulation to my worst enemies. Ocelot does a much better job with none of the brain damage associated with emulation. And emulation is deprecated and scheduled for removal in the next CUDA release anyway…

lolol :D good to know

So i got a problem where i don’t have enough space to pass my parameters so i m copying them to constant memory in this manner from my main

extern __constant__ int const_maxColLen;	

				extern __constant__ int const_columnSize;	

				extern __constant__ int const_offset;	

				

				CUDA_SAFE_CALL(cudaMemcpyToSymbol(const_maxColLen, tjdsMatrix->Blocks[i]->maxColLen, sizeof(int), 0, cudaMemcpyHostToDevice));

				CUDA_SAFE_CALL(cudaMemcpyToSymbol(const_columnSize, columnSize, sizeof(int), 0, cudaMemcpyHostToDevice));

				CUDA_SAFE_CALL(cudaMemcpyToSymbol(const_offset, toffset, sizeof(int), 0, cudaMemcpyHostToDevice));

now i read that i am able to read these values without actually needing to pass them as parameters yet i get

Error 1 error: identifier “const_maxColLen” is undefined

constant variables have to be declared at global scope. In the thread you scraped the reduction code from is an attachment which contains code to exercise that gemv kernel (and an iterative Jacobi solver too). It contains working examples of how to use constant memory variables and cudaMemcpyToSymbol you can look at.

You should change ur nick from avidday to lifesaver :P

In your code the constants are also defined as constants

const unsigned int L = 96;

	const unsigned int N = L*L;

	const unsigned int LDA = N;

	const float alpha = 0.95; // relaxation factor

	gpuAssert( cudaMemcpyToSymbol("N_", &N, sizeof(unsigned int), 0, cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpyToSymbol("LDA_", &LDA, sizeof(unsigned int), 0, cudaMemcpyHostToDevice) );

	gpuAssert( cudaMemcpyToSymbol("JORalpha_", &alpha, sizeof(float), 0, cudaMemcpyHostToDevice) );

i m doing the same exact thing except without the const and i get

First-chance exception at 0x000007fefd40aa7d in CUDATJDS.exe: Microsoft C++ exception: cudaError at memory location 0x0025f9f0…

any idea?

It would seem that you have either found a bug, or you really aren’t doing the exact same thing as the reference code I pointed you to. Can you post a concise, self-contained piece of code that illustrates the problem?

sure here is the code automatically generated by visual studio just trying to define a constant

/************************************************************

********

*  sample.cu

*  This is a example of the CUDA program.

************************************************************

*********/

#include <stdio.h>

#include <stdlib.h>

#include <cutil_inline.h>

#ifndef gpuAssert

#include <stdio.h>

#define gpuAssert( condition ) { if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE %s in %s, line %d\n", cudaGetErrorString(condition), __FILE__, __LINE__ ); exit( 1 ); } }

#endif

/************************************************************

************/

/* Init CUDA															*/

/************************************************************

************/

#if __DEVICE_EMULATION__

bool InitCUDA(void){return true;}

#else

bool InitCUDA(void)

{

	int count = 0;

	int i = 0;

	cudaGetDeviceCount(&count);

	if(count == 0) {

		fprintf(stderr, "There is no device.\n");

		return false;

	}

	for(i = 0; i < count; i++) {

		cudaDeviceProp prop;

		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {

			if(prop.major >= 1) {

				break;

			}

		}

	}

	if(i == count) {

		fprintf(stderr, "There is no device supporting CUDA.\n");

		return false;

	}

	cudaSetDevice(i);

	printf("CUDA initialized.\n");

	return true;

}

#endif

/************************************************************

************/

/* Example															  */

/************************************************************

************/

__constant__ int const_maxColLen;

__global__ static void HelloCUDA(char* result, int num)

{

	int i = 0;

	char p_HelloCUDA[] = "Hello CUDA!";

	for(i = 0; i < num; i++) {

		result[i] = p_HelloCUDA[i];

	}

}

/************************************************************

************/

/* HelloCUDA															*/

/************************************************************

************/

int main(int argc, char* argv[])

{

	if(!InitCUDA()) {

		return 0;

	}

	char	*device_result	= 0;

	char	host_result[12]	={0};

	cutilSafeCall( cudaMalloc((void**) &device_result, sizeof(char) * 11));

	unsigned int timer = 0;

	cutilCheckError( cutCreateTimer( &timer));

	cutilCheckError( cutStartTimer( timer));

	

[b]	int temp = 0;

		

	gpuAssert( cudaMemcpyToSymbol("const_columnSize", &temp, sizeof(int), 0, cudaMemcpyHostToDevice));	  //<-----------------ERROR HERE[/b]

	HelloCUDA<<<1, 1, 0>>>(device_result, 11);

	cutilCheckMsg("Kernel execution failed\n");

	cudaThreadSynchronize();

	cutilCheckError( cutStopTimer( timer));

	printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));

	cutilCheckError( cutDeleteTimer( timer));

	cutilSafeCall( cudaMemcpy(host_result, device_result, sizeof(char) * 11, cudaMemcpyDeviceToHost));

	printf("%s\n", host_result);

	cutilSafeCall( cudaFree(device_result));

	return 0;

}

First-chance exception at 0x000007fefd40aa7d in Project1.exe: Microsoft C++ exception: cudaError at memory location 0x0012f880…

First-chance exception at 0x000007fefd40aa7d in Project1.exe: Microsoft C++ exception: cudaError at memory location 0x0012f880…

refreshed page and the reply got reposted

In that code there is no const_columnSize defined anywhere, if I am not missing something:

__constant__ int const_maxColLen;

gpuAssert( cudaMemcpyToSymbol("const_columnSize", &temp, sizeof(int), 0, cudaMemcpyHostToDevice));

This message board isn’t some sort of interactive debugging service. If you have technical questions, by all means ask them, but at least try and exhaust all the usual suspects before posting. It wastes less of everyone’s time.

sorry about that didn’t see it gonna go get some sleep been programming to long