Invalid Device Pointer

Hi everybody,

I’m using an S1070 where each GPU is associate to a pthread (I’m using Ubuntu SMP 64bit. CPU Intel i7).

I get a CUDA error “Invalide Device Pointer” while trying to use cudaMemcpy2D from dev to host and from host to dev.

Here the piece of code:

void write_ghosts(int gpu, int neighbor, int step, float *host_addr, size_t host_pitch, float *dev_addr, size_t dev_pitch, int width_byte, int height)

{

	//WRITE PHASE

	pthread_mutex_lock(&mutex[gpu]);

		while(rAcc[gpu] != step+1)

		{

			pthread_cond_wait(&rCond[gpu], &mutex[gpu]);

		}

	pthread_mutex_unlock(&mutex[gpu]);

	cudaMemcpy2D(host_addr, host_pitch, dev_addr, dev_pitch, width_byte, height, cudaMemcpyDeviceToHost); //HERE COMES AN ERROR

	pthread_mutex_lock(&mutex[gpu]);

		wAcc[gpu]++;

	pthread_mutex_unlock(&mutex[gpu]);

	pthread_cond_signal(&wCond[gpu]);

	//END WRITE PHASE

}

void read_ghosts(int gpu, int neighbor, int step, float *host_addr, size_t host_pitch, float *dev_addr, size_t dev_pitch, int width_byte, int height)

{

	//READ PHASE

	pthread_mutex_lock(&mutex[neighbor]);

	while(wAcc[neighbor] != step+1)

		{

			pthread_cond_wait(&wCond[neighbor], &mutex[neighbor]);

		}

	pthread_mutex_unlock(&mutex[neighbor]);

	cudaMemcpy2D(dev_addr, dev_pitch, host_addr, host_pitch, width_byte, height, cudaMemcpyHostToDevice);  //HERE COMES AN ERROR

	pthread_mutex_lock(&mutex[neighbor]);

		rAcc[neighbor]++;

	pthread_mutex_unlock(&mutex[neighbor]);

	pthread_cond_signal(&rCond[neighbor]);

	//END READ PHASE

}

void *workers_task(void *params)

{

	//Cast the cookie pointer to the right type

	struct Order *p = (struct Order *)params;

	

	cudaSetDevice(p->device);

	

	int halt = 0;

	while(1)

	{

		pthread_mutex_lock(&task[p->device]);

			while(!tokens[p->device*2 - p->direction])

			{

				pthread_cond_wait(&task_cond[p->device], &task[p->device]);

			}

			tokens[p->device*2 - p->direction]--;

			halt = stop[p->device];

		pthread_mutex_unlock(&task[p->device]);

		

		if (halt) break;

		

		switch(p->direction)

		{

			case NORTH:

			{

				write_ghosts(p->device*2 - 1, p->device*2 - 2, p->step, p->grid, p->n*sizeof(float), p->dgrid+(p->border_size)*(p->pitch/4), p->pitch, 

					p->n*sizeof(float), p->border_size);

				read_ghosts(p->device*2 - 1, p->device*2 - 2, p->step, p->grid-(p->border_size*p->n), p->n*sizeof(float), p->dgrid, p->pitch, p->n*sizeof(float), 

					p->border_size);

				break;			

			}

			case SOUTH:

			{

				write_ghosts(p->device*2, p->device*2 + 1, p->step, p->grid+(p->h-p->border_size)*p->n, p->n*sizeof(float), 

					p->dgrid+(p->h)*(p->pitch/4), p->pitch, p->n*sizeof(float), p->border_size);

				read_ghosts(p->device*2, p->device*2 + 1, p->step, p->grid+(p->h*p->n), p->n*sizeof(float), p->dgrid+((p->h)+(p->border_size))*(p->pitch/4), 

					p->pitch, p->n*sizeof(float), p->border_size);

			}

		}

		pthread_mutex_lock(&task[p->device]);

			written[p->device*2 - p->direction]++;

		pthread_mutex_unlock(&task[p->device]);

		pthread_cond_signal(&go[p->device]);

	}

	return(NULL);

}

“void *workers_task(void *)” is the function passed to the new threads. Do you have any idea about what could be the reason for an Invalid Device Pointer error when I’m sure that the base address (p->dgrid) is correct? I say that the base address s correct because before running the threads, I can transfer correctly at that very same address. The fact that I have to setDevice again is just because every time you run a new thread, the last one is associated to device 0. So I have to reset the right device once again.

I hope I’ve been clear describing the problem and that someone can help me finding out a good reason :P

Daniel

Each thread is creating its own context, and the resources allocated are only valid within that context.

Thanks! So as far as I can understand there’s no chance to have different threads managing communication in parallel?

You could use the thread migration API, but that would require you to use the driver API as well.