IS copying an array of character strings to device memory absolutely impossible?

ananth_Sadanand · June 19, 2010, 3:13pm

ive been battling this issue for over 3 days now.
I have an an array of charater strings defined as:

char *a[3];
a[0]=“foo1”;
a[1]=“foo1”;
a[2]=“foo2”;

I need to copy this to device. Can somebody tell me how?

Heres wat i tried so far:

cudaMalloc((void**)&a[0],5sizeof(char));
cudaMemcpy(dev_array[0],a[0],5sizeof(char),cudaMemcpyHostTo
Device);
//Subsequently do this for a[1], a[2]…n so on.

This works, but as you see, ive had to explicitly send in each character string one at a time. If i have char *a[1000], this is obviously impossible to do. Also calling the kernel with this big a number of pointers is impossible. Is there a way to do this???

tmurray · June 19, 2010, 5:57pm

Why is this “obviously impossible” with 1000 strings?

cbuchner1 · June 19, 2010, 6:43pm

Work with pooled memory allocation. This allows you to copy the entire string pool at once.

Instead of storing pointers, I think it would be better to just store byte offsets into the string pool.

This makes “pointer” dereferencing a bit more inconvenient, but you could use macros or C++

operator overloads to do this with little extra effort.

ananth_Sadanand · June 19, 2010, 6:49pm

@ tmurray: It is possible…only if i make a kernel invocation which looks like my)kernel<<<dinGrid,dimBlock>>> (a[0],a[1],a[2],a[3]…a[1000]);

This is not sumthing id wanna do…

ananth_Sadanand · June 19, 2010, 6:51pm

I dont really understand what your saying. Im new to cuda. Can you elaborate?

Thankx in advance.

avidday · June 19, 2010, 7:16pm

You would never pass each string individually, just pass a and index into it inside the kernel…

cbuchner1 · June 19, 2010, 8:14pm

Essentially I am saying that you should to roll your own memory management for strings.

Memory pools are a generic design pattern not specific to CUDA. You can even read about

it on wikipedia.

tmurray · June 19, 2010, 10:15pm

That’s not true at all. Pass a char** that lives on the device, life is easy.

ananth_Sadanand · June 20, 2010, 12:26am

i have tried this. With this approach, i kinda get stuck at the following juncture:

suppose i have:

[codebox]char* tnode_atts_char_dev[50];

char* tnode_vals_char_dev[50];

cudaMalloc((void**)?,50*(sizeof(char*)));

cudaMemcpy(?,?,?,cudaMemcpyHostToDevice);

cudaMemcpy(?,?,?,cudaMemcpyHostToDevice);[/codebox]

Whats are the arguments for cudaMalloc and cudaMemcpy???

I racked my brains out with this approach but i could’nt figure it out…Would be great if you tell me what i fill in as arguments…

Now im just playing with the whole concept of streams and seeing if that will help…

Thanx in advance.

avidday · June 20, 2010, 12:38pm

As you have written it, that code won’t work. cudaMempcy (or regular memcpy for that matter) can’t do any kind of “deep” copying. You still have to do the iterative allocate and copy of each individual string into device memory, then copy the host array holding the addresses of the device strings from host memory into device memory.

ananth_Sadanand · June 20, 2010, 5:18pm

What do u mean by the last line-“, then copy the host array holding the addresses of the device strings from host memory into device memory.”…i can copy n allocate each individual(for loop)…but how do i launch my kernel…thats the question!!!

avidday · June 20, 2010, 7:56pm

Precisely what I said. You need to create an array of device pointers in host memory, each pointer being the address of the strings you copy to device memory in the loop. Then create an array of pointers in device memory and copy the host array of pointers to that device memory. Then just launch the kernel like this:

char ** device_strings;

// initialization and copying goes here

mykernel <<< >>> (device_strings);

ananth_Sadanand · June 23, 2010, 7:56am

[quote name=‘avidday’ post=‘1076527’ date=‘Jun 20 2010, 03:56 PM’]

Precisely what I said. You need to create an array of device pointers in host memory, each pointer being the address of the strings you copy to device memory in the loop. Then create an array of pointers in device memory and copy the host array of pointers to that device memory. Then just launch the kernel like this:

[codebox]char* a[2];

a[0]=“foo1”;

a[2]=‘foo2’[/codebox]

Cant you please take it from here and show me how the copy, allocation and kernel invocation takes place???

Thanx in advance.

avidday · June 24, 2010, 1:21pm

It sounds like you might need to brush up a bit on strings and pointers in C. Here is something to ponder while you are doing it:

#include <string.h>

#include <stdlib.h>

#include <stdio.h>

#include "cuda_runtime.h"

#ifndef gpuAssert

#define gpuAssert( condition ) { if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE %s in %s, line %d\n", cudaGetErrorString(condition), __FILE__, __LINE__ ); exit( 1 ); } }

#endif

#define nstrings (2)

#define strlenlim (9)

__global__ void stringfoo(char ** instrings)

{

	const unsigned int tform = 0xafa00;

	if (threadIdx.x < nstrings) {

		 unsigned int *ival = (unsigned int *)instrings[threadIdx.x];

		*ival -= tform;

	}   

	return;

}

int main()

{

	char * a[nstrings];

	char * _s[nstrings];

	char ** _a;

	char s1[] = "foo1

#include <string.h>

#include <stdlib.h>

#include <stdio.h>

#include “cuda_runtime.h”

#ifndef gpuAssert

#define gpuAssert( condition ) { if( (condition) != 0 ) { fprintf( stderr, “\n FAILURE %s in %s, line %d\n”, cudaGetErrorString(condition), FILE, LINE ); exit( 1 ); } }

#endif

#define nstrings (2)

#define strlenlim (9)

global void stringfoo(char ** instrings)

{

const unsigned int tform = 0xafa00;

if (threadIdx.x < nstrings) {

	 unsigned int *ival = (unsigned int *)instrings[threadIdx.x];

	*ival -= tform;

}   

return;

}

int main()

{

char * a[nstrings];

char * _s[nstrings];

char ** _a;

char s1[] = "foo1\0";

char s2[] = "foo2\0";

a[0] = s1;

a[1] = s2;

for (int i = 0; i < nstrings; i++) {

	size_t slen = strlen(a[i]);

	size_t clen = (slen > strlenlim) ? strlenlim : slen;

	gpuAssert( cudaMalloc((void **)&_s[i], strlenlim) );

	gpuAssert( cudaMemcpy(_s[i], a[i], clen, cudaMemcpyHostToDevice) );

}

size_t alen = size_t(nstrings) * sizeof(char *);

gpuAssert( cudaMalloc((void ***)&_a, alen) );

gpuAssert( cudaMemcpy(_a, _s, alen, cudaMemcpyHostToDevice) );

stringfoo <<< 1, nstrings >>> (_a);

gpuAssert( cudaGetLastError() );

char sout[strlenlim];

for (int i = 0; i < nstrings; i++) {

	gpuAssert( cudaMemcpy(sout, _s[i], size_t(strlenlim), cudaMemcpyDeviceToHost) );

	fprintf(stdout, "%s %s\n", a[i], sout);

}

}

/* vim:ts=4:sw=4:expandtab:number */

";

	char s2[] = "foo2

#include <string.h>

#include <stdlib.h>

#include <stdio.h>

#include “cuda_runtime.h”

#ifndef gpuAssert

#define gpuAssert( condition ) { if( (condition) != 0 ) { fprintf( stderr, “\n FAILURE %s in %s, line %d\n”, cudaGetErrorString(condition), FILE, LINE ); exit( 1 ); } }

#endif

#define nstrings (2)

#define strlenlim (9)

global void stringfoo(char ** instrings)

{

const unsigned int tform = 0xafa00;

if (threadIdx.x < nstrings) {

	 unsigned int *ival = (unsigned int *)instrings[threadIdx.x];

	*ival -= tform;

}   

return;

}

int main()

{

char * a[nstrings];

char * _s[nstrings];

char ** _a;

char s1[] = "foo1\0";

char s2[] = "foo2\0";

a[0] = s1;

a[1] = s2;

for (int i = 0; i < nstrings; i++) {

	size_t slen = strlen(a[i]);

	size_t clen = (slen > strlenlim) ? strlenlim : slen;

	gpuAssert( cudaMalloc((void **)&_s[i], strlenlim) );

	gpuAssert( cudaMemcpy(_s[i], a[i], clen, cudaMemcpyHostToDevice) );

}

size_t alen = size_t(nstrings) * sizeof(char *);

gpuAssert( cudaMalloc((void ***)&_a, alen) );

gpuAssert( cudaMemcpy(_a, _s, alen, cudaMemcpyHostToDevice) );

stringfoo <<< 1, nstrings >>> (_a);

gpuAssert( cudaGetLastError() );

char sout[strlenlim];

for (int i = 0; i < nstrings; i++) {

	gpuAssert( cudaMemcpy(sout, _s[i], size_t(strlenlim), cudaMemcpyDeviceToHost) );

	fprintf(stdout, "%s %s\n", a[i], sout);

}

}

/* vim:ts=4:sw=4:expandtab:number */

";

	a[0] = s1;

	a[1] = s2;

	for (int i = 0; i < nstrings; i++) {

		size_t slen = strlen(a[i]);

		size_t clen = (slen > strlenlim) ? strlenlim : slen;

		gpuAssert( cudaMalloc((void **)&_s[i], strlenlim) );

		gpuAssert( cudaMemcpy(_s[i], a[i], clen, cudaMemcpyHostToDevice) );

	}

	size_t alen = size_t(nstrings) * sizeof(char *);

	gpuAssert( cudaMalloc((void ***)&_a, alen) );

	gpuAssert( cudaMemcpy(_a, _s, alen, cudaMemcpyHostToDevice) );

	stringfoo <<< 1, nstrings >>> (_a);

	gpuAssert( cudaGetLastError() );

	char sout[strlenlim];

	for (int i = 0; i < nstrings; i++) {

		gpuAssert( cudaMemcpy(sout, _s[i], size_t(strlenlim), cudaMemcpyDeviceToHost) );

		fprintf(stdout, "%s %s\n", a[i], sout);

	}

}

/* vim:ts=4:sw=4:expandtab:number */

which does this:

avidday@cuda:~$ nvcc stringfoo.cu -o stringfoo

./stringfoo.cu(19): Warning: Cannot tell what pointer points to, assuming global memory space

./stringfoo.cu(19): Warning: Cannot tell what pointer points to, assuming global memory space

avidday@cuda:~$ ./stringfoo 

foo1 fud1

foo2 fud2

wiraungu · March 24, 2011, 6:56am

It sounds like you might need to brush up a bit on strings and pointers in C. Here is something to ponder while you are doing it:

#include <string.h>

#include <stdlib.h>

#include <stdio.h>

#include "cuda_runtime.h"

#ifndef gpuAssert

#define gpuAssert( condition ) { if( (condition) != 0 ) { fprintf( stderr, "\n FAILURE %s in %s, line %d\n", cudaGetErrorString(condition), __FILE__, __LINE__ ); exit( 1 ); } }

#endif

#define nstrings (2)

#define strlenlim (9)

__global__ void stringfoo(char ** instrings)

{

	const unsigned int tform = 0xafa00;

	if (threadIdx.x < nstrings) {

		 unsigned int *ival = (unsigned int *)instrings[threadIdx.x];

		*ival -= tform;

	}   

	return;

}

int main()

{

	char * a[nstrings];

	char * _s[nstrings];

	char ** _a;

	char s1[] = "foo1

include <string.h>

include <stdlib.h>

include <stdio.h>

include “cuda_runtime.h”

#ifndef gpuAssert

define gpuAssert( condition ) { if( (condition) != 0 ) { fprintf( stderr, “\n FAILURE %s in %s, line %d\n”, cudaGetErrorString(condition), FILE, LINE ); exit( 1 ); } }

endif

define nstrings (2)

define strlenlim (9)

global void stringfoo(char ** instrings)

{

const unsigned int tform = 0xafa00;

if (threadIdx.x < nstrings) {

	 unsigned int *ival = (unsigned int *)instrings[threadIdx.x];

	*ival -= tform;

}   

return;

}

int main()

{

char * a[nstrings];

char * _s[nstrings];

char ** _a;

char s1[] = "foo1\0";

char s2[] = "foo2\0";

a[0] = s1;

a[1] = s2;

for (int i = 0; i < nstrings; i++) {

	size_t slen = strlen(a[i]);

	size_t clen = (slen > strlenlim) ? strlenlim : slen;

	gpuAssert( cudaMalloc((void **)&_s[i], strlenlim) );

	gpuAssert( cudaMemcpy(_s[i], a[i], clen, cudaMemcpyHostToDevice) );

}

size_t alen = size_t(nstrings) * sizeof(char *);

gpuAssert( cudaMalloc((void ***)&_a, alen) );

gpuAssert( cudaMemcpy(_a, _s, alen, cudaMemcpyHostToDevice) );

stringfoo <<< 1, nstrings >>> (_a);

gpuAssert( cudaGetLastError() );

char sout[strlenlim];

for (int i = 0; i < nstrings; i++) {

	gpuAssert( cudaMemcpy(sout, _s[i], size_t(strlenlim), cudaMemcpyDeviceToHost) );

	fprintf(stdout, "%s %s\n", a[i], sout);

}

}

/* vim:ts=4:sw=4:expandtab:number */

";

	char s2[] = "foo2

include <string.h>

include <stdlib.h>

include <stdio.h>

include “cuda_runtime.h”

#ifndef gpuAssert

define gpuAssert( condition ) { if( (condition) != 0 ) { fprintf( stderr, “\n FAILURE %s in %s, line %d\n”, cudaGetErrorString(condition), FILE, LINE ); exit( 1 ); } }

endif

define nstrings (2)

define strlenlim (9)

global void stringfoo(char ** instrings)

{

const unsigned int tform = 0xafa00;

if (threadIdx.x < nstrings) {

	 unsigned int *ival = (unsigned int *)instrings[threadIdx.x];

	*ival -= tform;

}   

return;

}

int main()

{

char * a[nstrings];

char * _s[nstrings];

char ** _a;

char s1[] = "foo1\0";

char s2[] = "foo2\0";

a[0] = s1;

a[1] = s2;

for (int i = 0; i < nstrings; i++) {

	size_t slen = strlen(a[i]);

	size_t clen = (slen > strlenlim) ? strlenlim : slen;

	gpuAssert( cudaMalloc((void **)&_s[i], strlenlim) );

	gpuAssert( cudaMemcpy(_s[i], a[i], clen, cudaMemcpyHostToDevice) );

}

size_t alen = size_t(nstrings) * sizeof(char *);

gpuAssert( cudaMalloc((void ***)&_a, alen) );

gpuAssert( cudaMemcpy(_a, _s, alen, cudaMemcpyHostToDevice) );

stringfoo <<< 1, nstrings >>> (_a);

gpuAssert( cudaGetLastError() );

char sout[strlenlim];

for (int i = 0; i < nstrings; i++) {

	gpuAssert( cudaMemcpy(sout, _s[i], size_t(strlenlim), cudaMemcpyDeviceToHost) );

	fprintf(stdout, "%s %s\n", a[i], sout);

}

}

/* vim:ts=4:sw=4:expandtab:number */

";

	a[0] = s1;

	a[1] = s2;

	for (int i = 0; i < nstrings; i++) {

		size_t slen = strlen(a[i]);

		size_t clen = (slen > strlenlim) ? strlenlim : slen;

		gpuAssert( cudaMalloc((void **)&_s[i], strlenlim) );

		gpuAssert( cudaMemcpy(_s[i], a[i], clen, cudaMemcpyHostToDevice) );

	}

	size_t alen = size_t(nstrings) * sizeof(char *);

	gpuAssert( cudaMalloc((void ***)&_a, alen) );

	gpuAssert( cudaMemcpy(_a, _s, alen, cudaMemcpyHostToDevice) );

	stringfoo <<< 1, nstrings >>> (_a);

	gpuAssert( cudaGetLastError() );

	char sout[strlenlim];

	for (int i = 0; i < nstrings; i++) {

		gpuAssert( cudaMemcpy(sout, _s[i], size_t(strlenlim), cudaMemcpyDeviceToHost) );

		fprintf(stdout, "%s %s\n", a[i], sout);

	}

}

/* vim:ts=4:sw=4:expandtab:number */

which does this:

avidday@cuda:~$ nvcc stringfoo.cu -o stringfoo

./stringfoo.cu(19): Warning: Cannot tell what pointer points to, assuming global memory space

./stringfoo.cu(19): Warning: Cannot tell what pointer points to, assuming global memory space

avidday@cuda:~$ ./stringfoo 

foo1 fud1

foo2 fud2

hi avidday,

cuPrintf(“[%s] \n”, instrings[threadIdx.x]);

I copied the above codes and put the cuPrintf to display instrings[threadIdx.x] but it was not printed out. Any idea why?

Thanks, in advance.