draw vertices from cudaMalloc() memory

I have created an array using the following:

float *vel_h, *vel_d;  // Pointers to host & device

	int size = mesh_height * mesh_width * 4 * sizeof(float);  

	vel_h = (float *)malloc(size);  // Allocate array on host  

	cudaMalloc((void **) &vel_d, size);  // Allocate array on device  

	// Initialize host array and copy it to CUDA device  

	for (int i=0; i<mesh_width*mesh_height; i++) 

		vel_h[i] = (float)((rand()/((float)RAND_MAX)) * 2.0 - 1.0)*0.01f;  

	cudaMemcpy(vel_d, vel_h, size, cudaMemcpyHostToDevice);

is it possible to tell a direct3d/opengl device to render vertices represented by an array without copying the data back to the host and back to the device?

It would be great if i could give the IDirect3DDevice9 device the buffer pointer using SetStreamSource() method. like bellow:

// Render the vertex buffer contents

	device->SetStreamSource( 0, vel_d, 0, sizeof(CUSTOMVERTEX) );

	device->SetFVF( D3DFVF_CUSTOMVERTEX );

	device->DrawPrimitive( D3DPT_POINTLIST, 0, num);

Again, any help would be amazing!