ID3DX11Scan issues

Hi,

I have implemented a D3D11 application that uses ID3DX11Scan (d3dcsx.lib) to do a parallel prefix sum of a UAV buffer into another UAV buffer. This works very well when using an integrated Intel GPU. However, when I try using my GTX 680 with driver version 347.52 it does not seem to work any more. I am using DXGI_FORMAT_R32_SINT since unsigned ints did not work for me during implementation (using an integrated Intel GPU).

According to the following forum link, only signed ints are supported (which I think is a bit weird since it is not mentioned in the Microsoft docs)

http://xboxforums.create.msdn.com/forums/p/79535/484267.aspx
https://msdn.microsoft.com/en-us/library/windows/desktop/ff476856(v=vs.85).aspx

Anyways, my code fails (silently! runs fine without bad HRESULT) and looks something like this for creating the scan:

INT32 dx = (INT32)ceil(g_streamDesc.RecommendedDim.X / (DOUBLE)BrickSize);
INT32 dy = (INT32)ceil(g_streamDesc.RecommendedDim.Y / (DOUBLE)BrickSize);
INT32 dz = (INT32)ceil(g_streamDesc.RecommendedDim.Z / (DOUBLE)BrickSize);
INT32 count = (dx * dy * dz);

ID3DX11Scan* scan = nullptr;
hr = D3DX11CreateScan(sdAdapter->ImmediateContext.Get(), count, 1, &scan);
S3D_VERIFY(SUCCEEDED(hr));
hr = scan->SetScanDirection(D3DX11_SCAN_DIRECTION_FORWARD);
S3D_VERIFY(SUCCEEDED(hr));
sdAdapter->Resources.Scans.Insert("PrefixSumScan", scan);

Then the usage is as follows:

INT32 dx = (INT32)ceil(g_streamDesc.RecommendedDim.X / (DOUBLE)BrickSize);
INT32 dy = (INT32)ceil(g_streamDesc.RecommendedDim.Y / (DOUBLE)BrickSize);
INT32 dz = (INT32)ceil(g_streamDesc.RecommendedDim.Z / (DOUBLE)BrickSize);
INT32 count = (dx * dy * dz);

HRESULT hr{};
ID3DX11Scan* scan = sdAdapter->Resources.Scans["PrefixSumScan"].Get();
S3D_VERIFY(scan != nullptr);

// Note: Scan only seems to work with SINT (not UINT) data..
hr = scan->Scan(D3DX11_SCAN_DATA_TYPE_INT, D3DX11_SCAN_OPCODE_ADD, count, 
sdAdapter->Resources.UnorderedAccessViews["LookupSphereCountBufferUAView"].Get(), 
sdAdapter->Resources.UnorderedAccessViews["LookupSpherePrefixSumBufferUAView"].Get());
S3D_VERIFY(SUCCEEDED(hr));

Is the ID3DX11Scan utility properly supported and supposed to work? Have I implemented the feature properly or is there something I am doing wrong? This code behaves as expected on my Intel integrated GPU (though they dont support UINT format).

My buffers are created with the following code:

// LookupSphereCountBuffer
{
   INT32 dx = (INT32)ceil(g_streamDesc.RecommendedDim.X / (DOUBLE)BrickSize);
   INT32 dy = (INT32)ceil(g_streamDesc.RecommendedDim.Y / (DOUBLE)BrickSize);
   INT32 dz = (INT32)ceil(g_streamDesc.RecommendedDim.Z / (DOUBLE)BrickSize);

   D3D11_BUFFER_DESC desc{};
   desc.ByteWidth = (sizeof(INT32) * (dx * dy * dz));
   desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
   desc.Usage = D3D11_USAGE_DEFAULT; // A resource that requires read and write access by the GPU.
   desc.CPUAccessFlags = 0;
      
   // Create texture.
   ComPtr<ID3D11Buffer> buffer{};
   hr = sdAdapter->Device->CreateBuffer(&desc, nullptr, &buffer);
   if (!S3D_VERIFY_MSG(SUCCEEDED(hr), "CreateBuffer - failed!"))
   {
      return hr;
   }
   sdAdapter->Resources.Buffers.Insert(SetDebugObjectName(buffer.Get(), "LookupSphereCountBuffer"), buffer);

   D3D11_SHADER_RESOURCE_VIEW_DESC srDesc{};
   srDesc.Buffer.NumElements = (dx * dy * dz);
   srDesc.Buffer.ElementOffset = 0;
   srDesc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
   srDesc.Format = DXGI_FORMAT_R32_SINT;

   // Create a resource view of the texture.
   ComPtr<ID3D11ShaderResourceView> srView{};
   hr = sdAdapter->Device->CreateShaderResourceView(buffer.Get(), &srDesc, &srView);
   if (!S3D_VERIFY_MSG(SUCCEEDED(hr), "CreateShaderResourceView - failed!"))
   {
      return hr;
   }
   sdAdapter->Resources.ShaderResourceViews.Insert(SetDebugObjectName(srView.Get(), "LookupSphereCountBufferSRView"), srView);

   D3D11_UNORDERED_ACCESS_VIEW_DESC uaDesc{};
   uaDesc.Buffer.NumElements = (dx * dy * dz);
   uaDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
   uaDesc.Format = DXGI_FORMAT_R32_SINT;

   // Create a uav from the texture.
   ComPtr<ID3D11UnorderedAccessView> uaView{};
   hr = sdAdapter->Device->CreateUnorderedAccessView(buffer.Get(), &uaDesc, &uaView);
   if (!S3D_VERIFY_MSG(SUCCEEDED(hr), "CreateUnorderedAccessView - failed!"))
   {
      return hr;
   }
   sdAdapter->Resources.UnorderedAccessViews.Insert(SetDebugObjectName(uaView.Get(), "LookupSphereCountBufferUAView"), uaView);
}
// LookupSpherePrefixSumBuffer
{
   INT32 dx = (INT32)ceil(g_streamDesc.RecommendedDim.X / (DOUBLE)BrickSize);
   INT32 dy = (INT32)ceil(g_streamDesc.RecommendedDim.Y / (DOUBLE)BrickSize);
   INT32 dz = (INT32)ceil(g_streamDesc.RecommendedDim.Z / (DOUBLE)BrickSize);

   D3D11_BUFFER_DESC desc{};
   desc.ByteWidth = (sizeof(INT32) * (dx * dy * dz));
   desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
   desc.Usage = D3D11_USAGE_DEFAULT; // A resource that requires read and write access by the GPU.
   desc.CPUAccessFlags = 0;

   // Create texture.
   ComPtr<ID3D11Buffer> buffer{};
   hr = sdAdapter->Device->CreateBuffer(&desc, nullptr, &buffer);
   if (!S3D_VERIFY_MSG(SUCCEEDED(hr), "CreateBuffer - failed!"))
   {
      return hr;
   }
   sdAdapter->Resources.Buffers.Insert(SetDebugObjectName(buffer.Get(), "LookupSpherePrefixSumBuffer"), buffer);

   D3D11_SHADER_RESOURCE_VIEW_DESC srDesc{};
   srDesc.Buffer.NumElements = (dx * dy * dz);
   srDesc.Buffer.ElementOffset = 0;
   srDesc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
   srDesc.Format = DXGI_FORMAT_R32_SINT;

   // Create a resource view of the texture.
   ComPtr<ID3D11ShaderResourceView> srView{};
   hr = sdAdapter->Device->CreateShaderResourceView(buffer.Get(), &srDesc, &srView);
   if (!S3D_VERIFY_MSG(SUCCEEDED(hr), "CreateShaderResourceView - failed!"))
   {
      return hr;
   }
   sdAdapter->Resources.ShaderResourceViews.Insert(SetDebugObjectName(srView.Get(), "LookupSpherePrefixSumBufferSRView"), srView);

   D3D11_UNORDERED_ACCESS_VIEW_DESC uaDesc{};
   uaDesc.Buffer.NumElements = (dx * dy * dz);
   uaDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
   uaDesc.Format = DXGI_FORMAT_R32_SINT;

   // Create a uav from the texture.
   ComPtr<ID3D11UnorderedAccessView> uaView{};
   hr = sdAdapter->Device->CreateUnorderedAccessView(buffer.Get(), &uaDesc, &uaView);
   if (!S3D_VERIFY_MSG(SUCCEEDED(hr), "CreateUnorderedAccessView - failed!"))
   {
      return hr;
   }
   sdAdapter->Resources.UnorderedAccessViews.Insert(SetDebugObjectName(uaView.Get(), "LookupSpherePrefixSumBufferUAView"), uaView);
}

Let me know if you need to know more,
Jørn

So I have added some more test code to my application to verify input and output from the scan (prefix sum) operation.

Basically the input buffer I have set to all ones (1) - which is verified by the first test and succeeds in all cases. The prefix sum validation succeeds on the Intel GPU, but fails after 128 iterations on the NVIDIA GPU. When a correct prefix sum of 128 is expected, the actual result on the NVIDIA GPU is 256!

After this the prefix sum seems to add 1 on each iteration until it reaches 383. Then goes down to 256 again and starts adding 1s. My tests are placed just after the prefix sum and looks like this:

// Test count data..
{
   auto buffer = sdAdapter->Resources.Buffers["LookupSphereTestBufferCpu"].Get();
   sdAdapter->ImmediateContext->CopyResource(buffer, sdAdapter->Resources.Buffers["LookupSphereCountBuffer"].Get());

   D3D11_MAPPED_SUBRESOURCE mappedResource{};
   HRESULT hr = sdAdapter->ImmediateContext->Map(buffer, 0, D3D11_MAP_READ, 0, &mappedResource);
   S3D_VERIFY_MSG(SUCCEEDED(hr), "Map - failed!");
   auto counts = reinterpret_cast<INT32*>(mappedResource.pData);

   for (auto i = 0; i < (dx * dy * dz); i++)
   {
      S3D_VERIFY(*(counts++) == 1);
   }

   sdAdapter->ImmediateContext->Unmap(buffer, 0);
}

// Test prefix sum data..
{
   auto buffer = sdAdapter->Resources.Buffers["LookupSphereTestBufferCpu"].Get();
   sdAdapter->ImmediateContext->CopyResource(buffer, sdAdapter->Resources.Buffers["LookupSpherePrefixSumBuffer"].Get());

   D3D11_MAPPED_SUBRESOURCE mappedResource{};
   HRESULT hr = sdAdapter->ImmediateContext->Map(buffer, 0, D3D11_MAP_READ, 0, &mappedResource);
   S3D_VERIFY_MSG(SUCCEEDED(hr), "Map - failed!");
   auto prefixSum = reinterpret_cast<INT32*>(mappedResource.pData);

   for (auto i = 0; i < (dx * dy * dz); i++)
   {
      S3D_VERIFY(*(prefixSum++) == i);
   }

   sdAdapter->ImmediateContext->Unmap(buffer, 0);
}

My buffers size (dx * dy * dz) is 69819.

Any insights or ideas about this?

J.

I have now made a minimal example to reproduce the issue I am having.

Could someone please try it and verify that the prefix sum using ID3DX11Scan has issues? I could also send you the project if someone is interested in testing my code.

#include "stdafx.h"

#pragma comment(lib, "d3d11.lib")
#pragma comment(lib, "d3dcsx.lib")

using namespace Microsoft::WRL;
using namespace std;

int _tmain(int argc, _TCHAR* argv[])
{
   INT32 dx = 37;
   INT32 dy = 51;
   INT32 dz = 37;
   INT32 count = (dx * dy * dz);
   INT32 byteCount = (count * sizeof(INT32));

   HRESULT hr{};
   ComPtr<ID3D11Device> device{};
   ComPtr<ID3D11DeviceContext> immediateContext{};
   ComPtr<ID3D11Buffer> lookupSphereCountBuffer{};
   ComPtr<ID3D11Buffer> lookupSpherePrefixSumBuffer{};
   ComPtr<ID3D11Buffer> lookupSphereTestBufferCpu{};
   ComPtr<ID3D11UnorderedAccessView> lookupSphereCountBufferUAView{};
   ComPtr<ID3D11UnorderedAccessView> lookupSpherePrefixSumBufferUAView{};
   ComPtr<ID3DX11Scan> scan{};

   // Create a Direct3D device with associated immediate context..
   {
      UINT32 createDeviceFlags = 0;
#ifdef _DEBUG
      createDeviceFlags |= D3D11_CREATE_DEVICE_DEBUG;
#endif

      D3D_DRIVER_TYPE driverType = D3D_DRIVER_TYPE_HARDWARE;
      D3D_FEATURE_LEVEL featureLevels[] = { D3D_FEATURE_LEVEL_11_0 };
      D3D_FEATURE_LEVEL featureLevel{};

      hr = D3D11CreateDevice(nullptr, driverType, 0, createDeviceFlags, featureLevels, ARRAYSIZE(featureLevels), D3D11_SDK_VERSION,
         &device, &featureLevel, &immediateContext);
      assert(SUCCEEDED(hr));
   }

   // LookupSphereCountBuffer
   {
      D3D11_BUFFER_DESC desc{};
      desc.ByteWidth = (sizeof(INT32) * count);
      desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
      desc.Usage = D3D11_USAGE_DEFAULT; // A resource that requires read and write access by the GPU.
      desc.CPUAccessFlags = 0;

      INT32* intData = reinterpret_cast<INT32*>(_aligned_malloc(byteCount, 16));      
      for (auto i = 0; i < count; i++)
      {
         intData[i] = 1;
      }

      D3D11_SUBRESOURCE_DATA data{};
      data.SysMemPitch = byteCount;
      data.pSysMem = intData;
      hr = device->CreateBuffer(&desc, &data, &lookupSphereCountBuffer);
      assert(SUCCEEDED(hr));

      _aligned_free(intData);

      D3D11_UNORDERED_ACCESS_VIEW_DESC uaDesc{};
      uaDesc.Buffer.NumElements = count;
      uaDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
      uaDesc.Format = DXGI_FORMAT_R32_SINT;

      hr = device->CreateUnorderedAccessView(lookupSphereCountBuffer.Get(), &uaDesc, &lookupSphereCountBufferUAView);
      assert(SUCCEEDED(hr));
   }

   // LookupSpherePrefixSumBuffer
   {
      D3D11_BUFFER_DESC desc{};
      desc.ByteWidth = (sizeof(INT32) * count);
      desc.BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_UNORDERED_ACCESS;
      desc.Usage = D3D11_USAGE_DEFAULT; // A resource that requires read and write access by the GPU.
      desc.CPUAccessFlags = 0;

      hr = device->CreateBuffer(&desc, nullptr, &lookupSpherePrefixSumBuffer);
      assert(SUCCEEDED(hr));

      D3D11_UNORDERED_ACCESS_VIEW_DESC uaDesc{};
      uaDesc.Buffer.NumElements = count;
      uaDesc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
      uaDesc.Format = DXGI_FORMAT_R32_SINT;

      device->CreateUnorderedAccessView(lookupSpherePrefixSumBuffer.Get(), &uaDesc, &lookupSpherePrefixSumBufferUAView);
      assert(SUCCEEDED(hr));
   }

   // LookupSphereTestBufferCpu
   {
      D3D11_BUFFER_DESC desc{};
      desc.ByteWidth = (sizeof(INT32) * count);
      desc.BindFlags = 0;
      desc.Usage = D3D11_USAGE_STAGING; // A resource that supports data transfer (copy) from the GPU to the CPU
      desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;

      hr = device->CreateBuffer(&desc, nullptr, &lookupSphereTestBufferCpu);
      assert(SUCCEEDED(hr));
   }

   // Create prefix sum scan..
   {
      hr = D3DX11CreateScan(immediateContext.Get(), count, 1, &scan);
      assert(SUCCEEDED(hr));
      hr = scan->SetScanDirection(D3DX11_SCAN_DIRECTION_FORWARD);
      assert(SUCCEEDED(hr));
   }

   // Perform prefix sum and test input and output..
   {
      // Note: Scan only seems to work with SINT (not UINT) data..
      hr = scan->Scan(D3DX11_SCAN_DATA_TYPE_INT, D3DX11_SCAN_OPCODE_ADD, count, lookupSphereCountBufferUAView.Get(), lookupSpherePrefixSumBufferUAView.Get());
      assert(SUCCEEDED(hr));

      // Test count data..
      {
         immediateContext->CopyResource(lookupSphereTestBufferCpu.Get(), lookupSphereCountBuffer.Get());

         D3D11_MAPPED_SUBRESOURCE mappedResource{};
         hr = immediateContext->Map(lookupSphereTestBufferCpu.Get(), 0, D3D11_MAP_READ, 0, &mappedResource);
         assert(SUCCEEDED(hr));

         auto counts = reinterpret_cast<INT32*>(mappedResource.pData);
         for (auto i = 0; i < (dx * dy * dz); i++)
         {
            assert(*(counts++) == 1);
         }

         immediateContext->Unmap(lookupSphereTestBufferCpu.Get(), 0);
      }

      // Test prefix sum data..
      {
         immediateContext->CopyResource(lookupSphereTestBufferCpu.Get(), lookupSpherePrefixSumBuffer.Get());

         D3D11_MAPPED_SUBRESOURCE mappedResource{};
         hr = immediateContext->Map(lookupSphereTestBufferCpu.Get(), 0, D3D11_MAP_READ, 0, &mappedResource);
         assert(SUCCEEDED(hr));

         auto prefixSum = reinterpret_cast<INT32*>(mappedResource.pData);
         for (auto i = 0; i < (dx * dy * dz); i++)
         {
            if (*(prefixSum) != i)
            {
               // Note: This is where the assert happens after 128 iterations!
               assert(*(prefixSum) == i);
               cout << "Prefix sums do not match. Got: " << *(prefixSum) << ". Expected: " << i << endl;
            }
            prefixSum++;
         }

         immediateContext->Unmap(lookupSphereTestBufferCpu.Get(), 0);
      }
   }

	return 0;
}

The includes have in this example been put in stdafx.h which looks like this:

#pragma once

#include "targetver.h"

#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#include <d3d11_2.h>
#include <d3dcsx.h>
#include <wrl/client.h>
#include <assert.h>
#include <iostream>

Thanks,
Jørn