Currently the cuda grid system only supports up to 6d grid,
blockIdx.x
blockIdx.y
blockIdx.z
threadIdx.x
threadIdx.y
threadIdx.z
and it is tied to the kernel launch parameter.
But sometimes we would like to write something like
template <typename R, typename grid>
__global__ void k(grid g, const R* x, R* y)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
coordinate c = coord(g, idx); //
// g(x,y,c);
}
template <typename R>
void f(const ttl::cuda_tensor_view<R> &x, const ttl::cuda_tensor_ref<R> &y){
grid g = y.shape();
constexpr int blocksPerGrid = 10;
constexpr int threadsPerBlock = 10;
k<R><<<blocksPerGrid, threadsPerBlock>>>(g, x, y);
}
Currently the cuda grid system only supports up to 6d grid,
blockIdx.x
blockIdx.y
blockIdx.z
threadIdx.x
threadIdx.y
threadIdx.z
and it is tied to the kernel launch parameter.
But sometimes we would like to write something like