MAPS Showcase
Showcase
-
2-Dimensional Convolution
__constant__ float kConvKernel[(2 * RADIUS + 1) * (2 * RADIUS + 1)]; __global__ void Conv2(maps::Window2DSingleGPU<float, BW, BH, RADIUS> in, maps::StructuredInjective2DSingleGPU<float, BW, BH> out) { MAPS_INIT(in, out); MAPS_FOREACH(oiter, out) { int i = 0; *oiter = 0.0f; MAPS_FOREACH_ALIGNED(iter, in, oiter) { *oiter += *iter * kConvKernel[i++]; } } out.commit(); }
__constant__ float kConvKernel[(2 * RADIUS + 1) * (2 * RADIUS + 1)]; __global__ void Conv2 MAPS_MULTIDEF(maps::Window2D<float, BW, BH, RADIUS> in, maps::StructuredInjective2D<float, BW, BH> out) { MAPS_MULTI_INITVARS(in, out); MAPS_FOREACH(oiter, out) { int i = 0; *oiter = 0.0f; MAPS_FOREACH_ALIGNED(iter, in, oiter) { *oiter += *iter * kConvKernel[i++]; } } out.commit(); }
-
Jacobi (with multiple elements per thread)
template <typename T, int IPX, int IPY> __global__ void JacobiMAPSKernel(maps::WindowSingleGPU<T, 2, BW, BH, 1, 1, IPX, IPY> inFrame, maps::StructuredInjectiveOutputSingleGPU<T, 2, BW, BH, 1, IPX, IPY> outFrame) { MAPS_INIT(inFrame, outFrame); // If there are no items to write, return if (outFrame.Items() == 0) return; MAPS_FOREACH(oiter, outFrame) { *oiter = (inFrame.aligned_at(oiter, 0, -1) + inFrame.aligned_at(oiter, -1, 0) + inFrame.aligned_at(oiter, 1, 0) + inFrame.aligned_at(oiter, 0, 1)) / T(4); } outFrame.commit(); }
template <typename T, int IPX, int IPY> __global__ void JacobiMAPSKernel MAPS_MULTIDEF(maps::Window<T, 2, BW, BH, 1, 1, IPX, IPY> inFrame, maps::StructuredInjectiveOutput<T, 2, BW, BH, 1, IPX, IPY> outFrame) { MAPS_MULTI_INITVARS(inFrame, outFrame); // If there are no items to write, return if (outFrame.Items() == 0) return; MAPS_FOREACH(oiter, outFrame) { *oiter = (inFrame.aligned_at(oiter, 0, -1) + inFrame.aligned_at(oiter, -1, 0) + inFrame.aligned_at(oiter, 1, 0) + inFrame.aligned_at(oiter, 0, 1)) / T(4); } outFrame.commit(); }
-
Histogram
template <typename T, typename BinT, int BINS> __global__ void Histogram(maps::Window2DSingleGPU<T, BW, BH, 0> in, maps::ReductiveStaticSingleGPU<BinT, BW*BH, BINS> out) { MAPS_INIT(in, out); MAPS_FOREACH(oiter, out) { int bin = *iter.align(oiter); ++oiter[bin]; } out.commit(); }
template <typename T, typename BinT, int BINS> __global__ void Histogram MAPS_MULTIDEF(maps::Window2D<T, BW, BH, 0> in, maps::ReductiveStatic<BinT, BW*BH, BINS> out) { MAPS_MULTI_INITVARS(in, out); MAPS_FOREACH(oiter, out) { int bin = *iter.align(oiter); ++oiter[bin]; } out.commit(); }
-
Game of Life (Advanced API)
template <typename T, int IPX, int IPY> __global__ void GoLMAPSKernel(maps::WindowSingleGPU<T, 2, BW, BH, 1, 1, IPX, IPY> current_gen, maps::StructuredInjectiveOutputSingleGPU<T, 2, BW, BH, 1, IPX, IPY> next_gen) { // Allocate shared memory for containers __shared__ typename decltype(current_gen)::SharedData sdata; __shared__ typename decltype(next_gen)::SharedData osdata; // Initialize containers (pre-synchronization) current_gen.init_async(sdata); next_gen.init_async(osdata); // Synchronize thread-blocks __syncthreads(); // Initialize containers (post-sync) current_gen.init_async_postsync(); next_gen.init_async_postsync(); // If there are no items to write, return if (next_gen.Items() == 0) return; MAPS_FOREACH(oiter, next_gen) { int numLiveNeighbors = 0; int isLive; // Determine number of live neighbors MAPS_FOREACH_ALIGNED(iter, current_gen, oiter) { if (iter.index() == 4) isLive = *iter; else numLiveNeighbors += *iter; } // Game of Life conditions if (isLive) { if (numLiveNeighbors < 2 || numLiveNeighbors > 3) isLive = 0; } else { if (numLiveNeighbors == 3) isLive = 1; } // Fill output cell *oiter = isLive; } next_gen.commit(); }
template <typename T, int IPX, int IPY> __global__ void GoLMAPSKernel MAPS_MULTIDEF(maps::Window<T, 2, BW, BH, 1, 1, IPX, IPY> current_gen, maps::StructuredInjectiveOutput<T, 2, BW, BH, 1, IPX, IPY> next_gen) { // Initialize multi-GPU device abstraction MAPS_MULTI_INIT(); // Allocate shared memory for containers __shared__ typename decltype(current_gen)::SharedData sdata; __shared__ typename decltype(next_gen)::SharedData osdata; // Initialize containers (pre-synchronization) current_gen.init_async(sdata); next_gen.init_async(osdata); // Synchronize thread-blocks __syncthreads(); // Initialize containers (post-sync) current_gen.init_async_postsync(); next_gen.init_async_postsync(); // If there are no items to write, return if (next_gen.Items() == 0) return; MAPS_FOREACH(oiter, next_gen) { int numLiveNeighbors = 0; int isLive; // Determine number of live neighbors MAPS_FOREACH_ALIGNED(iter, current_gen, oiter) { if (iter.index() == 4) isLive = *iter; else numLiveNeighbors += *iter; } // Game of Life conditions if (isLive) { if (numLiveNeighbors < 2 || numLiveNeighbors > 3) isLive = 0; } else { if (numLiveNeighbors == 3) isLive = 1; } // Fill output cell *oiter = isLive; } next_gen.commit(); }
-
N-Body Simulation
template <int IPX> __global__ void NBodyTimeStep MAPS_MULTIDEF(maps::Window1D<float4, BLOCK_WIDTH, 0, IPX> bodies_in, maps::Block1D<float4, 0, BLOCK_WIDTH, IPX> other_bodies, maps::StructuredInjective1D<float4, BLOCK_WIDTH, IPX> outBodies, maps::Window1D<float4, BLOCK_WIDTH, 0, IPX> velocity_in, maps::StructuredInjective1D<float4, BLOCK_WIDTH, IPX> velocity_out, float deltaTime) { MAPS_MULTI_INITVARS(bodies_in, other_bodies, outBodies, velocity_in, velocity_out); MAPS_FOREACH(oiter, outBodies) // Initialize outputs *oiter = make_float4(0.0f, 0.0f, 0.0f, 0.0f); do { // Compute forces of all bodies MAPS_FOREACH(oiter, outBodies) { MAPS_FOREACH_ALIGNED(iter, other_bodies, oiter) bodyBodyInteraction<float>(*oiter, *bodies_in.align(oiter), *iter); } other_bodies.nextChunk(); } while (!other_bodies.isDone()); auto viter = velocity_out.begin(); MAPS_FOREACH(oiter, outBodies) { // Integrate: compute outputs from acceleration float4 vel = *velocity_in.align(oiter); vel.x += (*oiter).x * deltaTime; // Compute velocity vel.y += (*oiter).y * deltaTime; vel.z += (*oiter).z * deltaTime; vel *= DAMPING_FACTOR; *oiter = *bodies_in.align(oiter); // Get current position (*oiter).x += vel.x * deltaTime; // Update position (*oiter).y += vel.y * deltaTime; (*oiter).z += vel.z * deltaTime; *viter = vel; // Write velocity and increment iterator ++viter; } velocity_out.commit(); // Commit velocity and positions outBodies.commit(); }
-
Matrix Multiplication
template <typename T> __global__ void GEMMKernel(MAPS_MULTIDEF2, maps::Block2D<T, 0, BLOCK_WIDTH, BLOCK_HEIGHT> A, maps::Block2D<T, 1, BLOCK_WIDTH, BLOCK_HEIGHT> B, maps::StructuredInjective2D<T, BLOCK_WIDTH, BLOCK_HEIGHT> C) { MAPS_MULTI_INITVARS(A, B, C); *C.begin() = T(0); do { MAPS_FOREACH(oiter, C) { auto B_iter = B.align(oiter); // Initialize B's iterator in addition to A MAPS_FOREACH_ALIGNED(A_iter, A, oiter) { *oiter += (*A_iter) * (*B_iter); ++B_iter; } } maps::NextChunkAll(A, B); // Efficiently advance chunk on both containers } while (!A.isDone()); if (C.Items() > 0) // Output results C.commit(); }
-
Matrix Multiplication (CUBLAS)
bool SGEMMRoutine(void *context, int deviceIdx, cudaStream_t stream, const maps::multi::GridSegment& task_segment, const std::vector<void *>& parameters, const std::vector<maps::multi::DatumSegment>& container_segments, const std::vector<maps::multi::DatumSegment>& container_allocation) { CUBLASContext *c = (CUBLASContext *)context; float alpha, beta; maps::GetConstantParameter(parameters[3], alpha); // Obtain constants maps::GetConstantParameter(parameters[4], beta); int m, n, k; m = container_segments[0].m_dimensions[0]; // Compute matrix segment dimensions n = container_segments[1].m_dimensions[0]; k = container_segments[2].m_dimensions[1]; // Set GPU stream and call actual kernel CUBLAS_CHECK(cublasSetStream(c->handles[deviceIdx], stream)); CUBLAS_CHECK(cublasSgemm(c->handles[deviceIdx], CUBLAS_OP_N, CUBLAS_OP_N, m, k, n, &alpha, (float *)parameters[0], container_segments[0].m_stride_bytes / sizeof(float), (float *)parameters[1], container_segments[1].m_stride_bytes / sizeof(float), &beta, (float *)parameters[2], container_segments[2].m_stride_bytes / sizeof(float))); return true; }
sched.InvokeUnmodified(SGEMMRoutine, &context, dim3(), maps::multi::Block2DUnmodified<0, float>(A), maps::multi::Block2DUnmodified<1, float>(B), maps::multi::StructuredInjective2D<float>(C), alpha, beta);
-
Sparse Matrix-Vector Multiplication (SpMV)
template <typename T> __global__ void SpMVMapsMultiKernel MAPS_MULTIDEF(maps::Adjacency<T, T> in_graph, maps::StructuredInjective1D<T, BLOCK_WIDTH> out) { MAPS_MULTI_INITVARS(in_graph, out); if (out.Items() == 0) return; T result = T(0); const auto eit = in_graph.end(); for (auto it = in_graph.begin(); it != eit; ++it) { result += (*it).edge_weight * // Matrix value (*it).adjacent_node_value; // Vector value } *out.begin() = result; // Write and commit output out.commit(); }