MAPS: GPU Memory Abstraction and Optimization Framework

2-Dimensional Convolution

Single GPU
Multi-GPU

__constant__ float kConvKernel[(2 * RADIUS + 1) * (2 * RADIUS + 1)];

__global__ void Conv2(maps::Window2DSingleGPU<float, BW, BH, RADIUS> in,
                      maps::StructuredInjective2DSingleGPU<float, BW, BH> out) {
    MAPS_INIT(in, out);
    
    
    MAPS_FOREACH(oiter, out) {
        int i = 0;
        *oiter = 0.0f;
        
        
        MAPS_FOREACH_ALIGNED(iter, in, oiter) {
            *oiter += *iter * kConvKernel[i++];
        }
    }
    
    out.commit();
}

__constant__ float kConvKernel[(2 * RADIUS + 1) * (2 * RADIUS + 1)];

__global__ void Conv2 MAPS_MULTIDEF(maps::Window2D<float, BW, BH, RADIUS> in,
                                    maps::StructuredInjective2D<float, BW, BH> out) {
    MAPS_MULTI_INITVARS(in, out);
    
    
    MAPS_FOREACH(oiter, out) {
        int i = 0;
        *oiter = 0.0f;
        
        
        MAPS_FOREACH_ALIGNED(iter, in, oiter) {
            *oiter += *iter * kConvKernel[i++];
        }
    }
    
    out.commit();
}

Jacobi (with multiple elements per thread)

Single GPU
Multi-GPU

template <typename T, int IPX, int IPY>
__global__ void JacobiMAPSKernel(maps::WindowSingleGPU<T, 2, BW, BH, 1, 1, IPX, IPY> inFrame, 
                                 maps::StructuredInjectiveOutputSingleGPU<T, 2, BW, BH, 1, IPX, IPY> outFrame) {
    MAPS_INIT(inFrame, outFrame);

    // If there are no items to write, return
    if (outFrame.Items() == 0)
        return;

    
    MAPS_FOREACH(oiter, outFrame) {
        *oiter = (inFrame.aligned_at(oiter, 0, -1) +
                  inFrame.aligned_at(oiter, -1, 0) +
                  inFrame.aligned_at(oiter,  1, 0) +
                  inFrame.aligned_at(oiter, 0,  1)) / T(4);
    }

    outFrame.commit();
}

template <typename T, int IPX, int IPY>
__global__ void JacobiMAPSKernel MAPS_MULTIDEF(maps::Window<T, 2, BW, BH, 1, 1, IPX, IPY> inFrame, 
                                               maps::StructuredInjectiveOutput<T, 2, BW, BH, 1, IPX, IPY> outFrame) {
    MAPS_MULTI_INITVARS(inFrame, outFrame);

    // If there are no items to write, return
    if (outFrame.Items() == 0)
        return;

    
    MAPS_FOREACH(oiter, outFrame) {
        *oiter = (inFrame.aligned_at(oiter, 0, -1) +
                  inFrame.aligned_at(oiter, -1, 0) +
                  inFrame.aligned_at(oiter,  1, 0) +
                  inFrame.aligned_at(oiter, 0,  1)) / T(4);
    }

    outFrame.commit();
}

Histogram

Single GPU
Multi-GPU

template <typename T, typename BinT, int BINS>
__global__ void Histogram(maps::Window2DSingleGPU<T, BW, BH, 0> in,
                          maps::ReductiveStaticSingleGPU<BinT, BW*BH, BINS> out) {
    MAPS_INIT(in, out);
    
    
    MAPS_FOREACH(oiter, out) {
        int bin = *iter.align(oiter);
        ++oiter[bin];
    }
    
    out.commit();
}

template <typename T, typename BinT, int BINS>
__global__ void Histogram MAPS_MULTIDEF(maps::Window2D<T, BW, BH, 0> in,
                                        maps::ReductiveStatic<BinT, BW*BH, BINS> out) {
    MAPS_MULTI_INITVARS(in, out);
    
    
    MAPS_FOREACH(oiter, out) {
        int bin = *iter.align(oiter);
        ++oiter[bin];
    }
    
    out.commit();
}

Game of Life (Advanced API)

Single GPU
Multi-GPU

template <typename T, int IPX, int IPY>
__global__ void GoLMAPSKernel(maps::WindowSingleGPU<T, 2, BW, BH, 1, 1, IPX, IPY> current_gen, 
                              maps::StructuredInjectiveOutputSingleGPU<T, 2, BW, BH, 1, IPX, IPY> next_gen) {
    
    
    
    // Allocate shared memory for containers
    __shared__ typename decltype(current_gen)::SharedData sdata;
    __shared__ typename decltype(next_gen)::SharedData osdata;
    
    // Initialize containers (pre-synchronization)
    current_gen.init_async(sdata);
    next_gen.init_async(osdata);
    
    // Synchronize thread-blocks
    __syncthreads();
    
    // Initialize containers (post-sync)
    current_gen.init_async_postsync();
    next_gen.init_async_postsync();

    // If there are no items to write, return
    if (next_gen.Items() == 0)
        return;

    
    MAPS_FOREACH(oiter, next_gen) {
        int numLiveNeighbors = 0;
        int isLive;

        // Determine number of live neighbors
        
        MAPS_FOREACH_ALIGNED(iter, current_gen, oiter) {
            if (iter.index() == 4)
                isLive = *iter;
            else
                numLiveNeighbors += *iter;
        }

        // Game of Life conditions
        if (isLive) {
            if (numLiveNeighbors < 2 || numLiveNeighbors > 3)
                isLive = 0;
        } else {
            if (numLiveNeighbors == 3)
                isLive = 1;
        }

        // Fill output cell
        *oiter = isLive;
    }

    next_gen.commit();
}

template <typename T, int IPX, int IPY>
__global__ void GoLMAPSKernel MAPS_MULTIDEF(maps::Window<T, 2, BW, BH, 1, 1, IPX, IPY> current_gen, 
                                            maps::StructuredInjectiveOutput<T, 2, BW, BH, 1, IPX, IPY> next_gen) {
    // Initialize multi-GPU device abstraction
    MAPS_MULTI_INIT();
    
    // Allocate shared memory for containers
    __shared__ typename decltype(current_gen)::SharedData sdata;
    __shared__ typename decltype(next_gen)::SharedData osdata;
    
    // Initialize containers (pre-synchronization)
    current_gen.init_async(sdata);
    next_gen.init_async(osdata);
    
    // Synchronize thread-blocks
    __syncthreads();
    
    // Initialize containers (post-sync)
    current_gen.init_async_postsync();
    next_gen.init_async_postsync();

    // If there are no items to write, return
    if (next_gen.Items() == 0)
        return;

    
    MAPS_FOREACH(oiter, next_gen) {
        int numLiveNeighbors = 0;
        int isLive;

        // Determine number of live neighbors
        
        MAPS_FOREACH_ALIGNED(iter, current_gen, oiter) {
            if (iter.index() == 4)
                isLive = *iter;
            else
                numLiveNeighbors += *iter;
        }

        // Game of Life conditions
        if (isLive) {
            if (numLiveNeighbors < 2 || numLiveNeighbors > 3)
                isLive = 0;
        } else {
            if (numLiveNeighbors == 3)
                isLive = 1;
        }

        // Fill output cell
        *oiter = isLive;
    }

    next_gen.commit();
}

N-Body Simulation

template <int IPX>
__global__ void NBodyTimeStep MAPS_MULTIDEF(maps::Window1D<float4, BLOCK_WIDTH, 0, IPX> bodies_in,
                                            maps::Block1D<float4, 0, BLOCK_WIDTH, IPX> other_bodies,
                                            maps::StructuredInjective1D<float4, BLOCK_WIDTH, IPX> outBodies,
                                            maps::Window1D<float4, BLOCK_WIDTH, 0, IPX> velocity_in,
                                            maps::StructuredInjective1D<float4, BLOCK_WIDTH, IPX> velocity_out,
                                            float deltaTime) {
    MAPS_MULTI_INITVARS(bodies_in, other_bodies, outBodies, velocity_in, velocity_out);
    
    
    MAPS_FOREACH(oiter, outBodies)          // Initialize outputs
        *oiter = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    
    do {                                    // Compute forces of all bodies
        
        MAPS_FOREACH(oiter, outBodies) {
            
            MAPS_FOREACH_ALIGNED(iter, other_bodies, oiter)
                bodyBodyInteraction<float>(*oiter, *bodies_in.align(oiter), *iter);
        }

        other_bodies.nextChunk();
    } while (!other_bodies.isDone());

    auto viter = velocity_out.begin();
    
    MAPS_FOREACH(oiter, outBodies) {        // Integrate: compute outputs from acceleration
        float4 vel = *velocity_in.align(oiter);

        vel.x += (*oiter).x * deltaTime;    // Compute velocity
        vel.y += (*oiter).y * deltaTime;
        vel.z += (*oiter).z * deltaTime;
        vel *= DAMPING_FACTOR;
        
        *oiter = *bodies_in.align(oiter);   // Get current position
        (*oiter).x += vel.x * deltaTime;    // Update position
        (*oiter).y += vel.y * deltaTime;
        (*oiter).z += vel.z * deltaTime;
        
        *viter = vel;                       // Write velocity and increment iterator
        ++viter;
    }
    
    velocity_out.commit();                  // Commit velocity and positions
    outBodies.commit();
}

Matrix Multiplication

template <typename T>
__global__ void GEMMKernel(MAPS_MULTIDEF2,
                           maps::Block2D<T, 0, BLOCK_WIDTH, BLOCK_HEIGHT> A,
                           maps::Block2D<T, 1, BLOCK_WIDTH, BLOCK_HEIGHT> B,
                           maps::StructuredInjective2D<T, BLOCK_WIDTH, BLOCK_HEIGHT> C) {
    MAPS_MULTI_INITVARS(A, B, C);
    *C.begin() = T(0);

    do {
        
        MAPS_FOREACH(oiter, C) {            
            
            auto B_iter = B.align(oiter);      // Initialize B's iterator in addition to A
            
            MAPS_FOREACH_ALIGNED(A_iter, A, oiter) {
                *oiter += (*A_iter) * (*B_iter);
                ++B_iter;
            }
        }
        maps::NextChunkAll(A, B);              // Efficiently advance chunk on both containers
    } while (!A.isDone());

    if (C.Items() > 0)                         // Output results
        C.commit();
}

Matrix Multiplication (CUBLAS)

Multi-GPU (Device Wrapper)
Multi-GPU (Host)


bool SGEMMRoutine(void *context, int deviceIdx, cudaStream_t stream,
                  const maps::multi::GridSegment& task_segment,
                  const std::vector<void *>& parameters,
                  const std::vector<maps::multi::DatumSegment>& container_segments,
                  const std::vector<maps::multi::DatumSegment>& container_allocation) {
    CUBLASContext *c = (CUBLASContext *)context;    
    
    float alpha, beta;
    maps::GetConstantParameter(parameters[3], alpha); // Obtain constants
    maps::GetConstantParameter(parameters[4], beta);

    int m, n, k;
    m = container_segments[0].m_dimensions[0]; // Compute matrix segment dimensions
    n = container_segments[1].m_dimensions[0];
    k = container_segments[2].m_dimensions[1];

    // Set GPU stream and call actual kernel
    CUBLAS_CHECK(cublasSetStream(c->handles[deviceIdx], stream));
    CUBLAS_CHECK(cublasSgemm(c->handles[deviceIdx], CUBLAS_OP_N, CUBLAS_OP_N, m, k, n, &alpha, 
                             (float *)parameters[0], 
                             container_segments[0].m_stride_bytes / sizeof(float), 
                             (float *)parameters[1], 
                             container_segments[1].m_stride_bytes / sizeof(float), &beta, 
                             (float *)parameters[2],
                             container_segments[2].m_stride_bytes / sizeof(float)));
    return true;
}


sched.InvokeUnmodified(SGEMMRoutine, &context, dim3(),
                       maps::multi::Block2DUnmodified<0, float>(A),
                       maps::multi::Block2DUnmodified<1, float>(B),
                       maps::multi::StructuredInjective2D<float>(C), 
                       alpha, beta);

Sparse Matrix-Vector Multiplication (SpMV)

template <typename T>
__global__ void SpMVMapsMultiKernel MAPS_MULTIDEF(maps::Adjacency<T, T> in_graph,
                                                  maps::StructuredInjective1D<T, BLOCK_WIDTH> out) {
    MAPS_MULTI_INITVARS(in_graph, out);

    if (out.Items() == 0)
        return;
    T result = T(0);
    
    const auto eit = in_graph.end();
    for (auto it = in_graph.begin(); it != eit; ++it) {
        result +=
            (*it).edge_weight *           // Matrix value 
            (*it).adjacent_node_value;    // Vector value
    }
    
    *out.begin() = result;                // Write and commit output
    out.commit();
}

MAPS Showcase

Showcase