Parallel Programming Seminar

August 5-7 2009

Interdisciplinary Mathematics Institute
University of South Carolina

matrix_transpose_shared

This is a simple CUDA program for transposing a square matrix as part of the Parallel Programming Seminar at IMI. To compile it, enter the following command line:

make -f matrix_transpose_shared.mk

To run, type:

bin/darwin/release/matrix_transpose_shared

Below is the kernel source code for this matrix_transpose variant. It uses shared memory to avoid uncoalesced memory writes. Note the +1 in the minor dimension of the tile shared array: this is necessary to avoid shared memory bank conflicts.

Click here for the complete source code from Day 2.

#include "matrix_transpose_shared_kernel.h"
#include <assert.h>

#define BLOCK_SIZE 16

__global__
void
matrix_transpose_shared_kernel( float *output, float const *input, int dim ) 
    { 
    __shared__ float tile[BLOCK_SIZE][BLOCK_SIZE+1];
    int x_in = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int y_in = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    int index_in = x_in + dim*y_in;
    int x_out = blockIdx.y * BLOCK_SIZE + threadIdx.x;
    int y_out = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    int index_out = x_out + dim*y_out;
    tile[threadIdx.y][threadIdx.x] = input[index_in];
    __syncthreads();
    output[index_out] = tile[threadIdx.x][threadIdx.y];
    }

void
cuda_matrix_transpose_shared( float * output, float const * input, int dim )
    {
    assert(output!=0);
    assert(input!=0);
    assert(dim>0);
    assert(!(dim%BLOCK_SIZE));
    dim3 block(BLOCK_SIZE,BLOCK_SIZE);
    dim3 grid(dim/block.x,dim/block.y);
    matrix_transpose_shared_kernel<<<grid,block>>>(output,input,dim);
    }

Other matrix_transpose variants used: