Parallel Programming Seminar

August 5-7 2009

Interdisciplinary Mathematics Institute
University of South Carolina

Convolution Filter

This CUDA program demonstrates the use of textures to implement a simple 3x3 convolution filter as part of the Parallel Programming Seminar at IMI. To compile it, enter the following command line:

/usr/local/cuda/bin/nvcc filter.cu -o filter

The program uses the PGM file format for the input and the output images. To run, type:

./filter <source_pgm> <target_pgm>

Below is the complete source code: (download)

#include <vector>
#include <exception>
#include <iostream>
#include <stdio.h>
#include <assert.h>

int const block_x=16;
int const block_y=16;

class
cuda_buffer
    {
    cuda_buffer( cuda_buffer const & );
    cuda_buffer & operator=( cuda_buffer const & );
    
    public:
    
    void * ptr_;
    size_t pitch_;
    
    explicit
    cuda_buffer( int width, int height )
        {
        assert(width>0);
        assert(height>0);
        cudaError_t err=cudaMallocPitch(&ptr_,&pitch_,width,height);
        if( err!=cudaSuccess )
            throw std::bad_alloc();
        }

    ~cuda_buffer()
        {
        cudaFree(ptr_);
        }       
    };

class
file
    {
    file( file const & );
    file & operator=( file const & );

    public:

    FILE * const f_;

    explicit
    file( FILE * f ):
        f_(f)
        {
        assert(f!=0);
        }

    ~file()
        {
        (void) fclose(f_);
        }
    };

class
io_error:
    virtual public std::exception
    {
    char const *
    what() const throw()
        {
        return "Error reading!";
        }
    };

void
read_pgm( char const * filename, int & width, int & height, std::vector<unsigned char> & data )
    {
    assert(filename && *filename);
    if( FILE* fptr = fopen(filename,"rb") )
        {
        file f(fptr);
        if( 2==fscanf(fptr, "P5 %d %d %*d%*c", &width, &height) && width>0 && height>0 )
            {
            std::vector<unsigned char> tmp(width*height);
            if( fread(&tmp[0],1,tmp.size(),f.f_)==tmp.size() )
                {
                data.swap(tmp);
                return;
                }
            }
        }
    throw io_error();
    }

void
write_pgm( char const * filename, int width, int height, unsigned char const * buf )
    {
    assert(filename && *filename);
    assert(buf!=0);
    assert(width>0);
    assert(height>0);
    if( FILE* fptr = fopen(filename, "wb") )
        {
        file f(fptr);
        fprintf(f.f_, "P5 %d %d 255\n",width, height);
        int n=width*height;
        if( n==fwrite(buf, 1, n, f.f_) )
            return;
        }
    throw io_error();
    }

struct
float33
    {
    float a00, a01, a02;
    float a10, a11, a12;
    float a20, a21, a22;
    };

texture<unsigned char,2,cudaReadModeNormalizedFloat> texref;

__device__
float33
get_sample( float x, float y )
    {
    float33 s;
    s.a00=tex2D(texref,x-1,y-1);
    s.a01=tex2D(texref,x,y-1);
    s.a02=tex2D(texref,x+1,y-1);
    s.a10=tex2D(texref,x-1,y);
    s.a11=tex2D(texref,x,y);
    s.a12=tex2D(texref,x+1,y);
    s.a20=tex2D(texref,x-1,y+1);
    s.a21=tex2D(texref,x,y+1);
    s.a22=tex2D(texref,x+1,y+1);
    return s;
    }

__device__
float
conv( float33 sample, float33 filter )
    {
    return
        sample.a00*filter.a00 +
        sample.a01*filter.a01 +
        sample.a02*filter.a02 +
        sample.a10*filter.a10 +
        sample.a11*filter.a11 +
        sample.a12*filter.a12 +
        sample.a20*filter.a20 +
        sample.a21*filter.a21 +
        sample.a22*filter.a22;
    }

__global__
void
convolve_33( unsigned char * output, int pitch, int width, int height, float33 filter )
    {
    int x = blockIdx.x*block_x + threadIdx.x;
    int y = blockIdx.y*block_y + threadIdx.y;
    float r;
    if( x<width && y<height )
        r=conv(get_sample(x,y),filter);
    __syncthreads();
    if( x<width && y<height )
        output[pitch*y+x] = 255*r;
    }

int
main( int argc, char const * argv[] )
    {
    if( argc!=3 )
        {
        std::cout << "Usage: " << argv[0] << " <input_pgm> <output_pgm>\n";
        return 1;
        }

    std::cout << "Reading " << argv[1] << "..." << std::endl;
    int width, height;
    std::vector<unsigned char> image;
    read_pgm(argv[1],width,height,image);
    cuda_buffer texture(width,height);
    cudaMemcpy2D(texture.ptr_,texture.pitch_,&image[0],width,width,height,cudaMemcpyHostToDevice);

    float33 filter;
    filter.a00=+1; filter.a01=+2; filter.a02=+1;
    filter.a10= 0; filter.a11= 0; filter.a12= 0;
    filter.a20=-1; filter.a21=-2; filter.a22=-1;

    std::cout << "Computing..." << std::endl;
    dim3 block(block_x,block_y);
    dim3 grid((width+block.x-1)/block.x,(height+block.y-1)/block.y);
    cuda_buffer output(width,height);
    cudaBindTexture2D(0,texref,texture.ptr_,cudaCreateChannelDesc(8,0,0,0,cudaChannelFormatKindUnsigned),width,height,width); 
    convolve_33<<<grid,block>>>((unsigned char *)output.ptr_,output.pitch_,width,height,filter);
    cudaMemcpy2D(&image[0],width,output.ptr_,output.pitch_,width,height,cudaMemcpyDeviceToHost);

    std::cout << "Writing " << argv[2] << "..." << std::endl;
    write_pgm(argv[2],width,height,&image[0]);

    std::cout << "Done.";
    return 0;
    }