Inverting Large Images Using CUDA

This is a simple example of how to invert a very large image, stored as a vector using nVidia's CUDA programming environment and an EVGA GeForce GTX 260 graphics card.

A GeForce GTX 260 card should have only 24 cores on it, but the nVidia SDK deviceQuery utility reports that there are 27 multiprocessors on my GeForce GTX 260 graphics card and I am able to use all 27 of them.

$ deviceQuery
CUDA Device Query (Runtime API) version (CUDART static linking)
There is 1 device supporting CUDA

Device 0: "GeForce GTX 260"
CUDA Driver Version: 2.30
CUDA Runtime Version: 2.30
CUDA Capability Major revision number: 1
CUDA Capability Minor revision number: 3
Total amount of global memory: 938803200 bytes
Number of multiprocessors: 27
Number of cores: 216
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 16384 bytes
Total number of registers available per block: 16384
Warp size: 32
Maximum number of threads per block: 512
Maximum sizes of each dimension of a block: 512 x 512 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 262144 bytes
Texture alignment: 256 bytes
Clock rate: 1.35 GHz
Concurrent copy and execution: Yes
Run time limit on kernels: Yes
Integrated: No
Support host page-locked memory mapping: Yes
Compute mode: Default (multiple host threads can use this device simultaneously)
The application is fairly trivial. It reads in an image file in PPM P6 format, stores the image in a vector of integers, one per pixel, with 24-bits used to store the RGB values. Depending on which command line option is selected ( -c invert using CPU, -g invert using GPU), it then inverts the image vector either using the CPU or the GPU (Graphical Processing Unit). Finally it writes the image out to an output file in PPM P6 format.

It uses a block size of 16 and a single kernel called myCudaInvertPixel which works on 16 rows of the image at the time via tiling.

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <climits>

#include <unistd.h>
#include <time.h>
#include <string.h>
#include <stdio.h>

#include <cuda.h>
#include <cuda_runtime_api.h>

#define BLOCK_SIZE 16
// #define DEBUG 0

// ------------ prototypes --------------
__global__ void myCudaInvertPixel(int *, int);


class CCL {

int width;
int height;

public:

CCL() { width = height = 0; }
~CCL() { width = height = 0; }

void cuda_ccl(vector& img, int w)
{
width = w;
height = img.size()/w;

#ifdef DEBUG
cout << "Image width=" << width << " height=" << height << endl;
#endif
int result = cuda_invert(img, width, height);

return;
}

private:
int cuda_invert(vector&, int, int);

};


int
CCL::cuda_invert(vector& image, int width, int height)
{
int deviceCount = 0;
int driverVersion = 0, runtimeVersion = 0;
int globalMem;
int image_size = width * height * sizeof(int);

int *cuda_image_ptr;
cudaDeviceProp deviceProp;
cudaError_t err;

int strake_size = 0;
int last_strake_size = 0;
int strakes = 0;
int this_strake_size = 0;
int offset = 0;

if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
printf("ERROR: cudaGetDeviceCount failed. CUDA Driver and Runtime version may be mismatched.\n");
return 1;
}

if (deviceCount == 0) {
printf("ERROR: No device supporting CUDA found\n");
return 1;
}

cudaGetDeviceProperties(&deviceProp, 0);
globalMem = deviceProp.totalGlobalMem;

#ifdef DEBUG
printf("Number of CUDA devices: %d\n\n", deviceCount);
printf("Device %d: \"%s\"\n", 0, deviceProp.name);
#if CUDART_VERSION >= 2020
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, driverVersion%100);
printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, runtimeVersion%100);
#endif
printf(" CUDA capability revision number: %d.%d\n", deviceProp.major, deviceProp.minor);
#if CUDART_VERSION >= 2000
printf(" Number of multiprocessors: %d\n", deviceProp.multiProcessorCount);
printf(" Number of cores: %d\n", 8 * deviceProp.multiProcessorCount);
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
#endif
printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f);
printf(" Global memory: %d\n", globalMem);
printf(" Shared memory per block: %d\n", deviceProp.sharedMemPerBlock);
printf(" Registers per block: %d\n\n", deviceProp.regsPerBlock);
#endif

cudaSetDevice(0); // not needed - default is 0.

int rowBytes = width * 4; // size of a row
int maxrow = BLOCK_SIZE; // max rows that can be handled in global memory
strake_size = maxrow * rowBytes; // memory to allocate
strakes = image_size/strake_size + ((image_size % strake_size == 0) ? 0 : 1); // there X strakes necessary
last_strake_size = image_size - (strake_size * (strakes - 1)); // figure out size of last strake (may be smaller)
#if DEBUG
printf(" Image size %d\n", image_size);
printf(" Image vector size %d\n", image.size());
printf(" Row size %d\n", rowBytes);
printf(" Maxximum rows %d\n", maxrow);
printf(" Strake size %d\n", strake_size);
printf(" Strakes %d\n", strakes);
printf(" Last strake size %d\n\n", last_strake_size);
#endif

// uncomment if you want to using CUDA's elapsed timer
// float elapsed_time_ms = 0.0f;
// cudaEvent_t start, stop;
// cudaEventCreate( &start);
// cudaEventCreate( &stop);
// cudaEventRecord(start, 0);

dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(((width*4)/dimBlock.x) + (((width*4)%dimBlock.x)?1:0), BLOCK_SIZE);

cudaMalloc((void**)&cuda_image_ptr, strake_size);
if ((err = cudaGetLastError()) != cudaSuccess) {
fprintf(stderr, "ERROR: Cuda: %s\n", cudaGetErrorString(err));
exit(2);
}

for (int i = 0; i < strakes; i++) {
if (i > 0 && i == strakes - 1) {
this_strake_size = last_strake_size;
} else {
this_strake_size = strake_size;
}
offset = 0 + i *(strake_size/4);

cudaMemcpy(cuda_image_ptr, &image[offset], this_strake_size, cudaMemcpyHostToDevice);
if ((err = cudaGetLastError()) != cudaSuccess) {
fprintf(stderr, "ERROR: Cuda: %s\n", cudaGetErrorString(err));
exit(2);
}

// invoke CUDA kernel
myCudaInvertPixel<<<dimGrid, dimBlock>>>(cuda_image_ptr, width);
if ((err = cudaGetLastError()) != cudaSuccess) {
fprintf(stderr, "ERROR: Cuda: %s\n", cudaGetErrorString(err));
exit(2);
}

// not needed here - cudaThreadSynchronize();

cudaMemcpy( &image[offset], cuda_image_ptr, this_strake_size, cudaMemcpyDeviceToHost);
if ((err = cudaGetLastError()) != cudaSuccess) {
fprintf(stderr, "ERROR: Cuda: %s\n", cudaGetErrorString(err));
exit(2);
}
}

cudaFree(cuda_image_ptr);

// uncomment if you want to using CUDA's elapsed timer
// cudaEventRecord(stop, 0);
// cudaEventSynchronize(stop);
// cudaEventElapsedTime( &elapsed_time_ms, start, stop);
// printf("Elapsed time per CUDA: %f\n", elapsed_time_ms);

return 0;
}


// CUDA kernel definition - invert pixel
__global__ void myCudaInvertPixel(int *pixel, int width)
{
int ix = blockIdx.x * blockDim.x + threadIdx.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
int idx = ix + iy*width;

if (ix <= width || iy >= BLOCK_SIZE) return;

pixel[idx] = 0x00FFFFFF - pixel[idx];
}


class Timer {

private:

timespec starttime, endtime;
float elapsed;

float difftime(timespec start, timespec end) {
float temp;

temp = (end.tv_sec - start.tv_sec) +
(float) (end.tv_nsec - start.tv_nsec) / 1000000000.0;

return temp;
}

public:

Timer() { elapsed = 0.00; }

void reset() { elapsed = 0.00; }

void start() {
elapsed = 0.00;
clock_gettime(CLOCK_MONOTONIC, &starttime);
}

void stop() {
clock_gettime(CLOCK_MONOTONIC, &endtime);
elapsed = elapsed + difftime(starttime, endtime);
}

void restart() { clock_gettime(CLOCK_MONOTONIC, &starttime); }

void print() { cout << "Elapsed time: " << elapsed << endl; }

};

class PPM {

protected:

int width;
int height;
int maxColor;
vector <int> image;
friend class CCL;

public:

PPM() {
width = 0;
height = 0;
maxColor = 0;
}

int read(char *filename);

int write(char* filename);

void printsize() {
cout << "Image height: " << height << endl;
cout << "Image width: " << width <<< endl;
cout << "Image vector size: " << image.size() << endl;
}

void invertpixelNoCuda() {
for (vector<int>::iterator it = image.begin(); it != image.end(); ++it) {
*it = 0x00FFFFFF - *it;
}
}

void invertpixelCuda() {
CCL ccl;
ccl.cuda_ccl(image, width);
}

int getwidth() { return width; }

};


int
PPM::read(char *filename)
{
FILE *fp = fopen(filename, "r");
int rgb, x, y;
unsigned char red, green, blue;
char buf[256];
int *pwidth = &width;
int *pheight = &height;

if (fp== NULL) return 1;

do {
fgets(buf, 256, fp);
} while(buf[0] == '#');

if (buf[0] != 'P' || buf[1] != '6') {
fclose(fp);
return 1;
}

do {
fgets(buf, 256, fp);
} while(buf[0] == '#');

sscanf(buf, "%u %u", pwidth, pheight);
fscanf(fp, "%u\n", &maxColor);

image.resize(height*width);
for (y = 0; y < height ; y++) {
for (x = 0; x < width; x++) {
fscanf(fp, "%c%c%c", &red, &green, &blue);
rgb = ((int)blue & 0x000000ff) << 16 |
((int)green & 0x000000ff ) << 8 |
((int)red & 0x000000ff) | 0x00000000;
image[y*width + x] = rgb;
}
}

fclose(fp);
return 0;
}


int
PPM::write(char* filename)
{
FILE *fp;
int y, x;
unsigned char red=0, green=0, blue=0;
int rgb;

if ((fp = fopen(filename, "w")) == NULL) {
fprintf(stderr, "ERROR: Cannot open output file\n");
return 1;
}

fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");

for (y = 0; y < height; y++) {
for (x = 0; x < width; x++) {
rgb = image[y*width + x];
blue = (rgb & 0x00FF0000) >> 16;
green = (rgb & 0x0000FF00) >> 8;
red = (rgb & 0x000000FF);
fprintf(fp, "%c%c%c", red, green, blue);
}
}

fclose(fp);

return(0);
}


void
usage(char *name,
int mode)
{
cerr << "Usage: " << name << " [-c] [-d] [-g] infile outfile" << endl;
exit(mode);
}


int
main(int argc,
char *argv[])
{
int c;
int cflag = 0, dflag = 0, gflag = 0;
PPM ppm;
Timer timer;

if (argc == 1)
usage(argv[0], 0);

opterr = 0;
while ((c = getopt( argc, argv, "cdgh")) != -1) {
switch (c) {
case 'c':
cflag = 1;
break;
case 'd':
dflag = 1;
break;
case 'g':
gflag = 1;
break;
case '?':
case 'h':
usage(argv[0], 0);
break;
}
}

if (optind != argc-2) {
cerr << "ERROR: An input file and an output file must be specified" << endl;
usage(argv[0], 1);
}

if (ppm.read(argv[optind++])) {
cerr << "ERROR: Failed to read input file" << endl;
exit(1);
}

timer.start();

if (dflag) ppm.printsize();

if (cflag) ppm.invertpixelNoCuda();

if (gflag) ppm.invertpixelCuda();

timer.stop();

if (ppm.write(argv[optind])) {
cerr << "ERROR: Failed to write output file" << endl;
exit(1);
}

timer.print();

exit(0);
}

Well that is all there is to this application. To compile it you need to have at least the nVidia CUDA 2.3 SDK installed and the latest nVidia graphics card driver.

If you are new to CUDA programming, I suggest you download the source code, compile it and play around with it on your own computer. The man pages for Linux do not work (they are all mixed up) but good up-to-date documentation can be found online at nVidia CUDA Library Documentation.

You might also want to play around with the block (dimBlock) and grid (dimGrid) dimensions. nVidia has a useful Microsoft Excel Occupancy Calculator Spreadsheet.

P.S. I tested the application on images up to a 32000 x 32000 pixels on RHEL5.4 using the CUDA 2.3 SDK and an EVGA GeForce GTX 260 graphics card with 896Mb of memory.

Connected Component Labeling 1


Recently, I became interested in connected component labeling (CCL) algorithms because of a competition on TopCoder. Over the next few weeks I plan to explore various algorithms including parallel algorithms written to take advantage of nVidia GPUs via the CUDA programming model.

To simply things and enable me to concentrate on the algorithms, all images will be in PPM P6 format and will be loaded into a vector (image) of integers. The image vector will be processed by the CCL algorithm. The transformed image vector will then be written out to a PPM P6 file for viewing by an external application such as GIMP.

Here is an example program which contains a PPM class for reading, writing, inverting, and displaying some information about a PPM file. An image is read into the image vector with the red, green and blue components stored in specific bits of each integer. It also contains a simple timer class which uses clock_getttime() to retrieve the monotomic time.

#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <unistd.h>
#include <time.h>

using namespace std;

class Timer {

private:

timespec starttime, endtime;
float elapsed;

float difftime(timespec start, timespec end)
{
float temp;

temp = (end.tv_sec - start.tv_sec) +
(float) (end.tv_nsec - start.tv_nsec) / 1000000000.0;

return temp;
}

public:

Timer() {
elapsed = 0.00;
}

void reset() {
elapsed = 0.00;
}

void start() {
elapsed = 0.00;
clock_gettime(CLOCK_MONOTONIC, &starttime);
}

void stop() {
clock_gettime(CLOCK_MONOTONIC, &endtime);
elapsed = elapsed + difftime(starttime, endtime);
}

void restart() {
clock_gettime(CLOCK_MONOTONIC, &starttime);
}

void print() {
cout << "Elapsed time: " << elapsed << endl;
}
};


class PPM {

protected:

int width;
int height;
int maxColor;
vector image;

public:

PPM()
{
width = 0;
height = 0;
maxColor = 0;
}

int readfile(char *filename);

int writefile(char* filename);

void printsize()
{
cout << "Image height: " << height << endl;
cout << "Image width: " << width << endl;
cout << "Image vector size: " << image.size() << endl;
}

void invertpixel()
{
for (vector::iterator it = image.begin(); it != image.end(); ++it) {
*it = 0x00FFFFFF - *it;
}
}
};

int
PPM::readfile(char* filename)
{
ifstream fin;
char buf[256];
unsigned char red, green, blue;
int i, j, rgb;

fin.open(filename);
if (!fin.is_open()) return 1;

fin.getline(buf, 256);
if (buf[0] != 'P' || buf[1] != '6') {
fin.close();
return 1;
}
while (fin.peek()=='#') fin.getline(buf, 256);

fin >> width >> height >> maxColor;
fin.getline(buf, 256); // discard rest of line

image.resize(height*width);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
fin >> red >> green >> blue;
rgb = ((int)blue & 0x000000ff) << 16 |
((int)green & 0x000000ff ) << 8 |
((int)red & 0x000000ff) | 0x00000000;
image[i * width + j] = rgb;
}
}

fin.close();
return 0;
}

int
PPM::writefile(char* filename)
{
ofstream fout;
stringstream ss;
unsigned char red, green, blue;
int i, j, rgb;

fout.open(filename);
if (!fout.is_open()) return 1;

fout << "P6" << endl;
ss << width << " " << height;
fout << ss.str() << endl;
fout << "255" << endl;

for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
rgb = image[i * width + j];
red = (rgb & 0x000000FF);
green = (rgb & 0x0000FF00) >> 8;
blue = (rgb & 0x00FF0000) >> 16;
fout << red << green << blue;
}
}

fout.close();
return 0;
}
void
usage(char *name,
int mode)
{
cerr << "Usage: " << name << " [-d] infile outfile" << endl;
exit(mode);
}

int
main(int argc,
char *argv[])
{
int c, result = 0;
int dflag = 0, iflag = 0;
PPM ppm;
Timer timer;

if (argc == 1)
usage(argv[0], 0);

opterr = 0;
while ((c = getopt( argc, argv, "di")) != -1) {
switch (c) {
case 'd':
dflag = 1;
break;
case 'i':
iflag = 1;
break;
case '?':
case 'h':
usage(argv[0], 0);
break;
}
}

if (optind != argc-2) {
cerr << "ERROR: An input file and an output file must be specified" << endl;
usage(argv[0], 1);
}


timer.start();
if (ppm.readfile(argv[optind++])) {
cerr << "ERROR: Failed to read input file" << endl;
exit(1);
}

if (dflag) ppm.printsize();

if (iflag) ppm.invertpixel();

if (ppm.writefile(argv[optind])) {
cerr << "ERROR: Failed to write output file" << endl;
exit(1);
}

timer.stop();
timer.print();

exit(0);
}

Here is an alternative way of reading and writing an image file using mmap()

#include <fcntl.h>
#include <sys/mman.h>
#include <string.h>
#include <stdio.h>

int
PPM::readfile(char *filename)
{
FILE *fp;
struct stat statbuf;
char buf[256];
int *pwidth = &width;
int *pheight = &height;

if ((fp = fopen(filename, "r")) == NULL) return 1;

do {
fgets(buf, 256, fp);
} while(buf[0] == '#');

if (buf[0] != 'P' || buf[1] != '6') {
fclose(fp);
return 1;
}

do {
fgets(buf, 256, fp);
} while(buf[0] == '#');

sscanf(buf, "%u %u", pwidth, pheight);
fscanf(fp, "%u\n", &maxColor);

image.resize(height*width);
long offset = ftell(fp);
int fd = fileno(fp);
fstat(fd, &statbuf);

unsigned char* data = (unsigned char*)mmap(0, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (data == MAP_FAILED) {
fprintf(stderr, "ERROR: mmap failed. %s\n", strerror(errno));
fclose(fp);
return 1;
}

unsigned char* data_offset = (unsigned char*)(data + offset);
unsigned char red, green, blue;
int i = 0, o = 0;
for (int y = 0; y < height * width; y++) {
blue = data_offset[i++];
green = data_offset[i++];
red = data_offset[i++];
image[o++] = red << 16 | green << 8 | blue;
}

munmap(data, statbuf.st_size);
close(fd);

return 0;
}

int
PPM::writefile(char* filename)
{
unsigned int rgb;
int result;

int fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600);
if (fd == -1) {
fprintf(stderr, "ERROR: Cannot create output file\n");
return 1;
}

char buf[256];
sprintf(buf, "P6\n%d %d\n255\n", width, height);
int bufsize = strlen(buf);

int mmap_size = bufsize + (height * width * 3);
if (((result = lseek(fd, mmap_size - 1, SEEK_SET)) == -1) ||
((result = write(fd, "", 1)) == -1)) {
fprintf(stderr, "ERROR: Cannot write null at end of output file\n");
close(fd);
return 1;
}

unsigned char* data = (unsigned char*)mmap(0, mmap_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (data == MAP_FAILED) {
fprintf(stderr, "ERROR: mmap failed. %s\n", strerror(errno));
close(fd);
return 1;
}

int o = 0;
for (int i = 0; i < bufsize; i++)
data[o++] = buf[i];

for (int i = 0; i < height * width; i++) {
rgb = image[i];
data[o++] = rgb & 0xFF;
data[o++] = (rgb >> 8) & 0xFF;
data[o++] = (rgb >> 16) & 0xFF;
}

munmap(data, mmap_size);
close(fd);

return(0);
}

Note the need to seek to the end of the new file and write a null before calling mmap() when writing a file. Also MAP_SHARED must be used instead of MAP_PRIVATE otherwise your file will contain nothing but zeros.

Well, that is all I have time for at present. In my next post on this subject I will implement a 4-connected thresholding union-find algorithm.