Config and test CUDA on Ubuntu

Install NVIDIA driver

Check recommended driver for graphic card

ubuntu-drivers devices

Install driver

sudo apt install nvidia-xxx #i.e. nvidia-340

Run nvidia-smi to check whether the driver has been successfully installed.
If you have multiple graphic cards, run nvidia-settings to select NVIDIA graphic card.

Install CUDA

sudo apt install nvidia-cuda-toolkit

Run nvcc --version to check whether the cuda compiler has been successfully installed.

Simple demo

Create main.cc, vector_add.cc, vector_add_cu.cu, Makefile in a new proj directory with following contents:

main.cc

#include <fcntl.h>

#include <unistd.h>

#include <iostream>

#include <array>

#include <chrono>

using namespace std;

void vector_add(const float *a, const float *b, float *c, unsigned length);
void vector_add_cu(const float *a, const float *b, float *c, unsigned length);

void generateData(float *data, unsigned length, int fd) {
    read(fd, data, sizeof(float) * length);
    for (int i = 0; i < length; ++i) {
        // float32 = {S[31], E[30:23], M[22:0]},

        // when E are all ones, float32 will be inf/nan

        // let E be {0, rand[6:0]} to avoid inf/nan,

        // and overflow after addition 

        int tmp = (*((int *) (data+i))) & 0xbfffffff;
        data[i] = *((float *) &tmp);
        // data[i] = rand();

    }
}

int main() {
    constexpr int length = 1000000;
    static array<float, length> a, b, c_cpu, c_gpu;
    int fd = open("/dev/urandom", O_RDONLY);
    generateData(a.data(), length, fd);
    generateData(b.data(), length, fd);
    close(fd);

    auto tik = chrono::high_resolution_clock::now();
    vector_add(a.data(), b.data(), c_cpu.data(), length);
    auto tok = chrono::high_resolution_clock::now();
    double dur = chrono::duration_cast<chrono::duration<double>>(tok - tik).count();
    cout << "cpu time: " << dur << "s" << endl;

    tik = chrono::high_resolution_clock::now();
    vector_add_cu(a.data(), b.data(), c_gpu.data(), length);
    tok = chrono::high_resolution_clock::now();
    dur = chrono::duration_cast<chrono::duration<double>>(tok - tik).count();
    cout << "gpu time: " << dur << "s" << endl;

    float error = 0.0f;
    for (int i = 0; i < length; ++i) {
        float diff = c_cpu[i] - c_gpu[i];
        error += diff * diff;
    }
    cout << "accumulated error: " << error << endl;

    return 0;
}

vector_add.cc

void vector_add(const float *a, const float *b, float *c, unsigned length) {
    for (int t = 0; t < 1000; t++) {
        for (int i = 0; i < length; ++i) {
            c[i] = a[i] + b[i];
        }
    }
}

vector_add_cu.cu

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#define gpuErrchk(ans) do { gpuAssert((ans), __FILE__, __LINE__); } while (0)

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = false) {
    if (code != cudaSuccess) {
        printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    } else {
        printf("cuda returned code == cudaSuccess\n");
    }
}

bool InitCUDA(int device) {
    int count;
    cudaGetDeviceCount(&count);
    if (count <= device) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    cudaSetDevice(device);
    return true;
}

__global__ static 
void add(const float *a, const float *b, float *c, unsigned legnth) {
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= 0 && idx < legnth) {
        for (int t = 0; t < 1000; ++t) {
            c[idx] = a[idx] + b[idx];
        }
    }
}

void vector_add_cu(const float *a, const float *b, float *c, unsigned length) {
    InitCUDA(0);
    int thread_num = 1024;
    
    float *dev_a, *dev_b, *dev_c;
    cudaMalloc(&dev_a, sizeof(float) * length);
    cudaMalloc(&dev_b, sizeof(float) * length);
    cudaMemcpy(dev_a, a, sizeof(float) * length, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, sizeof(float) * length, cudaMemcpyHostToDevice);
    cudaMalloc(&dev_c, sizeof(float) * length);
    // blocks_num = ceil(length / thread_num)

    unsigned blocks_num = (length + thread_num - 1) / thread_num;
    add<<< blocks_num, thread_num >>>(dev_a, dev_b, dev_c, length);
    cudaDeviceSynchronize();
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaMemcpy(c, dev_c, sizeof(float) * length, cudaMemcpyDeviceToHost);
    cudaFree(dev_c);
}

Makefile

CC     := g++
NVCC   := nvcc
FLAG   := -std=c++11
LINK   := -lcudart -L/usr/local/cuda/lib64
TAR    := vecadd
CCOBJS := $(patsubst %cc, %o, $(wildcard *.cc))
CUOBJS := $(patsubst %cu, %o, $(wildcard *.cu))

all: $(TAR)
%.o: %.cc
	$(CC) $(FLAG) -c $<
%.o: %.cu
	$(NVCC) -c $<
$(TAR): $(CCOBJS) $(CUOBJS)
	$(CC) $(FLAG) -o $@ $^ $(LINK)
.PHONY: clean
clean:
	rm *.o $(TAR)

Compile and run the demo

cd $THE_PROJ_DIR
make
./vecadd

This demo demonstrates the time cost difference between CPU and GPU by adding two vector.