Install NVIDIA driver
Check recommended driver for graphic card
ubuntu-drivers devices
Install driver
sudo apt install nvidia-xxx #i.e. nvidia-340
Run nvidia-smi
to check whether the driver has been successfully installed.
If you have multiple graphic cards, run nvidia-settings
to select NVIDIA graphic card.
Install CUDA
sudo apt install nvidia-cuda-toolkit
Run nvcc --version
to check whether the cuda compiler has been successfully installed.
Simple demo
Create main.cc
, vector_add.cc
, vector_add_cu.cu
, Makefile
in a new proj directory with following contents:
main.cc
#include <fcntl.h>
#include <unistd.h>
#include <iostream>
#include <array>
#include <chrono>
using namespace std;
void vector_add(const float *a, const float *b, float *c, unsigned length);
void vector_add_cu(const float *a, const float *b, float *c, unsigned length);
void generateData(float *data, unsigned length, int fd) {
read(fd, data, sizeof(float) * length);
for (int i = 0; i < length; ++i) {
// float32 = {S[31], E[30:23], M[22:0]},
// when E are all ones, float32 will be inf/nan
// let E be {0, rand[6:0]} to avoid inf/nan,
// and overflow after addition
int tmp = (*((int *) (data+i))) & 0xbfffffff;
data[i] = *((float *) &tmp);
// data[i] = rand();
}
}
int main() {
constexpr int length = 1000000;
static array<float, length> a, b, c_cpu, c_gpu;
int fd = open("/dev/urandom", O_RDONLY);
generateData(a.data(), length, fd);
generateData(b.data(), length, fd);
close(fd);
auto tik = chrono::high_resolution_clock::now();
vector_add(a.data(), b.data(), c_cpu.data(), length);
auto tok = chrono::high_resolution_clock::now();
double dur = chrono::duration_cast<chrono::duration<double>>(tok - tik).count();
cout << "cpu time: " << dur << "s" << endl;
tik = chrono::high_resolution_clock::now();
vector_add_cu(a.data(), b.data(), c_gpu.data(), length);
tok = chrono::high_resolution_clock::now();
dur = chrono::duration_cast<chrono::duration<double>>(tok - tik).count();
cout << "gpu time: " << dur << "s" << endl;
float error = 0.0f;
for (int i = 0; i < length; ++i) {
float diff = c_cpu[i] - c_gpu[i];
error += diff * diff;
}
cout << "accumulated error: " << error << endl;
return 0;
}
vector_add.cc
void vector_add(const float *a, const float *b, float *c, unsigned length) {
for (int t = 0; t < 1000; t++) {
for (int i = 0; i < length; ++i) {
c[i] = a[i] + b[i];
}
}
}
vector_add_cu.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define gpuErrchk(ans) do { gpuAssert((ans), __FILE__, __LINE__); } while (0)
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = false) {
if (code != cudaSuccess) {
printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
} else {
printf("cuda returned code == cudaSuccess\n");
}
}
bool InitCUDA(int device) {
int count;
cudaGetDeviceCount(&count);
if (count <= device) {
fprintf(stderr, "There is no device.\n");
return false;
}
cudaSetDevice(device);
return true;
}
__global__ static
void add(const float *a, const float *b, float *c, unsigned legnth) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= 0 && idx < legnth) {
for (int t = 0; t < 1000; ++t) {
c[idx] = a[idx] + b[idx];
}
}
}
void vector_add_cu(const float *a, const float *b, float *c, unsigned length) {
InitCUDA(0);
int thread_num = 1024;
float *dev_a, *dev_b, *dev_c;
cudaMalloc(&dev_a, sizeof(float) * length);
cudaMalloc(&dev_b, sizeof(float) * length);
cudaMemcpy(dev_a, a, sizeof(float) * length, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeof(float) * length, cudaMemcpyHostToDevice);
cudaMalloc(&dev_c, sizeof(float) * length);
// blocks_num = ceil(length / thread_num)
unsigned blocks_num = (length + thread_num - 1) / thread_num;
add<<< blocks_num, thread_num >>>(dev_a, dev_b, dev_c, length);
cudaDeviceSynchronize();
cudaFree(dev_a);
cudaFree(dev_b);
cudaMemcpy(c, dev_c, sizeof(float) * length, cudaMemcpyDeviceToHost);
cudaFree(dev_c);
}
Makefile
CC := g++
NVCC := nvcc
FLAG := -std=c++11
LINK := -lcudart -L/usr/local/cuda/lib64
TAR := vecadd
CCOBJS := $(patsubst %cc, %o, $(wildcard *.cc))
CUOBJS := $(patsubst %cu, %o, $(wildcard *.cu))
all: $(TAR)
%.o: %.cc
$(CC) $(FLAG) -c $<
%.o: %.cu
$(NVCC) -c $<
$(TAR): $(CCOBJS) $(CUOBJS)
$(CC) $(FLAG) -o $@ $^ $(LINK)
.PHONY: clean
clean:
rm *.o $(TAR)
Compile and run the demo
cd $THE_PROJ_DIR
make
./vecadd
This demo demonstrates the time cost difference between CPU and GPU by adding two vector.