diff --git a/CMakeLists.txt b/CMakeLists.txt index de9740b..ab49ecb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,3 +5,5 @@ add_subdirectory(common) add_subdirectory(cpu-slo) add_subdirectory(cpu-opt) + +add_subdirectory(gpu-slo) diff --git a/gpu-slo/CMakeLists.txt b/gpu-slo/CMakeLists.txt new file mode 100644 index 0000000..6ae6437 --- /dev/null +++ b/gpu-slo/CMakeLists.txt @@ -0,0 +1,6 @@ +cmake_minimum_required(VERSION 3.8 FATAL_ERROR) +project(main LANGUAGES CXX CUDA) + +add_executable(main main.cu) +target_compile_features(main PUBLIC cxx_std_11) +set_target_properties(main PROPERTIES CUDA_SEPARABLE_COMPILATION ON) diff --git a/gpu-slo/Makefile b/gpu-slo/Makefile new file mode 100644 index 0000000..23f7282 --- /dev/null +++ b/gpu-slo/Makefile @@ -0,0 +1,8 @@ +main : main.cu util.h + nvcc -o main -std=c++11 main.cu + +clean : + rm main + +run : main + ./main diff --git a/gpu-slo/main b/gpu-slo/main new file mode 100755 index 0000000..38abc77 Binary files /dev/null and b/gpu-slo/main differ diff --git a/gpu-slo/main.cu b/gpu-slo/main.cu new file mode 100644 index 0000000..c6fdb48 --- /dev/null +++ b/gpu-slo/main.cu @@ -0,0 +1,178 @@ +#include +#include +#include + +#include +#include + +#include "util.h" + +// struct Cosets { +// int width; +// thrust::device_vector data{}; +// +// __host__ +// Cosets(int ngens) : width(ngens) { +// } +// +// void add_row() { +// data.resize(data.size() + width, -1); +// } +// +// thrust::host_vector get_data() { +// return data; +// } +// }; + +struct Row { + int rel; + + int l, r; + + int from, to; + + __host__ __device__ + Row() : rel(0), l(0), r(0), from(0), to(0) {} + + __device__ + Row(int rel, int cos, int size) { + l = 0; + r = size - 1; + + from = to = cos; + + this->rel = rel; + } +}; + +std::ostream &operator<<(std::ostream &o, const Row &r) { + return o << "Row[" << r.rel << "]{" << r.l << ":" << r.from << "-" << r.to << ":" << r.r << "}"; +} + +struct Rel { + int gens[2]; + int mul; +}; + +struct Solver { + int ngens; + int *cosets; + Rel *rels; + + Solver(int ngens, + thrust::device_vector &cosets, + thrust::device_vector &rels) + : ngens(ngens), + cosets(thrust::raw_pointer_cast(cosets.data())), + rels(thrust::raw_pointer_cast(rels.data())) { + } + + __device__ + void operator()(Row &r) { + if (r.l + 1 >= r.r) return; + + while ((r.r - r.l) > 0) { + int gen = rels[r.rel].gens[r.l & 1]; + int next = cosets[r.from * ngens + gen]; + if (next < 0) break; + r.l++; + r.from = next; + } + + while ((r.r - r.l) > 0) { + int gen = rels[r.rel].gens[r.r & 1]; + int next = cosets[r.to * ngens + gen]; + if (next < 0) break; + r.r--; + r.to = next; + } + + if (r.r - r.l == 0) { + int gen = rels[r.rel].gens[r.l & 1]; + cosets[r.from * ngens + gen] = r.to; + cosets[r.to * ngens + gen] = r.from; + } + } +}; + +struct CosetInitializer { + int *cosets; + + CosetInitializer(thrust::device_vector &cosets) + : cosets(thrust::raw_pointer_cast(cosets.data())) { + } + + __device__ + void operator()(int gen) { + cosets[gen] = 0; + } +}; + +struct RowGen { + Rel *rels; + + int coset; + + RowGen(int coset, thrust::device_vector &rels) + : coset(coset), + rels(thrust::raw_pointer_cast(rels.data())) {} + + __device__ + Row operator()(int rel) { + return Row(rel, coset, rels[rel].mul * 2); + } +}; + +thrust::device_vector solve( + int ngens, + thrust::device_vector subs, + thrust::device_vector rels) { + + thrust::device_vector cosets; + cosets.resize(cosets.size() + ngens, -1); + int lastCoset = 0; + + thrust::for_each(subs.begin(), subs.end(), + CosetInitializer(cosets)); + + thrust::device_vector rows; + + thrust::counting_iterator counter(0); + + thrust::device_vector new_rows(rels.size()); + thrust::transform(counter, counter + rels.size(), new_rows.begin(), + RowGen(lastCoset, rels)); + rows.insert(rows.begin(), new_rows.begin(), new_rows.end()); + + std::cout << rows << std::endl; + + Solver solv(ngens, cosets, rels); + + std::cout << thrust::host_vector(rows) << std::endl; + thrust::for_each(rows.begin(), rows.end(), solv); + std::cout << thrust::host_vector(rows) << std::endl; + + return cosets; +} + + +int main(int argc, char* argv[]) { + int ngens = 4; + std::vector rels = { + {0, 1, 4}, + {1, 2, 3}, + {2, 3, 3}, + + {0, 2, 2}, + {1, 2, 2}, + {1, 3, 2}, + }; + std::vector subs = {1, 3}; + + thrust::host_vector cosets = solve(ngens, subs, rels); + + std::cout << cosets << std::endl; + + return 0; +} + diff --git a/gpu-slo/tests.cu b/gpu-slo/tests.cu new file mode 100644 index 0000000..cad1f96 --- /dev/null +++ b/gpu-slo/tests.cu @@ -0,0 +1,104 @@ +//#include +//#include +// +//#include +//#include +//#include +// +//#define N 50 +// +//__global__ +//void vector_add(float* out, float* a, float* b, int n) { +// for(int i = 0; i < n; i++){ +// out[i] = a[i] + b[i]; +// } +//} +// +//int main(){ +// thrust::host_vector a(N); +// thrust::sequence(a.begin(), a.end()); +// +// thrust::host_vector b(N); +// thrust::sequence(b.begin(), b.end()); +// thrust::reverse(b.begin(), b.end()); +// +// for (int i = 0; i < N; ++i) { +// printf("%.1f ", a[i]); +// } printf("\n"); +// +// for (int i = 0; i < N; ++i) { +// printf("%.1f ", b[i]); +// } printf("\n"); +// +// thrust::device_vector aD = a; +// thrust::device_vector bD = b; +// thrust::device_vector outD(N); +// +// vector_add<<<1, 1>>>( +// thrust::raw_pointer_cast(&outD[0]), +// thrust::raw_pointer_cast(&aD[0]), +// thrust::raw_pointer_cast(&bD[0]), +// N); +// +// thrust::host_vector out = outD; +// +// for (int i = 0; i < N; ++i) { +// printf("%.1f ", out[i]); +// } printf("\n"); +// +// return 0; +//} + +#include +#include +#include + +#include +#include +#include + +void add_proc(int *c, int *a, int *b) { + *c = *a + *b; +} + +void test_proc(){ + int a = 0; + int b = 1; + + auto start = std::chrono::system_clock::now(); + + for (int i = 0; i < 1000000; ++i) { + add_proc(&a, &a, &b); + } + + auto end = std::chrono::system_clock::now(); + + std::chrono::duration diff = end - start; + + printf("proc: %d: 1B in %.3f micro\n", a, diff.count()); +} + +__global__ +void add_gpu(int *c, int *a, int *b) { + *c = *a + *b; +} + +void test_gpu(){ + thrust::device_vector vals(2, 0); + vals[0] = 0; + vals[1] = 1; + printf(" gpu: %d: 1B in %.3f micro\n", vals[0], 0.0f); + + int *a = thrust::raw_pointer_cast(&vals[0]); + int *b = thrust::raw_pointer_cast(&vals[1]); + + add_gpu<<<1, 1>>>(a, a, a); + + printf(" gpu: %d: 1B in %.3f micro\n", vals[0], 0.0f); +} + +int main(int argc, char *argv[]) { + test_proc(); + test_gpu(); +} + diff --git a/gpu-slo/util.h b/gpu-slo/util.h new file mode 100644 index 0000000..add032e --- /dev/null +++ b/gpu-slo/util.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +#include + +template +std::ostream &operator<<(std::ostream &o, const thrust::host_vector &vec) { + if (vec.size() == 0 || vec.size() > 15) + return o << "host_vector{size=" << vec.size() << "}"; + + o << "["; + + for (int i = 0; i < vec.size() - 1; i++) o << vec[i] << ", "; + + if (vec.size() > 0) o << vec[vec.size() - 1]; + + o << "]"; + + return o; +} + +template +std::ostream &operator<<(std::ostream &o, const thrust::device_vector &vec) { + return o << "device_vector{size=" << vec.size() << "}"; +} +