Merge pull request #1 from allemangD/gpu

Gpu
This commit is contained in:
2019-12-11 07:38:26 -05:00
committed by GitHub
19 changed files with 1285 additions and 0 deletions

1
.gitignore vendored
View File

@@ -3,3 +3,4 @@
CMakeFiles
cmake-build*
*.swp
main

View File

@@ -5,3 +5,5 @@ add_subdirectory(common)
add_subdirectory(cpu-slo)
add_subdirectory(cpu-opt)
add_subdirectory(gpu-slo)

6
gpu-opt/CMakeLists.txt Normal file
View File

@@ -0,0 +1,6 @@
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
project(main LANGUAGES CXX CUDA)
add_executable(main main.cu)
target_compile_features(main PUBLIC cxx_std_11)
set_target_properties(main PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

9
gpu-opt/Makefile Normal file
View File

@@ -0,0 +1,9 @@
main : main.cu util.h
nvcc -o main -std=c++11 -O3 main.cu
clean :
rm main
queue : main
qsub -q mamba -l walltime=24:00:00 -l nodes=1:ppn=1:gpus=1 -d `pwd` run_gpu_opt.sh

127
gpu-opt/groups.h Normal file
View File

@@ -0,0 +1,127 @@
#pragma once
#include "util.h"
#include <iostream>
/*
* Order 4*res*res
*/
Coxeter torus(int res) {
return make_coxeter(4, {
{0, 1, res},
{2, 3, res},
});
}
Coxeter hypercube(int dim) {
std::vector<Rel> rels;
rels.push_back({0, 1, 4});
for (int d = 2; d < dim; d++) {
rels.push_back({d-1, d, 3});
}
return make_coxeter(dim, rels);
}
/*
* Order 14,400
*/
Coxeter H4() {
return make_coxeter(4, {
{0, 1, 5},
{1, 2, 3},
{2, 3, 3},
});
}
/*
* Order 51,840
*/
Coxeter E6() {
return make_coxeter(6, {
{0, 1, 3},
{1, 2, 3},
{2, 3, 3},
{2, 4, 3},
{4, 5, 3},
});
}
/*
* Order 2,903,040
*/
Coxeter E7() {
return make_coxeter(7, {
{0, 1, 3},
{1, 2, 3},
{2, 3, 3},
{2, 4, 3},
{4, 5, 3},
{5, 6, 3},
});
}
/*
* Order 696,729,600
*/
Coxeter E8() {
return make_coxeter(8, {
{0, 1, 3},
{1, 2, 3},
{2, 3, 3},
{2, 4, 3},
{4, 5, 3},
{5, 6, 3},
{6, 7, 3},
});
}
/*
* returns coxeter group based on the arguments
* prints out type and arguments, without an endline
*/
Coxeter proc_args(int argc, const char* argv[]) {
if (argc < 2) {
std::cerr << "missing type argument." << std::endl;
exit(EXIT_FAILURE);
}
int type = std::strtol(argv[1], nullptr, 10);
std::cout << type << ',';
int arg;
switch (type) {
case 0:
if (argc < 3) {
std::cerr << "Must provide a size for torus!" << std::endl;
exit(EXIT_FAILURE);
}
arg = std::strtol(argv[2], nullptr, 10);
std::cout << arg << ',';
return torus(arg);
case 1:
std::cout << -1 << ',';
return H4();
case 2:
std::cout << -1 << ',';
return E6();
case 3:
std::cout << -1 << ',';
return E7();
case 4:
std::cout << -1 << ',';
return E8();
case 5:
if (argc < 3) {
std::cerr << "Must provide a dimension for hypercube!" << std::endl;
exit(EXIT_FAILURE);
}
arg = std::strtol(argv[2], nullptr, 10);
std::cout << arg << ',';
return hypercube(arg);
}
std::cerr << "Not a valid type!" << std::endl;
exit(EXIT_FAILURE);
}

245
gpu-opt/main.cu Normal file
View File

@@ -0,0 +1,245 @@
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/logical.h>
#include <vector>
#include <iostream>
#include <chrono>
#include "util.h"
#include "groups.h"
__constant__ Rel c_rels[128];
__constant__ int c_nrels[1];
__constant__ int c_ngens[1];
struct Row {
int rel;
int l, r;
int from, to;
__host__ __device__
Row() : rel(0), l(0), r(0), from(0), to(0) {}
__device__
Row(int rel, int cos, int size) {
l = 0;
r = size - 1;
from = to = cos;
this->rel = rel;
}
};
std::ostream &operator<<(std::ostream &o, const Row &r) {
return o << "Row[" << r.rel << "]{" << r.l << ":" << r.from << "-" << r.to << ":" << r.r << "}";
}
// this performs a pass on one relation table row, applying learned data to the coset table.
struct Solver {
int *cosets;
Solver(thrust::device_vector<int> &cosets)
: cosets(thrust::raw_pointer_cast(cosets.data())) {
}
__device__
void operator()(Row &drow) {
Row row = drow;
if (row.r - row.l <= 0) {
return;
}
while (row.r - row.l > 0) {
int gen = c_rels[row.rel].gens[row.l & 1];
int next = cosets[row.from * c_ngens[0] + gen];
if (next < 0) break;
row.l++;
row.from = next;
}
while (row.r - row.l > 0) {
int gen = c_rels[row.rel].gens[row.r & 1];
int next = cosets[row.to * c_ngens[0] + gen];
if (next < 0) break;
row.r--;
row.to = next;
}
drow = row;
if (row.r - row.l <= 0) {
int gen = c_rels[row.rel].gens[row.l & 1];
cosets[row.from * c_ngens[0] + gen] = row.to;
cosets[row.to * c_ngens[0] + gen] = row.from;
return;
}
}
};
// this sets the inital row in the coset table based on the subgroup generators
struct CosetInitializer {
int *cosets;
CosetInitializer(thrust::device_vector<int> &cosets)
: cosets(thrust::raw_pointer_cast(cosets.data())) {
}
__device__
void operator()(int gen) {
cosets[gen] = 0;
}
};
// this creates rows for cosets by index of each relation table
struct RowGen {
int coset;
RowGen(int coset)
: coset(coset) {
}
__device__
Row operator()(int rel) {
return Row(rel, coset, c_rels[rel].mul * 2);
}
};
// determines if rows are incomplete; used to remove completed rows
struct RowIncomplete {
__device__
bool operator()(Row r) {
return r.r - r.l > 1;
}
};
// add a row to the coset table filled with -1
void add_row(
int ngens,
thrust::device_vector<int> &cosets) {
cosets.resize(cosets.size() + ngens, -1);
};
// add a new coset to the coset table, picking up where the last call left off.
bool add_coset(
int ngens,
int *coset,
int *hint,
thrust::device_vector<int> &dcosets) {
int offset = *hint;
thrust::host_vector<int> cosets(dcosets.begin() + offset, dcosets.end());
*coset = dcosets.size() / ngens;
while (cosets[*hint - offset] >= 0) {
*hint = *hint + 1;
if (*hint - offset >= cosets.size())
return true;
}
int from = *hint / ngens;
int gen = *hint % ngens;
add_row(ngens, dcosets);
dcosets[*hint] = *coset;
dcosets[*coset * ngens + gen] = from;
return false;
}
// add a row for each relation table for some coset
void gen_rows(
int coset,
int nrels,
thrust::device_vector<Row> &rows) {
rows.resize(rows.size() + nrels);
thrust::counting_iterator<int> counter(0);
thrust::transform(
thrust::device,
counter, counter + nrels,
rows.end() - nrels,
RowGen(coset));
}
// do everything. data is implicitly passed to the device via device_vector.
thrust::device_vector<int> solve(
int ngens,
int nrels,
thrust::device_vector<int> subs) {
thrust::device_vector<int> cosets;
thrust::device_vector<Row> rows;
// create the inital row and populate it from subs
add_row(ngens, cosets);
thrust::for_each(
thrust::device,
subs.begin(), subs.end(),
CosetInitializer(cosets));
// generate initial relation table rows for coset 0
gen_rows(0, nrels, rows);
// these keep track of what progress has been made
int coset = 0;
int hint = 0;
// will break out later
while (true) {
// create a solver and apply it until nothing is being learned
Solver solve(cosets);
thrust::for_each(
thrust::device,
rows.begin(), rows.end(),
solve);
// fails if hint passes the end of the table. in that case, break.
bool done = add_coset(
ngens,
&coset, &hint,
cosets);
if (done) break;
// generate relation table rows for new coset
gen_rows(coset, nrels, rows);
// move completed rows to the end of the list and remove.
auto cut = thrust::partition(
thrust::device,
rows.begin(), rows.end(),
RowIncomplete());
rows.erase(cut, rows.end());
}
return cosets;
}
int main(int argc, const char* argv[]) {
Coxeter cox;
cox = proc_args(argc, argv);
std::vector<int> subs = {};
int nrels = cox.rels.size();
int ngens = cox.ngens;
cudaMemcpyToSymbol(c_ngens, &ngens, sizeof(int));
cudaMemcpyToSymbol(c_nrels, &nrels, sizeof(int));
cudaMemcpyToSymbol(c_rels, cox.rels.data(), cox.rels.size() * sizeof(Rel));
auto s = std::chrono::system_clock::now();
thrust::host_vector<int> cosets = solve(cox.ngens, nrels, subs);
auto e = std::chrono::system_clock::now();
std::chrono::duration<float> diff = e - s;
int order = cosets.size() / cox.ngens;
// type, arg, ngens, time, order
std::cout << cox.ngens << ',' << diff.count() << ',' << order << std::endl;
return 0;
}

39
gpu-opt/run_gpu_opt.sh Normal file
View File

@@ -0,0 +1,39 @@
MAIN='./main'
echo "type,arg,nrels,secs,cosets"
TYPE=0
for ARG in $(seq 25 25 250); do
$MAIN $TYPE $ARG
done
TYPE=5
for ARG in $(seq 2 5); do
$MAIN $TYPE $ARG
done
TYPE=1
$MAIN $TYPE
TYPE=5
$MAIN $TYPE 6
TYPE=2
$MAIN $TYPE
TYPE=5
$MAIN $TYPE 7
TYPE=3
$MAIN $TYPE
TYPE=5
$MAIN $TYPE 8
$MAIN $TYPE 9
TYPE=4
$MAIN $TYPE
TYPE=5
$MAIN $TYPE 10

104
gpu-opt/tests.cu Normal file
View File

@@ -0,0 +1,104 @@
//#include <cstdio>
//#include <cstdlib>
//
//#include <thrust/host_vector.h>
//#include <thrust/device_vector.h>
//#include <thrust/sequence.h>
//
//#define N 50
//
//__global__
//void vector_add(float* out, float* a, float* b, int n) {
// for(int i = 0; i < n; i++){
// out[i] = a[i] + b[i];
// }
//}
//
//int main(){
// thrust::host_vector<float> a(N);
// thrust::sequence(a.begin(), a.end());
//
// thrust::host_vector<float> b(N);
// thrust::sequence(b.begin(), b.end());
// thrust::reverse(b.begin(), b.end());
//
// for (int i = 0; i < N; ++i) {
// printf("%.1f ", a[i]);
// } printf("\n");
//
// for (int i = 0; i < N; ++i) {
// printf("%.1f ", b[i]);
// } printf("\n");
//
// thrust::device_vector<float> aD = a;
// thrust::device_vector<float> bD = b;
// thrust::device_vector<float> outD(N);
//
// vector_add<<<1, 1>>>(
// thrust::raw_pointer_cast(&outD[0]),
// thrust::raw_pointer_cast(&aD[0]),
// thrust::raw_pointer_cast(&bD[0]),
// N);
//
// thrust::host_vector<float> out = outD;
//
// for (int i = 0; i < N; ++i) {
// printf("%.1f ", out[i]);
// } printf("\n");
//
// return 0;
//}
#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sequence.h>
void add_proc(int *c, int *a, int *b) {
*c = *a + *b;
}
void test_proc(){
int a = 0;
int b = 1;
auto start = std::chrono::system_clock::now();
for (int i = 0; i < 1000000; ++i) {
add_proc(&a, &a, &b);
}
auto end = std::chrono::system_clock::now();
std::chrono::duration<float, std::micro> diff = end - start;
printf("proc: %d: 1B in %.3f micro\n", a, diff.count());
}
__global__
void add_gpu(int *c, int *a, int *b) {
*c = *a + *b;
}
void test_gpu(){
thrust::device_vector<int> vals(2, 0);
vals[0] = 0;
vals[1] = 1;
printf(" gpu: %d: 1B in %.3f micro\n", vals[0], 0.0f);
int *a = thrust::raw_pointer_cast(&vals[0]);
int *b = thrust::raw_pointer_cast(&vals[1]);
add_gpu<<<1, 1>>>(a, a, a);
printf(" gpu: %d: 1B in %.3f micro\n", vals[0], 0.0f);
}
int main(int argc, char *argv[]) {
test_proc();
test_gpu();
}

63
gpu-opt/util.h Normal file
View File

@@ -0,0 +1,63 @@
#pragma once
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <vector>
#include <iostream>
template<class T>
std::ostream &operator<<(std::ostream &o, const thrust::host_vector<T> &vec) {
if (vec.size() == 0 || vec.size() > 15)
return o << "host_vector{size=" << vec.size() << "}";
o << "[";
for (int i = 0; i < vec.size() - 1; i++) o << vec[i] << ", ";
if (vec.size() > 0) o << vec[vec.size() - 1];
o << "]";
return o;
}
template<class T>
std::ostream &operator<<(std::ostream &o, const thrust::device_vector<T> &vec) {
return o << "device_vector{size=" << vec.size() << "}";
}
struct Rel {
int gens[2];
int mul;
};
struct Coxeter {
int ngens;
std::vector<Rel> rels;
};
Coxeter make_coxeter(int ngens, const std::vector<Rel> &rels) {
int mults[ngens][ngens];
for (int i = 0; i < ngens; i++) {
for (int j = 0; j < ngens; j++) {
mults[i][j] = 2;
mults[j][i] = 2;
}
}
for (const auto &r : rels) {
mults[r.gens[0]][r.gens[1]] = r.mul;
mults[r.gens[1]][r.gens[0]] = r.mul;
}
std::vector<Rel> res;
for (int i = 0; i < ngens; i++) {
for (int j = i + 1; j < ngens; j++) {
res.push_back({i, j, mults[i][j]});
}
}
return {ngens, res};
}

6
gpu-slo/CMakeLists.txt Normal file
View File

@@ -0,0 +1,6 @@
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
project(main LANGUAGES CXX CUDA)
add_executable(main main.cu)
target_compile_features(main PUBLIC cxx_std_11)
set_target_properties(main PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

9
gpu-slo/Makefile Normal file
View File

@@ -0,0 +1,9 @@
main : main.cu util.h
nvcc -o main -std=c++11 -O3 main.cu
clean :
rm main
queue : main
qsub -q mamba -l walltime=24:00:00 -l nodes=1:ppn=1:gpus=1 -d `pwd` run_gpu.sh

127
gpu-slo/groups.h Normal file
View File

@@ -0,0 +1,127 @@
#pragma once
#include "util.h"
#include <iostream>
/*
* Order 4*res*res
*/
Coxeter torus(int res) {
return make_coxeter(4, {
{0, 1, res},
{2, 3, res},
});
}
Coxeter hypercube(int dim) {
std::vector<Rel> rels;
rels.push_back({0, 1, 4});
for (int d = 2; d < dim; d++) {
rels.push_back({d-1, d, 3});
}
return make_coxeter(dim, rels);
}
/*
* Order 14,400
*/
Coxeter H4() {
return make_coxeter(4, {
{0, 1, 5},
{1, 2, 3},
{2, 3, 3},
});
}
/*
* Order 51,840
*/
Coxeter E6() {
return make_coxeter(6, {
{0, 1, 3},
{1, 2, 3},
{2, 3, 3},
{2, 4, 3},
{4, 5, 3},
});
}
/*
* Order 2,903,040
*/
Coxeter E7() {
return make_coxeter(7, {
{0, 1, 3},
{1, 2, 3},
{2, 3, 3},
{2, 4, 3},
{4, 5, 3},
{5, 6, 3},
});
}
/*
* Order 696,729,600
*/
Coxeter E8() {
return make_coxeter(8, {
{0, 1, 3},
{1, 2, 3},
{2, 3, 3},
{2, 4, 3},
{4, 5, 3},
{5, 6, 3},
{6, 7, 3},
});
}
/*
* returns coxeter group based on the arguments
* prints out type and arguments, without an endline
*/
Coxeter proc_args(int argc, const char* argv[]) {
if (argc < 2) {
std::cerr << "missing type argument." << std::endl;
exit(EXIT_FAILURE);
}
int type = std::strtol(argv[1], nullptr, 10);
std::cout << type << ',';
int arg;
switch (type) {
case 0:
if (argc < 3) {
std::cerr << "Must provide a size for torus!" << std::endl;
exit(EXIT_FAILURE);
}
arg = std::strtol(argv[2], nullptr, 10);
std::cout << arg << ',';
return torus(arg);
case 1:
std::cout << -1 << ',';
return H4();
case 2:
std::cout << -1 << ',';
return E6();
case 3:
std::cout << -1 << ',';
return E7();
case 4:
std::cout << -1 << ',';
return E8();
case 5:
if (argc < 3) {
std::cerr << "Must provide a dimension for hypercube!" << std::endl;
exit(EXIT_FAILURE);
}
arg = std::strtol(argv[2], nullptr, 10);
std::cout << arg << ',';
return hypercube(arg);
}
std::cerr << "Not a valid type!" << std::endl;
exit(EXIT_FAILURE);
}

281
gpu-slo/main.cu Normal file
View File

@@ -0,0 +1,281 @@
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/logical.h>
#include <vector>
#include <iostream>
#include <chrono>
#include "util.h"
#include "groups.h"
struct Row {
int rel;
int l, r;
int from, to;
bool learning;
__host__ __device__
Row() : rel(0), l(0), r(0), from(0), to(0), learning(true) {}
__device__
Row(int rel, int cos, int size) {
l = 0;
r = size - 1;
from = to = cos;
this->rel = rel;
learning = true;
}
};
std::ostream &operator<<(std::ostream &o, const Row &r) {
return o << "Row[" << r.rel << "]{" << r.l << ":" << r.from << "-" << r.to << ":" << r.r << "}(" << r.learning << ")";
}
// this performs a pass on one relation table row, applying learned data to the coset table.
struct Solver {
int ngens;
int *cosets;
Rel *rels;
Solver(int ngens,
thrust::device_vector<int> &cosets,
thrust::device_vector<Rel> &rels)
: ngens(ngens),
cosets(thrust::raw_pointer_cast(cosets.data())),
rels(thrust::raw_pointer_cast(rels.data())) {
}
__device__
void operator()(Row &r) {
if (r.r - r.l <= 0) {
r.learning = false;
return;
}
while (r.r - r.l > 0) {
int gen = rels[r.rel].gens[r.l & 1];
int next = cosets[r.from * ngens + gen];
if (next < 0) break;
r.l++;
r.from = next;
}
while (r.r - r.l > 0) {
int gen = rels[r.rel].gens[r.r & 1];
int next = cosets[r.to * ngens + gen];
if (next < 0) break;
r.r--;
r.to = next;
}
if (r.r - r.l <= 0) {
int gen = rels[r.rel].gens[r.l & 1];
cosets[r.from * ngens + gen] = r.to;
cosets[r.to * ngens + gen] = r.from;
r.learning = true;
return;
}
r.learning = false;
}
};
// this sets the inital row in the coset table based on the subgroup generators
struct CosetInitializer {
int *cosets;
CosetInitializer(thrust::device_vector<int> &cosets)
: cosets(thrust::raw_pointer_cast(cosets.data())) {
}
__device__
void operator()(int gen) {
cosets[gen] = 0;
}
};
// this creates rows for cosets by index of each relation table
struct RowGen {
Rel *rels;
int coset;
RowGen(int coset, thrust::device_vector<Rel> &rels)
: coset(coset),
rels(thrust::raw_pointer_cast(rels.data())) {}
__device__
Row operator()(int rel) {
return Row(rel, coset, rels[rel].mul * 2);
}
};
// determines if rows are incomplete; used to remove completed rows
struct RowIncomplete {
__device__
bool operator()(Row r) {
return r.r - r.l > 1;
}
};
// re-set rows to be learning for a next pass
struct Relearn {
__device__
void operator()(Row &r) {
r.learning = true;
}
};
// determine if rows are learning. used for exit condition
struct Learning {
__device__
bool operator()(Row r) {
return r.learning;
}
};
// add a row to the coset table filled with -1
void add_row(
int ngens,
thrust::device_vector<int> &cosets) {
cosets.resize(cosets.size() + ngens, -1);
};
// add a new coset to the coset table, picking up where the last call left off.
// todo: this part is _real_ slow.
bool add_coset(
int ngens,
int *coset,
int *hint,
thrust::device_vector<int> &cosets) {
*coset = cosets.size() / ngens;
// todo: this part especially.
while (cosets[*hint] >= 0) {
*hint = *hint + 1;
if (*hint >= cosets.size())
return true;
}
int from = *hint / ngens;
int gen = *hint % ngens;
add_row(ngens, cosets);
cosets[*hint] = *coset;
cosets[*coset * ngens + gen] = from;
return false;
}
// add a row for each relation table for some coset
void gen_rows(
int coset,
thrust::device_vector<Rel> &rels,
thrust::device_vector<Row> &rows) {
rows.resize(rows.size() + rels.size());
thrust::counting_iterator<int> counter(0);
thrust::transform(
thrust::device,
counter, counter + rels.size(),
rows.end() - rels.size(),
RowGen(coset, rels));
}
// do everything. data is implicitly passed to the device via device_vector.
thrust::device_vector<int> solve(
int ngens,
thrust::device_vector<int> subs,
thrust::device_vector<Rel> rels) {
thrust::device_vector<int> cosets;
thrust::device_vector<Row> rows;
// create the inital row and populate it from subs
add_row(ngens, cosets);
thrust::for_each(
thrust::device,
subs.begin(), subs.end(),
CosetInitializer(cosets));
// generate initial relation table rows for coset 0
gen_rows(0, rels, rows);
// these keep track of what progress has been made
int coset = 0;
int hint = 0;
// will break out later
while (true) {
// reset learning=true for all rows.
thrust::for_each(
thrust::device,
rows.begin(),
rows.end(),
Relearn());
// create a solver and apply it until nothing is being learned
Solver solve(ngens, cosets, rels);
while (true) {
thrust::for_each(
thrust::device,
rows.begin(), rows.end(),
solve);
// if not any row is learning, then break.
bool r = thrust::any_of(
thrust::device,
rows.begin(), rows.end(),
Learning());
if (!r) break;
}
// fails if hint passes the end of the table. in that case, break.
bool done = add_coset(
ngens,
&coset, &hint,
cosets);
if (done) break;
// generate relation table rows for new coset
gen_rows(coset, rels, rows);
// move completed rows to the end of the list and remove.
auto cut = thrust::partition(
thrust::device,
rows.begin(), rows.end(),
RowIncomplete());
rows.erase(cut, rows.end());
}
return cosets;
}
int main(int argc, const char* argv[]) {
Coxeter cox;
cox = proc_args(argc, argv);
std::vector<int> subs = {};
auto s = std::chrono::system_clock::now();
thrust::host_vector<int> cosets = solve(cox.ngens, subs, cox.rels);
auto e = std::chrono::system_clock::now();
std::chrono::duration<float> diff = e - s;
int order = cosets.size() / cox.ngens;
// type, arg, ngens, time, order
std::cout << cox.ngens << ',' << diff.count() << ',' << order << std::endl;
return 0;
}

39
gpu-slo/run_gpu.sh Normal file
View File

@@ -0,0 +1,39 @@
MAIN='./main'
echo "type,arg,nrels,secs,cosets"
TYPE=0
for ARG in $(seq 25 25 250); do
$MAIN $TYPE $ARG
done
TYPE=5
for ARG in $(seq 2 5); do
$MAIN $TYPE $ARG
done
TYPE=1
$MAIN $TYPE
TYPE=5
$MAIN $TYPE 6
TYPE=2
$MAIN $TYPE
TYPE=5
$MAIN $TYPE 7
TYPE=3
$MAIN $TYPE
TYPE=5
$MAIN $TYPE 8
$MAIN $TYPE 9
TYPE=4
$MAIN $TYPE
TYPE=5
$MAIN $TYPE 10

104
gpu-slo/tests.cu Normal file
View File

@@ -0,0 +1,104 @@
//#include <cstdio>
//#include <cstdlib>
//
//#include <thrust/host_vector.h>
//#include <thrust/device_vector.h>
//#include <thrust/sequence.h>
//
//#define N 50
//
//__global__
//void vector_add(float* out, float* a, float* b, int n) {
// for(int i = 0; i < n; i++){
// out[i] = a[i] + b[i];
// }
//}
//
//int main(){
// thrust::host_vector<float> a(N);
// thrust::sequence(a.begin(), a.end());
//
// thrust::host_vector<float> b(N);
// thrust::sequence(b.begin(), b.end());
// thrust::reverse(b.begin(), b.end());
//
// for (int i = 0; i < N; ++i) {
// printf("%.1f ", a[i]);
// } printf("\n");
//
// for (int i = 0; i < N; ++i) {
// printf("%.1f ", b[i]);
// } printf("\n");
//
// thrust::device_vector<float> aD = a;
// thrust::device_vector<float> bD = b;
// thrust::device_vector<float> outD(N);
//
// vector_add<<<1, 1>>>(
// thrust::raw_pointer_cast(&outD[0]),
// thrust::raw_pointer_cast(&aD[0]),
// thrust::raw_pointer_cast(&bD[0]),
// N);
//
// thrust::host_vector<float> out = outD;
//
// for (int i = 0; i < N; ++i) {
// printf("%.1f ", out[i]);
// } printf("\n");
//
// return 0;
//}
#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sequence.h>
void add_proc(int *c, int *a, int *b) {
*c = *a + *b;
}
void test_proc(){
int a = 0;
int b = 1;
auto start = std::chrono::system_clock::now();
for (int i = 0; i < 1000000; ++i) {
add_proc(&a, &a, &b);
}
auto end = std::chrono::system_clock::now();
std::chrono::duration<float, std::micro> diff = end - start;
printf("proc: %d: 1B in %.3f micro\n", a, diff.count());
}
__global__
void add_gpu(int *c, int *a, int *b) {
*c = *a + *b;
}
void test_gpu(){
thrust::device_vector<int> vals(2, 0);
vals[0] = 0;
vals[1] = 1;
printf(" gpu: %d: 1B in %.3f micro\n", vals[0], 0.0f);
int *a = thrust::raw_pointer_cast(&vals[0]);
int *b = thrust::raw_pointer_cast(&vals[1]);
add_gpu<<<1, 1>>>(a, a, a);
printf(" gpu: %d: 1B in %.3f micro\n", vals[0], 0.0f);
}
int main(int argc, char *argv[]) {
test_proc();
test_gpu();
}

63
gpu-slo/util.h Normal file
View File

@@ -0,0 +1,63 @@
#pragma once
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <vector>
#include <iostream>
template<class T>
std::ostream &operator<<(std::ostream &o, const thrust::host_vector<T> &vec) {
if (vec.size() == 0 || vec.size() > 15)
return o << "host_vector{size=" << vec.size() << "}";
o << "[";
for (int i = 0; i < vec.size() - 1; i++) o << vec[i] << ", ";
if (vec.size() > 0) o << vec[vec.size() - 1];
o << "]";
return o;
}
template<class T>
std::ostream &operator<<(std::ostream &o, const thrust::device_vector<T> &vec) {
return o << "device_vector{size=" << vec.size() << "}";
}
struct Rel {
int gens[2];
int mul;
};
struct Coxeter {
int ngens;
std::vector<Rel> rels;
};
Coxeter make_coxeter(int ngens, const std::vector<Rel> &rels) {
int mults[ngens][ngens];
for (int i = 0; i < ngens; i++) {
for (int j = 0; j < ngens; j++) {
mults[i][j] = 2;
mults[j][i] = 2;
}
}
for (const auto &r : rels) {
mults[r.gens[0]][r.gens[1]] = r.mul;
mults[r.gens[1]][r.gens[0]] = r.mul;
}
std::vector<Rel> res;
for (int i = 0; i < ngens; i++) {
for (int j = i + 1; j < ngens; j++) {
res.push_back({i, j, mults[i][j]});
}
}
return {ngens, res};
}

20
hpclog/log-gpu-opt-o3 Normal file
View File

@@ -0,0 +1,20 @@
type,arg,nrels,secs,cosets
0,25,4,0.38605,2500
0,50,4,1.6768,10000
0,75,4,7.64476,22500
0,100,4,19.5545,40000
0,125,4,44.443,62500
0,150,4,82.011,90000
0,175,4,124.86,122500
0,200,4,174.554,160000
0,225,4,230.284,202500
0,250,4,288.602,250000
5,2,2,0.00299317,8
5,3,3,0.00585187,48
5,4,4,0.0421133,384
5,5,5,0.439017,3840
1,-1,4,1.63292,14400
5,6,6,14.8635,46080
2,-1,6,17.6105,51840
5,7,7,734.339,645120
3,-1,7,6729.78,2903040

20
hpclog/log-gpu-slo Normal file
View File

@@ -0,0 +1,20 @@
type,arg,nrels,secs,cosets
0,25,4,1.11799,2500
0,50,4,4.33001,10000
0,75,4,10.4824,22500
0,100,4,23.661,40000
0,125,4,56.4877,62500
0,150,4,103.069,90000
0,175,4,163.208,122500
0,200,4,246.577,160000
0,225,4,354.257,202500
0,250,4,494.363,250000
5,2,2,0.0894585,8
5,3,3,0.0971892,48
5,4,4,0.189169,384
5,5,5,1.35973,3840
1,-1,4,4.76805,14400
5,6,6,22.5943,46080
2,-1,6,25.1293,51840
5,7,7,1350.11,645120
3,-1,7,14657.7,2903040

20
hpclog/log-gpu-slo-o3 Normal file
View File

@@ -0,0 +1,20 @@
type,arg,nrels,secs,cosets
0,25,4,1.03598,2500
0,50,4,3.89485,10000
0,75,4,9.97716,22500
0,100,4,22.3504,40000
0,125,4,60.0055,62500
0,150,4,107.5,90000
0,175,4,169.508,122500
0,200,4,256.386,160000
0,225,4,367.362,202500
0,250,4,508.957,250000
5,2,2,0.0747127,8
5,3,3,0.0833176,48
5,4,4,0.166347,384
5,5,5,1.24269,3840
1,-1,4,4.30961,14400
5,6,6,20.8581,46080
2,-1,6,23.1169,51840
5,7,7,1382.62,645120
3,-1,7,14806.1,2903040