linear-quadtree/nocuda.cpp at master · rob05c/linear-quadtree · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/// this file exists to remove C++11 from CUDA, to support outdated nvcc compilers
#include "lqt.h"
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>

#include <vector>
#include <utility>
#include <iostream>
#include <chrono>
#include <future>

#include "mergesort.hh"
#include "samplesort.hh"
#include "tbb/tbb.h"

using std::cout;
using std::endl;
using std::vector;
using std::pair;
using std::promise;
using std::future;

namespace {
/*
/// \todo put these in a header, so they aren't duplicated
/// initialize boundary so the first udpate overrides it.
inline void init_boundary(struct rtree_rect* boundary) {
  boundary->top = ord_t_max;
  boundary->bottom = ord_t_lowest;
  boundary->left = ord_t_max;
  boundary->right = ord_t_lowest;
}
inline void update_boundary(struct rtree_rect* boundary, struct rtree_point* p) {
  /// \todo replace these with CUDA min/max which won't use conditionals
  boundary->top = fmin(p->y, boundary->top);
  boundary->bottom = fmax(p->y, boundary->bottom);
  boundary->left = fmin(p->x, boundary->left);
  boundary->right = fmax(p->x, boundary->right);
}
inline void update_boundary(struct rtree_rect* boundary, struct rtree_rect* node) {
  /// \todo replace these with CUDA min/max which won't use conditionals
  boundary->top = fmin(node->top, boundary->top);
  boundary->bottom = fmax(node->bottom, boundary->bottom);
  boundary->left = fmin(node->left, boundary->left);
  boundary->right = fmax(node->right, boundary->right);
}
*/
} // namespace


// x value ALONE is used for comparison, to create an xpack
bool operator<(const lqt_unified_node& rhs, const lqt_unified_node& lhs) {
  return rhs.location < lhs.location;
}

size_t tbb_num_default_thread() {
  return tbb::task_scheduler_init::default_num_threads();
}

void tbb_test_scheduler_init() {
  for(size_t i = 1; i < 1025; i *= 2) {
    const size_t num_threads = i;
    const auto start = std::chrono::high_resolution_clock::now();
    tbb::task_scheduler_init init(num_threads);
    const auto end = std::chrono::high_resolution_clock::now();
    const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
    std::cout << "tbb scheduler init test - threads: " << num_threads << "\tms: " << duration << std::endl;
  }
}

linear_quadtree_unified tbb_sortify_unified(linear_quadtree_unified lqt, const size_t threads) {
//  auto lowxpack = [](const rtree_point& rhs, const struct rtree_point& lhs) {
//    return rhs.x < rhs.y;
//  };
//  tbb::task_scheduler_init init(threads);
  tbb::parallel_sort(lqt.nodes, lqt.nodes + lqt.length);
  return lqt;
}

linear_quadtree_unified sisd_sortify_unified(linear_quadtree_unified lqt, const size_t threads) {
  std::sort(lqt.nodes, lqt.nodes + lqt.length);
  return lqt;
}

/// does not block for GPU memory. Will fail, if GPU memory is insufficient.
linear_quadtree_unified lqt_create_heterogeneous(lqt_point* points, size_t len,
                                                       ord_t xstart, ord_t xend,
                                                       ord_t ystart, ord_t yend,
                                                       size_t* depth, const size_t threads) {
  return tbb_sortify_unified(lqt_nodify_cuda_unified(points, len, xstart, xend, ystart, yend, depth), threads);
}

/// does not block for GPU memory. Will fail, if GPU memory is insufficient.
linear_quadtree_unified lqt_create_sisd(lqt_point* points, size_t len,
                                                       ord_t xstart, ord_t xend,
                                                       ord_t ystart, ord_t yend,
                                                       size_t* depth, const size_t threads) {
  return sisd_sortify_unified(lqt_nodify_cuda_unified(points, len, xstart, xend, ystart, yend, depth), threads);
}


linear_quadtree_unified merge_sortify_unified(linear_quadtree_unified lqt, const size_t threads) {
  lqt.nodes = parallel_mergesort(lqt.nodes, lqt.nodes + lqt.length, threads);
  return lqt;
}

linear_quadtree_unified lqt_create_heterogeneous_mergesort(lqt_point* points, size_t len,
                                                           ord_t xstart, ord_t xend,
                                                           ord_t ystart, ord_t yend,
                                                           size_t* depth, const size_t threads) {
  return merge_sortify_unified(lqt_nodify_cuda_unified(points, len, xstart, xend, ystart, yend, depth), threads);
}

linear_quadtree_unified sample_sortify_unified(linear_quadtree_unified lqt, const size_t threads) {
  lqt.nodes = parallel_samplesort(lqt.nodes, lqt.nodes + lqt.length, threads);
  return lqt;
}

linear_quadtree_unified lqt_create_heterogeneous_samplesort(lqt_point* points, size_t len,
                                                           ord_t xstart, ord_t xend,
                                                           ord_t ystart, ord_t yend,
                                                           size_t* depth, const size_t threads) {
  return sample_sortify_unified(lqt_nodify_cuda_unified(points, len, xstart, xend, ystart, yend, depth), threads);
}

/*
/// SISD sort via single CPU core (for benchmarks)
struct rtree cuda_create_rtree_sisd(struct rtree_point* points, const size_t len) {
  std::sort(points, points + len);
  struct rtree_leaf* leaves = cuda_create_leaves_together(points, len);
  const size_t leaves_len = DIV_CEIL(len, RTREE_NODE_SIZE);

  rtree_node* previous_level = (rtree_node*) leaves;
  size_t      previous_len = leaves_len;
  size_t      depth = 1; // leaf level is 0
  while(previous_len > RTREE_NODE_SIZE) {
    previous_level = cuda_create_level(previous_level, previous_len);
    previous_len = DIV_CEIL(previous_len, RTREE_NODE_SIZE);
    ++depth;
  }

  rtree_node* root = (rtree_node*) malloc(sizeof(rtree_node));
  init_boundary(&root->bounding_box);
  root->num = previous_len;
  root->children = previous_level;
  for(size_t i = 0, end = previous_len; i != end; ++i)
    update_boundary(&root->bounding_box, &root->children[i].bounding_box);
  ++depth;

  struct rtree tree = {depth, root};
  return tree;
}
*/