forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSorting.cpp
253 lines (230 loc) · 7.29 KB
/
Sorting.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#include <ATen/ATen.h>
#include <ATen/NumericUtils.h>
#include <ATen/Parallel.h>
#include <ATen/WrapDimUtils.h>
#include <ATen/native/SortingUtils.h>
namespace at {
namespace native {
namespace {
// maybe these days, one should define a random access iterator and use
// std::sort...
/* Note from TH:
I cut and pasted (slightly adapted) the quicksort code from
Sedgewick's 1978 "Implementing Quicksort Programs" article
http://www.csie.ntu.edu.tw/~b93076/p847-sedgewick.pdf
It is the state of the art existing implementation. The macros
are here to make as close a match as possible to the pseudocode of
Program 2 p.851
Note that other partition schemes exist, and are typically presented
in textbook, but those are less efficient. See e.g.
http://cs.stackexchange.com/questions/11458/quicksort-partitioning-hoare-vs-lomuto
Julien, November 12th 2013
*/
constexpr int64_t MAX_LEVELS = 300;
constexpr int64_t M_SMALL = 10; // Limit for small subfiles
template <typename Fn>
void dim_apply(TensorList tensors, int64_t dim, Fn f) {
AT_ASSERT(tensors.size() > 0);
auto t = tensors[0];
auto sizes = t.sizes();
int64_t ndim = t.dim();
int64_t itersize = 1;
for (int64_t i = 0; i < ndim; i++) {
if (i != dim) {
itersize *= t.size(i);
}
}
parallel_for(0, itersize, 1, [&](int64_t i_begin, int64_t i_end) {
std::vector<Tensor> narrowed_tensors;
narrowed_tensors.reserve(tensors.size());
for (int64_t it = i_begin; it < i_end; it++) {
narrowed_tensors.clear();
for (auto ti : tensors) {
int64_t i = it;
Tensor nt = ti;
for (size_t d = 0; d < ndim; d++) {
if (d != dim) {
// this could be avoided for slower-changing dimensions if done
// better
nt = nt.select((d > dim ? 1 : 0), i % sizes[d]);
i = i / sizes[d];
}
}
narrowed_tensors.emplace_back(nt);
}
f(it, narrowed_tensors);
}
});
}
template <typename scalar_t, typename Comp, typename Fn>
void quick_select_template(
TensorAccessor<scalar_t, 1> arr,
int64_t k,
Comp gt_or_nan,
Fn swap_fn) {
int64_t P, L, R, i, j, swap;
scalar_t rswap, piv;
L = 0;
R = arr.size(0) - 1;
do {
if (R <= L) // One element only
return;
if (R == L + 1) { // Two elements only
if (gt_or_nan(arr[L], arr[R])) {
swap_fn(L, R);
}
return;
}
// Use median of three for pivot choice
P = (L + R) >> 1;
swap_fn(P, L + 1);
if (gt_or_nan(arr[L + 1], arr[R])) {
swap_fn(L + 1, R);
}
if (gt_or_nan(arr[L], arr[R])) {
swap_fn(L, R);
}
if (gt_or_nan(arr[L + 1], arr[L])) {
swap_fn(L + 1, L);
}
i = L + 1;
j = R;
piv = arr[L];
do {
do
i++;
while (gt_or_nan(piv, arr[i]));
do
j--;
while (gt_or_nan(arr[j], piv));
if (j < i)
break;
swap_fn(i, j);
} while (1);
swap_fn(L, j);
// Re-set active partition
if (j <= k)
L = i;
if (j >= k)
R = j - 1;
} while (1);
}
} // namespace
std::tuple<Tensor&, Tensor&> kthvalue_out_cpu(
Tensor& values,
Tensor& indices,
const Tensor& self,
int64_t k,
int64_t dim_,
bool keepdim) {
int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
// FIXME: This seems bogus, I only do this because it was the old behaviour.
// The reductions are fine, as long as the axis being reduced along
// isn't of 0 elements (and the output has elements).
AT_CHECK(
self.numel() > 0,
"cannot perform reduction function kthvalue",
" on tensor with no elements because the operation does not have an identity");
AT_CHECK(
k > 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
"selected index k out of range");
_reduction_with_indices_allocate_or_resize_output(
values, indices, self, dim_, keepdim);
if (self.dim() == 0 && self.numel() == 1) {
values.copy_(self);
indices.zero_();
return std::forward_as_tuple(values, indices);
}
auto tmp_values = self.clone();
auto tmp_indices = at::empty(self.sizes(), self.options().dtype(kLong));
AT_DISPATCH_ALL_TYPES(self.scalar_type(), "kthvalue_cpu", [&] {
dim_apply(
{tmp_values, tmp_indices, values, indices},
dim,
[&](int64_t i, TensorList tl) {
auto tmp_values = tl[0].accessor<scalar_t, 1>();
auto tmp_indices = tl[1].accessor<int64_t, 1>();
scalar_t* mode_value = tl[2].data<scalar_t>();
int64_t* mode_index = tl[3].data<int64_t>();
for (int64_t j = 0; j < tmp_indices.size(0); j++) {
tmp_indices[j] = j;
}
// we want NaN to be sorted as top for numpy compatibility
quick_select_template(
tmp_values,
k - 1,
[](scalar_t x, scalar_t y) -> bool {
return (
(_isnan<scalar_t>(x) && !_isnan<scalar_t>(y)) || (x > y));
},
[&](int64_t i, int64_t j) {
std::swap(tmp_values[i], tmp_values[j]);
std::swap(tmp_indices[i], tmp_indices[j]);
});
*mode_value = tmp_values[k - 1];
*mode_index = tmp_indices[k - 1];
});
});
if (!keepdim) {
values.squeeze_(dim);
indices.squeeze_(dim);
}
return std::forward_as_tuple(values, indices);
}
std::tuple<Tensor, Tensor> kthvalue(
const Tensor& self,
int64_t k,
int64_t dim,
bool keepdim) {
Tensor values = at::empty({0}, self.options());
Tensor indices = at::empty({0}, self.options().dtype(kLong));
at::kthvalue_out(values, indices, self, k, dim, keepdim);
return std::make_tuple(values, indices);
}
std::tuple<Tensor&, Tensor&> median_out(
Tensor& values,
Tensor& indices,
const Tensor& self,
int64_t dim,
bool keepdim) {
// note: kthvalue counts from 1..n
int64_t k = self.dim() > 0 ? (self.size(dim) + 1) / 2 : 1;
at::kthvalue_out(values, indices, self, k, dim, keepdim);
return std::forward_as_tuple(values, indices);
}
std::tuple<Tensor, Tensor> median(
const Tensor& self,
int64_t dim,
bool keepdim) {
Tensor values = at::empty({0}, self.options());
Tensor indices = at::empty({0}, self.options().dtype(kLong));
at::median_out(values, indices, self, dim, keepdim);
return std::make_tuple(values, indices);
}
// this does not reduce to median with dim beause we don't want to copy twice
Tensor median_cpu(const Tensor& self) {
AT_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
if (self.dim() == 0 && self.numel() == 1) {
return self.clone();
}
auto tmp_values = self.clone().view(-1);
auto result = at::empty({1}, self.options());
AT_DISPATCH_ALL_TYPES(self.scalar_type(), "median", [&] {
// note, quick_select is 0 based while kthvalue is not
int64_t k = (tmp_values.size(0) - 1) / 2;
auto val_accessor = tmp_values.accessor<scalar_t, 1>();
quick_select_template(
val_accessor,
k,
[](scalar_t x, scalar_t y) -> bool {
return ((_isnan<scalar_t>(x) && !_isnan<scalar_t>(y)) || (x > y));
},
[&](int64_t i, int64_t j) {
std::swap(val_accessor[i], val_accessor[j]);
});
result.fill_(tmp_values[k]);
});
return result.view({});
}
} // namespace native
} // namespace at