random_poisson_op.cc source code [tensorflow/tensorflow/core/kernels/random_poisson_op.cc]

1	/ Copyright 2016 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	// See docs in ../ops/random_ops.cc.
17
18	#define EIGEN_USE_THREADS
19
20	#include "tensorflow/core/kernels/random_poisson_op.h"
21
22	#include <algorithm>
23	#include <cmath>
24	#include <limits>
25	#include <memory>
26
27	#include "tensorflow/core/framework/op_kernel.h"
28	#include "tensorflow/core/framework/register_types.h"
29	#include "tensorflow/core/framework/tensor.h"
30	#include "tensorflow/core/framework/tensor_shape.h"
31	#include "tensorflow/core/framework/tensor_util.h"
32	#include "tensorflow/core/lib/random/random_distributions.h"
33	#include "tensorflow/core/lib/random/simple_philox.h"
34	#include "tensorflow/core/util/guarded_philox_random.h"
35	#include "tensorflow/core/util/work_sharder.h"
36
37	#if EIGEN_COMP_GNUC && __cplusplus > 199711L
38	#define DISABLE_FLOAT_EQUALITY_WARNING \
39	_Pragma("GCC diagnostic push") \
40	_Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
41	#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
42	#else
43	#define DISABLE_FLOAT_EQUALITY_WARNING
44	#define ENABLE_FLOAT_EQUALITY_WARNING
45	#endif
46
47	#define UNIFORM(X) \
48	if (uniform_remaining == 0) { \
49	uniform_remaining = Uniform::kResultElementCount; \
50	uniform_result = uniform(&gen); \
51	} \
52	uniform_remaining--; \
53	CT X = uniform_result[uniform_remaining]
54
55	namespace tensorflow {
56	namespace {
57
58	static constexpr int kReservedSamplesPerOutput = `256`;
59
60	typedef Eigen::ThreadPoolDevice CPUDevice;
61
62	template <typename T>
63	struct PoissonComputeType {
64	typedef double ComputeType;
65	};
66
67	} // namespace
68
69	namespace functor {
70
71	template <typename T, typename U>
72	struct PoissonFunctor<CPUDevice, T, U> {
73	void operator()(OpKernelContext* ctx, const CPUDevice& d, const T* rate_flat,
74	int num_rate, int num_samples,
75	const random::PhiloxRandom& rng, U* samples_flat) {
76	// Two different algorithms are employed, depending on the size of
77	// rate.
78	// If rate < 10, we use an algorithm attributed to Knuth:
79	// Seminumerical Algorithms. Art of Computer Programming, Volume 2.
80	//
81	// This algorithm runs in O(rate) time, and will require O(rate)
82	// uniform variates.
83	//
84	// If rate >= 10 we use a transformation-rejection algorithm from
85	// pairs of uniform random variables due to Hormann.
86	// http://www.sciencedirect.com/science/article/pii/0167668793909974
87	//
88	// The algorithm has an acceptance rate of ~89% for the smallest rate
89	// (~10),
90	// and higher accept rates for higher rate, so runtime is
91	// O(NumRate NumSamples * k) with k ~ 1 / 0.89.*
92	//
93	// We partition work first across rates then across
94	// samples-per-rate to
95	// avoid a couple flops which can be done on a per-rate basis.
96
97	typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;
98
99	auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
100	int64_t start_output, int64_t limit_output) {
101	// Capturing "rng" by value would only make a copy for the _shared_
102	// lambda. Since we want to let each worker have its own copy, we pass
103	// "rng" by reference and explicitly do a copy assignment.
104
105	Uniform uniform;
106	typename Uniform::ResultType uniform_result;
107	for (int64_t output_idx = start_output; output_idx < limit_output;
108	/ output_idx incremented within inner loop below /) {
109	const int64_t rate_idx = output_idx / num_samples;
110
111	// Several calculations can be done on a per-rate basis.
112	const CT rate = CT(rate_flat[rate_idx]);
113
114	auto samples_rate_output = samples_flat + rate_idx;
115
116	if (rate < CT(`10`)) {
117	// Knuth's algorithm for generating Poisson random variates.
118	// Given a Poisson process, the time between events is exponentially
119	// distributed. If we have a Poisson process with rate lambda, then,
120	// the time between events is distributed Exp(lambda). If X ~
121	// Uniform(0, 1), then Y ~ Exp(lambda), where Y = -log(X) / lambda.
122	// Thus to simulate a Poisson draw, we can draw X_i ~ Exp(lambda),
123	// and N ~ Poisson(lambda), where N is the least number such that
124	// \sum_i^N X_i > 1.
125	const CT exp_neg_rate = Eigen::numext::exp(-rate);
126
127	// Compute the rest of the samples for the current rate value.
128	for (int64_t sample_idx = output_idx % num_samples;
129	sample_idx < num_samples && output_idx < limit_output;
130	sample_idx++, output_idx++) {
131	random::PhiloxRandom gen = rng;
132	gen.Skip(kReservedSamplesPerOutput * output_idx);
133	int16_t uniform_remaining = `0`;
134
135	CT prod = `1`;
136	CT x = `0`;
137
138	// Keep trying until we surpass e^(-rate). This will take
139	// expected time proportional to rate.
140	while (true) {
141	UNIFORM(u);
142	prod = prod * u;
143	if (prod <= exp_neg_rate &&
144	x <= CT(Eigen::NumTraits<U>::highest())) {
145	samples_rate_output[sample_idx * num_rate] = U(x);
146	break;
147	}
148	x += `1`;
149	}
150	}
151	continue;
152	}
153	if (Eigen::numext::isinf(rate) && rate > CT(`0`)) {
154	// Fill the rest of the samples for the current rate value.
155	for (int64_t sample_idx = output_idx % num_samples;
156	sample_idx < num_samples && output_idx < limit_output;
157	sample_idx++, output_idx++) {
158	U k = Eigen::NumTraits<U>::infinity();
159	samples_rate_output[sample_idx * num_rate] = k;
160	}
161	continue;
162	}
163	// Transformed rejection due to Hormann.
164	//
165	// Given a CDF F(x), and G(x), a dominating distribution chosen such
166	// that it is close to the inverse CDF F^-1(x), compute the following
167	// steps:
168	//
169	// 1) Generate U and V, two independent random variates. Set U = U - 0.5
170	// (this step isn't strictly necessary, but is done to make some
171	// calculations symmetric and convenient. Henceforth, G is defined on
172	// [-0.5, 0.5]).
173	//
174	// 2) If V <= alpha F'(G(U)) * G'(U), return floor(G(U)), else return*
175	// to step 1. alpha is the acceptance probability of the rejection
176	// algorithm.
177	//
178	// For more details on transformed rejection, see:
179	// http://citeseer.ist.psu.edu/viewdoc/citations;jsessionid=1BEB35946CC807879F55D42512E5490C?doi=10.1.1.48.3054.
180	//
181	// The dominating distribution in this case:
182	//
183	// G(u) = (2 a / (2 - \|u\|) + b) * u + c*
184
185	using Eigen::numext::log;
186	const CT log_rate = log(rate);
187
188	// Constants used to define the dominating distribution. Names taken
189	// from Hormann's paper. Constants were chosen to define the tightest
190	// G(u) for the inverse Poisson CDF.
191	const CT b = CT(`0.931`) + CT(`2.53`) * Eigen::numext::sqrt(rate);
192	const CT a = CT(-`0.059`) + CT(`0.02483`) * b;
193
194	// This is the inverse acceptance rate. At a minimum (when rate = 10),
195	// this corresponds to ~75% acceptance. As the rate becomes larger, this
196	// approaches ~89%.
197	const CT inv_alpha = CT(`1.1239`) + CT(`1.1328`) / (b - CT(`3.4`));
198
199	// Compute the rest of the samples for the current rate value.
200	for (int64_t sample_idx = output_idx % num_samples;
201	sample_idx < num_samples && output_idx < limit_output;
202	sample_idx++, output_idx++) {
203	random::PhiloxRandom gen = rng;
204	gen.Skip(kReservedSamplesPerOutput * output_idx);
205	int16_t uniform_remaining = `0`;
206
207	while (true) {
208	UNIFORM(u);
209	u -= CT(`0.5`);
210	UNIFORM(v);
211
212	CT u_shifted = CT(`0.5`) - Eigen::numext::abs(u);
213	CT k = Eigen::numext::floor((CT(`2`) * a / u_shifted + b) * u + rate +
214	CT(`0.43`));
215
216	if (k > CT(Eigen::NumTraits<U>::highest())) {
217	// retry in case of overflow.
218	continue;
219	}
220
221	// When alpha f(G(U)) * G'(U) is close to 1, it is possible to*
222	// find a rectangle (-u_r, u_r) x (0, v_r) under the curve, such
223	// that if v <= v_r and \|u\| <= u_r, then we can accept.
224	// Here v_r = 0.9227 - 3.6224 / (b - 2) and u_r = 0.43.
225	if (u_shifted >= CT(`0.07`) &&
226	v <= CT(`0.9277`) - CT(`3.6224`) / (b - CT(`2`))) {
227	samples_rate_output[sample_idx * num_rate] = U(k);
228	break;
229	}
230
231	if (k < `0` \|\| (u_shifted < CT(`0.013`) && v > u_shifted)) {
232	continue;
233	}
234
235	// The expression below is equivalent to the computation of step 2)
236	// in transformed rejection (v <= alpha F'(G(u)) * G'(u)).*
237	CT s = log(v * inv_alpha / (a / (u_shifted * u_shifted) + b));
238	CT t = -rate + k * log_rate - Eigen::numext::lgamma(k + `1`);
239	if (s <= t) {
240	samples_rate_output[sample_idx * num_rate] = U(k);
241	break;
242	}
243	}
244	}
245	}
246	};
247
248	// This will depend on rate.
249	// For rate < 10, on average, O(rate) calls to uniform are
250	// needed, with that
251	// many multiplies. ~10 uniform calls on average with ~25 cost op calls.
252	//
253	// Very roughly, for rate >= 10, the single call to log + call to
254	// lgamma
255	// occur for ~60 percent of samples.
256	// 2 x 100 (64-bit cycles per log) 0.62 = ~124*
257	// Additionally, there are ~10 other ops (+, , /, ...) at 3-6 cycles each:*
258	// 40 .62 = ~25.*
259	//
260	// Finally, there are several other ops that are done every loop along with
261	// 2 uniform generations along with 5 other ops at 3-6 cycles each.
262	// ~15 / .89 = ~16
263	//
264	// In total this should be ~165 + 2 Uniform::kElementCost.*
265	// We assume that half the tensor has rate < 10, so on average 6
266	// uniform's
267	// will be needed. We will upper bound the other op cost by the one for
268	// rate > 10.
269	static const int kElementCost = `165` + `6` * Uniform::kElementCost +
270	`6` * random::PhiloxRandom::kElementCost;
271	auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
272	Shard(worker_threads.num_threads, worker_threads.workers,
273	num_rate * num_samples, kElementCost, DoWork);
274	}
275
276	private:
277	typedef typename PoissonComputeType<T>::ComputeType CT;
278	};
279
280	} // namespace functor
281
282	namespace {
283
284	// Samples from one or more Poisson distributions.
285	template <typename T, typename U>
286	class RandomPoissonOp : public OpKernel {
287	public:
288	explicit RandomPoissonOp(OpKernelConstruction* context) : OpKernel(context) {
289	OP_REQUIRES_OK(context, generator_.Init(context));
290	}
291
292	void Compute(OpKernelContext* ctx) override {
293	const Tensor& shape_t = ctx->input(`0`);
294	const Tensor& rate_t = ctx->input(`1`);
295
296	TensorShape samples_shape;
297	OP_REQUIRES_OK(ctx, tensor::MakeShape(shape_t, &samples_shape));
298	const int64_t num_samples = samples_shape.num_elements();
299	OP_REQUIRES_OK(ctx, samples_shape.AppendShapeWithStatus(rate_t.shape()));
300
301	// Allocate output samples.
302	Tensor* samples_t = nullptr;
303	OP_REQUIRES_OK(ctx, ctx->allocate_output(`0`, samples_shape, &samples_t));
304	if (num_samples == `0`) return;
305
306	const auto rate_flat = rate_t.flat<T>().data();
307	const int64_t num_rate = rate_t.NumElements();
308	auto samples_flat = samples_t->flat<U>().data();
309	random::PhiloxRandom rng = generator_.ReserveRandomOutputs(
310	num_samples * num_rate, kReservedSamplesPerOutput);
311
312	functor::PoissonFunctor<CPUDevice, T, U>()(
313	ctx, ctx->eigen_device<CPUDevice>(), rate_flat, num_rate, num_samples,
314	rng, samples_flat);
315	}
316
317	private:
318	GuardedPhiloxRandom generator_;
319
320	TF_DISALLOW_COPY_AND_ASSIGN(RandomPoissonOp);
321	};
322	} // namespace
323
324	#undef UNIFORM
325
326	#define REGISTER(TYPE) \
327	REGISTER_KERNEL_BUILDER( \
328	Name("RandomPoisson").Device(DEVICE_CPU).TypeConstraint<TYPE>("dtype"), \
329	RandomPoissonOp<TYPE, TYPE>);
330
331	TF_CALL_half(REGISTER);
332	TF_CALL_float(REGISTER);
333	TF_CALL_double(REGISTER);
334
335	#define REGISTER_V2(RTYPE, OTYPE) \
336	template struct functor::PoissonFunctor<CPUDevice, RTYPE, OTYPE>; \
337	REGISTER_KERNEL_BUILDER(Name("RandomPoissonV2") \
338	.Device(DEVICE_CPU) \
339	.TypeConstraint<RTYPE>("R") \
340	.TypeConstraint<OTYPE>("dtype"), \
341	RandomPoissonOp<RTYPE, OTYPE>);
342
343	#define REGISTER_ALL(RTYPE) \
344	REGISTER_V2(RTYPE, Eigen::half); \
345	REGISTER_V2(RTYPE, float); \
346	REGISTER_V2(RTYPE, double); \
347	REGISTER_V2(RTYPE, int32); \
348	REGISTER_V2(RTYPE, int64_t);
349
350	REGISTER_ALL(Eigen::half);
351	REGISTER_ALL(float);
352	REGISTER_ALL(double);
353	REGISTER_ALL(int32);
354	REGISTER_ALL(int64_t);
355
356	#undef REGISTER_ALL
357	#undef REGISTER_V2
358	#undef REGISTER
359
360	} // end namespace tensorflow
361

Browse the source code of tensorflow/tensorflow/core/kernels/random_poisson_op.cc