1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15==============================================================================*/
16
17// See docs in ../ops/math_ops.cc
18
19#define EIGEN_USE_THREADS
20
21#include <bitset>
22
23#include "tensorflow/core/kernels/population_count_op.h"
24
25#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26#include "tensorflow/core/framework/op_kernel.h"
27#include "tensorflow/core/framework/register_types.h"
28#include "tensorflow/core/framework/tensor.h"
29#include "tensorflow/core/framework/tensor_shape.h"
30#include "tensorflow/core/framework/types.h"
31#include "tensorflow/core/lib/core/status.h"
32#include "tensorflow/core/util/work_sharder.h"
33
34namespace tensorflow {
35
36typedef Eigen::ThreadPoolDevice CPUDevice;
37typedef Eigen::GpuDevice GPUDevice;
38
39template <typename Device, typename T>
40class PopulationCountOp : public OpKernel {
41 public:
42 explicit PopulationCountOp(OpKernelConstruction* context)
43 : OpKernel(context) {}
44
45 void Compute(OpKernelContext* c) override {
46 const Tensor& input_t = c->input(0);
47 Tensor* output_t;
48 OP_REQUIRES_OK(c, c->allocate_output(0, input_t.shape(), &output_t));
49
50 auto input = input_t.flat<T>();
51 auto output = output_t->flat<uint8>();
52
53 functor::PopulationCount<Device, T> popcnt;
54 popcnt(c, input, output);
55 }
56};
57
58#define REGISTER_POPULATION_COUNT(type) \
59 REGISTER_KERNEL_BUILDER( \
60 Name("PopulationCount").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
61 PopulationCountOp<CPUDevice, type>);
62
63TF_CALL_uint8(REGISTER_POPULATION_COUNT);
64TF_CALL_int8(REGISTER_POPULATION_COUNT);
65TF_CALL_uint16(REGISTER_POPULATION_COUNT);
66TF_CALL_int16(REGISTER_POPULATION_COUNT);
67TF_CALL_int32(REGISTER_POPULATION_COUNT);
68TF_CALL_uint32(REGISTER_POPULATION_COUNT);
69TF_CALL_int64(REGISTER_POPULATION_COUNT);
70TF_CALL_uint64(REGISTER_POPULATION_COUNT);
71
72#undef REGISTER_POPULATION_COUNT
73
74namespace functor {
75
76namespace {
77
78template <typename T>
79inline uint8 PopCnt(const T v);
80
81#define POPCNT(T, N) \
82 template <> \
83 uint8 PopCnt<T>(const T v) { \
84 return std::bitset<N>(v).count(); \
85 }
86
87POPCNT(int8_t, 8);
88POPCNT(uint8, 8);
89POPCNT(int16_t, 16);
90POPCNT(uint16, 16);
91POPCNT(int32_t, 32);
92POPCNT(uint32, 32);
93POPCNT(int64_t, 64);
94POPCNT(uint64, 64);
95
96#undef POPCNT
97
98} // namespace
99
100template <typename T>
101struct PopulationCount<CPUDevice, T> {
102 void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
103 TTypes<uint8>::Flat output) {
104 const T* input_ptr = input.data();
105 uint8* output_ptr = output.data();
106 auto shard = [input_ptr, output_ptr](int64_t start, int64_t limit) {
107 for (int64_t i = start; i < limit; ++i) {
108 output_ptr[i] = PopCnt<T>(input_ptr[i]);
109 }
110 };
111 int64_t total_shards = input.size();
112 // Approximating cost of popcnt: convert T to int64
113 // (std::bitset constructor) and convert int64 to uint8
114 // (bitset.count() -> output). The .count() itself is relatively cheap.
115 const double total_cost = (Eigen::TensorOpCost::CastCost<T, uint8>() +
116 Eigen::TensorOpCost::CastCost<int64_t, uint8>());
117 const int64_t shard_cost = (total_cost >= static_cast<double>(kint64max))
118 ? kint64max
119 : static_cast<int64_t>(total_cost);
120
121 auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads());
122 Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
123 shard_cost, shard);
124 }
125};
126
127} // namespace functor
128
129#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
130
131#define REGISTER_POPULATION_COUNT(type) \
132 REGISTER_KERNEL_BUILDER( \
133 Name("PopulationCount").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
134 PopulationCountOp<GPUDevice, type>)
135
136TF_CALL_uint8(REGISTER_POPULATION_COUNT);
137TF_CALL_int8(REGISTER_POPULATION_COUNT);
138TF_CALL_uint16(REGISTER_POPULATION_COUNT);
139TF_CALL_int16(REGISTER_POPULATION_COUNT);
140TF_CALL_int32(REGISTER_POPULATION_COUNT);
141TF_CALL_int64(REGISTER_POPULATION_COUNT);
142
143#undef REGISTER_POPULATION_COUNT
144
145namespace functor {
146
147#define DECLARE_GPU_SPEC(T) \
148 template <> \
149 void PopulationCount<GPUDevice, T>::operator()( \
150 OpKernelContext* c, typename TTypes<T>::ConstFlat input, \
151 TTypes<uint8>::Flat output); \
152 extern template struct PopulationCount<GPUDevice, T>
153
154TF_CALL_uint8(DECLARE_GPU_SPEC);
155TF_CALL_int8(DECLARE_GPU_SPEC);
156TF_CALL_uint16(DECLARE_GPU_SPEC);
157TF_CALL_int16(DECLARE_GPU_SPEC);
158TF_CALL_int32(DECLARE_GPU_SPEC);
159TF_CALL_int64(DECLARE_GPU_SPEC);
160
161#undef DECLARE_GPU_SPEC
162
163} // namespace functor
164
165#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
166
167} // namespace tensorflow
168