1 | /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
12 | implied. |
13 | See the License for the specific language governing permissions and |
14 | limitations under the License. |
15 | ==============================================================================*/ |
16 | |
17 | // See docs in ../ops/math_ops.cc |
18 | |
19 | #define EIGEN_USE_THREADS |
20 | |
21 | #include <bitset> |
22 | |
23 | #include "tensorflow/core/kernels/population_count_op.h" |
24 | |
25 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
26 | #include "tensorflow/core/framework/op_kernel.h" |
27 | #include "tensorflow/core/framework/register_types.h" |
28 | #include "tensorflow/core/framework/tensor.h" |
29 | #include "tensorflow/core/framework/tensor_shape.h" |
30 | #include "tensorflow/core/framework/types.h" |
31 | #include "tensorflow/core/lib/core/status.h" |
32 | #include "tensorflow/core/util/work_sharder.h" |
33 | |
34 | namespace tensorflow { |
35 | |
36 | typedef Eigen::ThreadPoolDevice CPUDevice; |
37 | typedef Eigen::GpuDevice GPUDevice; |
38 | |
39 | template <typename Device, typename T> |
40 | class PopulationCountOp : public OpKernel { |
41 | public: |
42 | explicit PopulationCountOp(OpKernelConstruction* context) |
43 | : OpKernel(context) {} |
44 | |
45 | void Compute(OpKernelContext* c) override { |
46 | const Tensor& input_t = c->input(0); |
47 | Tensor* output_t; |
48 | OP_REQUIRES_OK(c, c->allocate_output(0, input_t.shape(), &output_t)); |
49 | |
50 | auto input = input_t.flat<T>(); |
51 | auto output = output_t->flat<uint8>(); |
52 | |
53 | functor::PopulationCount<Device, T> popcnt; |
54 | popcnt(c, input, output); |
55 | } |
56 | }; |
57 | |
58 | #define REGISTER_POPULATION_COUNT(type) \ |
59 | REGISTER_KERNEL_BUILDER( \ |
60 | Name("PopulationCount").Device(DEVICE_CPU).TypeConstraint<type>("T"), \ |
61 | PopulationCountOp<CPUDevice, type>); |
62 | |
63 | TF_CALL_uint8(REGISTER_POPULATION_COUNT); |
64 | TF_CALL_int8(REGISTER_POPULATION_COUNT); |
65 | TF_CALL_uint16(REGISTER_POPULATION_COUNT); |
66 | TF_CALL_int16(REGISTER_POPULATION_COUNT); |
67 | TF_CALL_int32(REGISTER_POPULATION_COUNT); |
68 | TF_CALL_uint32(REGISTER_POPULATION_COUNT); |
69 | TF_CALL_int64(REGISTER_POPULATION_COUNT); |
70 | TF_CALL_uint64(REGISTER_POPULATION_COUNT); |
71 | |
72 | #undef REGISTER_POPULATION_COUNT |
73 | |
74 | namespace functor { |
75 | |
76 | namespace { |
77 | |
78 | template <typename T> |
79 | inline uint8 PopCnt(const T v); |
80 | |
81 | #define POPCNT(T, N) \ |
82 | template <> \ |
83 | uint8 PopCnt<T>(const T v) { \ |
84 | return std::bitset<N>(v).count(); \ |
85 | } |
86 | |
87 | POPCNT(int8_t, 8); |
88 | POPCNT(uint8, 8); |
89 | POPCNT(int16_t, 16); |
90 | POPCNT(uint16, 16); |
91 | POPCNT(int32_t, 32); |
92 | POPCNT(uint32, 32); |
93 | POPCNT(int64_t, 64); |
94 | POPCNT(uint64, 64); |
95 | |
96 | #undef POPCNT |
97 | |
98 | } // namespace |
99 | |
100 | template <typename T> |
101 | struct PopulationCount<CPUDevice, T> { |
102 | void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input, |
103 | TTypes<uint8>::Flat output) { |
104 | const T* input_ptr = input.data(); |
105 | uint8* output_ptr = output.data(); |
106 | auto shard = [input_ptr, output_ptr](int64_t start, int64_t limit) { |
107 | for (int64_t i = start; i < limit; ++i) { |
108 | output_ptr[i] = PopCnt<T>(input_ptr[i]); |
109 | } |
110 | }; |
111 | int64_t total_shards = input.size(); |
112 | // Approximating cost of popcnt: convert T to int64 |
113 | // (std::bitset constructor) and convert int64 to uint8 |
114 | // (bitset.count() -> output). The .count() itself is relatively cheap. |
115 | const double total_cost = (Eigen::TensorOpCost::CastCost<T, uint8>() + |
116 | Eigen::TensorOpCost::CastCost<int64_t, uint8>()); |
117 | const int64_t shard_cost = (total_cost >= static_cast<double>(kint64max)) |
118 | ? kint64max |
119 | : static_cast<int64_t>(total_cost); |
120 | |
121 | auto worker_threads = *(c->device()->tensorflow_cpu_worker_threads()); |
122 | Shard(worker_threads.num_threads, worker_threads.workers, total_shards, |
123 | shard_cost, shard); |
124 | } |
125 | }; |
126 | |
127 | } // namespace functor |
128 | |
129 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
130 | |
131 | #define REGISTER_POPULATION_COUNT(type) \ |
132 | REGISTER_KERNEL_BUILDER( \ |
133 | Name("PopulationCount").Device(DEVICE_GPU).TypeConstraint<type>("T"), \ |
134 | PopulationCountOp<GPUDevice, type>) |
135 | |
136 | TF_CALL_uint8(REGISTER_POPULATION_COUNT); |
137 | TF_CALL_int8(REGISTER_POPULATION_COUNT); |
138 | TF_CALL_uint16(REGISTER_POPULATION_COUNT); |
139 | TF_CALL_int16(REGISTER_POPULATION_COUNT); |
140 | TF_CALL_int32(REGISTER_POPULATION_COUNT); |
141 | TF_CALL_int64(REGISTER_POPULATION_COUNT); |
142 | |
143 | #undef REGISTER_POPULATION_COUNT |
144 | |
145 | namespace functor { |
146 | |
147 | #define DECLARE_GPU_SPEC(T) \ |
148 | template <> \ |
149 | void PopulationCount<GPUDevice, T>::operator()( \ |
150 | OpKernelContext* c, typename TTypes<T>::ConstFlat input, \ |
151 | TTypes<uint8>::Flat output); \ |
152 | extern template struct PopulationCount<GPUDevice, T> |
153 | |
154 | TF_CALL_uint8(DECLARE_GPU_SPEC); |
155 | TF_CALL_int8(DECLARE_GPU_SPEC); |
156 | TF_CALL_uint16(DECLARE_GPU_SPEC); |
157 | TF_CALL_int16(DECLARE_GPU_SPEC); |
158 | TF_CALL_int32(DECLARE_GPU_SPEC); |
159 | TF_CALL_int64(DECLARE_GPU_SPEC); |
160 | |
161 | #undef DECLARE_GPU_SPEC |
162 | |
163 | } // namespace functor |
164 | |
165 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
166 | |
167 | } // namespace tensorflow |
168 | |