1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_STATE_H_ |
17 | #define TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_STATE_H_ |
18 | |
19 | #include <functional> |
20 | #include <map> |
21 | #include <unordered_map> |
22 | #include <vector> |
23 | |
24 | #include "tensorflow/core/framework/allocator.h" |
25 | #include "tensorflow/core/framework/allocator_registry.h" |
26 | #include "tensorflow/core/platform/mutex.h" |
27 | #include "tensorflow/core/platform/thread_annotations.h" |
28 | #include "tensorflow/core/platform/types.h" |
29 | #include "tensorflow/core/protobuf/config.pb.h" |
30 | |
31 | namespace tensorflow { |
32 | |
33 | class PoolAllocator; |
34 | |
35 | // Singleton that manages per-process state, e.g. allocation of |
36 | // shared resources. |
37 | class ProcessState : public ProcessStateInterface { |
38 | public: |
39 | static ProcessState* singleton(); |
40 | |
41 | // Descriptor for memory allocation attributes, used by optional |
42 | // runtime correctness analysis logic. |
43 | struct MemDesc { |
44 | enum MemLoc { CPU, GPU }; |
45 | MemLoc loc; |
46 | int dev_index; |
47 | bool gpu_registered; |
48 | bool nic_registered; |
49 | MemDesc() |
50 | : loc(CPU), |
51 | dev_index(0), |
52 | gpu_registered(false), |
53 | nic_registered(false) {} |
54 | string DebugString(); |
55 | }; |
56 | |
57 | // If NUMA Allocators are desired, call this before calling any |
58 | // Allocator accessor. |
59 | void EnableNUMA() { numa_enabled_ = true; } |
60 | |
61 | // Returns what we know about the memory at ptr. |
62 | // If we know nothing, it's called CPU 0 with no other attributes. |
63 | MemDesc PtrType(const void* ptr); |
64 | |
65 | // Returns the one CPUAllocator used for the given numa_node. |
66 | // Treats numa_node == kNUMANoAffinity as numa_node == 0. |
67 | Allocator* GetCPUAllocator(int numa_node) override; |
68 | |
69 | // Registers alloc visitor for the CPU allocator(s). |
70 | // REQUIRES: must be called before GetCPUAllocator. |
71 | void AddCPUAllocVisitor(SubAllocator::Visitor v); |
72 | |
73 | // Registers free visitor for the CPU allocator(s). |
74 | // REQUIRES: must be called before GetCPUAllocator. |
75 | void AddCPUFreeVisitor(SubAllocator::Visitor v); |
76 | |
77 | typedef std::unordered_map<const void*, MemDesc> MDMap; |
78 | |
79 | protected: |
80 | ProcessState(); |
81 | virtual ~ProcessState() {} |
82 | friend class GPUProcessState; |
83 | friend class PluggableDeviceProcessState; |
84 | |
85 | // If these flags need to be runtime configurable consider adding |
86 | // them to ConfigProto. |
87 | static constexpr bool FLAGS_brain_mem_reg_gpu_dma = true; |
88 | static constexpr bool FLAGS_brain_gpu_record_mem_types = false; |
89 | |
90 | // Helper method for unit tests to reset the ProcessState singleton by |
91 | // cleaning up everything. Never use in production. |
92 | void TestOnlyReset(); |
93 | |
94 | static ProcessState* instance_; |
95 | bool numa_enabled_; |
96 | |
97 | mutex mu_; |
98 | |
99 | // Indexed by numa_node. If we want numa-specific allocators AND a |
100 | // non-specific allocator, maybe should index by numa_node+1. |
101 | std::vector<Allocator*> cpu_allocators_ TF_GUARDED_BY(mu_); |
102 | std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ TF_GUARDED_BY(mu_); |
103 | std::vector<SubAllocator::Visitor> cpu_free_visitors_ TF_GUARDED_BY(mu_); |
104 | |
105 | // A cache of cpu allocators indexed by a numa node. Used as a fast path to |
106 | // get CPU allocator by numa node id without locking the mutex. We can't use |
107 | // `cpu_allocators_` storage in the lock-free path because concurrent |
108 | // operation can deallocate the vector storage. |
109 | std::atomic<int> cpu_allocators_cached_; |
110 | std::array<Allocator*, 8> cpu_allocators_cache_; |
111 | |
112 | // Optional RecordingAllocators that wrap the corresponding |
113 | // Allocators for runtime attribute use analysis. |
114 | MDMap mem_desc_map_; |
115 | std::vector<Allocator*> cpu_al_ TF_GUARDED_BY(mu_); |
116 | }; |
117 | |
118 | namespace internal { |
119 | class RecordingAllocator : public Allocator { |
120 | public: |
121 | RecordingAllocator(ProcessState::MDMap* mm, Allocator* a, |
122 | ProcessState::MemDesc md, mutex* mu) |
123 | : mm_(mm), a_(a), md_(md), mu_(mu) {} |
124 | |
125 | string Name() override { return a_->Name(); } |
126 | void* AllocateRaw(size_t alignment, size_t num_bytes) override { |
127 | void* p = a_->AllocateRaw(alignment, num_bytes); |
128 | mutex_lock l(*mu_); |
129 | (*mm_)[p] = md_; |
130 | return p; |
131 | } |
132 | void DeallocateRaw(void* p) override { |
133 | mutex_lock l(*mu_); |
134 | auto iter = mm_->find(p); |
135 | mm_->erase(iter); |
136 | a_->DeallocateRaw(p); |
137 | } |
138 | bool TracksAllocationSizes() const override { |
139 | return a_->TracksAllocationSizes(); |
140 | } |
141 | size_t RequestedSize(const void* p) const override { |
142 | return a_->RequestedSize(p); |
143 | } |
144 | size_t AllocatedSize(const void* p) const override { |
145 | return a_->AllocatedSize(p); |
146 | } |
147 | absl::optional<AllocatorStats> GetStats() override { return a_->GetStats(); } |
148 | bool ClearStats() override { return a_->ClearStats(); } |
149 | |
150 | AllocatorMemoryType GetMemoryType() const override { |
151 | return a_->GetMemoryType(); |
152 | } |
153 | |
154 | ProcessState::MDMap* mm_; // not owned |
155 | Allocator* a_; // not owned |
156 | ProcessState::MemDesc md_; |
157 | mutex* mu_; |
158 | }; |
159 | } // namespace internal |
160 | } // namespace tensorflow |
161 | #endif // TENSORFLOW_CORE_COMMON_RUNTIME_PROCESS_STATE_H_ |
162 | |