1 | #pragma once |
2 | |
3 | #include <algorithm> |
4 | #include <deque> |
5 | #include <memory> |
6 | #include <mutex> |
7 | |
8 | #include <c10/util/Exception.h> |
9 | #include <c10/util/SmallVector.h> |
10 | #include <c10/util/flat_hash_map.h> |
11 | |
12 | /* |
13 | * CPUCachingAllocator: |
14 | * DISCLAIMER: |
15 | * This is subject to change (beta) and only supported on mobile builds. |
16 | * If code snippet such as in 'Usage pattern' is used outside of mobile |
17 | * build you will not observe the intended behavior. |
18 | * See below for more information. |
19 | * Why? |
20 | * It has been observed that some mobile platforms, such as pixel 3, return |
21 | * memory aggressively to the system. This results in page faults in some |
22 | * cases and ends up hurting performance. This caching allocator aims to address |
23 | * that. Furthermore it also allows users to specify their own allocator by |
24 | * implementing allocate/free virtual interfaces. What are the cons? There are |
25 | * some cons that were observed where use of caching allocator led to worse |
26 | * performance on some platforms. Reason being that the caching mechanism used |
27 | * by this allocator left us worse off compared to the corresponding platform's |
28 | * tuned memory allocator. In that case it seemed better to not use this |
29 | * allocator. Note there are some ideas to fix this in the works. |
30 | * |
31 | * Usage: |
32 | * Usage pattern: |
33 | * Instantiate and own the caching allocator. |
34 | * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator = |
35 | * std::make_unique<c10::CPUCachingAllocator>(); |
36 | * Use caching allocator with a scoped guard at inference time. |
37 | * { |
38 | * WithCPUCachingAllocatorGuard(caching_allocator.get()); |
39 | * ... model.forward(...); |
40 | * } |
41 | */ |
42 | |
43 | namespace c10 { |
44 | |
45 | class C10_API CPUCachingAllocator { |
46 | /* |
47 | * What it does: |
48 | * Caches all the allocations carried out by this allocator. |
49 | * Cache key is the size of the allocation. |
50 | * If requested size is found in the cache returns the cached pointer. |
51 | * What it does not do: |
52 | * No speculative allocation for any future allocations. |
53 | */ |
54 | private: |
55 | inline void* allocate_and_cache(const size_t bytes); |
56 | void free_cached(); |
57 | |
58 | protected: |
59 | // Invariants. |
60 | // 1. If memory is ever allocated via this allocator then |
61 | // the pointer will exist in allocation_map_, unless the allocator |
62 | // returned the memory to OS via free_cached. |
63 | // 1.1. Therefore even when the said memory is "freed" via this |
64 | // allocator (and thus cached), it will continue to stay |
65 | // in allocation_map_. Furthermore it will also exist in |
66 | // available_map_. Thus an allocated memory pointer can be in both |
67 | // allocation_map_ and available_map_ simultaneously. |
68 | // 2. Memory pointer maybe removed from allocation_map_, when it |
69 | // is freed outside of the scope of this allocator, but was allocated |
70 | // by this allocator. |
71 | // 3. Available map only contains that memory which was allocated |
72 | // by this allocator and subsequently freed by this allocator. |
73 | // As a result of above invariants, allocated memory ptr cannot be in |
74 | // available_map_ unless it is in allocation_map_ as well. |
75 | ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_; |
76 | static ska::flat_hash_map<void*, size_t> allocation_map_; |
77 | // Since allocation_map, which is a global instance, is mutated/read via |
78 | // all public APIs we need a global mutex. |
79 | static std::mutex mutex_; |
80 | |
81 | public: |
82 | static void record_free(void* ptr); |
83 | virtual ~CPUCachingAllocator(); |
84 | // Checks the cache to see if allocation of size bytes can be found. |
85 | // If so return cached memory, else |
86 | // allocates memory, records it for caching and returns. |
87 | virtual void* allocate(const size_t bytes); |
88 | // Checks if the memory being freed is was marked for allocation by |
89 | // an earlier call to allocate. If so cache the allocation. |
90 | // Otherwise free. |
91 | virtual void free(void* ptr); |
92 | }; |
93 | |
94 | CPUCachingAllocator* GetDefaultCPUCachingAllocator(); |
95 | |
96 | bool ThreadLocalCachingAllocatorEnabled(); |
97 | CPUCachingAllocator* GetThreadLocalCachingAllocator(); |
98 | |
99 | class C10_API WithCPUCachingAllocatorGuard { |
100 | public: |
101 | WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator); |
102 | ~WithCPUCachingAllocatorGuard(); |
103 | |
104 | private: |
105 | CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr}; |
106 | }; |
107 | |
108 | } // namespace c10 |
109 | |