1// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5#include "leveldb/table.h"
6
7#include "leveldb/cache.h"
8#include "leveldb/comparator.h"
9#include "leveldb/env.h"
10#include "leveldb/filter_policy.h"
11#include "leveldb/options.h"
12#include "table/block.h"
13#include "table/filter_block.h"
14#include "table/format.h"
15#include "table/two_level_iterator.h"
16#include "util/coding.h"
17
18namespace leveldb {
19
20struct Table::Rep {
21 ~Rep() {
22 delete filter;
23 delete[] filter_data;
24 delete index_block;
25 }
26
27 Options options;
28 Status status;
29 RandomAccessFile* file;
30 uint64_t cache_id;
31 FilterBlockReader* filter;
32 const char* filter_data;
33
34 BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer
35 Block* index_block;
36};
37
38Status Table::Open(const Options& options, RandomAccessFile* file,
39 uint64_t size, Table** table) {
40 *table = nullptr;
41 if (size < Footer::kEncodedLength) {
42 return Status::Corruption("file is too short to be an sstable");
43 }
44
45 char footer_space[Footer::kEncodedLength];
46 Slice footer_input;
47 Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
48 &footer_input, footer_space);
49 if (!s.ok()) return s;
50
51 Footer footer;
52 s = footer.DecodeFrom(&footer_input);
53 if (!s.ok()) return s;
54
55 // Read the index block
56 BlockContents index_block_contents;
57 ReadOptions opt;
58 if (options.paranoid_checks) {
59 opt.verify_checksums = true;
60 }
61 s = ReadBlock(file, opt, footer.index_handle(), &index_block_contents);
62
63 if (s.ok()) {
64 // We've successfully read the footer and the index block: we're
65 // ready to serve requests.
66 Block* index_block = new Block(index_block_contents);
67 Rep* rep = new Table::Rep;
68 rep->options = options;
69 rep->file = file;
70 rep->metaindex_handle = footer.metaindex_handle();
71 rep->index_block = index_block;
72 rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
73 rep->filter_data = nullptr;
74 rep->filter = nullptr;
75 *table = new Table(rep);
76 (*table)->ReadMeta(footer);
77 }
78
79 return s;
80}
81
82void Table::ReadMeta(const Footer& footer) {
83 if (rep_->options.filter_policy == nullptr) {
84 return; // Do not need any metadata
85 }
86
87 // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
88 // it is an empty block.
89 ReadOptions opt;
90 if (rep_->options.paranoid_checks) {
91 opt.verify_checksums = true;
92 }
93 BlockContents contents;
94 if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) {
95 // Do not propagate errors since meta info is not needed for operation
96 return;
97 }
98 Block* meta = new Block(contents);
99
100 Iterator* iter = meta->NewIterator(BytewiseComparator());
101 std::string key = "filter.";
102 key.append(rep_->options.filter_policy->Name());
103 iter->Seek(key);
104 if (iter->Valid() && iter->key() == Slice(key)) {
105 ReadFilter(iter->value());
106 }
107 delete iter;
108 delete meta;
109}
110
111void Table::ReadFilter(const Slice& filter_handle_value) {
112 Slice v = filter_handle_value;
113 BlockHandle filter_handle;
114 if (!filter_handle.DecodeFrom(&v).ok()) {
115 return;
116 }
117
118 // We might want to unify with ReadBlock() if we start
119 // requiring checksum verification in Table::Open.
120 ReadOptions opt;
121 if (rep_->options.paranoid_checks) {
122 opt.verify_checksums = true;
123 }
124 BlockContents block;
125 if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) {
126 return;
127 }
128 if (block.heap_allocated) {
129 rep_->filter_data = block.data.data(); // Will need to delete later
130 }
131 rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
132}
133
134Table::~Table() { delete rep_; }
135
136static void DeleteBlock(void* arg, void* ignored) {
137 delete reinterpret_cast<Block*>(arg);
138}
139
140static void DeleteCachedBlock(const Slice& key, void* value) {
141 Block* block = reinterpret_cast<Block*>(value);
142 delete block;
143}
144
145static void ReleaseBlock(void* arg, void* h) {
146 Cache* cache = reinterpret_cast<Cache*>(arg);
147 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
148 cache->Release(handle);
149}
150
151// Convert an index iterator value (i.e., an encoded BlockHandle)
152// into an iterator over the contents of the corresponding block.
153Iterator* Table::BlockReader(void* arg, const ReadOptions& options,
154 const Slice& index_value) {
155 Table* table = reinterpret_cast<Table*>(arg);
156 Cache* block_cache = table->rep_->options.block_cache;
157 Block* block = nullptr;
158 Cache::Handle* cache_handle = nullptr;
159
160 BlockHandle handle;
161 Slice input = index_value;
162 Status s = handle.DecodeFrom(&input);
163 // We intentionally allow extra stuff in index_value so that we
164 // can add more features in the future.
165
166 if (s.ok()) {
167 BlockContents contents;
168 if (block_cache != nullptr) {
169 char cache_key_buffer[16];
170 EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
171 EncodeFixed64(cache_key_buffer + 8, handle.offset());
172 Slice key(cache_key_buffer, sizeof(cache_key_buffer));
173 cache_handle = block_cache->Lookup(key);
174 if (cache_handle != nullptr) {
175 block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
176 } else {
177 s = ReadBlock(table->rep_->file, options, handle, &contents);
178 if (s.ok()) {
179 block = new Block(contents);
180 if (contents.cachable && options.fill_cache) {
181 cache_handle = block_cache->Insert(key, block, block->size(),
182 &DeleteCachedBlock);
183 }
184 }
185 }
186 } else {
187 s = ReadBlock(table->rep_->file, options, handle, &contents);
188 if (s.ok()) {
189 block = new Block(contents);
190 }
191 }
192 }
193
194 Iterator* iter;
195 if (block != nullptr) {
196 iter = block->NewIterator(table->rep_->options.comparator);
197 if (cache_handle == nullptr) {
198 iter->RegisterCleanup(&DeleteBlock, block, nullptr);
199 } else {
200 iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
201 }
202 } else {
203 iter = NewErrorIterator(s);
204 }
205 return iter;
206}
207
208Iterator* Table::NewIterator(const ReadOptions& options) const {
209 return NewTwoLevelIterator(
210 rep_->index_block->NewIterator(rep_->options.comparator),
211 &Table::BlockReader, const_cast<Table*>(this), options);
212}
213
214Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg,
215 void (*handle_result)(void*, const Slice&,
216 const Slice&)) {
217 Status s;
218 Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
219 iiter->Seek(k);
220 if (iiter->Valid()) {
221 Slice handle_value = iiter->value();
222 FilterBlockReader* filter = rep_->filter;
223 BlockHandle handle;
224 if (filter != nullptr && handle.DecodeFrom(&handle_value).ok() &&
225 !filter->KeyMayMatch(handle.offset(), k)) {
226 // Not found
227 } else {
228 Iterator* block_iter = BlockReader(this, options, iiter->value());
229 block_iter->Seek(k);
230 if (block_iter->Valid()) {
231 (*handle_result)(arg, block_iter->key(), block_iter->value());
232 }
233 s = block_iter->status();
234 delete block_iter;
235 }
236 }
237 if (s.ok()) {
238 s = iiter->status();
239 }
240 delete iiter;
241 return s;
242}
243
244uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
245 Iterator* index_iter =
246 rep_->index_block->NewIterator(rep_->options.comparator);
247 index_iter->Seek(key);
248 uint64_t result;
249 if (index_iter->Valid()) {
250 BlockHandle handle;
251 Slice input = index_iter->value();
252 Status s = handle.DecodeFrom(&input);
253 if (s.ok()) {
254 result = handle.offset();
255 } else {
256 // Strange: we can't decode the block handle in the index block.
257 // We'll just return the offset of the metaindex block, which is
258 // close to the whole file size for this case.
259 result = rep_->metaindex_handle.offset();
260 }
261 } else {
262 // key is past the last key in the file. Approximate the offset
263 // by returning the offset of the metaindex block (which is
264 // right near the end of the file).
265 result = rep_->metaindex_handle.offset();
266 }
267 delete index_iter;
268 return result;
269}
270
271} // namespace leveldb
272