1 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be |
3 | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
4 | // |
5 | // We recover the contents of the descriptor from the other files we find. |
6 | // (1) Any log files are first converted to tables |
7 | // (2) We scan every table to compute |
8 | // (a) smallest/largest for the table |
9 | // (b) largest sequence number in the table |
10 | // (3) We generate descriptor contents: |
11 | // - log number is set to zero |
12 | // - next-file-number is set to 1 + largest file number we found |
13 | // - last-sequence-number is set to largest sequence# found across |
14 | // all tables (see 2c) |
15 | // - compaction pointers are cleared |
16 | // - every table file is added at level 0 |
17 | // |
18 | // Possible optimization 1: |
19 | // (a) Compute total size and use to pick appropriate max-level M |
20 | // (b) Sort tables by largest sequence# in the table |
21 | // (c) For each table: if it overlaps earlier table, place in level-0, |
22 | // else place in level-M. |
23 | // Possible optimization 2: |
24 | // Store per-table metadata (smallest, largest, largest-seq#, ...) |
25 | // in the table's meta section to speed up ScanTable. |
26 | |
27 | #include "db/builder.h" |
28 | #include "db/db_impl.h" |
29 | #include "db/dbformat.h" |
30 | #include "db/filename.h" |
31 | #include "db/log_reader.h" |
32 | #include "db/log_writer.h" |
33 | #include "db/memtable.h" |
34 | #include "db/table_cache.h" |
35 | #include "db/version_edit.h" |
36 | #include "db/write_batch_internal.h" |
37 | #include "leveldb/comparator.h" |
38 | #include "leveldb/db.h" |
39 | #include "leveldb/env.h" |
40 | |
41 | namespace leveldb { |
42 | |
43 | namespace { |
44 | |
45 | class Repairer { |
46 | public: |
47 | Repairer(const std::string& dbname, const Options& options) |
48 | : dbname_(dbname), |
49 | env_(options.env), |
50 | icmp_(options.comparator), |
51 | ipolicy_(options.filter_policy), |
52 | options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), |
53 | owns_info_log_(options_.info_log != options.info_log), |
54 | owns_cache_(options_.block_cache != options.block_cache), |
55 | next_file_number_(1) { |
56 | // TableCache can be small since we expect each table to be opened once. |
57 | table_cache_ = new TableCache(dbname_, options_, 10); |
58 | } |
59 | |
60 | ~Repairer() { |
61 | delete table_cache_; |
62 | if (owns_info_log_) { |
63 | delete options_.info_log; |
64 | } |
65 | if (owns_cache_) { |
66 | delete options_.block_cache; |
67 | } |
68 | } |
69 | |
70 | Status Run() { |
71 | Status status = FindFiles(); |
72 | if (status.ok()) { |
73 | ConvertLogFilesToTables(); |
74 | ExtractMetaData(); |
75 | status = WriteDescriptor(); |
76 | } |
77 | if (status.ok()) { |
78 | unsigned long long bytes = 0; |
79 | for (size_t i = 0; i < tables_.size(); i++) { |
80 | bytes += tables_[i].meta.file_size; |
81 | } |
82 | Log(options_.info_log, |
83 | "**** Repaired leveldb %s; " |
84 | "recovered %d files; %llu bytes. " |
85 | "Some data may have been lost. " |
86 | "****" , |
87 | dbname_.c_str(), static_cast<int>(tables_.size()), bytes); |
88 | } |
89 | return status; |
90 | } |
91 | |
92 | private: |
93 | struct TableInfo { |
94 | FileMetaData meta; |
95 | SequenceNumber max_sequence; |
96 | }; |
97 | |
98 | Status FindFiles() { |
99 | std::vector<std::string> filenames; |
100 | Status status = env_->GetChildren(dbname_, &filenames); |
101 | if (!status.ok()) { |
102 | return status; |
103 | } |
104 | if (filenames.empty()) { |
105 | return Status::IOError(dbname_, "repair found no files" ); |
106 | } |
107 | |
108 | uint64_t number; |
109 | FileType type; |
110 | for (size_t i = 0; i < filenames.size(); i++) { |
111 | if (ParseFileName(filenames[i], &number, &type)) { |
112 | if (type == kDescriptorFile) { |
113 | manifests_.push_back(filenames[i]); |
114 | } else { |
115 | if (number + 1 > next_file_number_) { |
116 | next_file_number_ = number + 1; |
117 | } |
118 | if (type == kLogFile) { |
119 | logs_.push_back(number); |
120 | } else if (type == kTableFile) { |
121 | table_numbers_.push_back(number); |
122 | } else { |
123 | // Ignore other files |
124 | } |
125 | } |
126 | } |
127 | } |
128 | return status; |
129 | } |
130 | |
131 | void ConvertLogFilesToTables() { |
132 | for (size_t i = 0; i < logs_.size(); i++) { |
133 | std::string logname = LogFileName(dbname_, logs_[i]); |
134 | Status status = ConvertLogToTable(logs_[i]); |
135 | if (!status.ok()) { |
136 | Log(options_.info_log, "Log #%llu: ignoring conversion error: %s" , |
137 | (unsigned long long)logs_[i], status.ToString().c_str()); |
138 | } |
139 | ArchiveFile(logname); |
140 | } |
141 | } |
142 | |
143 | Status ConvertLogToTable(uint64_t log) { |
144 | struct LogReporter : public log::Reader::Reporter { |
145 | Env* env; |
146 | Logger* info_log; |
147 | uint64_t lognum; |
148 | void Corruption(size_t bytes, const Status& s) override { |
149 | // We print error messages for corruption, but continue repairing. |
150 | Log(info_log, "Log #%llu: dropping %d bytes; %s" , |
151 | (unsigned long long)lognum, static_cast<int>(bytes), |
152 | s.ToString().c_str()); |
153 | } |
154 | }; |
155 | |
156 | // Open the log file |
157 | std::string logname = LogFileName(dbname_, log); |
158 | SequentialFile* lfile; |
159 | Status status = env_->NewSequentialFile(logname, &lfile); |
160 | if (!status.ok()) { |
161 | return status; |
162 | } |
163 | |
164 | // Create the log reader. |
165 | LogReporter reporter; |
166 | reporter.env = env_; |
167 | reporter.info_log = options_.info_log; |
168 | reporter.lognum = log; |
169 | // We intentionally make log::Reader do checksumming so that |
170 | // corruptions cause entire commits to be skipped instead of |
171 | // propagating bad information (like overly large sequence |
172 | // numbers). |
173 | log::Reader reader(lfile, &reporter, false /*do not checksum*/, |
174 | 0 /*initial_offset*/); |
175 | |
176 | // Read all the records and add to a memtable |
177 | std::string scratch; |
178 | Slice record; |
179 | WriteBatch batch; |
180 | MemTable* mem = new MemTable(icmp_); |
181 | mem->Ref(); |
182 | int counter = 0; |
183 | while (reader.ReadRecord(&record, &scratch)) { |
184 | if (record.size() < 12) { |
185 | reporter.Corruption(record.size(), |
186 | Status::Corruption("log record too small" )); |
187 | continue; |
188 | } |
189 | WriteBatchInternal::SetContents(&batch, record); |
190 | status = WriteBatchInternal::InsertInto(&batch, mem); |
191 | if (status.ok()) { |
192 | counter += WriteBatchInternal::Count(&batch); |
193 | } else { |
194 | Log(options_.info_log, "Log #%llu: ignoring %s" , |
195 | (unsigned long long)log, status.ToString().c_str()); |
196 | status = Status::OK(); // Keep going with rest of file |
197 | } |
198 | } |
199 | delete lfile; |
200 | |
201 | // Do not record a version edit for this conversion to a Table |
202 | // since ExtractMetaData() will also generate edits. |
203 | FileMetaData meta; |
204 | meta.number = next_file_number_++; |
205 | Iterator* iter = mem->NewIterator(); |
206 | status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); |
207 | delete iter; |
208 | mem->Unref(); |
209 | mem = nullptr; |
210 | if (status.ok()) { |
211 | if (meta.file_size > 0) { |
212 | table_numbers_.push_back(meta.number); |
213 | } |
214 | } |
215 | Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s" , |
216 | (unsigned long long)log, counter, (unsigned long long)meta.number, |
217 | status.ToString().c_str()); |
218 | return status; |
219 | } |
220 | |
221 | void () { |
222 | for (size_t i = 0; i < table_numbers_.size(); i++) { |
223 | ScanTable(table_numbers_[i]); |
224 | } |
225 | } |
226 | |
227 | Iterator* NewTableIterator(const FileMetaData& meta) { |
228 | // Same as compaction iterators: if paranoid_checks are on, turn |
229 | // on checksum verification. |
230 | ReadOptions r; |
231 | r.verify_checksums = options_.paranoid_checks; |
232 | return table_cache_->NewIterator(r, meta.number, meta.file_size); |
233 | } |
234 | |
235 | void ScanTable(uint64_t number) { |
236 | TableInfo t; |
237 | t.meta.number = number; |
238 | std::string fname = TableFileName(dbname_, number); |
239 | Status status = env_->GetFileSize(fname, &t.meta.file_size); |
240 | if (!status.ok()) { |
241 | // Try alternate file name. |
242 | fname = SSTTableFileName(dbname_, number); |
243 | Status s2 = env_->GetFileSize(fname, &t.meta.file_size); |
244 | if (s2.ok()) { |
245 | status = Status::OK(); |
246 | } |
247 | } |
248 | if (!status.ok()) { |
249 | ArchiveFile(TableFileName(dbname_, number)); |
250 | ArchiveFile(SSTTableFileName(dbname_, number)); |
251 | Log(options_.info_log, "Table #%llu: dropped: %s" , |
252 | (unsigned long long)t.meta.number, status.ToString().c_str()); |
253 | return; |
254 | } |
255 | |
256 | // Extract metadata by scanning through table. |
257 | int counter = 0; |
258 | Iterator* iter = NewTableIterator(t.meta); |
259 | bool empty = true; |
260 | ParsedInternalKey parsed; |
261 | t.max_sequence = 0; |
262 | for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { |
263 | Slice key = iter->key(); |
264 | if (!ParseInternalKey(key, &parsed)) { |
265 | Log(options_.info_log, "Table #%llu: unparsable key %s" , |
266 | (unsigned long long)t.meta.number, EscapeString(key).c_str()); |
267 | continue; |
268 | } |
269 | |
270 | counter++; |
271 | if (empty) { |
272 | empty = false; |
273 | t.meta.smallest.DecodeFrom(key); |
274 | } |
275 | t.meta.largest.DecodeFrom(key); |
276 | if (parsed.sequence > t.max_sequence) { |
277 | t.max_sequence = parsed.sequence; |
278 | } |
279 | } |
280 | if (!iter->status().ok()) { |
281 | status = iter->status(); |
282 | } |
283 | delete iter; |
284 | Log(options_.info_log, "Table #%llu: %d entries %s" , |
285 | (unsigned long long)t.meta.number, counter, status.ToString().c_str()); |
286 | |
287 | if (status.ok()) { |
288 | tables_.push_back(t); |
289 | } else { |
290 | RepairTable(fname, t); // RepairTable archives input file. |
291 | } |
292 | } |
293 | |
294 | void RepairTable(const std::string& src, TableInfo t) { |
295 | // We will copy src contents to a new table and then rename the |
296 | // new table over the source. |
297 | |
298 | // Create builder. |
299 | std::string copy = TableFileName(dbname_, next_file_number_++); |
300 | WritableFile* file; |
301 | Status s = env_->NewWritableFile(copy, &file); |
302 | if (!s.ok()) { |
303 | return; |
304 | } |
305 | TableBuilder* builder = new TableBuilder(options_, file); |
306 | |
307 | // Copy data. |
308 | Iterator* iter = NewTableIterator(t.meta); |
309 | int counter = 0; |
310 | for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { |
311 | builder->Add(iter->key(), iter->value()); |
312 | counter++; |
313 | } |
314 | delete iter; |
315 | |
316 | ArchiveFile(src); |
317 | if (counter == 0) { |
318 | builder->Abandon(); // Nothing to save |
319 | } else { |
320 | s = builder->Finish(); |
321 | if (s.ok()) { |
322 | t.meta.file_size = builder->FileSize(); |
323 | } |
324 | } |
325 | delete builder; |
326 | builder = nullptr; |
327 | |
328 | if (s.ok()) { |
329 | s = file->Close(); |
330 | } |
331 | delete file; |
332 | file = nullptr; |
333 | |
334 | if (counter > 0 && s.ok()) { |
335 | std::string orig = TableFileName(dbname_, t.meta.number); |
336 | s = env_->RenameFile(copy, orig); |
337 | if (s.ok()) { |
338 | Log(options_.info_log, "Table #%llu: %d entries repaired" , |
339 | (unsigned long long)t.meta.number, counter); |
340 | tables_.push_back(t); |
341 | } |
342 | } |
343 | if (!s.ok()) { |
344 | env_->RemoveFile(copy); |
345 | } |
346 | } |
347 | |
348 | Status WriteDescriptor() { |
349 | std::string tmp = TempFileName(dbname_, 1); |
350 | WritableFile* file; |
351 | Status status = env_->NewWritableFile(tmp, &file); |
352 | if (!status.ok()) { |
353 | return status; |
354 | } |
355 | |
356 | SequenceNumber max_sequence = 0; |
357 | for (size_t i = 0; i < tables_.size(); i++) { |
358 | if (max_sequence < tables_[i].max_sequence) { |
359 | max_sequence = tables_[i].max_sequence; |
360 | } |
361 | } |
362 | |
363 | edit_.SetComparatorName(icmp_.user_comparator()->Name()); |
364 | edit_.SetLogNumber(0); |
365 | edit_.SetNextFile(next_file_number_); |
366 | edit_.SetLastSequence(max_sequence); |
367 | |
368 | for (size_t i = 0; i < tables_.size(); i++) { |
369 | // TODO(opt): separate out into multiple levels |
370 | const TableInfo& t = tables_[i]; |
371 | edit_.AddFile(0, t.meta.number, t.meta.file_size, t.meta.smallest, |
372 | t.meta.largest); |
373 | } |
374 | |
375 | // std::fprintf(stderr, |
376 | // "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); |
377 | { |
378 | log::Writer log(file); |
379 | std::string record; |
380 | edit_.EncodeTo(&record); |
381 | status = log.AddRecord(record); |
382 | } |
383 | if (status.ok()) { |
384 | status = file->Close(); |
385 | } |
386 | delete file; |
387 | file = nullptr; |
388 | |
389 | if (!status.ok()) { |
390 | env_->RemoveFile(tmp); |
391 | } else { |
392 | // Discard older manifests |
393 | for (size_t i = 0; i < manifests_.size(); i++) { |
394 | ArchiveFile(dbname_ + "/" + manifests_[i]); |
395 | } |
396 | |
397 | // Install new manifest |
398 | status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); |
399 | if (status.ok()) { |
400 | status = SetCurrentFile(env_, dbname_, 1); |
401 | } else { |
402 | env_->RemoveFile(tmp); |
403 | } |
404 | } |
405 | return status; |
406 | } |
407 | |
408 | void ArchiveFile(const std::string& fname) { |
409 | // Move into another directory. E.g., for |
410 | // dir/foo |
411 | // rename to |
412 | // dir/lost/foo |
413 | const char* slash = strrchr(fname.c_str(), '/'); |
414 | std::string new_dir; |
415 | if (slash != nullptr) { |
416 | new_dir.assign(fname.data(), slash - fname.data()); |
417 | } |
418 | new_dir.append("/lost" ); |
419 | env_->CreateDir(new_dir); // Ignore error |
420 | std::string new_file = new_dir; |
421 | new_file.append("/" ); |
422 | new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); |
423 | Status s = env_->RenameFile(fname, new_file); |
424 | Log(options_.info_log, "Archiving %s: %s\n" , fname.c_str(), |
425 | s.ToString().c_str()); |
426 | } |
427 | |
428 | const std::string dbname_; |
429 | Env* const env_; |
430 | InternalKeyComparator const icmp_; |
431 | InternalFilterPolicy const ipolicy_; |
432 | const Options options_; |
433 | bool owns_info_log_; |
434 | bool owns_cache_; |
435 | TableCache* table_cache_; |
436 | VersionEdit edit_; |
437 | |
438 | std::vector<std::string> manifests_; |
439 | std::vector<uint64_t> table_numbers_; |
440 | std::vector<uint64_t> logs_; |
441 | std::vector<TableInfo> tables_; |
442 | uint64_t next_file_number_; |
443 | }; |
444 | } // namespace |
445 | |
446 | Status RepairDB(const std::string& dbname, const Options& options) { |
447 | Repairer repairer(dbname, options); |
448 | return repairer.Run(); |
449 | } |
450 | |
451 | } // namespace leveldb |
452 | |