1// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file. See the AUTHORS file for names of contributors.
4//
5// We recover the contents of the descriptor from the other files we find.
6// (1) Any log files are first converted to tables
7// (2) We scan every table to compute
8// (a) smallest/largest for the table
9// (b) largest sequence number in the table
10// (3) We generate descriptor contents:
11// - log number is set to zero
12// - next-file-number is set to 1 + largest file number we found
13// - last-sequence-number is set to largest sequence# found across
14// all tables (see 2c)
15// - compaction pointers are cleared
16// - every table file is added at level 0
17//
18// Possible optimization 1:
19// (a) Compute total size and use to pick appropriate max-level M
20// (b) Sort tables by largest sequence# in the table
21// (c) For each table: if it overlaps earlier table, place in level-0,
22// else place in level-M.
23// Possible optimization 2:
24// Store per-table metadata (smallest, largest, largest-seq#, ...)
25// in the table's meta section to speed up ScanTable.
26
27#include "db/builder.h"
28#include "db/db_impl.h"
29#include "db/dbformat.h"
30#include "db/filename.h"
31#include "db/log_reader.h"
32#include "db/log_writer.h"
33#include "db/memtable.h"
34#include "db/table_cache.h"
35#include "db/version_edit.h"
36#include "db/write_batch_internal.h"
37#include "leveldb/comparator.h"
38#include "leveldb/db.h"
39#include "leveldb/env.h"
40
41namespace leveldb {
42
43namespace {
44
45class Repairer {
46 public:
47 Repairer(const std::string& dbname, const Options& options)
48 : dbname_(dbname),
49 env_(options.env),
50 icmp_(options.comparator),
51 ipolicy_(options.filter_policy),
52 options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
53 owns_info_log_(options_.info_log != options.info_log),
54 owns_cache_(options_.block_cache != options.block_cache),
55 next_file_number_(1) {
56 // TableCache can be small since we expect each table to be opened once.
57 table_cache_ = new TableCache(dbname_, options_, 10);
58 }
59
60 ~Repairer() {
61 delete table_cache_;
62 if (owns_info_log_) {
63 delete options_.info_log;
64 }
65 if (owns_cache_) {
66 delete options_.block_cache;
67 }
68 }
69
70 Status Run() {
71 Status status = FindFiles();
72 if (status.ok()) {
73 ConvertLogFilesToTables();
74 ExtractMetaData();
75 status = WriteDescriptor();
76 }
77 if (status.ok()) {
78 unsigned long long bytes = 0;
79 for (size_t i = 0; i < tables_.size(); i++) {
80 bytes += tables_[i].meta.file_size;
81 }
82 Log(options_.info_log,
83 "**** Repaired leveldb %s; "
84 "recovered %d files; %llu bytes. "
85 "Some data may have been lost. "
86 "****",
87 dbname_.c_str(), static_cast<int>(tables_.size()), bytes);
88 }
89 return status;
90 }
91
92 private:
93 struct TableInfo {
94 FileMetaData meta;
95 SequenceNumber max_sequence;
96 };
97
98 Status FindFiles() {
99 std::vector<std::string> filenames;
100 Status status = env_->GetChildren(dbname_, &filenames);
101 if (!status.ok()) {
102 return status;
103 }
104 if (filenames.empty()) {
105 return Status::IOError(dbname_, "repair found no files");
106 }
107
108 uint64_t number;
109 FileType type;
110 for (size_t i = 0; i < filenames.size(); i++) {
111 if (ParseFileName(filenames[i], &number, &type)) {
112 if (type == kDescriptorFile) {
113 manifests_.push_back(filenames[i]);
114 } else {
115 if (number + 1 > next_file_number_) {
116 next_file_number_ = number + 1;
117 }
118 if (type == kLogFile) {
119 logs_.push_back(number);
120 } else if (type == kTableFile) {
121 table_numbers_.push_back(number);
122 } else {
123 // Ignore other files
124 }
125 }
126 }
127 }
128 return status;
129 }
130
131 void ConvertLogFilesToTables() {
132 for (size_t i = 0; i < logs_.size(); i++) {
133 std::string logname = LogFileName(dbname_, logs_[i]);
134 Status status = ConvertLogToTable(logs_[i]);
135 if (!status.ok()) {
136 Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
137 (unsigned long long)logs_[i], status.ToString().c_str());
138 }
139 ArchiveFile(logname);
140 }
141 }
142
143 Status ConvertLogToTable(uint64_t log) {
144 struct LogReporter : public log::Reader::Reporter {
145 Env* env;
146 Logger* info_log;
147 uint64_t lognum;
148 void Corruption(size_t bytes, const Status& s) override {
149 // We print error messages for corruption, but continue repairing.
150 Log(info_log, "Log #%llu: dropping %d bytes; %s",
151 (unsigned long long)lognum, static_cast<int>(bytes),
152 s.ToString().c_str());
153 }
154 };
155
156 // Open the log file
157 std::string logname = LogFileName(dbname_, log);
158 SequentialFile* lfile;
159 Status status = env_->NewSequentialFile(logname, &lfile);
160 if (!status.ok()) {
161 return status;
162 }
163
164 // Create the log reader.
165 LogReporter reporter;
166 reporter.env = env_;
167 reporter.info_log = options_.info_log;
168 reporter.lognum = log;
169 // We intentionally make log::Reader do checksumming so that
170 // corruptions cause entire commits to be skipped instead of
171 // propagating bad information (like overly large sequence
172 // numbers).
173 log::Reader reader(lfile, &reporter, false /*do not checksum*/,
174 0 /*initial_offset*/);
175
176 // Read all the records and add to a memtable
177 std::string scratch;
178 Slice record;
179 WriteBatch batch;
180 MemTable* mem = new MemTable(icmp_);
181 mem->Ref();
182 int counter = 0;
183 while (reader.ReadRecord(&record, &scratch)) {
184 if (record.size() < 12) {
185 reporter.Corruption(record.size(),
186 Status::Corruption("log record too small"));
187 continue;
188 }
189 WriteBatchInternal::SetContents(&batch, record);
190 status = WriteBatchInternal::InsertInto(&batch, mem);
191 if (status.ok()) {
192 counter += WriteBatchInternal::Count(&batch);
193 } else {
194 Log(options_.info_log, "Log #%llu: ignoring %s",
195 (unsigned long long)log, status.ToString().c_str());
196 status = Status::OK(); // Keep going with rest of file
197 }
198 }
199 delete lfile;
200
201 // Do not record a version edit for this conversion to a Table
202 // since ExtractMetaData() will also generate edits.
203 FileMetaData meta;
204 meta.number = next_file_number_++;
205 Iterator* iter = mem->NewIterator();
206 status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
207 delete iter;
208 mem->Unref();
209 mem = nullptr;
210 if (status.ok()) {
211 if (meta.file_size > 0) {
212 table_numbers_.push_back(meta.number);
213 }
214 }
215 Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
216 (unsigned long long)log, counter, (unsigned long long)meta.number,
217 status.ToString().c_str());
218 return status;
219 }
220
221 void ExtractMetaData() {
222 for (size_t i = 0; i < table_numbers_.size(); i++) {
223 ScanTable(table_numbers_[i]);
224 }
225 }
226
227 Iterator* NewTableIterator(const FileMetaData& meta) {
228 // Same as compaction iterators: if paranoid_checks are on, turn
229 // on checksum verification.
230 ReadOptions r;
231 r.verify_checksums = options_.paranoid_checks;
232 return table_cache_->NewIterator(r, meta.number, meta.file_size);
233 }
234
235 void ScanTable(uint64_t number) {
236 TableInfo t;
237 t.meta.number = number;
238 std::string fname = TableFileName(dbname_, number);
239 Status status = env_->GetFileSize(fname, &t.meta.file_size);
240 if (!status.ok()) {
241 // Try alternate file name.
242 fname = SSTTableFileName(dbname_, number);
243 Status s2 = env_->GetFileSize(fname, &t.meta.file_size);
244 if (s2.ok()) {
245 status = Status::OK();
246 }
247 }
248 if (!status.ok()) {
249 ArchiveFile(TableFileName(dbname_, number));
250 ArchiveFile(SSTTableFileName(dbname_, number));
251 Log(options_.info_log, "Table #%llu: dropped: %s",
252 (unsigned long long)t.meta.number, status.ToString().c_str());
253 return;
254 }
255
256 // Extract metadata by scanning through table.
257 int counter = 0;
258 Iterator* iter = NewTableIterator(t.meta);
259 bool empty = true;
260 ParsedInternalKey parsed;
261 t.max_sequence = 0;
262 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
263 Slice key = iter->key();
264 if (!ParseInternalKey(key, &parsed)) {
265 Log(options_.info_log, "Table #%llu: unparsable key %s",
266 (unsigned long long)t.meta.number, EscapeString(key).c_str());
267 continue;
268 }
269
270 counter++;
271 if (empty) {
272 empty = false;
273 t.meta.smallest.DecodeFrom(key);
274 }
275 t.meta.largest.DecodeFrom(key);
276 if (parsed.sequence > t.max_sequence) {
277 t.max_sequence = parsed.sequence;
278 }
279 }
280 if (!iter->status().ok()) {
281 status = iter->status();
282 }
283 delete iter;
284 Log(options_.info_log, "Table #%llu: %d entries %s",
285 (unsigned long long)t.meta.number, counter, status.ToString().c_str());
286
287 if (status.ok()) {
288 tables_.push_back(t);
289 } else {
290 RepairTable(fname, t); // RepairTable archives input file.
291 }
292 }
293
294 void RepairTable(const std::string& src, TableInfo t) {
295 // We will copy src contents to a new table and then rename the
296 // new table over the source.
297
298 // Create builder.
299 std::string copy = TableFileName(dbname_, next_file_number_++);
300 WritableFile* file;
301 Status s = env_->NewWritableFile(copy, &file);
302 if (!s.ok()) {
303 return;
304 }
305 TableBuilder* builder = new TableBuilder(options_, file);
306
307 // Copy data.
308 Iterator* iter = NewTableIterator(t.meta);
309 int counter = 0;
310 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
311 builder->Add(iter->key(), iter->value());
312 counter++;
313 }
314 delete iter;
315
316 ArchiveFile(src);
317 if (counter == 0) {
318 builder->Abandon(); // Nothing to save
319 } else {
320 s = builder->Finish();
321 if (s.ok()) {
322 t.meta.file_size = builder->FileSize();
323 }
324 }
325 delete builder;
326 builder = nullptr;
327
328 if (s.ok()) {
329 s = file->Close();
330 }
331 delete file;
332 file = nullptr;
333
334 if (counter > 0 && s.ok()) {
335 std::string orig = TableFileName(dbname_, t.meta.number);
336 s = env_->RenameFile(copy, orig);
337 if (s.ok()) {
338 Log(options_.info_log, "Table #%llu: %d entries repaired",
339 (unsigned long long)t.meta.number, counter);
340 tables_.push_back(t);
341 }
342 }
343 if (!s.ok()) {
344 env_->RemoveFile(copy);
345 }
346 }
347
348 Status WriteDescriptor() {
349 std::string tmp = TempFileName(dbname_, 1);
350 WritableFile* file;
351 Status status = env_->NewWritableFile(tmp, &file);
352 if (!status.ok()) {
353 return status;
354 }
355
356 SequenceNumber max_sequence = 0;
357 for (size_t i = 0; i < tables_.size(); i++) {
358 if (max_sequence < tables_[i].max_sequence) {
359 max_sequence = tables_[i].max_sequence;
360 }
361 }
362
363 edit_.SetComparatorName(icmp_.user_comparator()->Name());
364 edit_.SetLogNumber(0);
365 edit_.SetNextFile(next_file_number_);
366 edit_.SetLastSequence(max_sequence);
367
368 for (size_t i = 0; i < tables_.size(); i++) {
369 // TODO(opt): separate out into multiple levels
370 const TableInfo& t = tables_[i];
371 edit_.AddFile(0, t.meta.number, t.meta.file_size, t.meta.smallest,
372 t.meta.largest);
373 }
374
375 // std::fprintf(stderr,
376 // "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
377 {
378 log::Writer log(file);
379 std::string record;
380 edit_.EncodeTo(&record);
381 status = log.AddRecord(record);
382 }
383 if (status.ok()) {
384 status = file->Close();
385 }
386 delete file;
387 file = nullptr;
388
389 if (!status.ok()) {
390 env_->RemoveFile(tmp);
391 } else {
392 // Discard older manifests
393 for (size_t i = 0; i < manifests_.size(); i++) {
394 ArchiveFile(dbname_ + "/" + manifests_[i]);
395 }
396
397 // Install new manifest
398 status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
399 if (status.ok()) {
400 status = SetCurrentFile(env_, dbname_, 1);
401 } else {
402 env_->RemoveFile(tmp);
403 }
404 }
405 return status;
406 }
407
408 void ArchiveFile(const std::string& fname) {
409 // Move into another directory. E.g., for
410 // dir/foo
411 // rename to
412 // dir/lost/foo
413 const char* slash = strrchr(fname.c_str(), '/');
414 std::string new_dir;
415 if (slash != nullptr) {
416 new_dir.assign(fname.data(), slash - fname.data());
417 }
418 new_dir.append("/lost");
419 env_->CreateDir(new_dir); // Ignore error
420 std::string new_file = new_dir;
421 new_file.append("/");
422 new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
423 Status s = env_->RenameFile(fname, new_file);
424 Log(options_.info_log, "Archiving %s: %s\n", fname.c_str(),
425 s.ToString().c_str());
426 }
427
428 const std::string dbname_;
429 Env* const env_;
430 InternalKeyComparator const icmp_;
431 InternalFilterPolicy const ipolicy_;
432 const Options options_;
433 bool owns_info_log_;
434 bool owns_cache_;
435 TableCache* table_cache_;
436 VersionEdit edit_;
437
438 std::vector<std::string> manifests_;
439 std::vector<uint64_t> table_numbers_;
440 std::vector<uint64_t> logs_;
441 std::vector<TableInfo> tables_;
442 uint64_t next_file_number_;
443};
444} // namespace
445
446Status RepairDB(const std::string& dbname, const Options& options) {
447 Repairer repairer(dbname, options);
448 return repairer.Run();
449}
450
451} // namespace leveldb
452