ptb.cpp source code [glow/examples/ptb.cpp]

1	/**
2	* Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16	#include "glow/ExecutionEngine/ExecutionEngine.h"
17	#include "glow/Graph/Graph.h"
18	#include "glow/IR/IR.h"
19	#include "glow/Support/Support.h"
20
21	#include "llvm/Support/CommandLine.h"
22	#include "llvm/Support/Format.h"
23	#include "llvm/Support/Timer.h"
24
25	#include <glog/logging.h>
26
27	#include <algorithm>
28	#include <fstream>
29	#include <iomanip>
30	#include <iostream>
31	#include <map>
32	#include <string>
33
34	using namespace glow;
35	using llvm::format;
36
37	namespace {
38	llvm::cl::OptionCategory ptbCat("PTB Options");
39	llvm::cl::opt<std::string> executionBackend(
40	"backend",
41	llvm::cl::desc ("Backend to use, e.g., Interpreter, CPU, OpenCL:"),
42	llvm::cl::Optional, llvm::cl::init("Interpreter"), llvm::cl::cat (ptbCat));
43
44	llvm::cl::opt<std::string> dumpInitialGraphDAGFileOpt(
45	"dumpInitialGraphDAG",
46	llvm::cl::desc (
47	"Specify the file to export the initial Graph in DOT format"),
48	llvm::cl::value_desc ("file.dot"), llvm::cl::cat (ptbCat));
49
50	llvm::cl::opt<std::string> dumpTrainingGraphDAGFileOpt(
51	"dumpTrainingGraphDAG",
52	llvm::cl::desc (
53	"Specify the file to export the training Graph in DOT format"),
54	llvm::cl::value_desc ("file.dot"), llvm::cl::cat (ptbCat));
55
56	} // namespace
57
58	unsigned loadPTB(Tensor &inputWords, Tensor &targetWords, dim_t numSteps,
59	dim_t vocabSize, dim_t minibatchSize, dim_t maxNumWords) {
60
61	std::ifstream ptbInput("ptb/simple-examples/data/ptb.train.txt");
62	CHECK(ptbInput.is_open()) << "Error loading ptb.train.txt";
63
64	std::vector<std::string> words;
65	std::string line;
66
67	while (getline(ptbInput, line)) {
68	std::istringstream ss(line);
69	std::string token;
70	while (getline(ss, token, `' '`)) {
71	if (!token.empty()) {
72	words.push_back(token);
73	}
74	}
75	words.push_back("<eos>");
76	}
77	ptbInput.close();
78
79	// We limit the number of words to 50,000 otherwise things will be slower.
80	words = std::vector<std::string>(words.begin(), words.begin() + maxNumWords);
81	size_t numWords = words.size();
82
83	CHECK_GT(numWords, `0`) << "No words were found.";
84
85	std::map<std::string, int> counter;
86	// Counter of words occurences in the input text
87	for (auto word : words) {
88	if (counter.find(word) == counter.end()) {
89	counter [word] = `0`;
90	}
91	counter [word] += `1`;
92	}
93
94	// Sort the counter
95	std::vector<std::pair<std::string, int>> counters(counter.begin(),
96	counter.end());
97
98	sort(counters.begin(), counters.end(),
99	[](const std::pair<std::string, int> &lhs,
100	const std::pair<std::string, int> &rhs) {
101	if (lhs.second == rhs.second) {
102	return rhs.first > lhs.first;
103	}
104	return lhs.second > rhs.second;
105	});
106
107	// Build the word to id map
108	std::map<std::string, int> wordToId;
109	for (unsigned i = `0`; i < counters.size(); i++) {
110	auto const &word = counters [i].first;
111	wordToId [word] = std::min<size_t>(i, vocabSize - `1`);
112	}
113
114	// Load the PTB database into two 3d tensors for word inputs and targets.
115	dim_t batchLength = numWords / minibatchSize;
116	dim_t numBatches = (batchLength - `1`) / numSteps;
117	dim_t numSequences = minibatchSize * numBatches;
118
119	// While we dont have embedding, we are using one-hot encoding to represent
120	// input words. To limit the size of the data we use an upper bound on the
121	// vocabulary size.
122	inputWords.reset(ElemKind::FloatTy, {numSequences, vocabSize * numSteps});
123	targetWords.reset(ElemKind::Int64ITy, {numSequences, numSteps});
124	auto IIH = inputWords.getHandle<>();
125	auto TIH = targetWords.getHandle<int64_t>();
126	for (unsigned batch = `0`; batch < minibatchSize; batch++) {
127	for (unsigned iter = `0`; iter < numBatches; iter++) {
128	dim_t sequence = batch + iter * minibatchSize;
129	for (unsigned step = `0`; step < numSteps; step++) {
130	int wordCounterId = step + iter * numSteps + batch * batchLength;
131	const std::string word1 = words [wordCounterId];
132	const std::string word2 = words [wordCounterId + `1`];
133	IIH.at({sequence, step * vocabSize + wordToId [word1]}) = `1`;
134	TIH.at({sequence, step}) = wordToId [word2];
135	}
136	}
137	}
138	return numWords;
139	}
140
141	/// This test builds a RNN language model on the Penn TreeBank dataset.
142	/// Results for RNN word-level perplexity are reported in
143	/// https://arxiv.org/pdf/1409.2329.pdf Here we simplify the problem to be able
144	/// to run it on a single CPU.
145	/// The results were cross-checked with an equivalent tensorflow implementation
146	/// as well as a Vanilla implementation inspired from Karpathy's Char-RNN code.
147	/// Tensorflow https://gist.github.com/mcaounfb/7ba05b0a62383c36e24a33defa3f11aa
148	/// Vanilla https://gist.github.com/mcaounfb/c4ee98bbddaa6f8505f283ac018f8c34
149	///
150	/// The results for the perplexity are expected to look as:
151	///
152	/// Iteration 1: 105.4579
153	/// Iteration 2: 82.3274
154	/// Iteration 4: 70.8094
155	/// Iteration 6: 63.8546
156	/// Iteration 8: 58.4330
157	/// Iteration 10: 53.7943
158	/// Iteration 12: 49.7214
159	/// Iteration 14: 46.1715
160	/// Iteration 16: 43.1474
161	/// Iteration 18: 40.5605
162	/// Iteration 20: 38.2837
163	///
164	/// For reference, we expect the usage of an LSTM instead of the current
165	/// simple RNN block will improve the perplexity to ~20.
166	void testPTB() {
167	LOG(INFO) << "Loading the ptb database.";
168
169	Tensor inputWords;
170	Tensor targetWords;
171
172	const dim_t minibatchSize = `10`;
173	const dim_t numSteps = `10`;
174	const dim_t numEpochs = `20`;
175
176	const dim_t hiddenSize = `20`;
177	const dim_t vocabSize = `500`;
178	const dim_t maxNumWords = `10000`;
179
180	float learningRate = `.1`;
181
182	unsigned numWords = loadPTB(inputWords, targetWords, numSteps, vocabSize,
183	minibatchSize, maxNumWords);
184	LOG(INFO) << "Loaded " << numWords << " words.";
185	ExecutionEngine EE(executionBackend);
186	PlaceholderBindings bindings;
187
188	// Construct the network:
189	TrainingConfig TC;
190	TC.learningRate = learningRate;
191	TC.momentum = `0`;
192	TC.batchSize = minibatchSize;
193
194	auto &mod = EE.getModule();
195	Function *F = mod.createFunction("main");
196	LOG(INFO) << "Building";
197
198	auto *X = mod.createPlaceholder(
199	ElemKind::FloatTy, {minibatchSize, vocabSize * numSteps}, "input", false);
200	bindings.allocate(X);
201	auto *Y = mod.createPlaceholder(ElemKind::Int64ITy, {minibatchSize, numSteps},
202	"selected", false);
203	bindings.allocate(Y);
204
205	std::vector<NodeValue> slicesX;
206
207	for (unsigned t = `0`; t < numSteps; t++) {
208	auto XtName = "X." + std::to_string(t);
209	auto Xt = F->createSlice(XtName, X, {`0`, t vocabSize},
210	{minibatchSize, (t + `1`) * vocabSize});
211	slicesX.push_back(Xt);
212	}
213
214	std::vector<NodeValue> outputNodes;
215	F->createSimpleRNN(bindings, "rnn", slicesX, minibatchSize, hiddenSize,
216	vocabSize, outputNodes);
217
218	// O has a shape of {numSteps minibatchSize, vocabSize}*
219	Node *O = F->createConcat("output", outputNodes, `0`);
220	// T has shape of {numSteps minibatchSize, 1}*
221	Node *TN = F->createTranspose("Y.transpose", Y, {`1`, `0`});
222	Node T = F->createReshape("Y.reshape", TN, {numSteps minibatchSize, `1`});
223
224	auto *SM = F->createSoftMax("softmax", O, T);
225	auto *save = F->createSave("result", SM);
226	auto *result = bindings.allocate(save->getPlaceholder());
227
228	if (!dumpInitialGraphDAGFileOpt.empty()) {
229	LOG(INFO) << "Dumping initial graph";
230	F->dumpDAG(dumpInitialGraphDAGFileOpt.c_str());
231	}
232
233	Function *TF = glow::differentiate(F, TC);
234	auto tfName = TF->getName();
235
236	EE.compile(CompilationMode::Train);
237	bindings.allocate(mod.getPlaceholders());
238
239	if (!dumpTrainingGraphDAGFileOpt.empty()) {
240	LOG(INFO) << "Dumping training graph";
241	TF->dumpDAG(dumpTrainingGraphDAGFileOpt.c_str());
242	}
243
244	size_t numBatches = (numWords / minibatchSize - `1`) / numSteps;
245
246	LOG(INFO) << "Training for " << numBatches << " rounds";
247
248	float metricValues[numEpochs];
249
250	for (size_t iter = `0`; iter < numEpochs; iter++) {
251	llvm::outs() << "Training - iteration #" << (iter + `1`) << "\n";
252
253	llvm::Timer timer("Training", "Training");
254	timer.startTimer();
255
256	// Compute the perplexity over a few minibatches
257	float perplexity = `0`;
258	size_t perplexityWordsCount = `0`;
259
260	// This variable records the number of the next sample to be used for
261	// training.
262	size_t sampleCounter = `0`;
263
264	for (unsigned batch = `0`; batch < numBatches; batch++) {
265	Tensor inputWordsBatch(ElemKind::FloatTy,
266	{minibatchSize, vocabSize * numSteps});
267	inputWordsBatch.copyConsecutiveSlices(&inputWords, minibatchSize * batch);
268
269	Tensor targetWordsBatch(ElemKind::Int64ITy, {minibatchSize, numSteps});
270	targetWordsBatch.copyConsecutiveSlices(&targetWords,
271	minibatchSize * batch);
272
273	runBatch(EE, bindings, `1`, sampleCounter, {X, Y},
274	{&inputWordsBatch, &targetWordsBatch}, tfName);
275	for (dim_t step = `0`; step < numSteps; step++) {
276	for (unsigned int i = `0`; i < minibatchSize; i++) {
277	auto T =
278	result->getHandle<float>().extractSlice(step * minibatchSize + i);
279	dim_t correct = targetWords.getHandle<int64_t>().at(
280	{minibatchSize * batch + i, step});
281	float soft_guess = -std::log(T.getHandle<float>().at({correct}));
282	perplexity += soft_guess;
283	perplexityWordsCount += `1`;
284	}
285	}
286	if (batch % `10` == `1`) {
287	llvm::outs() << "perplexity: "
288	<< format("%0.4f",
289	std::exp(perplexity / perplexityWordsCount))
290	<< "\n";
291	}
292	}
293	metricValues[iter] = std::exp(perplexity / perplexityWordsCount);
294	llvm::outs() << "perplexity: " << format("%0.4f", metricValues[iter])
295	<< "\n\n";
296
297	timer.stopTimer();
298	}
299
300	llvm::outs() << "Perplexity scores in copy-pastable format:\n";
301	for (size_t iter = `0`; iter < numEpochs; iter++) {
302	if (iter != `0` && iter % `2` == `0`)
303	continue;
304	llvm::outs() << "/// Iteration " << iter + `1` << ": "
305	<< format("%0.4f", metricValues[iter]) << "\n";
306	}
307	llvm::outs()
308	<< "Note, that small 1E-4 error is considered acceptable and may "
309	<< "be coming from fast math optimizations.\n";
310	}
311
312	int main(int argc, char **argv) {
313	llvm::cl::ParseCommandLineOptions(argc, argv, " The PTB test\n\n");
314	testPTB();
315
316	return `0`;
317	}
318

Browse the source code of glow/examples/ptb.cpp