audio_ops.h source code [tensorflow/tensorflow/cc/ops/audio_ops.h]

1	// This file is MACHINE GENERATED! Do not edit.
2
3	#ifndef TENSORFLOW_CC_OPS_AUDIO_OPS_H_
4	#define TENSORFLOW_CC_OPS_AUDIO_OPS_H_
5
6	// This file is MACHINE GENERATED! Do not edit.
7
8	#include "tensorflow/cc/framework/ops.h"
9	#include "tensorflow/cc/framework/scope.h"
10	#include "tensorflow/core/framework/tensor.h"
11	#include "tensorflow/core/framework/tensor_shape.h"
12	#include "tensorflow/core/framework/types.h"
13	#include "tensorflow/core/lib/gtl/array_slice.h"
14
15	namespace tensorflow {
16	namespace ops {
17
18	/// @defgroup audio_ops Audio Ops
19	/// @{
20
21	/// Produces a visualization of audio data over time.
22	///
23	/// Spectrograms are a standard way of representing audio information as a series of
24	/// slices of frequency information, one slice for each window of time. By joining
25	/// these together into a sequence, they form a distinctive fingerprint of the sound
26	/// over time.
27	///
28	/// This op expects to receive audio data as an input, stored as floats in the range
29	/// -1 to 1, together with a window width in samples, and a stride specifying how
30	/// far to move the window between slices. From this it generates a three
31	/// dimensional output. The first dimension is for the channels in the input, so a
32	/// stereo audio input would have two here for example. The second dimension is time,
33	/// with successive frequency slices. The third dimension has an amplitude value for
34	/// each frequency during that time slice.
35	///
36	/// This means the layout when converted and saved as an image is rotated 90 degrees
37	/// clockwise from a typical spectrogram. Time is descending down the Y axis, and
38	/// the frequency decreases from left to right.
39	///
40	/// Each value in the result represents the square root of the sum of the real and
41	/// imaginary parts of an FFT on the current window of samples. In this way, the
42	/// lowest dimension represents the power of each frequency in the current window,
43	/// and adjacent windows are concatenated in the next dimension.
44	///
45	/// To get a more intuitive and visual look at what this operation does, you can run
46	/// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
47	/// resulting spectrogram as a PNG image.
48	///
49	/// Args:
50	/// scope: A Scope object*
51	/// input: Float representation of audio data.*
52	/// window_size: How wide the input window is in samples. For the highest efficiency*
53	/// this should be a power of two, but other values are accepted.
54	/// stride: How widely apart the center of adjacent sample windows should be.*
55	///
56	/// Optional attributes (see `Attrs`):
57	/// magnitude_squared: Whether to return the squared magnitude or just the*
58	/// magnitude. Using squared magnitude can avoid extra calculations.
59	///
60	/// Returns:
61	/// `Output`: 3D representation of the audio frequencies as an image.*
62	class AudioSpectrogram {
63	public:
64	/// Optional attribute setters for AudioSpectrogram
65	struct Attrs {
66	/// Whether to return the squared magnitude or just the
67	/// magnitude. Using squared magnitude can avoid extra calculations.
68	///
69	/// Defaults to false
70	TF_MUST_USE_RESULT Attrs MagnitudeSquared(bool x) {
71	Attrs ret = *this;
72	ret.magnitude_squared_ = x;
73	return ret;
74	}
75
76	bool magnitude_squared_ = false;
77	};
78	AudioSpectrogram(const ::tensorflow::Scope& scope, ::tensorflow::Input input,
79	int64 window_size, int64 stride);
80	AudioSpectrogram(const ::tensorflow::Scope& scope, ::tensorflow::Input input,
81	int64 window_size, int64 stride, const
82	AudioSpectrogram::Attrs& attrs);
83	operator ::tensorflow::Output() const { return spectrogram; }
84	operator ::tensorflow::Input() const { return spectrogram; }
85	::tensorflow::Node* node() const { return spectrogram.node(); }
86
87	static Attrs MagnitudeSquared(bool x) {
88	return Attrs ().MagnitudeSquared(x);
89	}
90
91	Operation operation;
92	::tensorflow::Output spectrogram;
93	};
94
95	/// Decode a 16-bit PCM WAV file to a float tensor.
96	///
97	/// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
98	///
99	/// When desired_channels is set, if the input contains fewer channels than this
100	/// then the last channel will be duplicated to give the requested number, else if
101	/// the input has more channels than requested then the additional channels will be
102	/// ignored.
103	///
104	/// If desired_samples is set, then the audio will be cropped or padded with zeroes
105	/// to the requested length.
106	///
107	/// The first output contains a Tensor with the content of the audio samples. The
108	/// lowest dimension will be the number of channels, and the second will be the
109	/// number of samples. For example, a ten-sample-long stereo WAV file should give an
110	/// output shape of [10, 2].
111	///
112	/// Args:
113	/// scope: A Scope object*
114	/// contents: The WAV-encoded audio, usually from a file.*
115	///
116	/// Optional attributes (see `Attrs`):
117	/// desired_channels: Number of sample channels wanted.*
118	/// desired_samples: Length of audio requested.*
119	///
120	/// Returns:
121	/// `Output` audio: 2-D with shape `[length, channels]`.*
122	/// `Output` sample_rate: Scalar holding the sample rate found in the WAV header.*
123	class DecodeWav {
124	public:
125	/// Optional attribute setters for DecodeWav
126	struct Attrs {
127	/// Number of sample channels wanted.
128	///
129	/// Defaults to -1
130	TF_MUST_USE_RESULT Attrs DesiredChannels(int64 x) {
131	Attrs ret = *this;
132	ret.desired_channels_ = x;
133	return ret;
134	}
135
136	/// Length of audio requested.
137	///
138	/// Defaults to -1
139	TF_MUST_USE_RESULT Attrs DesiredSamples(int64 x) {
140	Attrs ret = *this;
141	ret.desired_samples_ = x;
142	return ret;
143	}
144
145	int64 desired_channels_ = -`1`;
146	int64 desired_samples_ = -`1`;
147	};
148	DecodeWav(const ::tensorflow::Scope& scope, ::tensorflow::Input contents);
149	DecodeWav(const ::tensorflow::Scope& scope, ::tensorflow::Input contents, const
150	DecodeWav::Attrs& attrs);
151
152	static Attrs DesiredChannels(int64 x) {
153	return Attrs ().DesiredChannels(x);
154	}
155	static Attrs DesiredSamples(int64 x) {
156	return Attrs ().DesiredSamples(x);
157	}
158
159	Operation operation;
160	::tensorflow::Output audio;
161	::tensorflow::Output sample_rate;
162	};
163
164	/// Encode audio data using the WAV file format.
165	///
166	/// This operation will generate a string suitable to be saved out to create a .wav
167	/// audio file. It will be encoded in the 16-bit PCM format. It takes in float
168	/// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
169	/// that range.
170	///
171	/// `audio` is a 2-D float Tensor of shape `[length, channels]`.
172	/// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
173	///
174	/// Args:
175	/// scope: A Scope object*
176	/// audio: 2-D with shape `[length, channels]`.*
177	/// sample_rate: Scalar containing the sample frequency.*
178	///
179	/// Returns:
180	/// `Output`: 0-D. WAV-encoded file contents.*
181	class EncodeWav {
182	public:
183	EncodeWav(const ::tensorflow::Scope& scope, ::tensorflow::Input audio,
184	::tensorflow::Input sample_rate);
185	operator ::tensorflow::Output() const { return contents; }
186	operator ::tensorflow::Input() const { return contents; }
187	::tensorflow::Node* node() const { return contents.node(); }
188
189	Operation operation;
190	::tensorflow::Output contents;
191	};
192
193	/// Transforms a spectrogram into a form that's useful for speech recognition.
194	///
195	/// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
196	/// been effective as an input feature for machine learning. They are created by
197	/// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
198	/// higher frequencies that are less significant to the human ear. They have a long
199	/// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
200	/// is a good resource to learn more.
201	///
202	/// Args:
203	/// scope: A Scope object*
204	/// spectrogram: Typically produced by the Spectrogram op, with magnitude_squared*
205	/// set to true.
206	/// sample_rate: How many samples per second the source audio used.*
207	///
208	/// Optional attributes (see `Attrs`):
209	/// upper_frequency_limit: The highest frequency to use when calculating the*
210	/// ceptstrum.
211	/// lower_frequency_limit: The lowest frequency to use when calculating the*
212	/// ceptstrum.
213	/// filterbank_channel_count: Resolution of the Mel bank used internally.*
214	/// dct_coefficient_count: How many output channels to produce per time slice.*
215	///
216	/// Returns:
217	/// `Output`: The output tensor.*
218	class Mfcc {
219	public:
220	/// Optional attribute setters for Mfcc
221	struct Attrs {
222	/// The highest frequency to use when calculating the
223	/// ceptstrum.
224	///
225	/// Defaults to 4000
226	TF_MUST_USE_RESULT Attrs UpperFrequencyLimit(float x) {
227	Attrs ret = *this;
228	ret.upper_frequency_limit_ = x;
229	return ret;
230	}
231
232	/// The lowest frequency to use when calculating the
233	/// ceptstrum.
234	///
235	/// Defaults to 20
236	TF_MUST_USE_RESULT Attrs LowerFrequencyLimit(float x) {
237	Attrs ret = *this;
238	ret.lower_frequency_limit_ = x;
239	return ret;
240	}
241
242	/// Resolution of the Mel bank used internally.
243	///
244	/// Defaults to 40
245	TF_MUST_USE_RESULT Attrs FilterbankChannelCount(int64 x) {
246	Attrs ret = *this;
247	ret.filterbank_channel_count_ = x;
248	return ret;
249	}
250
251	/// How many output channels to produce per time slice.
252	///
253	/// Defaults to 13
254	TF_MUST_USE_RESULT Attrs DctCoefficientCount(int64 x) {
255	Attrs ret = *this;
256	ret.dct_coefficient_count_ = x;
257	return ret;
258	}
259
260	float upper_frequency_limit_ = `4000.0f`;
261	float lower_frequency_limit_ = `20.0f`;
262	int64 filterbank_channel_count_ = `40`;
263	int64 dct_coefficient_count_ = `13`;
264	};
265	Mfcc(const ::tensorflow::Scope& scope, ::tensorflow::Input spectrogram,
266	::tensorflow::Input sample_rate);
267	Mfcc(const ::tensorflow::Scope& scope, ::tensorflow::Input spectrogram,
268	::tensorflow::Input sample_rate, const Mfcc::Attrs& attrs);
269	operator ::tensorflow::Output() const { return output; }
270	operator ::tensorflow::Input() const { return output; }
271	::tensorflow::Node* node() const { return output.node(); }
272
273	static Attrs UpperFrequencyLimit(float x) {
274	return Attrs ().UpperFrequencyLimit(x);
275	}
276	static Attrs LowerFrequencyLimit(float x) {
277	return Attrs ().LowerFrequencyLimit(x);
278	}
279	static Attrs FilterbankChannelCount(int64 x) {
280	return Attrs ().FilterbankChannelCount(x);
281	}
282	static Attrs DctCoefficientCount(int64 x) {
283	return Attrs ().DctCoefficientCount(x);
284	}
285
286	Operation operation;
287	::tensorflow::Output output;
288	};
289
290	/// @}
291
292	} // namespace ops
293	} // namespace tensorflow
294
295	#endif // TENSORFLOW_CC_OPS_AUDIO_OPS_H_
296

Browse the source code of tensorflow/tensorflow/cc/ops/audio_ops.h