1// This file is MACHINE GENERATED! Do not edit.
2
3#ifndef TENSORFLOW_CC_OPS_AUDIO_OPS_H_
4#define TENSORFLOW_CC_OPS_AUDIO_OPS_H_
5
6// This file is MACHINE GENERATED! Do not edit.
7
8#include "tensorflow/cc/framework/ops.h"
9#include "tensorflow/cc/framework/scope.h"
10#include "tensorflow/core/framework/tensor.h"
11#include "tensorflow/core/framework/tensor_shape.h"
12#include "tensorflow/core/framework/types.h"
13#include "tensorflow/core/lib/gtl/array_slice.h"
14
15namespace tensorflow {
16namespace ops {
17
18/// @defgroup audio_ops Audio Ops
19/// @{
20
21/// Produces a visualization of audio data over time.
22///
23/// Spectrograms are a standard way of representing audio information as a series of
24/// slices of frequency information, one slice for each window of time. By joining
25/// these together into a sequence, they form a distinctive fingerprint of the sound
26/// over time.
27///
28/// This op expects to receive audio data as an input, stored as floats in the range
29/// -1 to 1, together with a window width in samples, and a stride specifying how
30/// far to move the window between slices. From this it generates a three
31/// dimensional output. The first dimension is for the channels in the input, so a
32/// stereo audio input would have two here for example. The second dimension is time,
33/// with successive frequency slices. The third dimension has an amplitude value for
34/// each frequency during that time slice.
35///
36/// This means the layout when converted and saved as an image is rotated 90 degrees
37/// clockwise from a typical spectrogram. Time is descending down the Y axis, and
38/// the frequency decreases from left to right.
39///
40/// Each value in the result represents the square root of the sum of the real and
41/// imaginary parts of an FFT on the current window of samples. In this way, the
42/// lowest dimension represents the power of each frequency in the current window,
43/// and adjacent windows are concatenated in the next dimension.
44///
45/// To get a more intuitive and visual look at what this operation does, you can run
46/// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
47/// resulting spectrogram as a PNG image.
48///
49/// Args:
50/// * scope: A Scope object
51/// * input: Float representation of audio data.
52/// * window_size: How wide the input window is in samples. For the highest efficiency
53/// this should be a power of two, but other values are accepted.
54/// * stride: How widely apart the center of adjacent sample windows should be.
55///
56/// Optional attributes (see `Attrs`):
57/// * magnitude_squared: Whether to return the squared magnitude or just the
58/// magnitude. Using squared magnitude can avoid extra calculations.
59///
60/// Returns:
61/// * `Output`: 3D representation of the audio frequencies as an image.
62class AudioSpectrogram {
63 public:
64 /// Optional attribute setters for AudioSpectrogram
65 struct Attrs {
66 /// Whether to return the squared magnitude or just the
67 /// magnitude. Using squared magnitude can avoid extra calculations.
68 ///
69 /// Defaults to false
70 TF_MUST_USE_RESULT Attrs MagnitudeSquared(bool x) {
71 Attrs ret = *this;
72 ret.magnitude_squared_ = x;
73 return ret;
74 }
75
76 bool magnitude_squared_ = false;
77 };
78 AudioSpectrogram(const ::tensorflow::Scope& scope, ::tensorflow::Input input,
79 int64 window_size, int64 stride);
80 AudioSpectrogram(const ::tensorflow::Scope& scope, ::tensorflow::Input input,
81 int64 window_size, int64 stride, const
82 AudioSpectrogram::Attrs& attrs);
83 operator ::tensorflow::Output() const { return spectrogram; }
84 operator ::tensorflow::Input() const { return spectrogram; }
85 ::tensorflow::Node* node() const { return spectrogram.node(); }
86
87 static Attrs MagnitudeSquared(bool x) {
88 return Attrs().MagnitudeSquared(x);
89 }
90
91 Operation operation;
92 ::tensorflow::Output spectrogram;
93};
94
95/// Decode a 16-bit PCM WAV file to a float tensor.
96///
97/// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
98///
99/// When desired_channels is set, if the input contains fewer channels than this
100/// then the last channel will be duplicated to give the requested number, else if
101/// the input has more channels than requested then the additional channels will be
102/// ignored.
103///
104/// If desired_samples is set, then the audio will be cropped or padded with zeroes
105/// to the requested length.
106///
107/// The first output contains a Tensor with the content of the audio samples. The
108/// lowest dimension will be the number of channels, and the second will be the
109/// number of samples. For example, a ten-sample-long stereo WAV file should give an
110/// output shape of [10, 2].
111///
112/// Args:
113/// * scope: A Scope object
114/// * contents: The WAV-encoded audio, usually from a file.
115///
116/// Optional attributes (see `Attrs`):
117/// * desired_channels: Number of sample channels wanted.
118/// * desired_samples: Length of audio requested.
119///
120/// Returns:
121/// * `Output` audio: 2-D with shape `[length, channels]`.
122/// * `Output` sample_rate: Scalar holding the sample rate found in the WAV header.
123class DecodeWav {
124 public:
125 /// Optional attribute setters for DecodeWav
126 struct Attrs {
127 /// Number of sample channels wanted.
128 ///
129 /// Defaults to -1
130 TF_MUST_USE_RESULT Attrs DesiredChannels(int64 x) {
131 Attrs ret = *this;
132 ret.desired_channels_ = x;
133 return ret;
134 }
135
136 /// Length of audio requested.
137 ///
138 /// Defaults to -1
139 TF_MUST_USE_RESULT Attrs DesiredSamples(int64 x) {
140 Attrs ret = *this;
141 ret.desired_samples_ = x;
142 return ret;
143 }
144
145 int64 desired_channels_ = -1;
146 int64 desired_samples_ = -1;
147 };
148 DecodeWav(const ::tensorflow::Scope& scope, ::tensorflow::Input contents);
149 DecodeWav(const ::tensorflow::Scope& scope, ::tensorflow::Input contents, const
150 DecodeWav::Attrs& attrs);
151
152 static Attrs DesiredChannels(int64 x) {
153 return Attrs().DesiredChannels(x);
154 }
155 static Attrs DesiredSamples(int64 x) {
156 return Attrs().DesiredSamples(x);
157 }
158
159 Operation operation;
160 ::tensorflow::Output audio;
161 ::tensorflow::Output sample_rate;
162};
163
164/// Encode audio data using the WAV file format.
165///
166/// This operation will generate a string suitable to be saved out to create a .wav
167/// audio file. It will be encoded in the 16-bit PCM format. It takes in float
168/// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
169/// that range.
170///
171/// `audio` is a 2-D float Tensor of shape `[length, channels]`.
172/// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
173///
174/// Args:
175/// * scope: A Scope object
176/// * audio: 2-D with shape `[length, channels]`.
177/// * sample_rate: Scalar containing the sample frequency.
178///
179/// Returns:
180/// * `Output`: 0-D. WAV-encoded file contents.
181class EncodeWav {
182 public:
183 EncodeWav(const ::tensorflow::Scope& scope, ::tensorflow::Input audio,
184 ::tensorflow::Input sample_rate);
185 operator ::tensorflow::Output() const { return contents; }
186 operator ::tensorflow::Input() const { return contents; }
187 ::tensorflow::Node* node() const { return contents.node(); }
188
189 Operation operation;
190 ::tensorflow::Output contents;
191};
192
193/// Transforms a spectrogram into a form that's useful for speech recognition.
194///
195/// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
196/// been effective as an input feature for machine learning. They are created by
197/// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
198/// higher frequencies that are less significant to the human ear. They have a long
199/// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
200/// is a good resource to learn more.
201///
202/// Args:
203/// * scope: A Scope object
204/// * spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
205/// set to true.
206/// * sample_rate: How many samples per second the source audio used.
207///
208/// Optional attributes (see `Attrs`):
209/// * upper_frequency_limit: The highest frequency to use when calculating the
210/// ceptstrum.
211/// * lower_frequency_limit: The lowest frequency to use when calculating the
212/// ceptstrum.
213/// * filterbank_channel_count: Resolution of the Mel bank used internally.
214/// * dct_coefficient_count: How many output channels to produce per time slice.
215///
216/// Returns:
217/// * `Output`: The output tensor.
218class Mfcc {
219 public:
220 /// Optional attribute setters for Mfcc
221 struct Attrs {
222 /// The highest frequency to use when calculating the
223 /// ceptstrum.
224 ///
225 /// Defaults to 4000
226 TF_MUST_USE_RESULT Attrs UpperFrequencyLimit(float x) {
227 Attrs ret = *this;
228 ret.upper_frequency_limit_ = x;
229 return ret;
230 }
231
232 /// The lowest frequency to use when calculating the
233 /// ceptstrum.
234 ///
235 /// Defaults to 20
236 TF_MUST_USE_RESULT Attrs LowerFrequencyLimit(float x) {
237 Attrs ret = *this;
238 ret.lower_frequency_limit_ = x;
239 return ret;
240 }
241
242 /// Resolution of the Mel bank used internally.
243 ///
244 /// Defaults to 40
245 TF_MUST_USE_RESULT Attrs FilterbankChannelCount(int64 x) {
246 Attrs ret = *this;
247 ret.filterbank_channel_count_ = x;
248 return ret;
249 }
250
251 /// How many output channels to produce per time slice.
252 ///
253 /// Defaults to 13
254 TF_MUST_USE_RESULT Attrs DctCoefficientCount(int64 x) {
255 Attrs ret = *this;
256 ret.dct_coefficient_count_ = x;
257 return ret;
258 }
259
260 float upper_frequency_limit_ = 4000.0f;
261 float lower_frequency_limit_ = 20.0f;
262 int64 filterbank_channel_count_ = 40;
263 int64 dct_coefficient_count_ = 13;
264 };
265 Mfcc(const ::tensorflow::Scope& scope, ::tensorflow::Input spectrogram,
266 ::tensorflow::Input sample_rate);
267 Mfcc(const ::tensorflow::Scope& scope, ::tensorflow::Input spectrogram,
268 ::tensorflow::Input sample_rate, const Mfcc::Attrs& attrs);
269 operator ::tensorflow::Output() const { return output; }
270 operator ::tensorflow::Input() const { return output; }
271 ::tensorflow::Node* node() const { return output.node(); }
272
273 static Attrs UpperFrequencyLimit(float x) {
274 return Attrs().UpperFrequencyLimit(x);
275 }
276 static Attrs LowerFrequencyLimit(float x) {
277 return Attrs().LowerFrequencyLimit(x);
278 }
279 static Attrs FilterbankChannelCount(int64 x) {
280 return Attrs().FilterbankChannelCount(x);
281 }
282 static Attrs DctCoefficientCount(int64 x) {
283 return Attrs().DctCoefficientCount(x);
284 }
285
286 Operation operation;
287 ::tensorflow::Output output;
288};
289
290/// @}
291
292} // namespace ops
293} // namespace tensorflow
294
295#endif // TENSORFLOW_CC_OPS_AUDIO_OPS_H_
296