1 | // This file is MACHINE GENERATED! Do not edit. |
2 | |
3 | #ifndef TENSORFLOW_CC_OPS_AUDIO_OPS_H_ |
4 | #define TENSORFLOW_CC_OPS_AUDIO_OPS_H_ |
5 | |
6 | // This file is MACHINE GENERATED! Do not edit. |
7 | |
8 | #include "tensorflow/cc/framework/ops.h" |
9 | #include "tensorflow/cc/framework/scope.h" |
10 | #include "tensorflow/core/framework/tensor.h" |
11 | #include "tensorflow/core/framework/tensor_shape.h" |
12 | #include "tensorflow/core/framework/types.h" |
13 | #include "tensorflow/core/lib/gtl/array_slice.h" |
14 | |
15 | namespace tensorflow { |
16 | namespace ops { |
17 | |
18 | /// @defgroup audio_ops Audio Ops |
19 | /// @{ |
20 | |
21 | /// Produces a visualization of audio data over time. |
22 | /// |
23 | /// Spectrograms are a standard way of representing audio information as a series of |
24 | /// slices of frequency information, one slice for each window of time. By joining |
25 | /// these together into a sequence, they form a distinctive fingerprint of the sound |
26 | /// over time. |
27 | /// |
28 | /// This op expects to receive audio data as an input, stored as floats in the range |
29 | /// -1 to 1, together with a window width in samples, and a stride specifying how |
30 | /// far to move the window between slices. From this it generates a three |
31 | /// dimensional output. The first dimension is for the channels in the input, so a |
32 | /// stereo audio input would have two here for example. The second dimension is time, |
33 | /// with successive frequency slices. The third dimension has an amplitude value for |
34 | /// each frequency during that time slice. |
35 | /// |
36 | /// This means the layout when converted and saved as an image is rotated 90 degrees |
37 | /// clockwise from a typical spectrogram. Time is descending down the Y axis, and |
38 | /// the frequency decreases from left to right. |
39 | /// |
40 | /// Each value in the result represents the square root of the sum of the real and |
41 | /// imaginary parts of an FFT on the current window of samples. In this way, the |
42 | /// lowest dimension represents the power of each frequency in the current window, |
43 | /// and adjacent windows are concatenated in the next dimension. |
44 | /// |
45 | /// To get a more intuitive and visual look at what this operation does, you can run |
46 | /// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the |
47 | /// resulting spectrogram as a PNG image. |
48 | /// |
49 | /// Args: |
50 | /// * scope: A Scope object |
51 | /// * input: Float representation of audio data. |
52 | /// * window_size: How wide the input window is in samples. For the highest efficiency |
53 | /// this should be a power of two, but other values are accepted. |
54 | /// * stride: How widely apart the center of adjacent sample windows should be. |
55 | /// |
56 | /// Optional attributes (see `Attrs`): |
57 | /// * magnitude_squared: Whether to return the squared magnitude or just the |
58 | /// magnitude. Using squared magnitude can avoid extra calculations. |
59 | /// |
60 | /// Returns: |
61 | /// * `Output`: 3D representation of the audio frequencies as an image. |
62 | class AudioSpectrogram { |
63 | public: |
64 | /// Optional attribute setters for AudioSpectrogram |
65 | struct Attrs { |
66 | /// Whether to return the squared magnitude or just the |
67 | /// magnitude. Using squared magnitude can avoid extra calculations. |
68 | /// |
69 | /// Defaults to false |
70 | TF_MUST_USE_RESULT Attrs MagnitudeSquared(bool x) { |
71 | Attrs ret = *this; |
72 | ret.magnitude_squared_ = x; |
73 | return ret; |
74 | } |
75 | |
76 | bool magnitude_squared_ = false; |
77 | }; |
78 | AudioSpectrogram(const ::tensorflow::Scope& scope, ::tensorflow::Input input, |
79 | int64 window_size, int64 stride); |
80 | AudioSpectrogram(const ::tensorflow::Scope& scope, ::tensorflow::Input input, |
81 | int64 window_size, int64 stride, const |
82 | AudioSpectrogram::Attrs& attrs); |
83 | operator ::tensorflow::Output() const { return spectrogram; } |
84 | operator ::tensorflow::Input() const { return spectrogram; } |
85 | ::tensorflow::Node* node() const { return spectrogram.node(); } |
86 | |
87 | static Attrs MagnitudeSquared(bool x) { |
88 | return Attrs().MagnitudeSquared(x); |
89 | } |
90 | |
91 | Operation operation; |
92 | ::tensorflow::Output spectrogram; |
93 | }; |
94 | |
95 | /// Decode a 16-bit PCM WAV file to a float tensor. |
96 | /// |
97 | /// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float. |
98 | /// |
99 | /// When desired_channels is set, if the input contains fewer channels than this |
100 | /// then the last channel will be duplicated to give the requested number, else if |
101 | /// the input has more channels than requested then the additional channels will be |
102 | /// ignored. |
103 | /// |
104 | /// If desired_samples is set, then the audio will be cropped or padded with zeroes |
105 | /// to the requested length. |
106 | /// |
107 | /// The first output contains a Tensor with the content of the audio samples. The |
108 | /// lowest dimension will be the number of channels, and the second will be the |
109 | /// number of samples. For example, a ten-sample-long stereo WAV file should give an |
110 | /// output shape of [10, 2]. |
111 | /// |
112 | /// Args: |
113 | /// * scope: A Scope object |
114 | /// * contents: The WAV-encoded audio, usually from a file. |
115 | /// |
116 | /// Optional attributes (see `Attrs`): |
117 | /// * desired_channels: Number of sample channels wanted. |
118 | /// * desired_samples: Length of audio requested. |
119 | /// |
120 | /// Returns: |
121 | /// * `Output` audio: 2-D with shape `[length, channels]`. |
122 | /// * `Output` sample_rate: Scalar holding the sample rate found in the WAV header. |
123 | class DecodeWav { |
124 | public: |
125 | /// Optional attribute setters for DecodeWav |
126 | struct Attrs { |
127 | /// Number of sample channels wanted. |
128 | /// |
129 | /// Defaults to -1 |
130 | TF_MUST_USE_RESULT Attrs DesiredChannels(int64 x) { |
131 | Attrs ret = *this; |
132 | ret.desired_channels_ = x; |
133 | return ret; |
134 | } |
135 | |
136 | /// Length of audio requested. |
137 | /// |
138 | /// Defaults to -1 |
139 | TF_MUST_USE_RESULT Attrs DesiredSamples(int64 x) { |
140 | Attrs ret = *this; |
141 | ret.desired_samples_ = x; |
142 | return ret; |
143 | } |
144 | |
145 | int64 desired_channels_ = -1; |
146 | int64 desired_samples_ = -1; |
147 | }; |
148 | DecodeWav(const ::tensorflow::Scope& scope, ::tensorflow::Input contents); |
149 | DecodeWav(const ::tensorflow::Scope& scope, ::tensorflow::Input contents, const |
150 | DecodeWav::Attrs& attrs); |
151 | |
152 | static Attrs DesiredChannels(int64 x) { |
153 | return Attrs().DesiredChannels(x); |
154 | } |
155 | static Attrs DesiredSamples(int64 x) { |
156 | return Attrs().DesiredSamples(x); |
157 | } |
158 | |
159 | Operation operation; |
160 | ::tensorflow::Output audio; |
161 | ::tensorflow::Output sample_rate; |
162 | }; |
163 | |
164 | /// Encode audio data using the WAV file format. |
165 | /// |
166 | /// This operation will generate a string suitable to be saved out to create a .wav |
167 | /// audio file. It will be encoded in the 16-bit PCM format. It takes in float |
168 | /// values in the range -1.0f to 1.0f, and any outside that value will be clamped to |
169 | /// that range. |
170 | /// |
171 | /// `audio` is a 2-D float Tensor of shape `[length, channels]`. |
172 | /// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100). |
173 | /// |
174 | /// Args: |
175 | /// * scope: A Scope object |
176 | /// * audio: 2-D with shape `[length, channels]`. |
177 | /// * sample_rate: Scalar containing the sample frequency. |
178 | /// |
179 | /// Returns: |
180 | /// * `Output`: 0-D. WAV-encoded file contents. |
181 | class EncodeWav { |
182 | public: |
183 | EncodeWav(const ::tensorflow::Scope& scope, ::tensorflow::Input audio, |
184 | ::tensorflow::Input sample_rate); |
185 | operator ::tensorflow::Output() const { return contents; } |
186 | operator ::tensorflow::Input() const { return contents; } |
187 | ::tensorflow::Node* node() const { return contents.node(); } |
188 | |
189 | Operation operation; |
190 | ::tensorflow::Output contents; |
191 | }; |
192 | |
193 | /// Transforms a spectrogram into a form that's useful for speech recognition. |
194 | /// |
195 | /// Mel Frequency Cepstral Coefficients are a way of representing audio data that's |
196 | /// been effective as an input feature for machine learning. They are created by |
197 | /// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the |
198 | /// higher frequencies that are less significant to the human ear. They have a long |
199 | /// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum |
200 | /// is a good resource to learn more. |
201 | /// |
202 | /// Args: |
203 | /// * scope: A Scope object |
204 | /// * spectrogram: Typically produced by the Spectrogram op, with magnitude_squared |
205 | /// set to true. |
206 | /// * sample_rate: How many samples per second the source audio used. |
207 | /// |
208 | /// Optional attributes (see `Attrs`): |
209 | /// * upper_frequency_limit: The highest frequency to use when calculating the |
210 | /// ceptstrum. |
211 | /// * lower_frequency_limit: The lowest frequency to use when calculating the |
212 | /// ceptstrum. |
213 | /// * filterbank_channel_count: Resolution of the Mel bank used internally. |
214 | /// * dct_coefficient_count: How many output channels to produce per time slice. |
215 | /// |
216 | /// Returns: |
217 | /// * `Output`: The output tensor. |
218 | class Mfcc { |
219 | public: |
220 | /// Optional attribute setters for Mfcc |
221 | struct Attrs { |
222 | /// The highest frequency to use when calculating the |
223 | /// ceptstrum. |
224 | /// |
225 | /// Defaults to 4000 |
226 | TF_MUST_USE_RESULT Attrs UpperFrequencyLimit(float x) { |
227 | Attrs ret = *this; |
228 | ret.upper_frequency_limit_ = x; |
229 | return ret; |
230 | } |
231 | |
232 | /// The lowest frequency to use when calculating the |
233 | /// ceptstrum. |
234 | /// |
235 | /// Defaults to 20 |
236 | TF_MUST_USE_RESULT Attrs LowerFrequencyLimit(float x) { |
237 | Attrs ret = *this; |
238 | ret.lower_frequency_limit_ = x; |
239 | return ret; |
240 | } |
241 | |
242 | /// Resolution of the Mel bank used internally. |
243 | /// |
244 | /// Defaults to 40 |
245 | TF_MUST_USE_RESULT Attrs FilterbankChannelCount(int64 x) { |
246 | Attrs ret = *this; |
247 | ret.filterbank_channel_count_ = x; |
248 | return ret; |
249 | } |
250 | |
251 | /// How many output channels to produce per time slice. |
252 | /// |
253 | /// Defaults to 13 |
254 | TF_MUST_USE_RESULT Attrs DctCoefficientCount(int64 x) { |
255 | Attrs ret = *this; |
256 | ret.dct_coefficient_count_ = x; |
257 | return ret; |
258 | } |
259 | |
260 | float upper_frequency_limit_ = 4000.0f; |
261 | float lower_frequency_limit_ = 20.0f; |
262 | int64 filterbank_channel_count_ = 40; |
263 | int64 dct_coefficient_count_ = 13; |
264 | }; |
265 | Mfcc(const ::tensorflow::Scope& scope, ::tensorflow::Input spectrogram, |
266 | ::tensorflow::Input sample_rate); |
267 | Mfcc(const ::tensorflow::Scope& scope, ::tensorflow::Input spectrogram, |
268 | ::tensorflow::Input sample_rate, const Mfcc::Attrs& attrs); |
269 | operator ::tensorflow::Output() const { return output; } |
270 | operator ::tensorflow::Input() const { return output; } |
271 | ::tensorflow::Node* node() const { return output.node(); } |
272 | |
273 | static Attrs UpperFrequencyLimit(float x) { |
274 | return Attrs().UpperFrequencyLimit(x); |
275 | } |
276 | static Attrs LowerFrequencyLimit(float x) { |
277 | return Attrs().LowerFrequencyLimit(x); |
278 | } |
279 | static Attrs FilterbankChannelCount(int64 x) { |
280 | return Attrs().FilterbankChannelCount(x); |
281 | } |
282 | static Attrs DctCoefficientCount(int64 x) { |
283 | return Attrs().DctCoefficientCount(x); |
284 | } |
285 | |
286 | Operation operation; |
287 | ::tensorflow::Output output; |
288 | }; |
289 | |
290 | /// @} |
291 | |
292 | } // namespace ops |
293 | } // namespace tensorflow |
294 | |
295 | #endif // TENSORFLOW_CC_OPS_AUDIO_OPS_H_ |
296 | |