1 | /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // This code resamples the FFT bins, and smooths then with triangle-shaped |
17 | // weights to create a mel-frequency filter bank. For filter i centered at f_i, |
18 | // there is a triangular weighting of the FFT bins that extends from |
19 | // filter f_i-1 (with a value of zero at the left edge of the triangle) to f_i |
20 | // (where the filter value is 1) to f_i+1 (where the filter values returns to |
21 | // zero). |
22 | |
23 | // Note: this code fails if you ask for too many channels. The algorithm used |
24 | // here assumes that each FFT bin contributes to at most two channels: the |
25 | // right side of a triangle for channel i, and the left side of the triangle |
26 | // for channel i+1. If you ask for so many channels that some of the |
27 | // resulting mel triangle filters are smaller than a single FFT bin, these |
28 | // channels may end up with no contributing FFT bins. The resulting mel |
29 | // spectrum output will have some channels that are always zero. |
30 | |
31 | #include "tensorflow/core/kernels/mfcc_mel_filterbank.h" |
32 | |
33 | #include <math.h> |
34 | |
35 | #include "tensorflow/core/platform/logging.h" |
36 | |
37 | namespace tensorflow { |
38 | |
39 | MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {} |
40 | |
41 | bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate, |
42 | int output_channel_count, |
43 | double lower_frequency_limit, |
44 | double upper_frequency_limit) { |
45 | num_channels_ = output_channel_count; |
46 | sample_rate_ = input_sample_rate; |
47 | input_length_ = input_length; |
48 | |
49 | if (num_channels_ < 1) { |
50 | LOG(ERROR) << "Number of filterbank channels must be positive." ; |
51 | return false; |
52 | } |
53 | |
54 | if (sample_rate_ <= 0) { |
55 | LOG(ERROR) << "Sample rate must be positive." ; |
56 | return false; |
57 | } |
58 | |
59 | if (input_length < 2) { |
60 | LOG(ERROR) << "Input length must greater than 1." ; |
61 | return false; |
62 | } |
63 | |
64 | if (lower_frequency_limit < 0) { |
65 | LOG(ERROR) << "Lower frequency limit must be nonnegative." ; |
66 | return false; |
67 | } |
68 | |
69 | if (upper_frequency_limit <= lower_frequency_limit) { |
70 | LOG(ERROR) << "Upper frequency limit must be greater than " |
71 | << "lower frequency limit." ; |
72 | return false; |
73 | } |
74 | |
75 | // An extra center frequency is computed at the top to get the upper |
76 | // limit on the high side of the final triangular filter. |
77 | center_frequencies_.resize(num_channels_ + 1); |
78 | const double mel_low = FreqToMel(lower_frequency_limit); |
79 | const double mel_hi = FreqToMel(upper_frequency_limit); |
80 | const double mel_span = mel_hi - mel_low; |
81 | const double mel_spacing = mel_span / static_cast<double>(num_channels_ + 1); |
82 | for (int i = 0; i < num_channels_ + 1; ++i) { |
83 | center_frequencies_[i] = mel_low + (mel_spacing * (i + 1)); |
84 | } |
85 | |
86 | // Always exclude DC; emulate HTK. |
87 | const double hz_per_sbin = |
88 | 0.5 * sample_rate_ / static_cast<double>(input_length_ - 1); |
89 | start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin)); |
90 | end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin); |
91 | |
92 | // Maps the input spectrum bin indices to filter bank channels/indices. For |
93 | // each FFT bin, band_mapper tells us which channel this bin contributes to |
94 | // on the right side of the triangle. Thus this bin also contributes to the |
95 | // left side of the next channel's triangle response. |
96 | band_mapper_.resize(input_length_); |
97 | int channel = 0; |
98 | for (int i = 0; i < input_length_; ++i) { |
99 | double melf = FreqToMel(i * hz_per_sbin); |
100 | if ((i < start_index_) || (i > end_index_)) { |
101 | band_mapper_[i] = -2; // Indicate an unused Fourier coefficient. |
102 | } else { |
103 | while ((channel < num_channels_) && |
104 | (center_frequencies_[channel] < melf)) { |
105 | ++channel; |
106 | } |
107 | band_mapper_[i] = channel - 1; // Can be == -1 |
108 | } |
109 | } |
110 | |
111 | // Create the weighting functions to taper the band edges. The contribution |
112 | // of any one FFT bin is based on its distance along the continuum between two |
113 | // mel-channel center frequencies. This bin contributes weights_[i] to the |
114 | // current channel and 1-weights_[i] to the next channel. |
115 | weights_.resize(input_length_); |
116 | for (int i = 0; i < input_length_; ++i) { |
117 | channel = band_mapper_[i]; |
118 | if ((i < start_index_) || (i > end_index_)) { |
119 | weights_[i] = 0.0; |
120 | } else { |
121 | if (channel >= 0) { |
122 | weights_[i] = |
123 | (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) / |
124 | (center_frequencies_[channel + 1] - center_frequencies_[channel]); |
125 | } else { |
126 | weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) / |
127 | (center_frequencies_[0] - mel_low); |
128 | } |
129 | } |
130 | } |
131 | // Check the sum of FFT bin weights for every mel band to identify |
132 | // situations where the mel bands are so narrow that they don't get |
133 | // significant weight on enough (or any) FFT bins -- i.e., too many |
134 | // mel bands have been requested for the given FFT size. |
135 | std::vector<int> bad_channels; |
136 | for (int c = 0; c < num_channels_; ++c) { |
137 | float band_weights_sum = 0.0; |
138 | for (int i = 0; i < input_length_; ++i) { |
139 | if (band_mapper_[i] == c - 1) { |
140 | band_weights_sum += (1.0 - weights_[i]); |
141 | } else if (band_mapper_[i] == c) { |
142 | band_weights_sum += weights_[i]; |
143 | } |
144 | } |
145 | // The lowest mel channels have the fewest FFT bins and the lowest |
146 | // weights sum. But given that the target gain at the center frequency |
147 | // is 1.0, if the total sum of weights is 0.5, we're in bad shape. |
148 | if (band_weights_sum < 0.5) { |
149 | bad_channels.push_back(c); |
150 | } |
151 | } |
152 | if (!bad_channels.empty()) { |
153 | LOG(ERROR) << "Missing " << bad_channels.size() << " bands " |
154 | << " starting at " << bad_channels[0] |
155 | << " in mel-frequency design. " |
156 | << "Perhaps too many channels or " |
157 | << "not enough frequency resolution in spectrum. (" |
158 | << "input_length: " << input_length |
159 | << " input_sample_rate: " << input_sample_rate |
160 | << " output_channel_count: " << output_channel_count |
161 | << " lower_frequency_limit: " << lower_frequency_limit |
162 | << " upper_frequency_limit: " << upper_frequency_limit; |
163 | } |
164 | initialized_ = true; |
165 | return true; |
166 | } |
167 | |
168 | // Compute the mel spectrum from the squared-magnitude FFT input by taking the |
169 | // square root, then summing FFT magnitudes under triangular integration windows |
170 | // whose widths increase with frequency. |
171 | void MfccMelFilterbank::Compute(const std::vector<double> &input, |
172 | std::vector<double> *output) const { |
173 | if (!initialized_) { |
174 | LOG(ERROR) << "Mel Filterbank not initialized." ; |
175 | return; |
176 | } |
177 | |
178 | if (input.size() <= end_index_) { |
179 | LOG(ERROR) << "Input too short to compute filterbank" ; |
180 | return; |
181 | } |
182 | |
183 | // Ensure output is right length and reset all values. |
184 | output->assign(num_channels_, 0.0); |
185 | |
186 | for (int i = start_index_; i <= end_index_; i++) { // For each FFT bin |
187 | double spec_val = sqrt(input[i]); |
188 | double weighted = spec_val * weights_[i]; |
189 | int channel = band_mapper_[i]; |
190 | if (channel >= 0) |
191 | (*output)[channel] += weighted; // Right side of triangle, downward slope |
192 | channel++; |
193 | if (channel < num_channels_) |
194 | (*output)[channel] += spec_val - weighted; // Left side of triangle |
195 | } |
196 | } |
197 | |
198 | double MfccMelFilterbank::FreqToMel(double freq) const { |
199 | return 1127.0 * log1p(freq / 700.0); |
200 | } |
201 | |
202 | } // namespace tensorflow |
203 | |