1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// This code resamples the FFT bins, and smooths then with triangle-shaped
17// weights to create a mel-frequency filter bank. For filter i centered at f_i,
18// there is a triangular weighting of the FFT bins that extends from
19// filter f_i-1 (with a value of zero at the left edge of the triangle) to f_i
20// (where the filter value is 1) to f_i+1 (where the filter values returns to
21// zero).
22
23// Note: this code fails if you ask for too many channels. The algorithm used
24// here assumes that each FFT bin contributes to at most two channels: the
25// right side of a triangle for channel i, and the left side of the triangle
26// for channel i+1. If you ask for so many channels that some of the
27// resulting mel triangle filters are smaller than a single FFT bin, these
28// channels may end up with no contributing FFT bins. The resulting mel
29// spectrum output will have some channels that are always zero.
30
31#include "tensorflow/core/kernels/mfcc_mel_filterbank.h"
32
33#include <math.h>
34
35#include "tensorflow/core/platform/logging.h"
36
37namespace tensorflow {
38
39MfccMelFilterbank::MfccMelFilterbank() : initialized_(false) {}
40
41bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
42 int output_channel_count,
43 double lower_frequency_limit,
44 double upper_frequency_limit) {
45 num_channels_ = output_channel_count;
46 sample_rate_ = input_sample_rate;
47 input_length_ = input_length;
48
49 if (num_channels_ < 1) {
50 LOG(ERROR) << "Number of filterbank channels must be positive.";
51 return false;
52 }
53
54 if (sample_rate_ <= 0) {
55 LOG(ERROR) << "Sample rate must be positive.";
56 return false;
57 }
58
59 if (input_length < 2) {
60 LOG(ERROR) << "Input length must greater than 1.";
61 return false;
62 }
63
64 if (lower_frequency_limit < 0) {
65 LOG(ERROR) << "Lower frequency limit must be nonnegative.";
66 return false;
67 }
68
69 if (upper_frequency_limit <= lower_frequency_limit) {
70 LOG(ERROR) << "Upper frequency limit must be greater than "
71 << "lower frequency limit.";
72 return false;
73 }
74
75 // An extra center frequency is computed at the top to get the upper
76 // limit on the high side of the final triangular filter.
77 center_frequencies_.resize(num_channels_ + 1);
78 const double mel_low = FreqToMel(lower_frequency_limit);
79 const double mel_hi = FreqToMel(upper_frequency_limit);
80 const double mel_span = mel_hi - mel_low;
81 const double mel_spacing = mel_span / static_cast<double>(num_channels_ + 1);
82 for (int i = 0; i < num_channels_ + 1; ++i) {
83 center_frequencies_[i] = mel_low + (mel_spacing * (i + 1));
84 }
85
86 // Always exclude DC; emulate HTK.
87 const double hz_per_sbin =
88 0.5 * sample_rate_ / static_cast<double>(input_length_ - 1);
89 start_index_ = static_cast<int>(1.5 + (lower_frequency_limit / hz_per_sbin));
90 end_index_ = static_cast<int>(upper_frequency_limit / hz_per_sbin);
91
92 // Maps the input spectrum bin indices to filter bank channels/indices. For
93 // each FFT bin, band_mapper tells us which channel this bin contributes to
94 // on the right side of the triangle. Thus this bin also contributes to the
95 // left side of the next channel's triangle response.
96 band_mapper_.resize(input_length_);
97 int channel = 0;
98 for (int i = 0; i < input_length_; ++i) {
99 double melf = FreqToMel(i * hz_per_sbin);
100 if ((i < start_index_) || (i > end_index_)) {
101 band_mapper_[i] = -2; // Indicate an unused Fourier coefficient.
102 } else {
103 while ((channel < num_channels_) &&
104 (center_frequencies_[channel] < melf)) {
105 ++channel;
106 }
107 band_mapper_[i] = channel - 1; // Can be == -1
108 }
109 }
110
111 // Create the weighting functions to taper the band edges. The contribution
112 // of any one FFT bin is based on its distance along the continuum between two
113 // mel-channel center frequencies. This bin contributes weights_[i] to the
114 // current channel and 1-weights_[i] to the next channel.
115 weights_.resize(input_length_);
116 for (int i = 0; i < input_length_; ++i) {
117 channel = band_mapper_[i];
118 if ((i < start_index_) || (i > end_index_)) {
119 weights_[i] = 0.0;
120 } else {
121 if (channel >= 0) {
122 weights_[i] =
123 (center_frequencies_[channel + 1] - FreqToMel(i * hz_per_sbin)) /
124 (center_frequencies_[channel + 1] - center_frequencies_[channel]);
125 } else {
126 weights_[i] = (center_frequencies_[0] - FreqToMel(i * hz_per_sbin)) /
127 (center_frequencies_[0] - mel_low);
128 }
129 }
130 }
131 // Check the sum of FFT bin weights for every mel band to identify
132 // situations where the mel bands are so narrow that they don't get
133 // significant weight on enough (or any) FFT bins -- i.e., too many
134 // mel bands have been requested for the given FFT size.
135 std::vector<int> bad_channels;
136 for (int c = 0; c < num_channels_; ++c) {
137 float band_weights_sum = 0.0;
138 for (int i = 0; i < input_length_; ++i) {
139 if (band_mapper_[i] == c - 1) {
140 band_weights_sum += (1.0 - weights_[i]);
141 } else if (band_mapper_[i] == c) {
142 band_weights_sum += weights_[i];
143 }
144 }
145 // The lowest mel channels have the fewest FFT bins and the lowest
146 // weights sum. But given that the target gain at the center frequency
147 // is 1.0, if the total sum of weights is 0.5, we're in bad shape.
148 if (band_weights_sum < 0.5) {
149 bad_channels.push_back(c);
150 }
151 }
152 if (!bad_channels.empty()) {
153 LOG(ERROR) << "Missing " << bad_channels.size() << " bands "
154 << " starting at " << bad_channels[0]
155 << " in mel-frequency design. "
156 << "Perhaps too many channels or "
157 << "not enough frequency resolution in spectrum. ("
158 << "input_length: " << input_length
159 << " input_sample_rate: " << input_sample_rate
160 << " output_channel_count: " << output_channel_count
161 << " lower_frequency_limit: " << lower_frequency_limit
162 << " upper_frequency_limit: " << upper_frequency_limit;
163 }
164 initialized_ = true;
165 return true;
166}
167
168// Compute the mel spectrum from the squared-magnitude FFT input by taking the
169// square root, then summing FFT magnitudes under triangular integration windows
170// whose widths increase with frequency.
171void MfccMelFilterbank::Compute(const std::vector<double> &input,
172 std::vector<double> *output) const {
173 if (!initialized_) {
174 LOG(ERROR) << "Mel Filterbank not initialized.";
175 return;
176 }
177
178 if (input.size() <= end_index_) {
179 LOG(ERROR) << "Input too short to compute filterbank";
180 return;
181 }
182
183 // Ensure output is right length and reset all values.
184 output->assign(num_channels_, 0.0);
185
186 for (int i = start_index_; i <= end_index_; i++) { // For each FFT bin
187 double spec_val = sqrt(input[i]);
188 double weighted = spec_val * weights_[i];
189 int channel = band_mapper_[i];
190 if (channel >= 0)
191 (*output)[channel] += weighted; // Right side of triangle, downward slope
192 channel++;
193 if (channel < num_channels_)
194 (*output)[channel] += spec_val - weighted; // Left side of triangle
195 }
196}
197
198double MfccMelFilterbank::FreqToMel(double freq) const {
199 return 1127.0 * log1p(freq / 700.0);
200}
201
202} // namespace tensorflow
203