|
| 1 | +# Copyright 2022 The HuggingFace Team. All rights reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | + |
| 16 | +import warnings |
| 17 | + |
| 18 | +from ...configuration_utils import ConfigMixin, register_to_config |
| 19 | +from ...schedulers.scheduling_utils import SchedulerMixin |
| 20 | + |
| 21 | + |
| 22 | +warnings.filterwarnings("ignore") |
| 23 | + |
| 24 | +import numpy as np # noqa: E402 |
| 25 | + |
| 26 | +import librosa # noqa: E402 |
| 27 | +from PIL import Image # noqa: E402 |
| 28 | + |
| 29 | + |
| 30 | +class Mel(ConfigMixin, SchedulerMixin): |
| 31 | + """ |
| 32 | + Parameters: |
| 33 | + x_res (`int`): x resolution of spectrogram (time) |
| 34 | + y_res (`int`): y resolution of spectrogram (frequency bins) |
| 35 | + sample_rate (`int`): sample rate of audio |
| 36 | + n_fft (`int`): number of Fast Fourier Transforms |
| 37 | + hop_length (`int`): hop length (a higher number is recommended for lower than 256 y_res) |
| 38 | + top_db (`int`): loudest in decibels |
| 39 | + n_iter (`int`): number of iterations for Griffin Linn mel inversion |
| 40 | + """ |
| 41 | + |
| 42 | + config_name = "mel_config.json" |
| 43 | + |
| 44 | + @register_to_config |
| 45 | + def __init__( |
| 46 | + self, |
| 47 | + x_res: int = 256, |
| 48 | + y_res: int = 256, |
| 49 | + sample_rate: int = 22050, |
| 50 | + n_fft: int = 2048, |
| 51 | + hop_length: int = 512, |
| 52 | + top_db: int = 80, |
| 53 | + n_iter: int = 32, |
| 54 | + ): |
| 55 | + self.hop_length = hop_length |
| 56 | + self.sr = sample_rate |
| 57 | + self.n_fft = n_fft |
| 58 | + self.top_db = top_db |
| 59 | + self.n_iter = n_iter |
| 60 | + self.set_resolution(x_res, y_res) |
| 61 | + self.audio = None |
| 62 | + |
| 63 | + def set_resolution(self, x_res: int, y_res: int): |
| 64 | + """Set resolution. |
| 65 | +
|
| 66 | + Args: |
| 67 | + x_res (`int`): x resolution of spectrogram (time) |
| 68 | + y_res (`int`): y resolution of spectrogram (frequency bins) |
| 69 | + """ |
| 70 | + self.x_res = x_res |
| 71 | + self.y_res = y_res |
| 72 | + self.n_mels = self.y_res |
| 73 | + self.slice_size = self.x_res * self.hop_length - 1 |
| 74 | + |
| 75 | + def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None): |
| 76 | + """Load audio. |
| 77 | +
|
| 78 | + Args: |
| 79 | + audio_file (`str`): must be a file on disk due to Librosa limitation or |
| 80 | + raw_audio (`np.ndarray`): audio as numpy array |
| 81 | + """ |
| 82 | + if audio_file is not None: |
| 83 | + self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr) |
| 84 | + else: |
| 85 | + self.audio = raw_audio |
| 86 | + |
| 87 | + # Pad with silence if necessary. |
| 88 | + if len(self.audio) < self.x_res * self.hop_length: |
| 89 | + self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))]) |
| 90 | + |
| 91 | + def get_number_of_slices(self) -> int: |
| 92 | + """Get number of slices in audio. |
| 93 | +
|
| 94 | + Returns: |
| 95 | + `int`: number of spectograms audio can be sliced into |
| 96 | + """ |
| 97 | + return len(self.audio) // self.slice_size |
| 98 | + |
| 99 | + def get_audio_slice(self, slice: int = 0) -> np.ndarray: |
| 100 | + """Get slice of audio. |
| 101 | +
|
| 102 | + Args: |
| 103 | + slice (`int`): slice number of audio (out of get_number_of_slices()) |
| 104 | +
|
| 105 | + Returns: |
| 106 | + `np.ndarray`: audio as numpy array |
| 107 | + """ |
| 108 | + return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)] |
| 109 | + |
| 110 | + def get_sample_rate(self) -> int: |
| 111 | + """Get sample rate: |
| 112 | +
|
| 113 | + Returns: |
| 114 | + `int`: sample rate of audio |
| 115 | + """ |
| 116 | + return self.sr |
| 117 | + |
| 118 | + def audio_slice_to_image(self, slice: int) -> Image.Image: |
| 119 | + """Convert slice of audio to spectrogram. |
| 120 | +
|
| 121 | + Args: |
| 122 | + slice (`int`): slice number of audio to convert (out of get_number_of_slices()) |
| 123 | +
|
| 124 | + Returns: |
| 125 | + `PIL Image`: grayscale image of x_res x y_res |
| 126 | + """ |
| 127 | + S = librosa.feature.melspectrogram( |
| 128 | + y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels |
| 129 | + ) |
| 130 | + log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db) |
| 131 | + bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8) |
| 132 | + image = Image.fromarray(bytedata) |
| 133 | + return image |
| 134 | + |
| 135 | + def image_to_audio(self, image: Image.Image) -> np.ndarray: |
| 136 | + """Converts spectrogram to audio. |
| 137 | +
|
| 138 | + Args: |
| 139 | + image (`PIL Image`): x_res x y_res grayscale image |
| 140 | +
|
| 141 | + Returns: |
| 142 | + audio (`np.ndarray`): raw audio |
| 143 | + """ |
| 144 | + bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width)) |
| 145 | + log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db |
| 146 | + S = librosa.db_to_power(log_S) |
| 147 | + audio = librosa.feature.inverse.mel_to_audio( |
| 148 | + S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter |
| 149 | + ) |
| 150 | + return audio |
0 commit comments