__init__.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. import librosa
  2. import numpy as np
  3. import pyloudnorm as pyln
  4. import torch
  5. from scipy.signal import get_window
  6. from utils.audio.dct import dct
  7. from utils.audio.vad import trim_long_silences
  8. def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
  9. '''compute right padding (final frame) or both sides padding (first and final frames)
  10. '''
  11. assert pad_sides in (1, 2)
  12. # return int(fsize // 2)
  13. pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
  14. if pad_sides == 1:
  15. return 0, pad
  16. else:
  17. return pad // 2, pad // 2 + pad % 2
  18. def amp_to_db(x):
  19. return 20 * np.log10(np.maximum(1e-5, x))
  20. def db_to_amp(x):
  21. return 10.0 ** (x * 0.05)
  22. def normalize(S, min_level_db):
  23. return (S - min_level_db) / -min_level_db
  24. def denormalize(D, min_level_db):
  25. return (D * -min_level_db) + min_level_db
  26. def librosa_wav2spec(wav_path,
  27. fft_size=None,
  28. hop_size=256,
  29. win_length=1024,
  30. window="hann",
  31. num_mels=80,
  32. fmin=80,
  33. fmax=-1,
  34. eps=1e-6,
  35. sample_rate=22050,
  36. loud_norm=False,
  37. trim_long_sil=False,
  38. center=True):
  39. if isinstance(wav_path, str):
  40. if trim_long_sil:
  41. wav, _, _ = trim_long_silences(wav_path, sample_rate)
  42. else:
  43. wav, _ = librosa.core.load(wav_path, sr=sample_rate)
  44. else:
  45. wav = wav_path
  46. if fft_size is None:
  47. fft_size = win_length
  48. if loud_norm:
  49. meter = pyln.Meter(sample_rate) # create BS.1770 meter
  50. loudness = meter.integrated_loudness(wav)
  51. wav = pyln.normalize.loudness(wav, loudness, -16.0)
  52. if np.abs(wav).max() > 1:
  53. wav = wav / np.abs(wav).max()
  54. # get amplitude spectrogram
  55. x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
  56. win_length=win_length, window=window, center=center)
  57. linear_spc = np.abs(x_stft) # (n_bins, T)
  58. # get mel basis
  59. fmin = 0 if fmin == -1 else fmin
  60. fmax = sample_rate / 2 if fmax == -1 else fmax
  61. mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=fmin, fmax=fmax)
  62. # calculate mel spec
  63. mel = mel_basis @ linear_spc
  64. mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T)
  65. if center:
  66. l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1)
  67. wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
  68. wav = wav[:mel.shape[1] * hop_size]
  69. # log linear spec
  70. linear_spc = np.log10(np.maximum(eps, linear_spc))
  71. return {'wav': wav, 'mel': mel.T, 'linear': linear_spc.T, 'mel_basis': mel_basis}
  72. def librosa_wav2mfcc(wav_path,
  73. fft_size=None,
  74. hop_size=256,
  75. win_length=1024,
  76. window="hann",
  77. num_mels=80,
  78. fmin=80,
  79. fmax=-1,
  80. sample_rate=22050,
  81. center=True):
  82. if isinstance(wav_path, str):
  83. wav, _ = librosa.core.load(wav_path, sr=sample_rate)
  84. else:
  85. wav = wav_path
  86. mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13,
  87. n_fft=fft_size, n_mels=num_mels, fmin=fmin, fmax=fmax,
  88. hop_length=hop_size,
  89. win_length=win_length, window=window, center=center)
  90. return mfcc.T
  91. def torch_wav2spec(wav,
  92. mel_basis,
  93. fft_size=1024,
  94. hop_size=256,
  95. win_length=1024,
  96. eps=1e-6):
  97. fft_window = get_window('hann', win_length, fftbins=True)
  98. fft_window = torch.FloatTensor(fft_window).to(wav.device)
  99. mel_basis = torch.FloatTensor(mel_basis).to(wav.device)
  100. x_stft = torch.stft(wav, fft_size, hop_size, win_length, fft_window,
  101. center=False, pad_mode='constant', normalized=False, onesided=True, return_complex=True)
  102. linear_spc = torch.abs(x_stft)
  103. mel = mel_basis @ linear_spc
  104. mel = torch.log10(torch.clamp_min(mel, eps)) # (n_mel_bins, T)
  105. return mel.transpose(1, 2)
  106. def mel2mfcc_torch(mel, n_coef=13):
  107. return dct(mel, norm='ortho')[:, :, :n_coef]
  108. def librosa_wav2linearspec(wav_path,
  109. fft_size=None,
  110. hop_size=256,
  111. win_length=1024,
  112. window="hann",
  113. num_mels=80,
  114. fmin=80,
  115. fmax=-1,
  116. eps=1e-6,
  117. sample_rate=22050,
  118. loud_norm=False,
  119. trim_long_sil=False,
  120. center=True):
  121. if isinstance(wav_path, str):
  122. if trim_long_sil:
  123. wav, _, _ = trim_long_silences(wav_path, sample_rate)
  124. else:
  125. wav, _ = librosa.core.load(wav_path, sr=sample_rate)
  126. else:
  127. wav = wav_path
  128. if fft_size is None:
  129. fft_size = win_length
  130. if loud_norm:
  131. meter = pyln.Meter(sample_rate) # create BS.1770 meter
  132. loudness = meter.integrated_loudness(wav)
  133. wav = pyln.normalize.loudness(wav, loudness, -16.0)
  134. if np.abs(wav).max() > 1:
  135. wav = wav / np.abs(wav).max()
  136. # get amplitude spectrogram
  137. x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
  138. win_length=win_length, window=window, center=center)
  139. linear_spc = np.abs(x_stft) # (n_bins, T)
  140. # pad wav
  141. if center:
  142. l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1)
  143. wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
  144. wav = wav[:linear_spc.shape[1] * hop_size]
  145. # log linear spec
  146. linear_spc = np.log10(np.maximum(eps, linear_spc))
  147. return {'wav': wav, 'linear': linear_spc.T}
  148. def librosa_linear2mel(linear_spec, hparams, num_mels=160, eps=1e-6):
  149. fft_size=hparams['fft_size']
  150. hop_size=hparams['hop_size']
  151. win_length=hparams['win_size']
  152. fmin=hparams['fmin']
  153. fmax=hparams['fmax']
  154. sample_rate=hparams['audio_sample_rate']
  155. # get mel basis
  156. fmin = 0 if fmin == -1 else fmin
  157. fmax = sample_rate / 2 if fmax == -1 else fmax
  158. mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
  159. mel_basis = torch.FloatTensor(mel_basis).to(linear_spec.device)[None, :].repeat(linear_spec.shape[0], 1, 1)
  160. # perform linear spec to mel spec
  161. linear_spec = torch.pow(10, linear_spec)
  162. mel = torch.bmm(mel_basis, linear_spec.transpose(1, 2))
  163. mel = torch.log10(torch.clamp_min(mel, eps)) # (n_mel_bins, T)
  164. return mel.transpose(1, 2)