vad_iterator.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. import torch
  2. class VADIterator:
  3. def __init__(
  4. self,
  5. model,
  6. threshold: float = 0.5,
  7. sampling_rate: int = 16000,
  8. min_silence_duration_ms: int = 100,
  9. speech_pad_ms: int = 30,
  10. ):
  11. """
  12. Mainly taken from https://github.com/snakers4/silero-vad
  13. Class for stream imitation
  14. Parameters
  15. ----------
  16. model: preloaded .jit/.onnx silero VAD model
  17. threshold: float (default - 0.5)
  18. Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
  19. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
  20. sampling_rate: int (default - 16000)
  21. Currently silero VAD models support 8000 and 16000 sample rates
  22. min_silence_duration_ms: int (default - 100 milliseconds)
  23. In the end of each speech chunk wait for min_silence_duration_ms before separating it
  24. speech_pad_ms: int (default - 30 milliseconds)
  25. Final speech chunks are padded by speech_pad_ms each side
  26. """
  27. self.model = model
  28. self.threshold = threshold
  29. self.sampling_rate = sampling_rate
  30. self.is_speaking = False
  31. self.buffer = []
  32. if sampling_rate not in [8000, 16000]:
  33. raise ValueError(
  34. "VADIterator does not support sampling rates other than [8000, 16000]"
  35. )
  36. self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
  37. self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
  38. self.reset_states()
  39. def reset_states(self):
  40. self.model.reset_states()
  41. self.triggered = False
  42. self.temp_end = 0
  43. self.current_sample = 0
  44. @torch.no_grad()
  45. def __call__(self, x):
  46. """
  47. x: torch.Tensor
  48. audio chunk (see examples in repo)
  49. return_seconds: bool (default - False)
  50. whether return timestamps in seconds (default - samples)
  51. """
  52. if not torch.is_tensor(x):
  53. try:
  54. x = torch.Tensor(x)
  55. except Exception:
  56. raise TypeError("Audio cannot be casted to tensor. Cast it manually")
  57. window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
  58. self.current_sample += window_size_samples
  59. speech_prob = self.model(x, self.sampling_rate).item()
  60. if (speech_prob >= self.threshold) and self.temp_end:
  61. self.temp_end = 0
  62. if (speech_prob >= self.threshold) and not self.triggered:
  63. self.triggered = True
  64. return None
  65. if (speech_prob < self.threshold - 0.15) and self.triggered:
  66. if not self.temp_end:
  67. self.temp_end = self.current_sample
  68. if self.current_sample - self.temp_end < self.min_silence_samples:
  69. return None
  70. else:
  71. # end of speak
  72. self.temp_end = 0
  73. self.triggered = False
  74. spoken_utterance = self.buffer
  75. self.buffer = []
  76. return spoken_utterance
  77. if self.triggered:
  78. self.buffer.append(x)
  79. return None