123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- import torch
- class VADIterator:
- def __init__(
- self,
- model,
- threshold: float = 0.5,
- sampling_rate: int = 16000,
- min_silence_duration_ms: int = 100,
- speech_pad_ms: int = 30,
- ):
- """
- Mainly taken from https://github.com/snakers4/silero-vad
- Class for stream imitation
- Parameters
- ----------
- model: preloaded .jit/.onnx silero VAD model
- threshold: float (default - 0.5)
- Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
- It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
- sampling_rate: int (default - 16000)
- Currently silero VAD models support 8000 and 16000 sample rates
- min_silence_duration_ms: int (default - 100 milliseconds)
- In the end of each speech chunk wait for min_silence_duration_ms before separating it
- speech_pad_ms: int (default - 30 milliseconds)
- Final speech chunks are padded by speech_pad_ms each side
- """
- self.model = model
- self.threshold = threshold
- self.sampling_rate = sampling_rate
- self.is_speaking = False
- self.buffer = []
- if sampling_rate not in [8000, 16000]:
- raise ValueError(
- "VADIterator does not support sampling rates other than [8000, 16000]"
- )
- self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
- self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
- self.reset_states()
- def reset_states(self):
- self.model.reset_states()
- self.triggered = False
- self.temp_end = 0
- self.current_sample = 0
- @torch.no_grad()
- def __call__(self, x):
- """
- x: torch.Tensor
- audio chunk (see examples in repo)
- return_seconds: bool (default - False)
- whether return timestamps in seconds (default - samples)
- """
- if not torch.is_tensor(x):
- try:
- x = torch.Tensor(x)
- except Exception:
- raise TypeError("Audio cannot be casted to tensor. Cast it manually")
- window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
- self.current_sample += window_size_samples
- speech_prob = self.model(x, self.sampling_rate).item()
- if (speech_prob >= self.threshold) and self.temp_end:
- self.temp_end = 0
- if (speech_prob >= self.threshold) and not self.triggered:
- self.triggered = True
- return None
- if (speech_prob < self.threshold - 0.15) and self.triggered:
- if not self.temp_end:
- self.temp_end = self.current_sample
- if self.current_sample - self.temp_end < self.min_silence_samples:
- return None
- else:
- # end of speak
- self.temp_end = 0
- self.triggered = False
- spoken_utterance = self.buffer
- self.buffer = []
- return spoken_utterance
- if self.triggered:
- self.buffer.append(x)
- return None
|