vad_arguments.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. from dataclasses import dataclass, field
  2. @dataclass
  3. class VADHandlerArguments:
  4. thresh: float = field(
  5. default=0.3,
  6. metadata={
  7. "help": "The threshold value for voice activity detection (VAD). Values typically range from 0 to 1, with higher values requiring higher confidence in speech detection."
  8. },
  9. )
  10. sample_rate: int = field(
  11. default=16000,
  12. metadata={
  13. "help": "The sample rate of the audio in Hertz. Default is 16000 Hz, which is a common setting for voice audio."
  14. },
  15. )
  16. min_silence_ms: int = field(
  17. default=250,
  18. metadata={
  19. "help": "Minimum length of silence intervals to be used for segmenting speech. Measured in milliseconds. Default is 250 ms."
  20. },
  21. )
  22. min_speech_ms: int = field(
  23. default=500,
  24. metadata={
  25. "help": "Minimum length of speech segments to be considered valid speech. Measured in milliseconds. Default is 500 ms."
  26. },
  27. )
  28. max_speech_ms: float = field(
  29. default=float("inf"),
  30. metadata={
  31. "help": "Maximum length of continuous speech before forcing a split. Default is infinite, allowing for uninterrupted speech segments."
  32. },
  33. )
  34. speech_pad_ms: int = field(
  35. default=500,
  36. metadata={
  37. "help": "Amount of padding added to the beginning and end of detected speech segments. Measured in milliseconds. Default is 500 ms."
  38. },
  39. )
  40. audio_enhancement: bool = field(
  41. default=False,
  42. metadata={
  43. "help": "improves sound quality by applying techniques like noise reduction, equalization, and echo cancellation. Default is False."
  44. },
  45. )