whisper_stt_arguments.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. from dataclasses import dataclass, field
  2. from typing import Optional
  3. @dataclass
  4. class WhisperSTTHandlerArguments:
  5. stt_model_name: str = field(
  6. default="distil-whisper/distil-large-v3",
  7. metadata={
  8. "help": "The pretrained Whisper model to use. Default is 'distil-whisper/distil-large-v3'."
  9. },
  10. )
  11. stt_device: str = field(
  12. default="cuda",
  13. metadata={
  14. "help": "The device type on which the model will run. Default is 'cuda' for GPU acceleration."
  15. },
  16. )
  17. stt_torch_dtype: str = field(
  18. default="float16",
  19. metadata={
  20. "help": "The PyTorch data type for the model and input tensors. One of `float32` (full-precision), `float16` or `bfloat16` (both half-precision)."
  21. },
  22. )
  23. stt_compile_mode: str = field(
  24. default=None,
  25. metadata={
  26. "help": "Compile mode for torch compile. Either 'default', 'reduce-overhead' and 'max-autotune'. Default is None (no compilation)"
  27. },
  28. )
  29. stt_gen_max_new_tokens: int = field(
  30. default=128,
  31. metadata={
  32. "help": "The maximum number of new tokens to generate. Default is 128."
  33. },
  34. )
  35. stt_gen_num_beams: int = field(
  36. default=1,
  37. metadata={
  38. "help": "The number of beams for beam search. Default is 1, implying greedy decoding."
  39. },
  40. )
  41. stt_gen_return_timestamps: bool = field(
  42. default=False,
  43. metadata={
  44. "help": "Whether to return timestamps with transcriptions. Default is False."
  45. },
  46. )
  47. stt_gen_task: str = field(
  48. default="transcribe",
  49. metadata={
  50. "help": "The task to perform, typically 'transcribe' for transcription. Default is 'transcribe'."
  51. },
  52. )
  53. language: Optional[str] = field(
  54. default='en',
  55. metadata={
  56. "help": """The language for the conversation.
  57. Choose between 'en' (english), 'fr' (french), 'es' (spanish),
  58. 'zh' (chinese), 'ko' (korean), 'ja' (japanese), 'hi' (hindi) or 'None'.
  59. If using 'auto', the language is automatically detected and can
  60. change during the conversation. Default is 'en'."""
  61. },
  62. )