parler_tts_arguments.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. from dataclasses import dataclass, field
  2. @dataclass
  3. class ParlerTTSHandlerArguments:
  4. tts_model_name: str = field(
  5. default="ylacombe/parler-tts-mini-jenny-30H",
  6. metadata={
  7. "help": "The pretrained TTS model to use. Default is 'ylacombe/parler-tts-mini-jenny-30H'."
  8. },
  9. )
  10. tts_device: str = field(
  11. default="cuda",
  12. metadata={
  13. "help": "The device type on which the model will run. Default is 'cuda' for GPU acceleration."
  14. },
  15. )
  16. tts_torch_dtype: str = field(
  17. default="float16",
  18. metadata={
  19. "help": "The PyTorch data type for the model and input tensors. One of `float32` (full-precision), `float16` or `bfloat16` (both half-precision)."
  20. },
  21. )
  22. tts_compile_mode: str = field(
  23. default=None,
  24. metadata={
  25. "help": "Compile mode for torch compile. Either 'default', 'reduce-overhead' and 'max-autotune'. Default is None (no compilation)"
  26. },
  27. )
  28. tts_gen_min_new_tokens: int = field(
  29. default=64,
  30. metadata={
  31. "help": "Maximum number of new tokens to generate in a single completion. Default is 64, which corresponds to ~0.64 secs"
  32. },
  33. )
  34. tts_gen_max_new_tokens: int = field(
  35. default=512,
  36. metadata={
  37. "help": "Maximum number of new tokens to generate in a single completion. Default is 512, which corresponds to ~12 secs"
  38. },
  39. )
  40. description: str = field(
  41. default=(
  42. "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. "
  43. "She speaks very fast."
  44. ),
  45. metadata={
  46. "help": "Description of the speaker's voice and speaking style to guide the TTS model."
  47. },
  48. )
  49. play_steps_s: float = field(
  50. default=1.0,
  51. metadata={
  52. "help": "The time interval in seconds for playing back the generated speech in steps. Default is 1.0 seconds."
  53. },
  54. )
  55. max_prompt_pad_length: int = field(
  56. default=8,
  57. metadata={
  58. "help": "When using compilation, the prompt as to be padded to closest power of 2. This parameters sets the maximun power of 2 possible."
  59. },
  60. )