webui.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. # coding=utf-8
  2. import os
  3. import librosa
  4. import base64
  5. import io
  6. import gradio as gr
  7. import re
  8. import numpy as np
  9. import torch
  10. import torchaudio
  11. from funasr import AutoModel
  12. model = "iic/SenseVoiceSmall"
  13. model = AutoModel(model=model,
  14. vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
  15. vad_kwargs={"max_single_segment_time": 30000},
  16. trust_remote_code=True,
  17. )
  18. import re
  19. emo_dict = {
  20. "<|HAPPY|>": "😊",
  21. "<|SAD|>": "😔",
  22. "<|ANGRY|>": "😡",
  23. "<|NEUTRAL|>": "",
  24. "<|FEARFUL|>": "😰",
  25. "<|DISGUSTED|>": "🤢",
  26. "<|SURPRISED|>": "😮",
  27. }
  28. event_dict = {
  29. "<|BGM|>": "🎼",
  30. "<|Speech|>": "",
  31. "<|Applause|>": "👏",
  32. "<|Laughter|>": "😀",
  33. "<|Cry|>": "😭",
  34. "<|Sneeze|>": "🤧",
  35. "<|Breath|>": "",
  36. "<|Cough|>": "🤧",
  37. }
  38. emoji_dict = {
  39. "<|nospeech|><|Event_UNK|>": "❓",
  40. "<|zh|>": "",
  41. "<|en|>": "",
  42. "<|yue|>": "",
  43. "<|ja|>": "",
  44. "<|ko|>": "",
  45. "<|nospeech|>": "",
  46. "<|HAPPY|>": "😊",
  47. "<|SAD|>": "😔",
  48. "<|ANGRY|>": "😡",
  49. "<|NEUTRAL|>": "",
  50. "<|BGM|>": "🎼",
  51. "<|Speech|>": "",
  52. "<|Applause|>": "👏",
  53. "<|Laughter|>": "😀",
  54. "<|FEARFUL|>": "😰",
  55. "<|DISGUSTED|>": "🤢",
  56. "<|SURPRISED|>": "😮",
  57. "<|Cry|>": "😭",
  58. "<|EMO_UNKNOWN|>": "",
  59. "<|Sneeze|>": "🤧",
  60. "<|Breath|>": "",
  61. "<|Cough|>": "😷",
  62. "<|Sing|>": "",
  63. "<|Speech_Noise|>": "",
  64. "<|withitn|>": "",
  65. "<|woitn|>": "",
  66. "<|GBG|>": "",
  67. "<|Event_UNK|>": "",
  68. }
  69. lang_dict = {
  70. "<|zh|>": "<|lang|>",
  71. "<|en|>": "<|lang|>",
  72. "<|yue|>": "<|lang|>",
  73. "<|ja|>": "<|lang|>",
  74. "<|ko|>": "<|lang|>",
  75. "<|nospeech|>": "<|lang|>",
  76. }
  77. emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
  78. event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}
  79. def format_str(s):
  80. for sptk in emoji_dict:
  81. s = s.replace(sptk, emoji_dict[sptk])
  82. return s
  83. def format_str_v2(s):
  84. sptk_dict = {}
  85. for sptk in emoji_dict:
  86. sptk_dict[sptk] = s.count(sptk)
  87. s = s.replace(sptk, "")
  88. emo = "<|NEUTRAL|>"
  89. for e in emo_dict:
  90. if sptk_dict[e] > sptk_dict[emo]:
  91. emo = e
  92. for e in event_dict:
  93. if sptk_dict[e] > 0:
  94. s = event_dict[e] + s
  95. s = s + emo_dict[emo]
  96. for emoji in emo_set.union(event_set):
  97. s = s.replace(" " + emoji, emoji)
  98. s = s.replace(emoji + " ", emoji)
  99. return s.strip()
  100. def format_str_v3(s):
  101. def get_emo(s):
  102. return s[-1] if s[-1] in emo_set else None
  103. def get_event(s):
  104. return s[0] if s[0] in event_set else None
  105. s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
  106. for lang in lang_dict:
  107. s = s.replace(lang, "<|lang|>")
  108. s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
  109. new_s = " " + s_list[0]
  110. cur_ent_event = get_event(new_s)
  111. for i in range(1, len(s_list)):
  112. if len(s_list[i]) == 0:
  113. continue
  114. if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
  115. s_list[i] = s_list[i][1:]
  116. #else:
  117. cur_ent_event = get_event(s_list[i])
  118. if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
  119. new_s = new_s[:-1]
  120. new_s += s_list[i].strip().lstrip()
  121. new_s = new_s.replace("The.", " ")
  122. return new_s.strip()
  123. def model_inference(input_wav, language, fs=16000):
  124. # task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
  125. language_abbr = {"auto": "auto", "zh": "zh", "en": "en", "yue": "yue", "ja": "ja", "ko": "ko",
  126. "nospeech": "nospeech"}
  127. # task = "Speech Recognition" if task is None else task
  128. language = "auto" if len(language) < 1 else language
  129. selected_language = language_abbr[language]
  130. # selected_task = task_abbr.get(task)
  131. # print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
  132. if isinstance(input_wav, tuple):
  133. fs, input_wav = input_wav
  134. input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
  135. if len(input_wav.shape) > 1:
  136. input_wav = input_wav.mean(-1)
  137. if fs != 16000:
  138. print(f"audio_fs: {fs}")
  139. resampler = torchaudio.transforms.Resample(fs, 16000)
  140. input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
  141. input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
  142. merge_vad = True #False if selected_task == "ASR" else True
  143. print(f"language: {language}, merge_vad: {merge_vad}")
  144. text = model.generate(input=input_wav,
  145. cache={},
  146. language=language,
  147. use_itn=True,
  148. batch_size_s=0, merge_vad=merge_vad)
  149. print(text)
  150. text = text[0]["text"]
  151. text = format_str_v3(text)
  152. print(text)
  153. return text
  154. audio_examples = [
  155. ["example/zh.mp3", "zh"],
  156. ["example/yue.mp3", "yue"],
  157. ["example/en.mp3", "en"],
  158. ["example/ja.mp3", "ja"],
  159. ["example/ko.mp3", "ko"],
  160. ["example/emo_1.wav", "auto"],
  161. ["example/emo_2.wav", "auto"],
  162. ["example/emo_3.wav", "auto"],
  163. #["example/emo_4.wav", "auto"],
  164. #["example/event_1.wav", "auto"],
  165. #["example/event_2.wav", "auto"],
  166. #["example/event_3.wav", "auto"],
  167. ["example/rich_1.wav", "auto"],
  168. ["example/rich_2.wav", "auto"],
  169. #["example/rich_3.wav", "auto"],
  170. ["example/longwav_1.wav", "auto"],
  171. ["example/longwav_2.wav", "auto"],
  172. ["example/longwav_3.wav", "auto"],
  173. #["example/longwav_4.wav", "auto"],
  174. ]
  175. html_content = """
  176. <div>
  177. <h2 style="font-size: 22px;margin-left: 0px;">Voice Understanding Model: SenseVoice-Small</h2>
  178. <p style="font-size: 18px;margin-left: 20px;">SenseVoice-Small is an encoder-only speech foundation model designed for rapid voice understanding. It encompasses a variety of features including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and acoustic event detection (AED). SenseVoice-Small supports multilingual recognition for Chinese, English, Cantonese, Japanese, and Korean. Additionally, it offers exceptionally low inference latency, performing 7 times faster than Whisper-small and 17 times faster than Whisper-large.</p>
  179. <h2 style="font-size: 22px;margin-left: 0px;">Usage</h2> <p style="font-size: 18px;margin-left: 20px;">Upload an audio file or input through a microphone, then select the task and language. the audio is transcribed into corresponding text along with associated emotions (😊 happy, 😡 angry/exicting, 😔 sad) and types of sound events (😀 laughter, 🎼 music, 👏 applause, 🤧 cough&sneeze, 😭 cry). The event labels are placed in the front of the text and the emotion are in the back of the text.</p>
  180. <p style="font-size: 18px;margin-left: 20px;">Recommended audio input duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended.</p>
  181. <h2 style="font-size: 22px;margin-left: 0px;">Repo</h2>
  182. <p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">SenseVoice</a>: multilingual speech understanding model</p>
  183. <p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/modelscope/FunASR" target="_blank">FunASR</a>: fundamental speech recognition toolkit</p>
  184. <p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/modelscope/CosyVoice" target="_blank">CosyVoice</a>: high-quality multilingual TTS model</p>
  185. </div>
  186. """
  187. def launch():
  188. with gr.Blocks(theme=gr.themes.Soft()) as demo:
  189. # gr.Markdown(description)
  190. gr.HTML(html_content)
  191. with gr.Row():
  192. with gr.Column():
  193. audio_inputs = gr.Audio(label="Upload audio or use the microphone")
  194. with gr.Accordion("Configuration"):
  195. # task_inputs = gr.Radio(choices=["Speech Recognition", "Rich Text Transcription"],
  196. # value="Speech Recognition", label="Task")
  197. language_inputs = gr.Dropdown(choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
  198. value="auto",
  199. label="Language")
  200. fn_button = gr.Button("Start", variant="primary")
  201. text_outputs = gr.Textbox(label="Results")
  202. gr.Examples(examples=audio_examples, inputs=[audio_inputs, language_inputs], examples_per_page=20)
  203. fn_button.click(model_inference, inputs=[audio_inputs, language_inputs], outputs=text_outputs)
  204. # with gr.Accordion("More examples"):
  205. # gr.HTML(centered_table_html)
  206. demo.launch()
  207. if __name__ == "__main__":
  208. # iface.launch()
  209. launch()