launch.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. #!/usr/bin/env python3
  2. # -*- encoding: utf-8 -*-
  3. # Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
  4. # MIT License (https://opensource.org/licenses/MIT)
  5. from http import server
  6. import os
  7. import logging
  8. import argparse
  9. import gradio as gr
  10. from funasr import AutoModel
  11. from videoclipper import VideoClipper
  12. from llm.openai_api import openai_call
  13. from llm.qwen_api import call_qwen_model
  14. from llm.g4f_openai_api import g4f_openai_call
  15. from utils.trans_utils import extract_timestamps
  16. from introduction import top_md_1, top_md_3, top_md_4
  17. if __name__ == "__main__":
  18. parser = argparse.ArgumentParser(description='argparse testing')
  19. parser.add_argument('--lang', '-l', type=str, default = "zh", help="language")
  20. parser.add_argument('--share', '-s', action='store_true', help="if to establish gradio share link")
  21. parser.add_argument('--port', '-p', type=int, default=7860, help='port number')
  22. parser.add_argument('--listen', action='store_true', help="if to listen to all hosts")
  23. args = parser.parse_args()
  24. if args.lang == 'zh':
  25. funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
  26. vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
  27. punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
  28. spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
  29. )
  30. else:
  31. funasr_model = AutoModel(model="iic/speech_paraformer_asr-en-16k-vocab4199-pytorch",
  32. vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
  33. punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
  34. spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
  35. )
  36. audio_clipper = VideoClipper(funasr_model)
  37. audio_clipper.lang = args.lang
  38. server_name='127.0.0.1'
  39. if args.listen:
  40. server_name = '0.0.0.0'
  41. def audio_recog(audio_input, sd_switch, hotwords, output_dir):
  42. return audio_clipper.recog(audio_input, sd_switch, None, hotwords, output_dir=output_dir)
  43. def video_recog(video_input, sd_switch, hotwords, output_dir):
  44. return audio_clipper.video_recog(video_input, sd_switch, hotwords, output_dir=output_dir)
  45. def video_clip(dest_text, video_spk_input, start_ost, end_ost, state, output_dir):
  46. return audio_clipper.video_clip(
  47. dest_text, start_ost, end_ost, state, dest_spk=video_spk_input, output_dir=output_dir
  48. )
  49. def mix_recog(video_input, audio_input, hotwords, output_dir):
  50. output_dir = output_dir.strip()
  51. if not len(output_dir):
  52. output_dir = None
  53. else:
  54. output_dir = os.path.abspath(output_dir)
  55. audio_state, video_state = None, None
  56. if video_input is not None:
  57. res_text, res_srt, video_state = video_recog(
  58. video_input, 'No', hotwords, output_dir=output_dir)
  59. return res_text, res_srt, video_state, None
  60. if audio_input is not None:
  61. res_text, res_srt, audio_state = audio_recog(
  62. audio_input, 'No', hotwords, output_dir=output_dir)
  63. return res_text, res_srt, None, audio_state
  64. def mix_recog_speaker(video_input, audio_input, hotwords, output_dir):
  65. output_dir = output_dir.strip()
  66. if not len(output_dir):
  67. output_dir = None
  68. else:
  69. output_dir = os.path.abspath(output_dir)
  70. audio_state, video_state = None, None
  71. if video_input is not None:
  72. res_text, res_srt, video_state = video_recog(
  73. video_input, 'Yes', hotwords, output_dir=output_dir)
  74. return res_text, res_srt, video_state, None
  75. if audio_input is not None:
  76. res_text, res_srt, audio_state = audio_recog(
  77. audio_input, 'Yes', hotwords, output_dir=output_dir)
  78. return res_text, res_srt, None, audio_state
  79. def mix_clip(dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
  80. output_dir = output_dir.strip()
  81. if not len(output_dir):
  82. output_dir = None
  83. else:
  84. output_dir = os.path.abspath(output_dir)
  85. if video_state is not None:
  86. clip_video_file, message, clip_srt = audio_clipper.video_clip(
  87. dest_text, start_ost, end_ost, video_state, dest_spk=video_spk_input, output_dir=output_dir)
  88. return clip_video_file, None, message, clip_srt
  89. if audio_state is not None:
  90. (sr, res_audio), message, clip_srt = audio_clipper.clip(
  91. dest_text, start_ost, end_ost, audio_state, dest_spk=video_spk_input, output_dir=output_dir)
  92. return None, (sr, res_audio), message, clip_srt
  93. def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, output_dir, font_size, font_color):
  94. output_dir = output_dir.strip()
  95. if not len(output_dir):
  96. output_dir = None
  97. else:
  98. output_dir = os.path.abspath(output_dir)
  99. return audio_clipper.video_clip(
  100. dest_text, start_ost, end_ost, state,
  101. font_size=font_size, font_color=font_color,
  102. add_sub=True, dest_spk=video_spk_input, output_dir=output_dir
  103. )
  104. def llm_inference(system_content, user_content, srt_text, model, apikey):
  105. SUPPORT_LLM_PREFIX = ['qwen', 'gpt', 'g4f', 'moonshot']
  106. if model.startswith('qwen'):
  107. return call_qwen_model(apikey, model, user_content+'\n'+srt_text, system_content)
  108. if model.startswith('gpt') or model.startswith('moonshot'):
  109. return openai_call(apikey, model, system_content, user_content+'\n'+srt_text)
  110. elif model.startswith('g4f'):
  111. model = "-".join(model.split('-')[1:])
  112. return g4f_openai_call(model, system_content, user_content+'\n'+srt_text)
  113. else:
  114. logging.error("LLM name error, only {} are supported as LLM name prefix."
  115. .format(SUPPORT_LLM_PREFIX))
  116. def AI_clip(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
  117. timestamp_list = extract_timestamps(LLM_res)
  118. output_dir = output_dir.strip()
  119. if not len(output_dir):
  120. output_dir = None
  121. else:
  122. output_dir = os.path.abspath(output_dir)
  123. if video_state is not None:
  124. clip_video_file, message, clip_srt = audio_clipper.video_clip(
  125. dest_text, start_ost, end_ost, video_state,
  126. dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
  127. return clip_video_file, None, message, clip_srt
  128. if audio_state is not None:
  129. (sr, res_audio), message, clip_srt = audio_clipper.clip(
  130. dest_text, start_ost, end_ost, audio_state,
  131. dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
  132. return None, (sr, res_audio), message, clip_srt
  133. def AI_clip_subti(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
  134. timestamp_list = extract_timestamps(LLM_res)
  135. output_dir = output_dir.strip()
  136. if not len(output_dir):
  137. output_dir = None
  138. else:
  139. output_dir = os.path.abspath(output_dir)
  140. if video_state is not None:
  141. clip_video_file, message, clip_srt = audio_clipper.video_clip(
  142. dest_text, start_ost, end_ost, video_state,
  143. dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=True)
  144. return clip_video_file, None, message, clip_srt
  145. if audio_state is not None:
  146. (sr, res_audio), message, clip_srt = audio_clipper.clip(
  147. dest_text, start_ost, end_ost, audio_state,
  148. dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=True)
  149. return None, (sr, res_audio), message, clip_srt
  150. # gradio interface
  151. theme = gr.Theme.load("funclip/utils/theme.json")
  152. with gr.Blocks(theme=theme) as funclip_service:
  153. gr.Markdown(top_md_1)
  154. # gr.Markdown(top_md_2)
  155. gr.Markdown(top_md_3)
  156. gr.Markdown(top_md_4)
  157. video_state, audio_state = gr.State(), gr.State()
  158. with gr.Row():
  159. with gr.Column():
  160. with gr.Row():
  161. video_input = gr.Video(label="视频输入 | Video Input")
  162. audio_input = gr.Audio(label="音频输入 | Audio Input")
  163. with gr.Column():
  164. gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E5%A4%9A%E8%AF%BB%E4%B9%A6%EF%BC%9F%E8%BF%99%E6%98%AF%E6%88%91%E5%90%AC%E8%BF%87%E6%9C%80%E5%A5%BD%E7%9A%84%E7%AD%94%E6%A1%88-%E7%89%87%E6%AE%B5.mp4',
  165. 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/2022%E4%BA%91%E6%A0%96%E5%A4%A7%E4%BC%9A_%E7%89%87%E6%AE%B52.mp4',
  166. 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%BD%BF%E7%94%A8chatgpt_%E7%89%87%E6%AE%B5.mp4'],
  167. [video_input],
  168. label='示例视频 | Demo Video')
  169. gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E8%AE%BF%E8%B0%88.mp4'],
  170. [video_input],
  171. label='多说话人示例视频 | Multi-speaker Demo Video')
  172. gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E9%B2%81%E8%82%83%E9%87%87%E8%AE%BF%E7%89%87%E6%AE%B51.wav'],
  173. [audio_input],
  174. label="示例音频 | Demo Audio")
  175. with gr.Column():
  176. # with gr.Row():
  177. # video_sd_switch = gr.Radio(["No", "Yes"], label="👥区分说话人 Get Speakers", value='No')
  178. hotwords_input = gr.Textbox(label="🚒 热词 | Hotwords(可以为空,多个热词使用空格分隔,仅支持中文热词)")
  179. output_dir = gr.Textbox(label="📁 文件输出路径 | File Output Dir (可以为空,Linux, mac系统可以稳定使用)", value=" ")
  180. with gr.Row():
  181. recog_button = gr.Button("👂 识别 | ASR", variant="primary")
  182. recog_button2 = gr.Button("👂👫 识别+区分说话人 | ASR+SD")
  183. video_text_output = gr.Textbox(label="✏️ 识别结果 | Recognition Result")
  184. video_srt_output = gr.Textbox(label="📖 SRT字幕内容 | RST Subtitles")
  185. with gr.Column():
  186. with gr.Tab("🧠 LLM智能裁剪 | LLM Clipping"):
  187. with gr.Column():
  188. prompt_head = gr.Textbox(label="Prompt System (按需更改,最好不要变动主体和要求)", value=("你是一个视频srt字幕分析剪辑器,输入视频的srt字幕,"
  189. "分析其中的精彩且尽可能连续的片段并裁剪出来,输出四条以内的片段,将片段中在时间上连续的多个句子及它们的时间戳合并为一条,"
  190. "注意确保文字与时间戳的正确匹配。输出需严格按照如下格式:1. [开始时间-结束时间] 文本,注意其中的连接符是“-”"))
  191. prompt_head2 = gr.Textbox(label="Prompt User(不需要修改,会自动拼接左下角的srt字幕)", value=("这是待裁剪的视频srt字幕:"))
  192. with gr.Column():
  193. with gr.Row():
  194. llm_model = gr.Dropdown(
  195. choices=["qwen-plus",
  196. "gpt-3.5-turbo",
  197. "gpt-3.5-turbo-0125",
  198. "gpt-4-turbo",
  199. "g4f-gpt-3.5-turbo"],
  200. value="qwen-plus",
  201. label="LLM Model Name",
  202. allow_custom_value=True)
  203. apikey_input = gr.Textbox(label="APIKEY")
  204. llm_button = gr.Button("LLM推理 | LLM Inference(首先进行识别,非g4f需配置对应apikey)", variant="primary")
  205. llm_result = gr.Textbox(label="LLM Clipper Result")
  206. with gr.Row():
  207. llm_clip_button = gr.Button("🧠 LLM智能裁剪 | AI Clip", variant="primary")
  208. llm_clip_subti_button = gr.Button("🧠 LLM智能裁剪+字幕 | AI Clip+Subtitles")
  209. with gr.Tab("✂️ 根据文本/说话人裁剪 | Text/Speaker Clipping"):
  210. video_text_input = gr.Textbox(label="✏️ 待裁剪文本 | Text to Clip (多段文本使用'#'连接)")
  211. video_spk_input = gr.Textbox(label="✏️ 待裁剪说话人 | Speaker to Clip (多个说话人使用'#'连接)")
  212. with gr.Row():
  213. clip_button = gr.Button("✂️ 裁剪 | Clip", variant="primary")
  214. clip_subti_button = gr.Button("✂️ 裁剪+字幕 | Clip+Subtitles")
  215. with gr.Row():
  216. video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪ 开始位置偏移 | Start Offset (ms)")
  217. video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩ 结束位置偏移 | End Offset (ms)")
  218. with gr.Row():
  219. font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠 字幕字体大小 | Subtitle Font Size")
  220. font_color = gr.Radio(["black", "white", "green", "red"], label="🌈 字幕颜色 | Subtitle Color", value='white')
  221. # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
  222. video_output = gr.Video(label="裁剪结果 | Video Clipped")
  223. audio_output = gr.Audio(label="裁剪结果 | Audio Clipped")
  224. clip_message = gr.Textbox(label="⚠️ 裁剪信息 | Clipping Log")
  225. srt_clipped = gr.Textbox(label="📖 裁剪部分SRT字幕内容 | Clipped RST Subtitles")
  226. recog_button.click(mix_recog,
  227. inputs=[video_input,
  228. audio_input,
  229. hotwords_input,
  230. output_dir,
  231. ],
  232. outputs=[video_text_output, video_srt_output, video_state, audio_state])
  233. recog_button2.click(mix_recog_speaker,
  234. inputs=[video_input,
  235. audio_input,
  236. hotwords_input,
  237. output_dir,
  238. ],
  239. outputs=[video_text_output, video_srt_output, video_state, audio_state])
  240. clip_button.click(mix_clip,
  241. inputs=[video_text_input,
  242. video_spk_input,
  243. video_start_ost,
  244. video_end_ost,
  245. video_state,
  246. audio_state,
  247. output_dir
  248. ],
  249. outputs=[video_output, audio_output, clip_message, srt_clipped])
  250. clip_subti_button.click(video_clip_addsub,
  251. inputs=[video_text_input,
  252. video_spk_input,
  253. video_start_ost,
  254. video_end_ost,
  255. video_state,
  256. output_dir,
  257. font_size,
  258. font_color,
  259. ],
  260. outputs=[video_output, clip_message, srt_clipped])
  261. llm_button.click(llm_inference,
  262. inputs=[prompt_head, prompt_head2, video_srt_output, llm_model, apikey_input],
  263. outputs=[llm_result])
  264. llm_clip_button.click(AI_clip,
  265. inputs=[llm_result,
  266. video_text_input,
  267. video_spk_input,
  268. video_start_ost,
  269. video_end_ost,
  270. video_state,
  271. audio_state,
  272. output_dir,
  273. ],
  274. outputs=[video_output, audio_output, clip_message, srt_clipped])
  275. llm_clip_subti_button.click(AI_clip_subti,
  276. inputs=[llm_result,
  277. video_text_input,
  278. video_spk_input,
  279. video_start_ost,
  280. video_end_ost,
  281. video_state,
  282. audio_state,
  283. output_dir,
  284. ],
  285. outputs=[video_output, audio_output, clip_message, srt_clipped])
  286. # start gradio service in local or share
  287. if args.listen:
  288. funclip_service.launch(share=args.share, server_port=args.port, server_name=server_name, inbrowser=False)
  289. else:
  290. funclip_service.launch(share=args.share, server_port=args.port, server_name=server_name)