crazy_utils.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token, Singleton
  2. import threading
  3. import os
  4. import logging
  5. def input_clipping(inputs, history, max_token_limit):
  6. import numpy as np
  7. from request_llms.bridge_all import model_info
  8. enc = model_info["gpt-3.5-turbo"]['tokenizer']
  9. def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
  10. mode = 'input-and-history'
  11. # 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
  12. input_token_num = get_token_num(inputs)
  13. if input_token_num < max_token_limit//2:
  14. mode = 'only-history'
  15. max_token_limit = max_token_limit - input_token_num
  16. everything = [inputs] if mode == 'input-and-history' else ['']
  17. everything.extend(history)
  18. n_token = get_token_num('\n'.join(everything))
  19. everything_token = [get_token_num(e) for e in everything]
  20. delta = max(everything_token) // 16 # 截断时的颗粒度
  21. while n_token > max_token_limit:
  22. where = np.argmax(everything_token)
  23. encoded = enc.encode(everything[where], disallowed_special=())
  24. clipped_encoded = encoded[:len(encoded)-delta]
  25. everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char
  26. everything_token[where] = get_token_num(everything[where])
  27. n_token = get_token_num('\n'.join(everything))
  28. if mode == 'input-and-history':
  29. inputs = everything[0]
  30. else:
  31. pass
  32. history = everything[1:]
  33. return inputs, history
  34. def request_gpt_model_in_new_thread_with_ui_alive(
  35. inputs, inputs_show_user, llm_kwargs,
  36. chatbot, history, sys_prompt, refresh_interval=0.2,
  37. handle_token_exceed=True,
  38. retry_times_at_unknown_error=2,
  39. ):
  40. """
  41. Request GPT model,请求GPT模型同时维持用户界面活跃。
  42. 输入参数 Args (以_array结尾的输入变量都是列表,列表长度为子任务的数量,执行时,会把列表拆解,放到每个子线程中分别执行):
  43. inputs (string): List of inputs (输入)
  44. inputs_show_user (string): List of inputs to show user(展现在报告中的输入,借助此参数,在汇总报告中隐藏啰嗦的真实输入,增强报告的可读性)
  45. top_p (float): Top p value for sampling from model distribution (GPT参数,浮点数)
  46. temperature (float): Temperature value for sampling from model distribution(GPT参数,浮点数)
  47. chatbot: chatbot inputs and outputs (用户界面对话窗口句柄,用于数据流可视化)
  48. history (list): List of chat history (历史,对话历史列表)
  49. sys_prompt (string): List of system prompts (系统输入,列表,用于输入给GPT的前提提示,比如你是翻译官怎样怎样)
  50. refresh_interval (float, optional): Refresh interval for UI (default: 0.2) (刷新时间间隔频率,建议低于1,不可高于3,仅仅服务于视觉效果)
  51. handle_token_exceed:是否自动处理token溢出的情况,如果选择自动处理,则会在溢出时暴力截断,默认开启
  52. retry_times_at_unknown_error:失败时的重试次数
  53. 输出 Returns:
  54. future: 输出,GPT返回的结果
  55. """
  56. import time
  57. from concurrent.futures import ThreadPoolExecutor
  58. from request_llms.bridge_all import predict_no_ui_long_connection
  59. # 用户反馈
  60. chatbot.append([inputs_show_user, ""])
  61. yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
  62. executor = ThreadPoolExecutor(max_workers=16)
  63. mutable = ["", time.time(), ""]
  64. # 看门狗耐心
  65. watch_dog_patience = 5
  66. # 请求任务
  67. def _req_gpt(inputs, history, sys_prompt):
  68. retry_op = retry_times_at_unknown_error
  69. exceeded_cnt = 0
  70. while True:
  71. # watchdog error
  72. if len(mutable) >= 2 and (time.time()-mutable[1]) > watch_dog_patience:
  73. raise RuntimeError("检测到程序终止。")
  74. try:
  75. # 【第一种情况】:顺利完成
  76. result = predict_no_ui_long_connection(
  77. inputs=inputs, llm_kwargs=llm_kwargs,
  78. history=history, sys_prompt=sys_prompt, observe_window=mutable)
  79. return result
  80. except ConnectionAbortedError as token_exceeded_error:
  81. # 【第二种情况】:Token溢出
  82. if handle_token_exceed:
  83. exceeded_cnt += 1
  84. # 【选择处理】 尝试计算比例,尽可能多地保留文本
  85. from toolbox import get_reduce_token_percent
  86. p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error))
  87. MAX_TOKEN = get_max_token(llm_kwargs)
  88. EXCEED_ALLO = 512 + 512 * exceeded_cnt
  89. inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO)
  90. mutable[0] += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n'
  91. continue # 返回重试
  92. else:
  93. # 【选择放弃】
  94. tb_str = '```\n' + trimmed_format_exc() + '```'
  95. mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
  96. return mutable[0] # 放弃
  97. except:
  98. # 【第三种情况】:其他错误:重试几次
  99. tb_str = '```\n' + trimmed_format_exc() + '```'
  100. print(tb_str)
  101. mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
  102. if retry_op > 0:
  103. retry_op -= 1
  104. mutable[0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n"
  105. if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str):
  106. time.sleep(30)
  107. time.sleep(5)
  108. continue # 返回重试
  109. else:
  110. time.sleep(5)
  111. return mutable[0] # 放弃
  112. # 提交任务
  113. future = executor.submit(_req_gpt, inputs, history, sys_prompt)
  114. while True:
  115. # yield一次以刷新前端页面
  116. time.sleep(refresh_interval)
  117. # “喂狗”(看门狗)
  118. mutable[1] = time.time()
  119. if future.done():
  120. break
  121. chatbot[-1] = [chatbot[-1][0], mutable[0]]
  122. yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
  123. final_result = future.result()
  124. chatbot[-1] = [chatbot[-1][0], final_result]
  125. yield from update_ui(chatbot=chatbot, history=[]) # 如果最后成功了,则删除报错信息
  126. return final_result
  127. def can_multi_process(llm):
  128. if llm.startswith('gpt-'): return True
  129. if llm.startswith('api2d-'): return True
  130. if llm.startswith('azure-'): return True
  131. if llm.startswith('spark'): return True
  132. if llm.startswith('zhipuai'): return True
  133. return False
  134. def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
  135. inputs_array, inputs_show_user_array, llm_kwargs,
  136. chatbot, history_array, sys_prompt_array,
  137. refresh_interval=0.2, max_workers=-1, scroller_max_len=30,
  138. handle_token_exceed=True, show_user_at_complete=False,
  139. retry_times_at_unknown_error=2,
  140. ):
  141. """
  142. Request GPT model using multiple threads with UI and high efficiency
  143. 请求GPT模型的[多线程]版。
  144. 具备以下功能:
  145. 实时在UI上反馈远程数据流
  146. 使用线程池,可调节线程池的大小避免openai的流量限制错误
  147. 处理中途中止的情况
  148. 网络等出问题时,会把traceback和已经接收的数据转入输出
  149. 输入参数 Args (以_array结尾的输入变量都是列表,列表长度为子任务的数量,执行时,会把列表拆解,放到每个子线程中分别执行):
  150. inputs_array (list): List of inputs (每个子任务的输入)
  151. inputs_show_user_array (list): List of inputs to show user(每个子任务展现在报告中的输入,借助此参数,在汇总报告中隐藏啰嗦的真实输入,增强报告的可读性)
  152. llm_kwargs: llm_kwargs参数
  153. chatbot: chatbot (用户界面对话窗口句柄,用于数据流可视化)
  154. history_array (list): List of chat history (历史对话输入,双层列表,第一层列表是子任务分解,第二层列表是对话历史)
  155. sys_prompt_array (list): List of system prompts (系统输入,列表,用于输入给GPT的前提提示,比如你是翻译官怎样怎样)
  156. refresh_interval (float, optional): Refresh interval for UI (default: 0.2) (刷新时间间隔频率,建议低于1,不可高于3,仅仅服务于视觉效果)
  157. max_workers (int, optional): Maximum number of threads (default: see config.py) (最大线程数,如果子任务非常多,需要用此选项防止高频地请求openai导致错误)
  158. scroller_max_len (int, optional): Maximum length for scroller (default: 30)(数据流的显示最后收到的多少个字符,仅仅服务于视觉效果)
  159. handle_token_exceed (bool, optional): (是否在输入过长时,自动缩减文本)
  160. handle_token_exceed:是否自动处理token溢出的情况,如果选择自动处理,则会在溢出时暴力截断,默认开启
  161. show_user_at_complete (bool, optional): (在结束时,把完整输入-输出结果显示在聊天框)
  162. retry_times_at_unknown_error:子任务失败时的重试次数
  163. 输出 Returns:
  164. list: List of GPT model responses (每个子任务的输出汇总,如果某个子任务出错,response中会携带traceback报错信息,方便调试和定位问题。)
  165. """
  166. import time, random
  167. from concurrent.futures import ThreadPoolExecutor
  168. from request_llms.bridge_all import predict_no_ui_long_connection
  169. assert len(inputs_array) == len(history_array)
  170. assert len(inputs_array) == len(sys_prompt_array)
  171. if max_workers == -1: # 读取配置文件
  172. try: max_workers = get_conf('DEFAULT_WORKER_NUM')
  173. except: max_workers = 8
  174. if max_workers <= 0: max_workers = 3
  175. # 屏蔽掉 chatglm的多线程,可能会导致严重卡顿
  176. if not can_multi_process(llm_kwargs['llm_model']):
  177. max_workers = 1
  178. executor = ThreadPoolExecutor(max_workers=max_workers)
  179. n_frag = len(inputs_array)
  180. # 用户反馈
  181. chatbot.append(["请开始多线程操作。", ""])
  182. yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
  183. # 跨线程传递
  184. mutable = [["", time.time(), "等待中"] for _ in range(n_frag)]
  185. # 看门狗耐心
  186. watch_dog_patience = 5
  187. # 子线程任务
  188. def _req_gpt(index, inputs, history, sys_prompt):
  189. gpt_say = ""
  190. retry_op = retry_times_at_unknown_error
  191. exceeded_cnt = 0
  192. mutable[index][2] = "执行中"
  193. detect_timeout = lambda: len(mutable[index]) >= 2 and (time.time()-mutable[index][1]) > watch_dog_patience
  194. while True:
  195. # watchdog error
  196. if detect_timeout(): raise RuntimeError("检测到程序终止。")
  197. try:
  198. # 【第一种情况】:顺利完成
  199. gpt_say = predict_no_ui_long_connection(
  200. inputs=inputs, llm_kwargs=llm_kwargs, history=history,
  201. sys_prompt=sys_prompt, observe_window=mutable[index], console_slience=True
  202. )
  203. mutable[index][2] = "已成功"
  204. return gpt_say
  205. except ConnectionAbortedError as token_exceeded_error:
  206. # 【第二种情况】:Token溢出
  207. if handle_token_exceed:
  208. exceeded_cnt += 1
  209. # 【选择处理】 尝试计算比例,尽可能多地保留文本
  210. from toolbox import get_reduce_token_percent
  211. p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error))
  212. MAX_TOKEN = get_max_token(llm_kwargs)
  213. EXCEED_ALLO = 512 + 512 * exceeded_cnt
  214. inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO)
  215. gpt_say += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n'
  216. mutable[index][2] = f"截断重试"
  217. continue # 返回重试
  218. else:
  219. # 【选择放弃】
  220. tb_str = '```\n' + trimmed_format_exc() + '```'
  221. gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
  222. if len(mutable[index][0]) > 0: gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0]
  223. mutable[index][2] = "输入过长已放弃"
  224. return gpt_say # 放弃
  225. except:
  226. # 【第三种情况】:其他错误
  227. if detect_timeout(): raise RuntimeError("检测到程序终止。")
  228. tb_str = '```\n' + trimmed_format_exc() + '```'
  229. print(tb_str)
  230. gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
  231. if len(mutable[index][0]) > 0: gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0]
  232. if retry_op > 0:
  233. retry_op -= 1
  234. wait = random.randint(5, 20)
  235. if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str):
  236. wait = wait * 3
  237. fail_info = "OpenAI绑定信用卡可解除频率限制 "
  238. else:
  239. fail_info = ""
  240. # 也许等待十几秒后,情况会好转
  241. for i in range(wait):
  242. mutable[index][2] = f"{fail_info}等待重试 {wait-i}"; time.sleep(1)
  243. # 开始重试
  244. if detect_timeout(): raise RuntimeError("检测到程序终止。")
  245. mutable[index][2] = f"重试中 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}"
  246. continue # 返回重试
  247. else:
  248. mutable[index][2] = "已失败"
  249. wait = 5
  250. time.sleep(5)
  251. return gpt_say # 放弃
  252. # 异步任务开始
  253. futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip(
  254. range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
  255. cnt = 0
  256. while True:
  257. # yield一次以刷新前端页面
  258. time.sleep(refresh_interval)
  259. cnt += 1
  260. worker_done = [h.done() for h in futures]
  261. # 更好的UI视觉效果
  262. observe_win = []
  263. # 每个线程都要“喂狗”(看门狗)
  264. for thread_index, _ in enumerate(worker_done):
  265. mutable[thread_index][1] = time.time()
  266. # 在前端打印些好玩的东西
  267. for thread_index, _ in enumerate(worker_done):
  268. print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\
  269. replace('\n', '').replace('`', '.').replace(
  270. ' ', '.').replace('<br/>', '.....').replace('$', '.')+"`... ]"
  271. observe_win.append(print_something_really_funny)
  272. # 在前端打印些好玩的东西
  273. stat_str = ''.join([f'`{mutable[thread_index][2]}`: {obs}\n\n'
  274. if not done else f'`{mutable[thread_index][2]}`\n\n'
  275. for thread_index, done, obs in zip(range(len(worker_done)), worker_done, observe_win)])
  276. # 在前端打印些好玩的东西
  277. chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt % 10+1))]
  278. yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
  279. if all(worker_done):
  280. executor.shutdown()
  281. break
  282. # 异步任务结束
  283. gpt_response_collection = []
  284. for inputs_show_user, f in zip(inputs_show_user_array, futures):
  285. gpt_res = f.result()
  286. gpt_response_collection.extend([inputs_show_user, gpt_res])
  287. # 是否在结束时,在界面上显示结果
  288. if show_user_at_complete:
  289. for inputs_show_user, f in zip(inputs_show_user_array, futures):
  290. gpt_res = f.result()
  291. chatbot.append([inputs_show_user, gpt_res])
  292. yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
  293. time.sleep(0.5)
  294. return gpt_response_collection
  295. def read_and_clean_pdf_text(fp):
  296. """
  297. 这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好
  298. **输入参数说明**
  299. - `fp`:需要读取和清理文本的pdf文件路径
  300. **输出参数说明**
  301. - `meta_txt`:清理后的文本内容字符串
  302. - `page_one_meta`:第一页清理后的文本内容列表
  303. **函数功能**
  304. 读取pdf文件并清理其中的文本内容,清理规则包括:
  305. - 提取所有块元的文本信息,并合并为一个字符串
  306. - 去除短块(字符数小于100)并替换为回车符
  307. - 清理多余的空行
  308. - 合并小写字母开头的段落块并替换为空格
  309. - 清除重复的换行
  310. - 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
  311. """
  312. import fitz, copy
  313. import re
  314. import numpy as np
  315. from colorful import print亮黄, print亮绿
  316. fc = 0 # Index 0 文本
  317. fs = 1 # Index 1 字体
  318. fb = 2 # Index 2 框框
  319. REMOVE_FOOT_NOTE = True # 是否丢弃掉 不是正文的内容 (比正文字体小,如参考文献、脚注、图注等)
  320. REMOVE_FOOT_FFSIZE_PERCENT = 0.95 # 小于正文的?时,判定为不是正文(有些文章的正文部分字体大小不是100%统一的,有肉眼不可见的小变化)
  321. def primary_ffsize(l):
  322. """
  323. 提取文本块主字体
  324. """
  325. fsize_statiscs = {}
  326. for wtf in l['spans']:
  327. if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
  328. fsize_statiscs[wtf['size']] += len(wtf['text'])
  329. return max(fsize_statiscs, key=fsize_statiscs.get)
  330. def ffsize_same(a,b):
  331. """
  332. 提取字体大小是否近似相等
  333. """
  334. return abs((a-b)/max(a,b)) < 0.02
  335. with fitz.open(fp) as doc:
  336. meta_txt = []
  337. meta_font = []
  338. meta_line = []
  339. meta_span = []
  340. ############################## <第 1 步,搜集初始信息> ##################################
  341. for index, page in enumerate(doc):
  342. # file_content += page.get_text()
  343. text_areas = page.get_text("dict") # 获取页面上的文本信息
  344. for t in text_areas['blocks']:
  345. if 'lines' in t:
  346. pf = 998
  347. for l in t['lines']:
  348. txt_line = "".join([wtf['text'] for wtf in l['spans']])
  349. if len(txt_line) == 0: continue
  350. pf = primary_ffsize(l)
  351. meta_line.append([txt_line, pf, l['bbox'], l])
  352. for wtf in l['spans']: # for l in t['lines']:
  353. meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
  354. # meta_line.append(["NEW_BLOCK", pf])
  355. # 块元提取 for each word segment with in line for each line cross-line words for each block
  356. meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
  357. '- ', '') for t in text_areas['blocks'] if 'lines' in t])
  358. meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
  359. for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
  360. if index == 0:
  361. page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
  362. '- ', '') for t in text_areas['blocks'] if 'lines' in t]
  363. ############################## <第 2 步,获取正文主字体> ##################################
  364. try:
  365. fsize_statiscs = {}
  366. for span in meta_span:
  367. if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
  368. fsize_statiscs[span[1]] += span[2]
  369. main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
  370. if REMOVE_FOOT_NOTE:
  371. give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
  372. except:
  373. raise RuntimeError(f'抱歉, 我们暂时无法解析此PDF文档: {fp}。')
  374. ############################## <第 3 步,切分和重新整合> ##################################
  375. mega_sec = []
  376. sec = []
  377. for index, line in enumerate(meta_line):
  378. if index == 0:
  379. sec.append(line[fc])
  380. continue
  381. if REMOVE_FOOT_NOTE:
  382. if meta_line[index][fs] <= give_up_fize_threshold:
  383. continue
  384. if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
  385. # 尝试识别段落
  386. if meta_line[index][fc].endswith('.') and\
  387. (meta_line[index-1][fc] != 'NEW_BLOCK') and \
  388. (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
  389. sec[-1] += line[fc]
  390. sec[-1] += "\n\n"
  391. else:
  392. sec[-1] += " "
  393. sec[-1] += line[fc]
  394. else:
  395. if (index+1 < len(meta_line)) and \
  396. meta_line[index][fs] > main_fsize:
  397. # 单行 + 字体大
  398. mega_sec.append(copy.deepcopy(sec))
  399. sec = []
  400. sec.append("# " + line[fc])
  401. else:
  402. # 尝试识别section
  403. if meta_line[index-1][fs] > meta_line[index][fs]:
  404. sec.append("\n" + line[fc])
  405. else:
  406. sec.append(line[fc])
  407. mega_sec.append(copy.deepcopy(sec))
  408. finals = []
  409. for ms in mega_sec:
  410. final = " ".join(ms)
  411. final = final.replace('- ', ' ')
  412. finals.append(final)
  413. meta_txt = finals
  414. ############################## <第 4 步,乱七八糟的后处理> ##################################
  415. def 把字符太少的块清除为回车(meta_txt):
  416. for index, block_txt in enumerate(meta_txt):
  417. if len(block_txt) < 100:
  418. meta_txt[index] = '\n'
  419. return meta_txt
  420. meta_txt = 把字符太少的块清除为回车(meta_txt)
  421. def 清理多余的空行(meta_txt):
  422. for index in reversed(range(1, len(meta_txt))):
  423. if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
  424. meta_txt.pop(index)
  425. return meta_txt
  426. meta_txt = 清理多余的空行(meta_txt)
  427. def 合并小写开头的段落块(meta_txt):
  428. def starts_with_lowercase_word(s):
  429. pattern = r"^[a-z]+"
  430. match = re.match(pattern, s)
  431. if match:
  432. return True
  433. else:
  434. return False
  435. # 对于某些PDF会有第一个段落就以小写字母开头,为了避免索引错误将其更改为大写
  436. if starts_with_lowercase_word(meta_txt[0]):
  437. meta_txt[0] = meta_txt[0].capitalize()
  438. for _ in range(100):
  439. for index, block_txt in enumerate(meta_txt):
  440. if starts_with_lowercase_word(block_txt):
  441. if meta_txt[index-1] != '\n':
  442. meta_txt[index-1] += ' '
  443. else:
  444. meta_txt[index-1] = ''
  445. meta_txt[index-1] += meta_txt[index]
  446. meta_txt[index] = '\n'
  447. return meta_txt
  448. meta_txt = 合并小写开头的段落块(meta_txt)
  449. meta_txt = 清理多余的空行(meta_txt)
  450. meta_txt = '\n'.join(meta_txt)
  451. # 清除重复的换行
  452. for _ in range(5):
  453. meta_txt = meta_txt.replace('\n\n', '\n')
  454. # 换行 -> 双换行
  455. meta_txt = meta_txt.replace('\n', '\n\n')
  456. ############################## <第 5 步,展示分割效果> ##################################
  457. # for f in finals:
  458. # print亮黄(f)
  459. # print亮绿('***************************')
  460. return meta_txt, page_one_meta
  461. def get_files_from_everything(txt, type): # type='.md'
  462. """
  463. 这个函数是用来获取指定目录下所有指定类型(如.md)的文件,并且对于网络上的文件,也可以获取它。
  464. 下面是对每个参数和返回值的说明:
  465. 参数
  466. - txt: 路径或网址,表示要搜索的文件或者文件夹路径或网络上的文件。
  467. - type: 字符串,表示要搜索的文件类型。默认是.md。
  468. 返回值
  469. - success: 布尔值,表示函数是否成功执行。
  470. - file_manifest: 文件路径列表,里面包含以指定类型为后缀名的所有文件的绝对路径。
  471. - project_folder: 字符串,表示文件所在的文件夹路径。如果是网络上的文件,就是临时文件夹的路径。
  472. 该函数详细注释已添加,请确认是否满足您的需要。
  473. """
  474. import glob, os
  475. success = True
  476. if txt.startswith('http'):
  477. # 网络的远程文件
  478. import requests
  479. from toolbox import get_conf
  480. from toolbox import get_log_folder, gen_time_str
  481. proxies = get_conf('proxies')
  482. try:
  483. r = requests.get(txt, proxies=proxies)
  484. except:
  485. raise ConnectionRefusedError(f"无法下载资源{txt},请检查。")
  486. path = os.path.join(get_log_folder(plugin_name='web_download'), gen_time_str()+type)
  487. with open(path, 'wb+') as f: f.write(r.content)
  488. project_folder = get_log_folder(plugin_name='web_download')
  489. file_manifest = [path]
  490. elif txt.endswith(type):
  491. # 直接给定文件
  492. file_manifest = [txt]
  493. project_folder = os.path.dirname(txt)
  494. elif os.path.exists(txt):
  495. # 本地路径,递归搜索
  496. project_folder = txt
  497. file_manifest = [f for f in glob.glob(f'{project_folder}/**/*'+type, recursive=True)]
  498. if len(file_manifest) == 0:
  499. success = False
  500. else:
  501. project_folder = None
  502. file_manifest = []
  503. success = False
  504. return success, file_manifest, project_folder
  505. @Singleton
  506. class nougat_interface():
  507. def __init__(self):
  508. self.threadLock = threading.Lock()
  509. def nougat_with_timeout(self, command, cwd, timeout=3600):
  510. import subprocess
  511. from toolbox import ProxyNetworkActivate
  512. logging.info(f'正在执行命令 {command}')
  513. with ProxyNetworkActivate("Nougat_Download"):
  514. process = subprocess.Popen(command, shell=True, cwd=cwd, env=os.environ)
  515. try:
  516. stdout, stderr = process.communicate(timeout=timeout)
  517. except subprocess.TimeoutExpired:
  518. process.kill()
  519. stdout, stderr = process.communicate()
  520. print("Process timed out!")
  521. return False
  522. return True
  523. def NOUGAT_parse_pdf(self, fp, chatbot, history):
  524. from toolbox import update_ui_lastest_msg
  525. yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在排队, 等待线程锁...",
  526. chatbot=chatbot, history=history, delay=0)
  527. self.threadLock.acquire()
  528. import glob, threading, os
  529. from toolbox import get_log_folder, gen_time_str
  530. dst = os.path.join(get_log_folder(plugin_name='nougat'), gen_time_str())
  531. os.makedirs(dst)
  532. yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在加载NOUGAT... (提示:首次运行需要花费较长时间下载NOUGAT参数)",
  533. chatbot=chatbot, history=history, delay=0)
  534. self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600)
  535. res = glob.glob(os.path.join(dst,'*.mmd'))
  536. if len(res) == 0:
  537. self.threadLock.release()
  538. raise RuntimeError("Nougat解析论文失败。")
  539. self.threadLock.release()
  540. return res[0]
  541. def try_install_deps(deps, reload_m=[]):
  542. import subprocess, sys, importlib
  543. for dep in deps:
  544. subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--user', dep])
  545. import site
  546. importlib.reload(site)
  547. for m in reload_m:
  548. importlib.reload(__import__(m))
  549. def get_plugin_arg(plugin_kwargs, key, default):
  550. # 如果参数是空的
  551. if (key in plugin_kwargs) and (plugin_kwargs[key] == ""): plugin_kwargs.pop(key)
  552. # 正常情况
  553. return plugin_kwargs.get(key, default)