谷歌检索小助手.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
  2. from toolbox import CatchException, report_exception, promote_file_to_downloadzone
  3. from toolbox import update_ui, update_ui_lastest_msg, disable_auto_promotion, write_history_to_file
  4. import logging
  5. import requests
  6. import time
  7. import random
  8. ENABLE_ALL_VERSION_SEARCH = True
  9. def get_meta_information(url, chatbot, history):
  10. import arxiv
  11. import difflib
  12. import re
  13. from bs4 import BeautifulSoup
  14. from toolbox import get_conf
  15. from urllib.parse import urlparse
  16. session = requests.session()
  17. proxies = get_conf('proxies')
  18. headers = {
  19. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
  20. 'Accept-Encoding': 'gzip, deflate, br',
  21. 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
  22. 'Cache-Control':'max-age=0',
  23. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  24. 'Connection': 'keep-alive'
  25. }
  26. try:
  27. session.proxies.update(proxies)
  28. except:
  29. report_exception(chatbot, history,
  30. a=f"获取代理失败 无代理状态下很可能无法访问OpenAI家族的模型及谷歌学术 建议:检查USE_PROXY选项是否修改。",
  31. b=f"尝试直接连接")
  32. yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
  33. session.headers.update(headers)
  34. response = session.get(url)
  35. # 解析网页内容
  36. soup = BeautifulSoup(response.text, "html.parser")
  37. def string_similar(s1, s2):
  38. return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
  39. if ENABLE_ALL_VERSION_SEARCH:
  40. def search_all_version(url):
  41. time.sleep(random.randint(1,5)) # 睡一会防止触发google反爬虫
  42. response = session.get(url)
  43. soup = BeautifulSoup(response.text, "html.parser")
  44. for result in soup.select(".gs_ri"):
  45. try:
  46. url = result.select_one(".gs_rt").a['href']
  47. except:
  48. continue
  49. arxiv_id = extract_arxiv_id(url)
  50. if not arxiv_id:
  51. continue
  52. search = arxiv.Search(
  53. id_list=[arxiv_id],
  54. max_results=1,
  55. sort_by=arxiv.SortCriterion.Relevance,
  56. )
  57. try: paper = next(search.results())
  58. except: paper = None
  59. return paper
  60. return None
  61. def extract_arxiv_id(url):
  62. # 返回给定的url解析出的arxiv_id,如url未成功匹配返回None
  63. pattern = r'arxiv.org/abs/([^/]+)'
  64. match = re.search(pattern, url)
  65. if match:
  66. return match.group(1)
  67. else:
  68. return None
  69. profile = []
  70. # 获取所有文章的标题和作者
  71. for result in soup.select(".gs_ri"):
  72. title = result.a.text.replace('\n', ' ').replace(' ', ' ')
  73. author = result.select_one(".gs_a").text
  74. try:
  75. citation = result.select_one(".gs_fl > a[href*='cites']").text # 引用次数是链接中的文本,直接取出来
  76. except:
  77. citation = 'cited by 0'
  78. abstract = result.select_one(".gs_rs").text.strip() # 摘要在 .gs_rs 中的文本,需要清除首尾空格
  79. # 首先在arxiv上搜索,获取文章摘要
  80. search = arxiv.Search(
  81. query = title,
  82. max_results = 1,
  83. sort_by = arxiv.SortCriterion.Relevance,
  84. )
  85. try: paper = next(search.results())
  86. except: paper = None
  87. is_match = paper is not None and string_similar(title, paper.title) > 0.90
  88. # 如果在Arxiv上匹配失败,检索文章的历史版本的题目
  89. if not is_match and ENABLE_ALL_VERSION_SEARCH:
  90. other_versions_page_url = [tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']]
  91. if len(other_versions_page_url) > 0:
  92. other_versions_page_url = other_versions_page_url[0]
  93. paper = search_all_version('http://' + urlparse(url).netloc + other_versions_page_url)
  94. is_match = paper is not None and string_similar(title, paper.title) > 0.90
  95. if is_match:
  96. # same paper
  97. abstract = paper.summary.replace('\n', ' ')
  98. is_paper_in_arxiv = True
  99. else:
  100. # different paper
  101. abstract = abstract
  102. is_paper_in_arxiv = False
  103. logging.info('[title]:' + title)
  104. logging.info('[author]:' + author)
  105. logging.info('[citation]:' + citation)
  106. profile.append({
  107. 'title': title,
  108. 'author': author,
  109. 'citation': citation,
  110. 'abstract': abstract,
  111. 'is_paper_in_arxiv': is_paper_in_arxiv,
  112. })
  113. chatbot[-1] = [chatbot[-1][0], title + f'\n\n是否在arxiv中(不在arxiv中无法获取完整摘要):{is_paper_in_arxiv}\n\n' + abstract]
  114. yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
  115. return profile
  116. @CatchException
  117. def 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
  118. disable_auto_promotion(chatbot=chatbot)
  119. # 基本信息:功能、贡献者
  120. chatbot.append([
  121. "函数插件功能?",
  122. "分析用户提供的谷歌学术(google scholar)搜索页面中,出现的所有文章: binary-husky,插件初始化中..."])
  123. yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
  124. # 尝试导入依赖,如果缺少依赖,则给出安装建议
  125. try:
  126. import arxiv
  127. import math
  128. from bs4 import BeautifulSoup
  129. except:
  130. report_exception(chatbot, history,
  131. a = f"解析项目: {txt}",
  132. b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade beautifulsoup4 arxiv```。")
  133. yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
  134. return
  135. # 清空历史,以免输入溢出
  136. history = []
  137. meta_paper_info_list = yield from get_meta_information(txt, chatbot, history)
  138. if len(meta_paper_info_list) == 0:
  139. yield from update_ui_lastest_msg(lastmsg='获取文献失败,可能触发了google反爬虫机制。',chatbot=chatbot, history=history, delay=0)
  140. return
  141. batchsize = 5
  142. for batch in range(math.ceil(len(meta_paper_info_list)/batchsize)):
  143. if len(meta_paper_info_list[:batchsize]) > 0:
  144. i_say = "下面是一些学术文献的数据,提取出以下内容:" + \
  145. "1、英文题目;2、中文题目翻译;3、作者;4、arxiv公开(is_paper_in_arxiv);4、引用数量(cite);5、中文摘要翻译。" + \
  146. f"以下是信息源:{str(meta_paper_info_list[:batchsize])}"
  147. inputs_show_user = f"请分析此页面中出现的所有文章:{txt},这是第{batch+1}批"
  148. gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
  149. inputs=i_say, inputs_show_user=inputs_show_user,
  150. llm_kwargs=llm_kwargs, chatbot=chatbot, history=[],
  151. sys_prompt="你是一个学术翻译,请从数据中提取信息。你必须使用Markdown表格。你必须逐个文献进行处理。"
  152. )
  153. history.extend([ f"第{batch+1}批", gpt_say ])
  154. meta_paper_info_list = meta_paper_info_list[batchsize:]
  155. chatbot.append(["状态?",
  156. "已经全部完成,您可以试试让AI写一个Related Works,例如您可以继续输入Write a \"Related Works\" section about \"你搜索的研究领域\" for me."])
  157. msg = '正常'
  158. yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
  159. path = write_history_to_file(history)
  160. promote_file_to_downloadzone(path, chatbot=chatbot)
  161. chatbot.append(("完成了吗?", path));
  162. yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面