browser.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. """ 网页浏览器,获取网页内容"""
  2. from urllib.parse import urljoin
  3. import requests
  4. from webdriver_manager.microsoft import EdgeChromiumDriverManager
  5. from selenium import webdriver
  6. from selenium.webdriver.edge.service import Service as EdgeService
  7. from selenium.webdriver.edge.options import Options as EdgeOptions
  8. from selenium.webdriver.common.by import By
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from selenium.webdriver.support import expected_conditions as EC
  11. from bs4 import BeautifulSoup, NavigableString
  12. class Browser:
  13. """ 网页浏览器 """
  14. def __init__(self, proxy: str = None) -> None:
  15. self.proxy = proxy
  16. self.driver = self.create_driver()
  17. def create_driver(self):
  18. """ 创建 Selenium Edge web drive"""
  19. edge_options = EdgeOptions() # 创建Edge选项
  20. edge_options.add_argument("--headless") # 启用无头模式
  21. edge_options.add_argument('--disable-gpu') # 禁用GPU加速
  22. edge_options.add_argument('--disable-software-rasterizer') # 禁用软件光栅化器
  23. edge_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错
  24. edge_options.add_argument('--disable-dev-shm-usage') # 解决资源有限的问题
  25. edge_options.add_argument('log-level=3') # INFO = 0 WARNING = 1 LOG_ERROR = 2 LOG_FATAL = 3 default is 0
  26. edge_options.add_experimental_option('excludeSwitches', ['enable-automation']) # 实现了规避监测
  27. edge_options.add_experimental_option('excludeSwitches', ['enable-logging']) # 省略log
  28. edge_options.add_argument('--mute-audio') # 禁用音频
  29. edge_options.add_argument('--disable-extensions') # 禁用扩展
  30. edge_options.add_argument('--disable-popup-blocking') # 禁用弹出窗口拦截
  31. edge_options.add_argument('--disable-plugins') # 禁用插件
  32. if self.proxy: # 添加代理
  33. edge_options.add_argument(f'--proxy-server={self.proxy}')
  34. # 配置浏览器设置以忽略图片
  35. prefs = {"profile.managed_default_content_settings.images": 2}
  36. edge_options.add_experimental_option("prefs", prefs)
  37. driver = webdriver.Edge(
  38. service=EdgeService(EdgeChromiumDriverManager().install()),
  39. options=edge_options)
  40. return driver
  41. def wait_for_page_load(self, timeout=5):
  42. WebDriverWait(self.driver, timeout).until(
  43. EC.presence_of_element_located((By.TAG_NAME, "body"))
  44. )
  45. def full_text(self, soup: BeautifulSoup) -> str:
  46. fulltext = soup.get_text(separator='\n', strip=True)
  47. return fulltext
  48. def get_image_file_size(self, img_url: str) -> int:
  49. """ 获取图片文件大小"""
  50. try:
  51. response = requests.head(img_url, allow_redirects=True, timeout=3)
  52. if 'Content-Length' in response.headers:
  53. return int(response.headers['Content-Length'])
  54. except Exception as e:
  55. print(f"Error fetching image size {img_url}: {e}")
  56. return 0
  57. def get_the_image(self, soup: BeautifulSoup, base_url: str) -> str:
  58. """ 获得网页中最大最具代表性图片 """
  59. # Define keywords that are usually associated with unwanted images
  60. unwanted_keywords = ['background', 'banner', 'ad', 'advertisement', 'footer', 'header', 'logo']
  61. images = soup.find_all('img')
  62. # 获取每张图片的实际尺寸和URL
  63. image_data = []
  64. for img in images:
  65. src = img.get('src')
  66. if not src:
  67. continue
  68. # Convert relative URL to absolute URL
  69. full_src = urljoin(base_url, src)
  70. # Filter out unwanted images based on keywords in their class or id attributes
  71. if any(keyword in img.get('class', []) for keyword in unwanted_keywords):
  72. continue
  73. if any(keyword in img.get('id', '') for keyword in unwanted_keywords):
  74. continue
  75. if any(keyword in full_src for keyword in unwanted_keywords):
  76. continue
  77. file_size = self.get_image_file_size(full_src)
  78. if file_size > 0:
  79. image_data.append((file_size, full_src))
  80. # 按文件大小排序,并获取最大的图片
  81. image_data.sort(reverse=True, key=lambda x: x[0])
  82. if image_data:
  83. return image_data[0][1]
  84. return None
  85. def get_headline_image(self, soup: BeautifulSoup) -> str:
  86. """ 获取网页meta中的头图URL"""
  87. meta_tags = soup.find_all('meta', property='og:image')
  88. if meta_tags:
  89. return meta_tags[0]['content']
  90. return None
  91. def webpage_content(self, url: str, get_image: bool = True) -> tuple:
  92. """ 访问网页,读取内容返回文本和一张图片 """
  93. self.driver.get(url)
  94. # Wait for the page to load completely
  95. self.wait_for_page_load()
  96. html_content = self.driver.page_source
  97. # 使用BeautifulSoup解析HTML
  98. soup = BeautifulSoup(html_content, 'html.parser')
  99. text = self.full_text(soup)
  100. image_url = None
  101. if get_image:
  102. image_url = self.get_the_image(soup, url)
  103. self.driver.close()
  104. return text, image_url
  105. if __name__ == "__main__":
  106. # 使用示例测试
  107. browser = Browser()
  108. text, image_url = browser.webpage_content('https://mjcopilot.com')
  109. print(f"Text len = {len(text)}")
  110. print("Image URL:", image_url)