dashboard.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. import argparse
  2. import asyncio
  3. import logging
  4. import logging.handlers
  5. import platform
  6. import traceback
  7. import ray.dashboard.consts as dashboard_consts
  8. import ray.dashboard.head as dashboard_head
  9. import ray.dashboard.utils as dashboard_utils
  10. import ray.ray_constants as ray_constants
  11. import ray._private.gcs_utils as gcs_utils
  12. import ray._private.services
  13. import ray._private.utils
  14. from ray._private.gcs_pubsub import gcs_pubsub_enabled, GcsPublisher
  15. from ray._private.ray_logging import setup_component_logger
  16. # Logger for this module. It should be configured at the entry point
  17. # into the program using Ray. Ray provides a default configuration at
  18. # entry/init points.
  19. logger = logging.getLogger(__name__)
  20. class Dashboard:
  21. """A dashboard process for monitoring Ray nodes.
  22. This dashboard is made up of a REST API which collates data published by
  23. Reporter processes on nodes into a json structure, and a webserver
  24. which polls said API for display purposes.
  25. Args:
  26. host(str): Host address of dashboard aiohttp server.
  27. port(int): Port number of dashboard aiohttp server.
  28. port_retries(int): The retry times to select a valid port.
  29. gcs_address(str): GCS address of the cluster
  30. redis_address(str): Redis address of a Ray cluster
  31. redis_password(str): Redis password to access GCS
  32. log_dir(str): Log directory of dashboard.
  33. """
  34. def __init__(
  35. self,
  36. host,
  37. port,
  38. port_retries,
  39. gcs_address,
  40. redis_address,
  41. redis_password=None,
  42. log_dir=None,
  43. temp_dir=None,
  44. minimal=False,
  45. ):
  46. self.dashboard_head = dashboard_head.DashboardHead(
  47. http_host=host,
  48. http_port=port,
  49. http_port_retries=port_retries,
  50. gcs_address=gcs_address,
  51. redis_address=redis_address,
  52. redis_password=redis_password,
  53. log_dir=log_dir,
  54. temp_dir=temp_dir,
  55. minimal=minimal,
  56. )
  57. async def run(self):
  58. await self.dashboard_head.run()
  59. if __name__ == "__main__":
  60. parser = argparse.ArgumentParser(description="Ray dashboard.")
  61. parser.add_argument(
  62. "--host", required=True, type=str, help="The host to use for the HTTP server."
  63. )
  64. parser.add_argument(
  65. "--port", required=True, type=int, help="The port to use for the HTTP server."
  66. )
  67. parser.add_argument(
  68. "--port-retries",
  69. required=False,
  70. type=int,
  71. default=0,
  72. help="The retry times to select a valid port.",
  73. )
  74. parser.add_argument(
  75. "--gcs-address", required=False, type=str, help="The address (ip:port) of GCS."
  76. )
  77. parser.add_argument(
  78. "--redis-address", required=True, type=str, help="The address to use for Redis."
  79. )
  80. parser.add_argument(
  81. "--redis-password",
  82. required=False,
  83. type=str,
  84. default=None,
  85. help="The password to use for Redis",
  86. )
  87. parser.add_argument(
  88. "--logging-level",
  89. required=False,
  90. type=lambda s: logging.getLevelName(s.upper()),
  91. default=ray_constants.LOGGER_LEVEL,
  92. choices=ray_constants.LOGGER_LEVEL_CHOICES,
  93. help=ray_constants.LOGGER_LEVEL_HELP,
  94. )
  95. parser.add_argument(
  96. "--logging-format",
  97. required=False,
  98. type=str,
  99. default=ray_constants.LOGGER_FORMAT,
  100. help=ray_constants.LOGGER_FORMAT_HELP,
  101. )
  102. parser.add_argument(
  103. "--logging-filename",
  104. required=False,
  105. type=str,
  106. default=dashboard_consts.DASHBOARD_LOG_FILENAME,
  107. help="Specify the name of log file, "
  108. 'log to stdout if set empty, default is "{}"'.format(
  109. dashboard_consts.DASHBOARD_LOG_FILENAME
  110. ),
  111. )
  112. parser.add_argument(
  113. "--logging-rotate-bytes",
  114. required=False,
  115. type=int,
  116. default=ray_constants.LOGGING_ROTATE_BYTES,
  117. help="Specify the max bytes for rotating "
  118. "log file, default is {} bytes.".format(ray_constants.LOGGING_ROTATE_BYTES),
  119. )
  120. parser.add_argument(
  121. "--logging-rotate-backup-count",
  122. required=False,
  123. type=int,
  124. default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
  125. help="Specify the backup count of rotated log file, default is {}.".format(
  126. ray_constants.LOGGING_ROTATE_BACKUP_COUNT
  127. ),
  128. )
  129. parser.add_argument(
  130. "--log-dir",
  131. required=True,
  132. type=str,
  133. default=None,
  134. help="Specify the path of log directory.",
  135. )
  136. parser.add_argument(
  137. "--temp-dir",
  138. required=True,
  139. type=str,
  140. default=None,
  141. help="Specify the path of the temporary directory use by Ray process.",
  142. )
  143. parser.add_argument(
  144. "--minimal",
  145. action="store_true",
  146. help=(
  147. "Minimal dashboard only contains a subset of features that don't "
  148. "require additional dependencies installed when ray is installed "
  149. "by `pip install ray[default]`."
  150. ),
  151. )
  152. args = parser.parse_args()
  153. if gcs_utils.use_gcs_for_bootstrap():
  154. args.redis_address = None
  155. args.redis_password = None
  156. else:
  157. args.gcs_address = None
  158. try:
  159. setup_component_logger(
  160. logging_level=args.logging_level,
  161. logging_format=args.logging_format,
  162. log_dir=args.log_dir,
  163. filename=args.logging_filename,
  164. max_bytes=args.logging_rotate_bytes,
  165. backup_count=args.logging_rotate_backup_count,
  166. )
  167. dashboard = Dashboard(
  168. args.host,
  169. args.port,
  170. args.port_retries,
  171. args.gcs_address,
  172. args.redis_address,
  173. redis_password=args.redis_password,
  174. log_dir=args.log_dir,
  175. temp_dir=args.temp_dir,
  176. minimal=args.minimal,
  177. )
  178. loop = asyncio.get_event_loop()
  179. loop.run_until_complete(dashboard.run())
  180. except Exception as e:
  181. traceback_str = ray._private.utils.format_error_message(traceback.format_exc())
  182. message = (
  183. f"The dashboard on node {platform.uname()[1]} "
  184. f"failed with the following "
  185. f"error:\n{traceback_str}"
  186. )
  187. if isinstance(e, dashboard_utils.FrontendNotFoundError):
  188. logger.warning(message)
  189. else:
  190. logger.error(message)
  191. raise e
  192. # Something went wrong, so push an error to all drivers.
  193. redis_client = None
  194. gcs_publisher = None
  195. if gcs_pubsub_enabled():
  196. if gcs_utils.use_gcs_for_bootstrap():
  197. gcs_publisher = GcsPublisher(args.gcs_address)
  198. else:
  199. redis_client = ray._private.services.create_redis_client(
  200. args.redis_address, password=args.redis_password
  201. )
  202. gcs_publisher = GcsPublisher(
  203. address=gcs_utils.get_gcs_address_from_redis(redis_client)
  204. )
  205. redis_client = None
  206. else:
  207. redis_client = ray._private.services.create_redis_client(
  208. args.redis_address, password=args.redis_password
  209. )
  210. ray._private.utils.publish_error_to_driver(
  211. redis_client,
  212. ray_constants.DASHBOARD_DIED_ERROR,
  213. message,
  214. redis_client=redis_client,
  215. gcs_publisher=gcs_publisher,
  216. )