csv_monitor.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. '''Copyright The Microsoft DeepSpeed Team'''
  2. from .monitor import Monitor
  3. import os
  4. import deepspeed.comm as dist
  5. class csvMonitor(Monitor):
  6. def __init__(self, csv_config):
  7. super().__init__(csv_config)
  8. self.filenames = []
  9. self.enabled = csv_config.enabled
  10. self.output_path = csv_config.output_path
  11. self.job_name = csv_config.job_name
  12. self.log_dir = self.setup_log_dir()
  13. def setup_log_dir(self, base=os.path.join(os.path.expanduser("~"), "csv_monitor")):
  14. if self.enabled and dist.get_rank() == 0:
  15. if self.output_path is not None:
  16. log_dir = os.path.join(self.output_path, self.job_name)
  17. # NOTE: This code path currently is never used since the default tensorboard_output_path is an empty string and not None. Saving it in case we want this functionality in the future.
  18. else:
  19. if "DLWS_JOB_ID" in os.environ:
  20. infra_job_id = os.environ["DLWS_JOB_ID"]
  21. elif "DLTS_JOB_ID" in os.environ:
  22. infra_job_id = os.environ["DLTS_JOB_ID"]
  23. else:
  24. infra_job_id = "unknown-job-id"
  25. csv_monitor_dir_name = os.path.join(infra_job_id, "logs")
  26. log_dir = os.path.join(base, csv_monitor_dir_name, self.job_name)
  27. os.makedirs(log_dir, exist_ok=True)
  28. return log_dir
  29. def write_events(self, event_list):
  30. if self.enabled and dist.get_rank() == 0:
  31. import csv
  32. # We assume each event_list element is a tensorboard-style tuple in the format: (log_name: String, value, step: Int)
  33. for event in event_list:
  34. log_name = event[0]
  35. value = event[1]
  36. step = event[2]
  37. # Set the header to the log_name
  38. # Need this check because the deepspeed engine currently formats log strings to separate with '/'
  39. if '/' in log_name:
  40. record_splits = log_name.split('/')
  41. header = record_splits[len(record_splits) - 1]
  42. else:
  43. header = log_name
  44. # sanitize common naming conventions into filename
  45. filename = log_name.replace('/', '_').replace(' ', '_')
  46. fname = self.log_dir + '/' + filename + '.csv'
  47. # Open file and record event. Insert header if this is the first time writing
  48. with open(fname, 'a+') as csv_monitor_file:
  49. csv_monitor_writer = csv.writer(csv_monitor_file)
  50. if filename not in self.filenames:
  51. self.filenames.append(filename)
  52. csv_monitor_writer.writerow(['step', header])
  53. csv_monitor_writer.writerow([step, value])