debug.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. '''Copyright The Microsoft DeepSpeed Team'''
  2. """ debug utils """
  3. # For lazy import with printflock()
  4. fcntl = None
  5. # for debug purposes map module and param objects to their fully qualified names
  6. module_names = {}
  7. param_names = {}
  8. def debug_extract_module_and_param_names(model):
  9. # extract the fully qualified names as soon as the model is acquired
  10. global module_names
  11. global param_names
  12. # XXX: can probably make a map of param2module and vice-versa
  13. module_names = {module: name for name, module in model.named_modules()}
  14. param_names = {param: name for name, param in model.named_parameters()}
  15. def debug_module2name(module):
  16. if module in module_names:
  17. return module_names[module]
  18. else:
  19. return "unknown"
  20. def debug_module2name_id(module):
  21. return f"name={debug_module2name(module)} id={module.id}"
  22. def debug_module2name_class(module):
  23. return f"name={debug_module2name(module)} {module.__class__.__name__}"
  24. def debug_param2name(param):
  25. if param in param_names:
  26. return param_names[param]
  27. else:
  28. return "unknown"
  29. def debug_param2name_id(param):
  30. return f"name={debug_param2name(param)} id={param.ds_id}"
  31. def debug_param2name_id_shape(param):
  32. return f"name={debug_param2name(param)} id={param.ds_id} shape={param.data.shape}"
  33. def debug_param2name_id_shape_device(param):
  34. return f"name={debug_param2name(param)} id={param.ds_id} shape={param.data.shape} device={param.device}"
  35. def debug_param2name_id_numel(param):
  36. return f"name={debug_param2name(param)} id={param.ds_id} numel={param.numel()}"
  37. def debug_param2name_id_shape_status(param):
  38. return f"name={debug_param2name(param)} id={param.ds_id} shape={param.data.shape} status={param.ds_status}"
  39. def printflock(*msgs):
  40. """
  41. For printing messages for all concurrent gpus w/o getting interleaved text.
  42. This is useful when debugging issues where multi-gpus don't sync.
  43. 1. Enable the force debug in say partitioning and zero3 files
  44. 2. Override the usual versions with ::
  45. def print_rank_0(message, debug=False, force=False):
  46. rank = deepspeed.comm.get_rank()
  47. printflock(f"[{rank}] {message}")
  48. 3. run the program and you get both logs non-interleaved
  49. But this makes it very difficult to make sense of the output, so the ``log_rank_file`` helper
  50. function might be more useful, as it's easier to send each log stream into a separate file and
  51. then compare those.
  52. """
  53. global fcntl
  54. if fcntl == None:
  55. import fcntl
  56. with open(__file__, "r") as fh:
  57. fcntl.flock(fh, fcntl.LOCK_EX)
  58. try:
  59. print(*msgs)
  60. finally:
  61. fcntl.flock(fh, fcntl.LOCK_UN)
  62. fh = None
  63. def log_rank_file(rank, *msgs):
  64. """
  65. Print to a log file of the given rank
  66. This is useful for debugging hanging in sync processes. Here is a possible workflow:
  67. 1. Enable the force debug in say partitioning and zero3 files
  68. 2. Override the usual versions of print_rank_0 in those files with ::
  69. def print_rank_0(message, debug=False, force=False):
  70. rank = deepspeed.comm.get_rank()
  71. log_rank_file(rank, message)
  72. 3. run the program
  73. 4. fix up the expected differences, e.g. different cuda numbers ::
  74. perl -pi -e 's|cuda:1|cuda:0|' log_rank_*
  75. 5. now diff and see where names and ids diverge - you will find where the gpus don't do the same
  76. work (e.g. when some layers get conditionally skipped on one gpu but not all)
  77. diff -u log_rank_0.txt log_rank_1.txt | less
  78. """
  79. global fh
  80. if fh is None:
  81. fh = open(f"log_rank_{rank}.txt", "w")
  82. for m in msgs:
  83. fh.write(f"{m}\n")
  84. fh.flush()
  85. def print_backward_tensors(tensor):
  86. def _print_bwd_tensors(grad_fn):
  87. print(f"Backward tensors in {grad_fn}")
  88. for funcs in grad_fn.next_functions:
  89. if funcs[0]:
  90. try:
  91. tensor = getattr(funcs[0], 'variable')
  92. print(funcs[0])
  93. print(
  94. f"Tensor - id: {id(tensor)}, shape: {tensor.shape}, data: {tensor}, grad: {tensor.grad}"
  95. )
  96. except AttributeError as e:
  97. _print_bwd_tensors(funcs[0])
  98. if hasattr(tensor, 'grad_fn'):
  99. _print_bwd_tensors(tensor.grad_fn)