debug.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. # For lazy import with printflock()
  5. fcntl = None
  6. # for debug purposes map module and param objects to their fully qualified names
  7. module_names = {}
  8. param_names = {}
  9. def debug_extract_module_and_param_names(model):
  10. # extract the fully qualified names as soon as the model is acquired
  11. global module_names
  12. global param_names
  13. # XXX: can probably make a map of param2module and vice-versa
  14. module_names = {module: name for name, module in model.named_modules()}
  15. param_names = {param: name for name, param in model.named_parameters()}
  16. def debug_module2name(module):
  17. if module in module_names:
  18. return module_names[module]
  19. else:
  20. return "unknown"
  21. def debug_module2name_id(module):
  22. return f"name={debug_module2name(module)} id={module.id}"
  23. def debug_module2name_class(module):
  24. return f"name={debug_module2name(module)} {module.__class__.__name__}"
  25. def debug_param2name(param):
  26. if param in param_names:
  27. return param_names[param]
  28. else:
  29. return "unknown"
  30. def debug_param2name_id(param):
  31. return f"name={debug_param2name(param)} id={param.ds_id}"
  32. def debug_param2name_id_shape(param):
  33. return f"name={debug_param2name(param)} id={param.ds_id} shape={param.data.shape}"
  34. def debug_param2name_id_shape_device(param):
  35. return f"name={debug_param2name(param)} id={param.ds_id} shape={param.data.shape} device={param.device}"
  36. def debug_param2name_id_numel(param):
  37. return f"name={debug_param2name(param)} id={param.ds_id} numel={param.numel()}"
  38. def debug_param2name_id_shape_status(param):
  39. return f"name={debug_param2name(param)} id={param.ds_id} shape={param.data.shape} status={param.ds_status}"
  40. def printflock(*msgs):
  41. """
  42. For printing messages for all concurrent gpus w/o getting interleaved text.
  43. This is useful when debugging issues where multi-gpus don't sync.
  44. 1. Enable the force debug in say partitioning and zero3 files
  45. 2. Override the usual versions with ::
  46. def print_rank_0(message, debug=False, force=False):
  47. rank = deepspeed.comm.get_rank()
  48. printflock(f"[{rank}] {message}")
  49. 3. run the program and you get both logs non-interleaved
  50. But this makes it very difficult to make sense of the output, so the ``log_rank_file`` helper
  51. function might be more useful, as it's easier to send each log stream into a separate file and
  52. then compare those.
  53. """
  54. global fcntl
  55. if fcntl is None:
  56. import fcntl
  57. with open(__file__, "r") as fh:
  58. fcntl.flock(fh, fcntl.LOCK_EX)
  59. try:
  60. print(*msgs)
  61. finally:
  62. fcntl.flock(fh, fcntl.LOCK_UN)
  63. fh = None
  64. def log_rank_file(rank, *msgs):
  65. """
  66. Print to a log file of the given rank
  67. This is useful for debugging hanging in sync processes. Here is a possible workflow:
  68. 1. Enable the force debug in say partitioning and zero3 files
  69. 2. Override the usual versions of print_rank_0 in those files with ::
  70. def print_rank_0(message, debug=False, force=False):
  71. rank = deepspeed.comm.get_rank()
  72. log_rank_file(rank, message)
  73. 3. run the program
  74. 4. fix up the expected differences, e.g. different cuda numbers ::
  75. perl -pi -e 's|cuda:1|cuda:0|' log_rank_*
  76. 5. now diff and see where names and ids diverge - you will find where the gpus don't do the same
  77. work (e.g. when some layers get conditionally skipped on one gpu but not all)
  78. diff -u log_rank_0.txt log_rank_1.txt | less
  79. """
  80. global fh
  81. if fh is None:
  82. fh = open(f"log_rank_{rank}.txt", "w")
  83. for m in msgs:
  84. fh.write(f"{m}\n")
  85. fh.flush()
  86. def print_backward_tensors(tensor):
  87. def _print_bwd_tensors(grad_fn):
  88. print(f"Backward tensors in {grad_fn}")
  89. for funcs in grad_fn.next_functions:
  90. if funcs[0]:
  91. try:
  92. tensor = getattr(funcs[0], 'variable')
  93. print(funcs[0])
  94. print(f"Tensor - id: {id(tensor)}, shape: {tensor.shape}, data: {tensor}, grad: {tensor.grad}")
  95. except AttributeError as e:
  96. _print_bwd_tensors(funcs[0])
  97. if hasattr(tensor, 'grad_fn'):
  98. _print_bwd_tensors(tensor.grad_fn)