nv-megatron.yml 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. name: nv-megatron
  2. on:
  3. pull_request:
  4. paths-ignore:
  5. - 'docs/**'
  6. - 'blogs/**'
  7. merge_group:
  8. branches: [ master ]
  9. schedule:
  10. - cron: "0 0 * * *"
  11. concurrency:
  12. group: ${{ github.workflow }}-${{ github.ref }}
  13. cancel-in-progress: true
  14. jobs:
  15. unit-tests:
  16. runs-on: [self-hosted, nvidia, cu116, v100]
  17. steps:
  18. - uses: actions/checkout@v3
  19. - id: setup-venv
  20. uses: ./.github/workflows/setup-venv
  21. - name: Install pytorch
  22. run: |
  23. pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
  24. python -c "import torch; print('torch:', torch.__version__, torch)"
  25. python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
  26. - name: Install deepspeed
  27. run: |
  28. pip install .[dev]
  29. ds_report
  30. - name: Install apex
  31. run: |
  32. git clone https://github.com/NVIDIA/apex.git
  33. cd apex
  34. CURRENT_VER=$(git rev-parse HEAD)
  35. INSTALLED_VER=$(cat /blob/apex/.venv_installed_version)
  36. if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
  37. pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--global-option=--cpp_ext" --config-settings "--global-option=--cuda_ext" --target=/blob/apex/ --upgrade .
  38. git rev-parse HEAD > /blob/apex/.venv_installed_version
  39. fi
  40. echo PYTHONPATH=$PYTHONPATH:/blob/apex/ >> $GITHUB_ENV
  41. - name: Python environment
  42. run: |
  43. pip list
  44. - name: Megatron unit tests
  45. run: |
  46. git clone https://github.com/microsoft/Megatron-DeepSpeed.git
  47. cd Megatron-DeepSpeed
  48. pip install .
  49. unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
  50. cd tests
  51. pytest $PYTEST_OPTS ./