nv-h100.yml 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. name: nv-h100
  2. on:
  3. schedule:
  4. - cron: "0 0 * * *"
  5. workflow_dispatch:
  6. concurrency:
  7. group: ${{ github.workflow }}-${{ github.ref }}
  8. cancel-in-progress: true
  9. permissions:
  10. contents: read
  11. issues: write
  12. jobs:
  13. unit-tests:
  14. runs-on: [self-hosted, nvidia, h100]
  15. container:
  16. image: nvcr.io/nvidia/pytorch:23.03-py3
  17. ports:
  18. - 80
  19. options: --gpus all --shm-size "8G"
  20. steps:
  21. - uses: actions/checkout@v3
  22. - name: Check container state
  23. run: |
  24. nvidia-smi
  25. python -c "import torch; print('torch:', torch.__version__, torch)"
  26. python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
  27. - name: Install transformers
  28. run: |
  29. git clone https://github.com/huggingface/transformers
  30. cd transformers
  31. git rev-parse --short HEAD
  32. python -m pip install .
  33. - name: Install deepspeed
  34. run: |
  35. python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
  36. python -m pip install .[dev,1bit,autotuning]
  37. ds_report
  38. - name: Python environment
  39. run: |
  40. python -m pip list
  41. - name: Unit tests
  42. run: |
  43. unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
  44. cd tests
  45. python -m pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.0" --cuda_ver="12"
  46. python -m pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.0" --cuda_ver="12"
  47. - name: Open GitHub issue if nightly CI fails
  48. if: ${{ failure() && (github.event_name == 'schedule') }}
  49. uses: JasonEtco/create-an-issue@v2
  50. env:
  51. GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  52. with:
  53. filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
  54. update_existing: true