test_human_eval.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. # Copyright (c) Microsoft Corporation.
  2. # SPDX-License-Identifier: Apache-2.0
  3. # DeepSpeed Team
  4. import pytest
  5. import os
  6. import torch
  7. from deepspeed.accelerator import get_accelerator
  8. @pytest.mark.evaluation
  9. @pytest.mark.parametrize("model_name", ["codellama/CodeLlama-7b-Python-hf"])
  10. def test_human_eval(model_name):
  11. import mii
  12. import numpy
  13. from transformers import pipeline
  14. from human_eval.data import write_jsonl, read_problems
  15. from human_eval.evaluation import evaluate_functional_correctness
  16. def generate_base_completion(pipe, problem_prompt: str) -> str:
  17. return pipe(problem_prompt, do_sample=True)[0]["generated_text"]
  18. def generate_mii_completion(pipe, problem_prompt: str) -> str:
  19. return pipe(problem_prompt, max_new_tokens=512)[0].generated_text
  20. def generate_samples(pipe, generation_function):
  21. samples = [
  22. dict(task_id=task_id, completion=generation_function(pipe, problems[task_id]["prompt"]))
  23. for task_id in problems for _ in range(num_samples_per_task)
  24. ]
  25. return samples
  26. # Loading Problems
  27. problems = read_problems("../../human-eval/data/HumanEval.jsonl.gz")
  28. num_samples_per_task = 20
  29. # Initializing HuggingFace Pipeline
  30. local_rank = os.getenv("LOCAL_RANK", "0")
  31. device = torch.device(get_accelerator().device_name(local_rank))
  32. base_pipe = pipeline(model=model_name,
  33. device=torch.device(get_accelerator().device_name(local_rank)),
  34. max_length=512,
  35. return_full_text=False)
  36. # Generating Base Samples
  37. base_samples = generate_samples(base_pipe, generate_base_completion)
  38. # Base Pipeline Teardown
  39. del base_pipe
  40. get_accelerator().empty_cache()
  41. # Initializing DeepSpeed-MII Pipeline
  42. mii_pipe = mii.pipeline(model_name)
  43. # Generating MII Samples
  44. mii_samples = generate_samples(mii_pipe, generate_mii_completion)
  45. # MII Pipeline Teardown
  46. mii_pipe.destroy()
  47. # Writing Samples
  48. write_jsonl("base_samples.jsonl", base_samples)
  49. write_jsonl("mii_samples.jsonl", mii_samples)
  50. # Evaluating Samples
  51. base_results = evaluate_functional_correctness("base_samples.jsonl")
  52. mii_results = evaluate_functional_correctness("mii_samples.jsonl")
  53. # Executing Assertions
  54. for key in base_results.keys():
  55. assert numpy.allclose(base_results[key], mii_results[key], rtol=0.10), \
  56. f"Base result: {base_results[key]}, MII result: {mii_results[key]}, outside of rtol."