12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- # Copyright (c) Microsoft Corporation.
- # SPDX-License-Identifier: Apache-2.0
- # DeepSpeed Team
- import pytest
- import os
- import torch
- from deepspeed.accelerator import get_accelerator
- @pytest.mark.evaluation
- @pytest.mark.parametrize("model_name", ["codellama/CodeLlama-7b-Python-hf"])
- def test_human_eval(model_name):
- import mii
- import numpy
- from transformers import pipeline
- from human_eval.data import write_jsonl, read_problems
- from human_eval.evaluation import evaluate_functional_correctness
- def generate_base_completion(pipe, problem_prompt: str) -> str:
- return pipe(problem_prompt, do_sample=True)[0]["generated_text"]
- def generate_mii_completion(pipe, problem_prompt: str) -> str:
- return pipe(problem_prompt, max_new_tokens=512)[0].generated_text
- def generate_samples(pipe, generation_function):
- samples = [
- dict(task_id=task_id, completion=generation_function(pipe, problems[task_id]["prompt"]))
- for task_id in problems for _ in range(num_samples_per_task)
- ]
- return samples
- # Loading Problems
- problems = read_problems("../../human-eval/data/HumanEval.jsonl.gz")
- num_samples_per_task = 20
- # Initializing HuggingFace Pipeline
- local_rank = os.getenv("LOCAL_RANK", "0")
- device = torch.device(get_accelerator().device_name(local_rank))
- base_pipe = pipeline(model=model_name,
- device=torch.device(get_accelerator().device_name(local_rank)),
- max_length=512,
- return_full_text=False)
- # Generating Base Samples
- base_samples = generate_samples(base_pipe, generate_base_completion)
- # Base Pipeline Teardown
- del base_pipe
- get_accelerator().empty_cache()
- # Initializing DeepSpeed-MII Pipeline
- mii_pipe = mii.pipeline(model_name)
- # Generating MII Samples
- mii_samples = generate_samples(mii_pipe, generate_mii_completion)
- # MII Pipeline Teardown
- mii_pipe.destroy()
- # Writing Samples
- write_jsonl("base_samples.jsonl", base_samples)
- write_jsonl("mii_samples.jsonl", mii_samples)
- # Evaluating Samples
- base_results = evaluate_functional_correctness("base_samples.jsonl")
- mii_results = evaluate_functional_correctness("mii_samples.jsonl")
- # Executing Assertions
- for key in base_results.keys():
- assert numpy.allclose(base_results[key], mii_results[key], rtol=0.10), \
- f"Base result: {base_results[key]}, MII result: {mii_results[key]}, outside of rtol."
|