code_bleu / code_bleu.py
mutash's picture
test
34a14c3
"""
Code BLEU metric implementation
"""
import datasets
import evaluate
from codebleu import calc_codebleu
CODEBLEU_WEIGHTS = (0.25, 0.25, 0.25, 0.25)
_CITATION = """\
@misc{ren2020codebleu,
title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis},
author={Shuo Ren and Daya Guo and Shuai Lu and Long Zhou and Shujie Liu and Duyu Tang and Neel Sundaresan and Ming Zhou and Ambrosio Blanco and Shuai Ma},
year={2020},
eprint={2009.10297},
archivePrefix={arXiv},
primaryClass={cs.SE}
}
"""
_DESCRIPTION = """
An ideal evaluation metric should consider the grammatical correctness and the logic correctness.
We propose weighted n-gram match and syntactic AST match to measure grammatical correctness, and introduce semantic data-flow match to calculate logic correctness.
Source: https://pypi.org/project/codebleu/
"""
_KWARGS_DESCRIPTION = """
Computes CodeBLEU score of code segments against a reference.
Args:
predictions: list of code generations to score.
references: list of lists of or just a list of references for each code generation task.
Returns:
'codebleu_score': code bleu score
Examples:
>>> predictions = ["def add ( a , b ) :\n return a + b"]
>>> references = ["def sum ( first , second ) :\n return second + first"]
>>> codebleu = evaluate.load("codebleu_score")
>>> results = codebleu.compute(predictions=predictions, references=references)
>>> print(results["codebleu_score"])
0.5537
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class CodeBleu(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
}
),
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
],
codebase_urls=["https://github.com/microsoft/CodeXGLUE/tree/main"],
reference_urls=[
"https://pypi.org/project/codebleu/",
],
)
def compute_codebleu_score(self, ground_truth, generated_answer, lang="python"):
"""
Function to compute CodeBLEU score between ground truth code and generated code
Has keywords for C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, and Rust.
"""
result = calc_codebleu([ground_truth], [generated_answer], lang=lang, weights=CODEBLEU_WEIGHTS, tokenizer=None)
return result["codebleu"]
def _compute(self, references, predictions):
average_codebleu_score = sum([self.compute_codebleu_score(r, p) for r, p in zip(references, predictions)])/len(references)
return {"codebleu_score": average_codebleu_score}