|
|
""" |
|
|
Code BLEU metric implementation |
|
|
""" |
|
|
|
|
|
import datasets |
|
|
import evaluate |
|
|
from codebleu import calc_codebleu |
|
|
|
|
|
CODEBLEU_WEIGHTS = (0.25, 0.25, 0.25, 0.25) |
|
|
|
|
|
_CITATION = """\ |
|
|
@misc{ren2020codebleu, |
|
|
title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis}, |
|
|
author={Shuo Ren and Daya Guo and Shuai Lu and Long Zhou and Shujie Liu and Duyu Tang and Neel Sundaresan and Ming Zhou and Ambrosio Blanco and Shuai Ma}, |
|
|
year={2020}, |
|
|
eprint={2009.10297}, |
|
|
archivePrefix={arXiv}, |
|
|
primaryClass={cs.SE} |
|
|
} |
|
|
""" |
|
|
|
|
|
_DESCRIPTION = """ |
|
|
An ideal evaluation metric should consider the grammatical correctness and the logic correctness. |
|
|
We propose weighted n-gram match and syntactic AST match to measure grammatical correctness, and introduce semantic data-flow match to calculate logic correctness. |
|
|
Source: https://pypi.org/project/codebleu/ |
|
|
""" |
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
|
Computes CodeBLEU score of code segments against a reference. |
|
|
Args: |
|
|
predictions: list of code generations to score. |
|
|
references: list of lists of or just a list of references for each code generation task. |
|
|
Returns: |
|
|
'codebleu_score': code bleu score |
|
|
Examples: |
|
|
|
|
|
>>> predictions = ["def add ( a , b ) :\n return a + b"] |
|
|
>>> references = ["def sum ( first , second ) :\n return second + first"] |
|
|
>>> codebleu = evaluate.load("codebleu_score") |
|
|
>>> results = codebleu.compute(predictions=predictions, references=references) |
|
|
>>> print(results["codebleu_score"]) |
|
|
0.5537 |
|
|
""" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
|
class CodeBleu(evaluate.Metric): |
|
|
def _info(self): |
|
|
return evaluate.MetricInfo( |
|
|
description=_DESCRIPTION, |
|
|
citation=_CITATION, |
|
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
features=[ |
|
|
datasets.Features( |
|
|
{ |
|
|
"predictions": datasets.Value("string", id="sequence"), |
|
|
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), |
|
|
} |
|
|
), |
|
|
datasets.Features( |
|
|
{ |
|
|
"predictions": datasets.Value("string", id="sequence"), |
|
|
"references": datasets.Value("string", id="sequence"), |
|
|
} |
|
|
), |
|
|
], |
|
|
codebase_urls=["https://github.com/microsoft/CodeXGLUE/tree/main"], |
|
|
reference_urls=[ |
|
|
"https://pypi.org/project/codebleu/", |
|
|
], |
|
|
) |
|
|
|
|
|
def compute_codebleu_score(self, ground_truth, generated_answer, lang="python"): |
|
|
""" |
|
|
Function to compute CodeBLEU score between ground truth code and generated code |
|
|
Has keywords for C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, and Rust. |
|
|
""" |
|
|
result = calc_codebleu([ground_truth], [generated_answer], lang=lang, weights=CODEBLEU_WEIGHTS, tokenizer=None) |
|
|
|
|
|
return result["codebleu"] |
|
|
|
|
|
def _compute(self, references, predictions): |
|
|
average_codebleu_score = sum([self.compute_codebleu_score(r, p) for r, p in zip(references, predictions)])/len(references) |
|
|
return {"codebleu_score": average_codebleu_score} |
|
|
|