""" Code BLEU metric implementation """ import datasets import evaluate from codebleu import calc_codebleu CODEBLEU_WEIGHTS = (0.25, 0.25, 0.25, 0.25) _CITATION = """\ @misc{ren2020codebleu, title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis}, author={Shuo Ren and Daya Guo and Shuai Lu and Long Zhou and Shujie Liu and Duyu Tang and Neel Sundaresan and Ming Zhou and Ambrosio Blanco and Shuai Ma}, year={2020}, eprint={2009.10297}, archivePrefix={arXiv}, primaryClass={cs.SE} } """ _DESCRIPTION = """ An ideal evaluation metric should consider the grammatical correctness and the logic correctness. We propose weighted n-gram match and syntactic AST match to measure grammatical correctness, and introduce semantic data-flow match to calculate logic correctness. Source: https://pypi.org/project/codebleu/ """ _KWARGS_DESCRIPTION = """ Computes CodeBLEU score of code segments against a reference. Args: predictions: list of code generations to score. references: list of lists of or just a list of references for each code generation task. Returns: 'codebleu_score': code bleu score Examples: >>> predictions = ["def add ( a , b ) :\n return a + b"] >>> references = ["def sum ( first , second ) :\n return second + first"] >>> codebleu = evaluate.load("codebleu_score") >>> results = codebleu.compute(predictions=predictions, references=references) >>> print(results["codebleu_score"]) 0.5537 """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class CodeBleu(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=[ datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), } ), datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Value("string", id="sequence"), } ), ], codebase_urls=["https://github.com/microsoft/CodeXGLUE/tree/main"], reference_urls=[ "https://pypi.org/project/codebleu/", ], ) def compute_codebleu_score(self, ground_truth, generated_answer, lang="python"): """ Function to compute CodeBLEU score between ground truth code and generated code Has keywords for C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, and Rust. """ result = calc_codebleu([ground_truth], [generated_answer], lang=lang, weights=CODEBLEU_WEIGHTS, tokenizer=None) return result["codebleu"] def _compute(self, references, predictions): average_codebleu_score = sum([self.compute_codebleu_score(r, p) for r, p in zip(references, predictions)])/len(references) return {"codebleu_score": average_codebleu_score}