Spaces:

muditash
/

code_bleu

Runtime error

App Files Files Community

code_bleu / code_bleu.py

mutash

test

34a14c3 almost 2 years ago

raw

history blame contribute delete

3.25 kB

	"""
	Code BLEU metric implementation
	"""

	import datasets
	import evaluate
	from codebleu import calc_codebleu

	CODEBLEU_WEIGHTS = (0.25, 0.25, 0.25, 0.25)

	_CITATION = """\
	@misc{ren2020codebleu,
	title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis},
	author={Shuo Ren and Daya Guo and Shuai Lu and Long Zhou and Shujie Liu and Duyu Tang and Neel Sundaresan and Ming Zhou and Ambrosio Blanco and Shuai Ma},
	year={2020},
	eprint={2009.10297},
	archivePrefix={arXiv},
	primaryClass={cs.SE}
	}
	"""

	_DESCRIPTION = """
	An ideal evaluation metric should consider the grammatical correctness and the logic correctness.
	We propose weighted n-gram match and syntactic AST match to measure grammatical correctness, and introduce semantic data-flow match to calculate logic correctness.
	Source: https://pypi.org/project/codebleu/
	"""

	_KWARGS_DESCRIPTION = """
	Computes CodeBLEU score of code segments against a reference.
	Args:
	predictions: list of code generations to score.
	references: list of lists of or just a list of references for each code generation task.
	Returns:
	'codebleu_score': code bleu score
	Examples:

	>>> predictions = ["def add ( a , b ) :\n return a + b"]
	>>> references = ["def sum ( first , second ) :\n return second + first"]
	>>> codebleu = evaluate.load("codebleu_score")
	>>> results = codebleu.compute(predictions=predictions, references=references)
	>>> print(results["codebleu_score"])
	0.5537
	"""

	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class CodeBleu(evaluate.Metric):
	def _info(self):
	return evaluate.MetricInfo(
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	features=[
	datasets.Features(
	{
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
	}
	),
	datasets.Features(
	{
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Value("string", id="sequence"),
	}
	),
	],
	codebase_urls=["https://github.com/microsoft/CodeXGLUE/tree/main"],
	reference_urls=[
	"https://pypi.org/project/codebleu/",
	],
	)

	def compute_codebleu_score(self, ground_truth, generated_answer, lang="python"):
	"""
	Function to compute CodeBLEU score between ground truth code and generated code
	Has keywords for C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, and Rust.
	"""
	result = calc_codebleu([ground_truth], [generated_answer], lang=lang, weights=CODEBLEU_WEIGHTS, tokenizer=None)

	return result["codebleu"]

	def _compute(self, references, predictions):
	average_codebleu_score = sum([self.compute_codebleu_score(r, p) for r, p in zip(references, predictions)])/len(references)
	return {"codebleu_score": average_codebleu_score}