post_editing_evaluator / evaluator /run_experiment.py
Morgan
initialised evaluator
5d5c713
import json
from pathlib import Path
from assignment5.chrf import calculate_chrf
from assignment5.mbr import select_best_hypothesis
# Load data
data_dir = Path(__file__).parent / "mt_data"
source_sentences = (data_dir / "source_sentences.txt").read_text().splitlines()
reference_translations = (data_dir / "reference_translations.txt").read_text().splitlines()
beam_search_translations = (data_dir / "beam_search_translations.txt").read_text().splitlines()
with open(data_dir / "samples.jsonl") as f:
samples = [json.loads(line)["samples"] for line in f]
# Step 1: Select the best hypothesis for each source sentence using MBR decoding
mbr_translations = [select_best_hypothesis(sample_set) for sample_set in samples]
# Step 2: Calculate ChrF scores for MBR translations
mbr_chrf_scores = [
calculate_chrf(mbr_translation, reference)
for mbr_translation, reference in zip(mbr_translations, reference_translations)
]
average_mbr_chrf = sum(mbr_chrf_scores) / len(mbr_chrf_scores)
# Step 3: Calculate ChrF scores for beam search translations
beam_chrf_scores = [
calculate_chrf(beam_translation, reference)
for beam_translation, reference in zip(beam_search_translations, reference_translations)
]
average_beam_chrf = sum(beam_chrf_scores) / len(beam_chrf_scores)
# Step 4: Print the results
print(f"Average ChrF score for MBR decoding: {average_mbr_chrf:.2f}")
print(f"Average ChrF score for beam search: {average_beam_chrf:.2f}")
if average_mbr_chrf > average_beam_chrf:
print("MBR decoding produced better translations.")
else:
print("Beam search produced better translations.")