Skip to content

Eval Math

eval_math(fname: str) -> Tuple[int, int, int, int, int]

Evaluate math answers from a CSV file using different sampling methods.

Reads a CSV file containing answers from different sampling strategies and grades them against the correct answers.

Parameters:

Name Type Description Default
fname str

Path to the CSV file containing answers and correct solutions.

required

Returns:

Type Description
int

A tuple containing (naive_sampling_correct, low_temp_sampling_correct,

int

power_sampling_sliding_window_correct, power_sampling_correct, total),

int

where each value represents the count of correct answers for that method

int

and total is the number of questions evaluated.

Source code in pita/utils/grading_utils/math/eval_math.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def eval_math(fname: str) -> Tuple[int, int, int, int, int]:
    """
    Evaluate math answers from a CSV file using different sampling methods.

    Reads a CSV file containing answers from different sampling strategies
    and grades them against the correct answers.

    Args:
        fname: Path to the CSV file containing answers and correct solutions.

    Returns:
        A tuple containing (naive_sampling_correct, low_temp_sampling_correct,
        power_sampling_sliding_window_correct, power_sampling_correct, total),
        where each value represents the count of correct answers for that method
        and total is the number of questions evaluated.
    """
    print(fname)
    df = pd.read_csv(fname)
    naive_sampling_correct = 0
    low_temp_sampling_correct = 0
    power_sampling_sliding_window_correct = 0
    power_sampling_correct = 0
    total = len(df)

    for i in range(total):
        naive_sampling_correct += safe_grade(df["naive_sampling_answer"][i], df["correct_answer"][i])
        low_temp_sampling_correct += safe_grade(df["low_temp_sampling_answer"][i], df["correct_answer"][i])
        power_sampling_sliding_window_correct += safe_grade(df["power_sampling_windowed_answer"][i], df["correct_answer"][i])
        power_sampling_correct += safe_grade(df["power_sampling_answer"][i], df["correct_answer"][i])


    return naive_sampling_correct, low_temp_sampling_correct, power_sampling_sliding_window_correct, power_sampling_correct, total

math_results(fnames: List[str]) -> Dict[str, float]

Compute and display aggregate math results across multiple CSV files.

Evaluates answers from multiple CSV files using different sampling strategies and computes accuracy metrics for each strategy.

Parameters:

Name Type Description Default
fnames List[str]

List of paths to CSV files containing answers and correct solutions.

required

Returns:

Type Description
Dict[str, float]

A dictionary containing accuracy metrics for each sampling strategy:

Dict[str, float]
  • naive_sampling_acc: Accuracy of naive sampling method.
Dict[str, float]
  • low_temp_sampling_acc: Accuracy of low temperature sampling method.
Dict[str, float]
  • power_sampling_sliding_window_acc: Accuracy of power sampling with sliding window.
Dict[str, float]
  • power_sampling_acc: Accuracy of power sampling method.
Source code in pita/utils/grading_utils/math/eval_math.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def math_results(fnames: List[str]) -> Dict[str, float]:
    """
    Compute and display aggregate math results across multiple CSV files.

    Evaluates answers from multiple CSV files using different sampling strategies
    and computes accuracy metrics for each strategy.

    Args:
        fnames: List of paths to CSV files containing answers and correct solutions.

    Returns:
        A dictionary containing accuracy metrics for each sampling strategy:
        - naive_sampling_acc: Accuracy of naive sampling method.
        - low_temp_sampling_acc: Accuracy of low temperature sampling method.
        - power_sampling_sliding_window_acc: Accuracy of power sampling with sliding window.
        - power_sampling_acc: Accuracy of power sampling method.
    """
    naive_sampling_total = 0
    low_temp_sampling_total = 0
    power_sampling_sliding_window_total = 0
    power_sampling_total = 0
    total = 0

    for fname in fnames:
        naive, low_temp, power_sliding, power, n = eval_math(fname)
        naive_sampling_total += naive
        low_temp_sampling_total += low_temp
        power_sampling_sliding_window_total += power_sliding
        power_sampling_total += power
        total += n

    denom = max(total, 1)
    naive_sampling_acc = naive_sampling_total / denom
    low_temp_sampling_acc = low_temp_sampling_total / denom
    power_sampling_sliding_window_acc = power_sampling_sliding_window_total / denom
    power_sampling_acc = power_sampling_total / denom

    print(f"Files evaluated: {len(fnames)}")
    print(f"Total questions: {total}")
    print(f"Naive sampling accuracy:  {naive_sampling_acc:.3f}")
    print(f"Low temperature sampling accuracy:  {low_temp_sampling_acc:.3f}")
    print(f"Power sampling sliding window accuracy:  {power_sampling_sliding_window_acc:.3f}")
    print(f"Power Sampling accuracy:  {power_sampling_acc:.3f}")

    return {
        "naive_sampling_acc": naive_sampling_acc,
        "low_temp_sampling_acc": low_temp_sampling_acc,
        "power_sampling_sliding_window_acc": power_sampling_sliding_window_acc,
        "power_sampling_acc": power_sampling_acc,
    }

safe_grade(ans: str, correct_ans: str) -> int

Safely grade an answer against the correct answer.

Attempts to grade the given answer using the grade_answer function. Returns 0 if any exception occurs during grading.

Parameters:

Name Type Description Default
ans str

The student's answer to grade.

required
correct_ans str

The correct answer to compare against.

required

Returns:

Type Description
int

1 if the answer is correct, 0 if incorrect or if an exception occurred.

Source code in pita/utils/grading_utils/math/eval_math.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def safe_grade(ans: str, correct_ans: str) -> int:
    """
    Safely grade an answer against the correct answer.

    Attempts to grade the given answer using the grade_answer function.
    Returns 0 if any exception occurs during grading.

    Args:
        ans: The student's answer to grade.
        correct_ans: The correct answer to compare against.

    Returns:
        1 if the answer is correct, 0 if incorrect or if an exception occurred.
    """
    try:
        return int(grade_answer(ans, correct_ans))
    except Exception:
        return 0