Source code for hbllmutils.testing.base

"""
Binary testing utilities for language model evaluation.

This module provides a small framework for executing *binary* tests on large
language models, where each test yields a pass/fail result. It offers simple
data structures for representing the results of individual tests and aggregated
statistics for repeated runs. A base class is also provided to simplify the
implementation of concrete tests.

The module contains the following main components:

* :class:`BinaryTestResult` - Stores the outcome of a single binary test
* :class:`MultiBinaryTestResult` - Aggregates multiple test results and statistics
* :class:`BinaryTest` - Base class for implementing binary tests

Typical usage involves subclassing :class:`BinaryTest` and implementing
:meth:`BinaryTest._single_test` to define the test logic. The :meth:`BinaryTest.test`
method can then execute the test once or multiple times to produce statistics.

Example::

    >>> class AlwaysPassTest(BinaryTest):
    ...     def _single_test(self, model, **params):
    ...         return BinaryTestResult(passed=True, content="ok")
    ...
    >>> test = AlwaysPassTest()
    >>> result = test.test(model="my-llm", n=3, silent=True)
    >>> result.passed_ratio
    1.0

.. note::
   This module expects a non-empty list of tests when computing aggregate
   statistics. Passing an empty list to :class:`MultiBinaryTestResult` will
   raise a ``ZeroDivisionError`` due to division by zero.

"""

from dataclasses import dataclass
from typing import Any, List, Optional, Union

from tqdm import tqdm

from ..model import LLMModel, load_llm_model, LLMModelTyping


[docs] @dataclass class BinaryTestResult: """ Data class representing the result of a single binary test. :param passed: Whether the test passed or failed. :type passed: bool :param content: The content or output produced during the test. :type content: str Example:: >>> BinaryTestResult(passed=True, content="response text") BinaryTestResult(passed=True, content='response text') """ passed: bool content: str
[docs] @dataclass class MultiBinaryTestResult: """ Data class representing aggregated results from multiple binary tests. This class automatically calculates statistics about the test results, including total count, passed/failed counts, and their ratios. :param tests: List of individual binary test results. :type tests: List[BinaryTestResult] :param total_count: Total number of tests (automatically calculated). :type total_count: int :param passed_count: Number of tests that passed (automatically calculated). :type passed_count: int :param passed_ratio: Ratio of tests that passed (automatically calculated). :type passed_ratio: float :param failed_count: Number of tests that failed (automatically calculated). :type failed_count: int :param failed_ratio: Ratio of tests that failed (automatically calculated). :type failed_ratio: float :raises ZeroDivisionError: If ``tests`` is an empty list. Example:: >>> results = [ ... BinaryTestResult(passed=True, content="test1"), ... BinaryTestResult(passed=False, content="test2"), ... ] >>> multi_result = MultiBinaryTestResult(tests=results) >>> multi_result.passed_ratio 0.5 """ tests: List[BinaryTestResult] total_count: int = 0 passed_count: int = 0 passed_ratio: float = 0 failed_count: int = 0 failed_ratio: float = 0
[docs] def __post_init__(self) -> None: """ Post-initialization method that calculates test statistics. This method is automatically called after the dataclass is initialized. It computes the total count, passed/failed counts, and their ratios based on the provided test results. :raises ZeroDivisionError: If ``tests`` is an empty list. """ self.total_count = len(self.tests) self.passed_count, self.failed_count = 0, 0 for test in self.tests: if test.passed: self.passed_count += 1 else: self.failed_count += 1 self.passed_ratio = self.passed_count / self.total_count self.failed_ratio = self.failed_count / self.total_count
[docs] class BinaryTest: """ Base class for implementing binary tests on language models. This class provides a framework for running tests that have a pass/fail outcome. Tests can be run once or multiple times to gather statistics. Subclasses should implement the :meth:`_single_test` method to define the specific test logic. :ivar __desc_name__: Optional descriptive name for the test, used in progress bars. :vartype __desc_name__: Optional[str] Example:: >>> class MyBinaryTest(BinaryTest): ... def _single_test(self, model, **params): ... return BinaryTestResult(passed=True, content="ok") ... >>> test = MyBinaryTest() >>> result = test.test(model="my-llm", n=1, silent=True) >>> result.passed True """ __desc_name__: Optional[str] = None def _single_test(self, model: LLMModel, **params: Any) -> BinaryTestResult: """ Execute a single binary test on the given model. This is an abstract method that must be implemented by subclasses to define the specific test logic. :param model: The language model to test. :type model: LLMModel :param params: Additional parameters for the test. :type params: dict :return: The result of the single test. :rtype: BinaryTestResult :raises NotImplementedError: This method must be implemented by subclasses. """ raise NotImplementedError # pragma: no cover
[docs] def test( self, model: LLMModelTyping, n: int = 1, silent: bool = False, **params: Any, ) -> Union[BinaryTestResult, MultiBinaryTestResult]: """ Run the binary test one or multiple times on the given model. If ``n == 1``, runs a single test and returns a :class:`BinaryTestResult`. If ``n > 1``, runs multiple tests and returns a :class:`MultiBinaryTestResult` with aggregated statistics. :param model: The language model to test. Can be a model instance or a model identifier. :type model: LLMModelTyping :param n: Number of times to run the test, defaults to 1. :type n: int :param silent: If True, suppresses the progress bar, defaults to False. :type silent: bool :param params: Additional parameters to pass to the test. :type params: dict :return: Single test result if ``n == 1``, otherwise aggregated results. :rtype: Union[BinaryTestResult, MultiBinaryTestResult] Example:: >>> test = MyBinaryTest() # Assuming MyBinaryTest is a subclass >>> result = test.test(model="my-llm", n=10, silent=True) >>> print(f"Pass rate: {result.passed_ratio}") Pass rate: 0.8 """ model = load_llm_model(model) if n == 1: return self._single_test(model=model, **params) else: tests = [] for _ in tqdm(range(n), disable=silent, desc=self.__desc_name__): tests.append(self._single_test(model=model, **params)) return MultiBinaryTestResult(tests=tests)