Source code for hbllmutils.testing.base

"""
Binary testing utilities for language model evaluation.

This module provides a small framework for executing *binary* tests on large
language models, where each test yields a pass/fail result. It offers simple
data structures for representing the results of individual tests and aggregated
statistics for repeated runs. A base class is also provided to simplify the
implementation of concrete tests.

The module contains the following main components:

* :class:`BinaryTestResult` - Stores the outcome of a single binary test
* :class:`MultiBinaryTestResult` - Aggregates multiple test results and statistics
* :class:`BinaryTest` - Base class for implementing binary tests

Typical usage involves subclassing :class:`BinaryTest` and implementing
:meth:`BinaryTest._single_test` to define the test logic. The :meth:`BinaryTest.test`
method can then execute the test once or multiple times to produce statistics.

Example::

    >>> class AlwaysPassTest(BinaryTest):
    ...     def _single_test(self, model, **params):
    ...         return BinaryTestResult(passed=True, content="ok")
    ...
    >>> test = AlwaysPassTest()
    >>> result = test.test(model="my-llm", n=3, silent=True)
    >>> result.passed_ratio
    1.0

.. note::
   This module expects a non-empty list of tests when computing aggregate
   statistics. Passing an empty list to :class:`MultiBinaryTestResult` will
   raise a ``ZeroDivisionError`` due to division by zero.

"""

from dataclasses import dataclass
from typing import Any, List, Optional, Union

from tqdm import tqdm

from ..model import LLMModel, load_llm_model, LLMModelTyping



[docs]
@dataclass
class BinaryTestResult:
    """
    Data class representing the result of a single binary test.

    :param passed: Whether the test passed or failed.
    :type passed: bool
    :param content: The content or output produced during the test.
    :type content: str

    Example::

        >>> BinaryTestResult(passed=True, content="response text")
        BinaryTestResult(passed=True, content='response text')
    """
    passed: bool
    content: str




[docs]
@dataclass
class MultiBinaryTestResult:
    """
    Data class representing aggregated results from multiple binary tests.

    This class automatically calculates statistics about the test results,
    including total count, passed/failed counts, and their ratios.

    :param tests: List of individual binary test results.
    :type tests: List[BinaryTestResult]
    :param total_count: Total number of tests (automatically calculated).
    :type total_count: int
    :param passed_count: Number of tests that passed (automatically calculated).
    :type passed_count: int
    :param passed_ratio: Ratio of tests that passed (automatically calculated).
    :type passed_ratio: float
    :param failed_count: Number of tests that failed (automatically calculated).
    :type failed_count: int
    :param failed_ratio: Ratio of tests that failed (automatically calculated).
    :type failed_ratio: float

    :raises ZeroDivisionError: If ``tests`` is an empty list.

    Example::

        >>> results = [
        ...     BinaryTestResult(passed=True, content="test1"),
        ...     BinaryTestResult(passed=False, content="test2"),
        ... ]
        >>> multi_result = MultiBinaryTestResult(tests=results)
        >>> multi_result.passed_ratio
        0.5
    """
    tests: List[BinaryTestResult]
    total_count: int = 0
    passed_count: int = 0
    passed_ratio: float = 0
    failed_count: int = 0
    failed_ratio: float = 0


[docs]
    def __post_init__(self) -> None:
        """
        Post-initialization method that calculates test statistics.

        This method is automatically called after the dataclass is initialized.
        It computes the total count, passed/failed counts, and their ratios
        based on the provided test results.

        :raises ZeroDivisionError: If ``tests`` is an empty list.
        """
        self.total_count = len(self.tests)
        self.passed_count, self.failed_count = 0, 0
        for test in self.tests:
            if test.passed:
                self.passed_count += 1
            else:
                self.failed_count += 1
        self.passed_ratio = self.passed_count / self.total_count
        self.failed_ratio = self.failed_count / self.total_count





[docs]
class BinaryTest:
    """
    Base class for implementing binary tests on language models.

    This class provides a framework for running tests that have a pass/fail
    outcome. Tests can be run once or multiple times to gather statistics.
    Subclasses should implement the :meth:`_single_test` method to define
    the specific test logic.

    :ivar __desc_name__: Optional descriptive name for the test, used in
        progress bars.
    :vartype __desc_name__: Optional[str]

    Example::

        >>> class MyBinaryTest(BinaryTest):
        ...     def _single_test(self, model, **params):
        ...         return BinaryTestResult(passed=True, content="ok")
        ...
        >>> test = MyBinaryTest()
        >>> result = test.test(model="my-llm", n=1, silent=True)
        >>> result.passed
        True
    """
    __desc_name__: Optional[str] = None

    def _single_test(self, model: LLMModel, **params: Any) -> BinaryTestResult:
        """
        Execute a single binary test on the given model.

        This is an abstract method that must be implemented by subclasses to
        define the specific test logic.

        :param model: The language model to test.
        :type model: LLMModel
        :param params: Additional parameters for the test.
        :type params: dict

        :return: The result of the single test.
        :rtype: BinaryTestResult
        :raises NotImplementedError: This method must be implemented by subclasses.
        """
        raise NotImplementedError  # pragma: no cover


[docs]
    def test(
            self,
            model: LLMModelTyping,
            n: int = 1,
            silent: bool = False,
            **params: Any,
    ) -> Union[BinaryTestResult, MultiBinaryTestResult]:
        """
        Run the binary test one or multiple times on the given model.

        If ``n == 1``, runs a single test and returns a :class:`BinaryTestResult`.
        If ``n > 1``, runs multiple tests and returns a
        :class:`MultiBinaryTestResult` with aggregated statistics.

        :param model: The language model to test. Can be a model instance or
            a model identifier.
        :type model: LLMModelTyping
        :param n: Number of times to run the test, defaults to 1.
        :type n: int
        :param silent: If True, suppresses the progress bar, defaults to False.
        :type silent: bool
        :param params: Additional parameters to pass to the test.
        :type params: dict

        :return: Single test result if ``n == 1``, otherwise aggregated results.
        :rtype: Union[BinaryTestResult, MultiBinaryTestResult]

        Example::

            >>> test = MyBinaryTest()  # Assuming MyBinaryTest is a subclass
            >>> result = test.test(model="my-llm", n=10, silent=True)
            >>> print(f"Pass rate: {result.passed_ratio}")
            Pass rate: 0.8
        """
        model = load_llm_model(model)
        if n == 1:
            return self._single_test(model=model, **params)
        else:
            tests = []
            for _ in tqdm(range(n), disable=silent, desc=self.__desc_name__):
                tests.append(self._single_test(model=model, **params))
            return MultiBinaryTestResult(tests=tests)