Source code for hbllmutils.utils.truncate

"""
Data structure truncation utilities for logging purposes.

This module provides utilities for truncating and formatting complex nested data 
structures (dictionaries, lists, strings) to make them suitable for logging. It is 
particularly useful for preventing log files from becoming excessively large when 
dealing with verbose outputs from Large Language Models (LLMs) or other systems 
that generate extensive data structures.

The module contains the following main components:

* :func:`truncate_dict` - Recursively truncate nested data structures
* :func:`log_pformat` - Format truncated data for logging output

.. note::
   This module is designed to handle arbitrarily nested data structures and will
   recursively process all levels of nesting while applying truncation rules.

.. warning::
   Very deeply nested structures may still cause performance issues. Consider
   limiting the depth of structures before processing if performance is critical.

Example::

    >>> from hbllmutils.utils.truncate import log_pformat, truncate_dict
    >>> 
    >>> # Example with LLM conversation history
    >>> llm_history = [
    ...     {"role": "system", "content": "You are a helpful assistant"},
    ...     {"role": "user", "content": "Hello" * 1000},
    ...     {"role": "assistant", "content": "Hi there!"}
    ... ]
    >>> print(log_pformat(llm_history, max_string_len=50))
    [{'content': 'You are a helpful assistant', 'role': 'system'},
     {'content': 'HelloHelloHelloHelloHelloHelloHelloHelloHelloH...<truncated, total 5000 chars>',
      'role': 'user'},
     {'content': 'Hi there!', 'role': 'assistant'}]
    >>> 
    >>> # Example with large dictionary
    >>> large_dict = {f"key_{i}": f"value_{i}" * 100 for i in range(20)}
    >>> truncated = truncate_dict(large_dict, max_dict_keys=3, max_string_len=30)
    >>> print(truncated)
    {'key_0': 'value_0value_0value_0value_0va...<truncated, total 700 chars>',
     'key_1': 'value_1value_1value_1value_1va...<truncated, total 700 chars>',
     'key_2': 'value_2value_2value_2value_2va...<truncated, total 700 chars>',
     '<truncated>': '17 more keys'}

"""

import shutil
from pprint import pformat
from typing import Any, Optional



[docs]
def truncate_dict(
        obj: Any,
        max_string_len: int = 250,
        max_list_items: int = 4,
        max_dict_keys: int = 5,
        current_depth: int = 0
) -> Any:
    """
    Recursively truncate complex data structures for logging purposes.

    This function traverses nested data structures (dictionaries, lists, tuples, 
    strings) and truncates them according to specified limits to prevent excessive 
    log output. It handles arbitrary nesting depth and preserves the structure 
    while reducing the size of the data.

    The function applies different truncation strategies based on the data type:
    
    * **Strings**: Truncated to max_string_len characters with ellipsis and total length
    * **Lists/Tuples**: Limited to max_list_items elements with count of remaining items
    * **Dictionaries**: Limited to max_dict_keys keys with count of remaining keys
    * **Other types**: Returned unchanged

    :param obj: The object to truncate. Can be any type including nested structures
                such as lists of dictionaries, dictionaries of lists, etc.
    :type obj: Any
    :param max_string_len: Maximum length for string values before truncation.
                          Strings longer than this will be cut and marked with ellipsis.
                          Defaults to 250.
    :type max_string_len: int
    :param max_list_items: Maximum number of items to keep in lists or tuples.
                          Additional items will be replaced with a summary message.
                          Defaults to 4.
    :type max_list_items: int
    :param max_dict_keys: Maximum number of keys to keep in dictionaries.
                         Additional keys will be replaced with a summary message.
                         Defaults to 5.
    :type max_dict_keys: int
    :param current_depth: Current recursion depth, used internally for tracking
                         nesting level. Should not be set by users. Defaults to 0.
    :type current_depth: int

    :return: Truncated version of the input object with the same structure but
            reduced content according to the specified limits.
    :rtype: Any

    .. note::
       The function preserves the original data types (list remains list, dict 
       remains dict) but may add string markers to indicate truncation.

    .. warning::
       This function modifies the structure by adding truncation markers. The
       returned object is not suitable for further processing, only for display.

    Example::

        >>> # Truncate a long string
        >>> truncate_dict("a" * 300, max_string_len=10)
        'aaaaaaaaaa...<truncated, total 300 chars>'
        
        >>> # Truncate a list
        >>> truncate_dict([1, 2, 3, 4, 5], max_list_items=3)
        [1, 2, 3, '...<2 more items>']
        
        >>> # Truncate a nested structure
        >>> data = {
        ...     "messages": [
        ...         {"role": "user", "content": "x" * 500},
        ...         {"role": "assistant", "content": "y" * 500}
        ...     ]
        ... }
        >>> result = truncate_dict(data, max_string_len=20, max_list_items=1)
        >>> print(result)
        {'messages': [{'role': 'user', 'content': 'xxxxxxxxxxxxxxxxxxxx...<truncated, total 500 chars>'},
                      '...<1 more items>']}
        
        >>> # Truncate a large dictionary
        >>> large_dict = {f"key{i}": f"value{i}" for i in range(10)}
        >>> truncate_dict(large_dict, max_dict_keys=3)
        {'key0': 'value0', 'key1': 'value1', 'key2': 'value2', '<truncated>': '7 more keys'}

    """
    if isinstance(obj, str):
        if len(obj) > max_string_len:
            return obj[:max_string_len] + f"...<truncated, total {len(obj)} chars>"
        return obj

    elif isinstance(obj, (list, tuple)):
        if len(obj) > max_list_items:
            truncated = [
                truncate_dict(item, max_string_len, max_list_items,
                              max_dict_keys, current_depth + 1)
                for item in obj[:max_list_items]
            ]
            truncated.append(f"...<{len(obj) - max_list_items} more items>")
            return truncated
        else:
            return [
                truncate_dict(item, max_string_len, max_list_items,
                              max_dict_keys, current_depth + 1)
                for item in obj
            ]

    elif isinstance(obj, dict):
        if len(obj) > max_dict_keys:
            keys = list(obj.keys())[:max_dict_keys]
            result = {}
            for key in keys:
                result[key] = truncate_dict(
                    obj[key], max_string_len, max_list_items,
                    max_dict_keys, current_depth + 1
                )
            result[f"<truncated>"] = f"{len(obj) - max_dict_keys} more keys"
            return result
        else:
            return {
                key: truncate_dict(
                    value, max_string_len, max_list_items,
                    max_dict_keys, current_depth + 1
                )
                for key, value in obj.items()
            }

    else:
        return obj




[docs]
def log_pformat(
        obj: Any,
        max_string_len: int = 250,
        max_list_items: int = 4,
        max_dict_keys: int = 5,
        width: Optional[int] = None,
        **kwargs
) -> str:
    """
    Generate a concise formatted string representation for logging purposes.

    This function combines truncation and pretty-printing to create log-friendly
    string representations of complex data structures. It first truncates the data
    using :func:`truncate_dict` and then formats it using Python's :func:`pprint.pformat`
    for readable output. This is particularly useful for logging LLM conversation 
    histories, API responses, and other verbose data structures.

    The function automatically detects terminal width for optimal formatting unless
    a specific width is provided. All truncation parameters can be customized to
    balance between detail and brevity in log output.

    :param obj: The object to format for logging. Can be any Python object including
                nested structures like lists of dictionaries or dictionaries of lists.
    :type obj: Any
    :param max_string_len: Maximum length for string values before truncation. Strings
                          exceeding this length will be cut with an ellipsis and total
                          length indicator. Defaults to 250.
    :type max_string_len: int
    :param max_list_items: Maximum number of items to display in lists or tuples.
                          Additional items will be summarized with a count message.
                          Defaults to 4.
    :type max_list_items: int
    :param max_dict_keys: Maximum number of keys to display in dictionaries.
                         Additional keys will be summarized with a count message.
                         Defaults to 5.
    :type max_dict_keys: int
    :param width: Output width for formatting in characters. If None, automatically
                 detects terminal width using :func:`shutil.get_terminal_size`.
                 Defaults to None.
    :type width: Optional[int]
    :param kwargs: Additional keyword arguments passed directly to :func:`pprint.pformat`.
                  Common options include indent, depth, compact, sort_dicts, and
                  underscore_numbers.
    :type kwargs: Any

    :return: A formatted string representation of the truncated object, suitable for
            logging or console output with proper indentation and line breaks.
    :rtype: str

    .. note::
       The function uses terminal width detection to ensure output fits within the
       console. This may not work correctly in all environments (e.g., when output
       is redirected to a file).

    .. warning::
       The returned string is for display purposes only. Do not attempt to parse
       or deserialize it back into the original data structure.

    Example::

        >>> from hbllmutils.utils.truncate import log_pformat
        >>> 
        >>> # Format LLM conversation history
        >>> llm_history = [
        ...     {"role": "system", "content": "You are a helpful assistant"},
        ...     {"role": "user", "content": "Hello" * 1000},
        ...     {"role": "assistant", "content": "Hi there! How can I help you?"}
        ... ]
        >>> print(log_pformat(llm_history, max_string_len=50))
        [{'content': 'You are a helpful assistant', 'role': 'system'},
         {'content': 'HelloHelloHelloHelloHelloHelloHelloHelloHelloH...<truncated, total 5000 chars>',
          'role': 'user'},
         {'content': 'Hi there! How can I help you?', 'role': 'assistant'}]
        
        >>> # Format API response with custom width
        >>> api_response = {
        ...     "status": "success",
        ...     "data": {
        ...         "items": [{"id": i, "name": f"Item {i}"} for i in range(10)],
        ...         "metadata": {"total": 10, "page": 1}
        ...     }
        ... }
        >>> print(log_pformat(api_response, max_list_items=2, width=60))
        {'data': {'items': [{'id': 0, 'name': 'Item 0'},
                            {'id': 1, 'name': 'Item 1'},
                            '...<8 more items>'],
                  'metadata': {'page': 1, 'total': 10}},
         'status': 'success'}
        
        >>> # Use with custom pformat options
        >>> nested_data = {"level1": {"level2": {"level3": {"level4": "deep"}}}}
        >>> print(log_pformat(nested_data, depth=2, compact=True))
        {'level1': {'level2': {...}}}

    """
    truncated = truncate_dict(
        obj=obj,
        max_string_len=max_string_len,
        max_list_items=max_list_items,
        max_dict_keys=max_dict_keys,
    )
    width = width or shutil.get_terminal_size()[0]
    return pformat(truncated, width=width, **kwargs)