Source code for hbllmutils.template.matcher

"""
Template-based file matcher utilities for extracting structured metadata from filenames.

This module implements a metaclass-driven pattern matching system that turns
template patterns with typed placeholders into compiled regular expressions.
It provides a convenient way to scan directories, match file names, and
automatically convert captured fields into their declared Python types.

The module contains the following public component:

* :class:`BaseMatcher` - Base class for defining file matchers using
  ``__pattern__`` templates and type annotations.

.. note::
   The metaclass :class:`_MatcherMeta` is an internal implementation detail.
   It is intentionally not part of the public API.

Example::

    >>> class ImageMatcher(BaseMatcher):
    ...     __pattern__ = "image_<id>_<name>.png"
    ...     id: int
    ...     name: str
    >>> matcher = ImageMatcher.match("/path/to/images")
    >>> if matcher:
    ...     print(matcher.id, matcher.name)

"""

import os
import re
from pathlib import Path
from typing import Optional, List, Dict, Any, Union, Iterator, Tuple, Type

from hbutils.model import IComparable
from natsort import natsorted


class _MatcherMeta(type):
    """
    Metaclass for creating pattern-based file matchers.

    This metaclass processes the :attr:`__pattern__` attribute and type
    annotations to generate a compiled regular expression pattern and field
    metadata. It converts template patterns like ``"file_<id>_<name>.txt"``
    into regex patterns with typed capture groups.
    """

    def __new__(cls, *args: Any, **kwargs: Any) -> Type:
        """
        Create a new matcher class with processed pattern and field information.

        :param args: Positional arguments for type creation
        :type args: tuple
        :param kwargs: Keyword arguments for type creation
        :type kwargs: dict
        :return: New matcher class instance with processed metadata
        :rtype: type
        """
        instance = super().__new__(cls, *args, **kwargs)
        instance.__regexp_pattern__, instance.__fields__, instance.__field_names__ = \
            cls._cls_init(instance.__pattern__, getattr(instance, '__annotations__') or {})
        instance.__field_names_set__ = set(instance.__field_names__)
        return instance

    @classmethod
    def _cls_init(cls, pattern: str, annotations: Dict[str, type]) -> Tuple[str, Dict[str, type], List[str]]:
        """
        Initialize class-level pattern and field information.

        This method parses the template pattern to extract placeholders,
        validates them against the provided type annotations, and generates
        a regex pattern with capture groups for each field type.

        :param pattern: Template pattern with placeholders like ``"file_<id>_<name>.txt"``
        :type pattern: str
        :param annotations: Type annotations for fields
        :type annotations: Dict[str, type]
        :return: Tuple of ``(regex_pattern, fields_dict, field_names_list)``
        :rtype: Tuple[str, Dict[str, type], List[str]]
        :raises NameError: If placeholders do not match annotated fields

        Example::

            >>> pattern = "image_<id>_<name>.png"
            >>> annotations = {'id': int, 'name': str}
            >>> regex, fields, names = _MatcherMeta._cls_init(pattern, annotations)
            >>> print(names)
            ['id', 'name']
        """
        fields: Dict[str, type] = {}
        # Find all placeholders <field_name>
        placeholder_pattern = r'<(\w+)>'
        placeholders = re.findall(placeholder_pattern, pattern)
        annotations = {key: value for key, value in annotations.items()
                       if not (key.startswith('__') and key.endswith('__'))}

        # Build regular expression
        regex_pattern = pattern
        if set(annotations.keys()) != set(placeholders):
            if set(annotations.keys()) - set(placeholders):
                raise NameError(f'Field {", ".join(natsorted(set(annotations.keys()) - set(placeholders)))} '
                                f'not included in pattern {pattern!r}.')
            if set(placeholders) - set(annotations.keys()):
                raise NameError(f'Placeholder {", ".join(natsorted(set(placeholders) - set(annotations.keys())))} '
                                f'not included in fields {annotations!r}.')
        for placeholder in placeholders:
            field_type = annotations.get(placeholder, str)
            fields[placeholder] = field_type

            # Generate corresponding regex based on type
            if field_type == int:
                regex_pattern = regex_pattern.replace(f'<{placeholder}>', r'(\d+?)')
            elif field_type == float:
                regex_pattern = regex_pattern.replace(f'<{placeholder}>', r'(\d+\.?\d*?)')
            else:  # str or other types
                regex_pattern = regex_pattern.replace(f'<{placeholder}>', r'([^/\\]+?)')

        # Escape special characters but preserve capture groups
        # Temporarily replace capture groups
        temp_markers: Dict[str, str] = {}
        group_count = 0
        for match in re.finditer(r'\([^)]+\)', regex_pattern):
            marker = f"__TEMP_GROUP_{group_count}__"
            temp_markers[marker] = match.group()
            regex_pattern = regex_pattern.replace(match.group(), marker, 1)
            group_count += 1

        # Escape special characters
        regex_pattern = re.escape(regex_pattern)

        # Restore capture groups
        for marker, group in temp_markers.items():
            regex_pattern = regex_pattern.replace(marker, group)

        return regex_pattern, fields, placeholders


[docs] class BaseMatcher(IComparable, metaclass=_MatcherMeta): """ Base class for file pattern matchers. Subclasses define a :attr:`__pattern__` template and annotate fields with their intended types. Instances represent matched files and provide convenient access to field values and file paths. :cvar __pattern__: Template pattern with placeholders, such as ``"file_<id>.txt"`` :vartype __pattern__: str :cvar __recursively__: Whether to search directories recursively :vartype __recursively__: bool Example:: >>> class LogMatcher(BaseMatcher): ... __pattern__ = "log_<date>_<level>.txt" ... date: str ... level: str >>> matcher = LogMatcher.match("/var/logs") >>> if matcher: ... print(f"Found log: {matcher.date} - {matcher.level}") """ __pattern__: str = "" __recursively__: bool = False
[docs] def __init__(self, full_path: str, **kwargs: Any) -> None: """ Initialize matcher instance with extracted field values. :param full_path: Complete path to the matched file :type full_path: str :param kwargs: Extracted field values from the filename :type kwargs: Any :raises ValueError: If unknown fields are provided or required fields are missing Example:: >>> matcher = ImageMatcher("/path/image_001_test.png", id=1, name="test") >>> print(matcher.id, matcher.name) 1 test """ self.full_path = full_path self.file_name = os.path.basename(full_path) self.dir_path = os.path.dirname(full_path) unknown_fields: Dict[str, Any] = {} excluded_fields = set(self.__field_names_set__) for key, value in kwargs.items(): if key not in self.__field_names_set__: unknown_fields[key] = value else: excluded_fields.remove(key) if unknown_fields: raise ValueError(f'Unknown fields for class {self.__class__.__name__}: {unknown_fields!r}.') if excluded_fields: raise ValueError(f'Non-included fields of class {self.__class__.__name__}: {natsorted(excluded_fields)!r}.') # Set fields extracted from pattern for key, value in kwargs.items(): setattr(self, key, value)
@classmethod def _convert_value(cls, value: str, target_type: type) -> Any: """ Convert string value to target type. :param value: String value to convert :type value: str :param target_type: Target type for conversion :type target_type: type :return: Converted value :rtype: Any :raises TypeError: If target type is not supported Example:: >>> BaseMatcher._convert_value("123", int) 123 >>> BaseMatcher._convert_value("3.14", float) 3.14 """ if target_type == int: return int(value) elif target_type == float: return float(value) elif target_type == bool: return value.lower() in ('true', '1', 'yes', 'on') elif target_type == str: return value else: raise TypeError(f'Unsupported target type - {target_type!r}.') @classmethod def _yield_match(cls, directory: Union[str, Path]) -> Iterator['BaseMatcher']: """ Yield all matching file instances in the specified directory. :param directory: Directory to search for matching files :type directory: Union[str, Path] :return: Iterator of matched file instances :rtype: Iterator[BaseMatcher] Example:: >>> for matcher in ImageMatcher._yield_match("/path/to/images"): ... print(matcher.id, matcher.name) """ directory = Path(directory) if not directory.exists(): return regex_pattern, fields, field_order = cls.__regexp_pattern__, cls.__fields__, cls.__field_names__ compiled_pattern = re.compile(regex_pattern) recursively = getattr(cls, '__recursively__', False) # Build search pattern search_pattern = "**/*" if recursively else "*" for file_path in natsorted(directory.glob(search_pattern)): if file_path.is_file(): file_name = file_path.name match = compiled_pattern.match(file_name) if match: # Extract field values field_values: Dict[str, Any] = {} for i, field_name in enumerate(field_order): raw_value = match.group(i + 1) field_type = fields[field_name] try: converted_value = cls._convert_value(raw_value, field_type) except (ValueError, TypeError): # Type conversion failed, skip this file continue else: field_values[field_name] = converted_value # Create instance instance = cls(str(file_path), **field_values) yield instance
[docs] @classmethod def match(cls, directory: Union[str, Path]) -> Optional['BaseMatcher']: """ Match the first file that conforms to the pattern in the specified directory. :param directory: Directory to search :type directory: Union[str, Path] :return: Matched file instance, or ``None`` if not found :rtype: Optional[BaseMatcher] Example:: >>> matcher = ImageMatcher.match("/path/to/images") >>> if matcher: ... print(f"Found: {matcher.full_path}") """ iterable = cls._yield_match(directory) try: return next(iterable) except StopIteration: return None
[docs] @classmethod def match_all(cls, directory: Union[str, Path]) -> List['BaseMatcher']: """ Match all files that conform to the pattern in the specified directory. :param directory: Directory to search :type directory: Union[str, Path] :return: List of matched file instances :rtype: List[BaseMatcher] Example:: >>> matchers = ImageMatcher.match_all("/path/to/images") >>> print(f"Found {len(matchers)} images") """ return list(cls._yield_match(directory))
[docs] @classmethod def exists(cls, directory: Union[str, Path]) -> bool: """ Check if any file matching the pattern exists in the specified directory. :param directory: Directory to search :type directory: Union[str, Path] :return: ``True`` if a matching file exists, ``False`` otherwise :rtype: bool Example:: >>> if ImageMatcher.exists("/path/to/images"): ... print("Images found!") """ return cls.match(directory) is not None
[docs] def __str__(self) -> str: """ Get string representation of the matcher instance. :return: String representation showing field values and full path :rtype: str """ field_info: List[str] = [] annotations = getattr(self.__class__, '__annotations__') or {} for field_name in annotations: if hasattr(self, field_name): value = getattr(self, field_name) field_info.append(f"{field_name}={value!r}") field_info.append(f"full_path={self.full_path!r}") field_str = ", ".join(field_info) return f"{self.__class__.__name__}({field_str})"
[docs] def __repr__(self) -> str: """ Get representation string of the matcher instance. :return: Representation string :rtype: str """ return self.__str__()
[docs] def tuple(self) -> Tuple[Any, ...]: """ Get field values as a tuple. :return: Tuple of field values in definition order :rtype: tuple Example:: >>> matcher = ImageMatcher("/path/image_001_test.png", id=1, name="test") >>> matcher.tuple() (1, 'test') """ return tuple(getattr(self, name) for name in self.__field_names__)
[docs] def dict(self) -> Dict[str, Any]: """ Get field values as a dictionary. :return: Dictionary mapping field names to values :rtype: dict Example:: >>> matcher = ImageMatcher("/path/image_001_test.png", id=1, name="test") >>> matcher.dict() {'id': 1, 'name': 'test'} """ return {name: getattr(self, name) for name in self.__field_names__}
[docs] def __hash__(self) -> int: """ Get hash value of the matcher instance. :return: Hash value based on field values :rtype: int """ return hash(self.tuple())
def _cmpkey(self) -> Tuple[Any, ...]: """ Get comparison key for ordering instances. :return: Tuple of field values used for comparison :rtype: tuple """ return self.tuple()