"""
File pattern matching module for extracting structured information from filenames.
This module provides a metaclass-based pattern matching system that allows defining
file matchers using template patterns with typed placeholders. It supports automatic
field extraction, type conversion, and file discovery in directories.
The main components are:
- MatcherMeta: Metaclass that processes pattern templates and generates regex patterns
- BaseMatcher: Base class for creating custom file matchers with pattern matching capabilities
Example::
>>> class ImageMatcher(BaseMatcher):
... __pattern__ = "image_<id>_<name>.png"
... id: int
... name: str
>>> matcher = ImageMatcher.match("/path/to/images")
>>> print(matcher.id, matcher.name)
"""
import os
import re
from pathlib import Path
from typing import Optional, List, Dict, Any, Union, Iterator, Tuple
from hbutils.model import IComparable
from natsort import natsorted
class _MatcherMeta(type):
"""
Metaclass for creating pattern-based file matchers.
This metaclass processes the __pattern__ attribute and type annotations to generate
a compiled regular expression pattern and field metadata. It automatically converts
template patterns like "file_<id>_<name>.txt" into proper regex patterns with
appropriate capture groups based on field types.
"""
def __new__(cls, *args, **kwargs):
"""
Create a new matcher class with processed pattern and field information.
:param args: Positional arguments for type creation
:type args: tuple
:param kwargs: Keyword arguments for type creation
:type kwargs: dict
:return: New matcher class instance with processed metadata
:rtype: type
"""
instance = super().__new__(cls, *args, **kwargs)
instance.__regexp_pattern__, instance.__fields__, instance.__field_names__ = \
cls._cls_init(instance.__pattern__, getattr(instance, '__annotations__') or {})
instance.__field_names_set__ = set(instance.__field_names__)
return instance
@classmethod
def _cls_init(cls, pattern: str, annotations: Dict[str, type]) -> Tuple[str, Dict[str, type], List[str]]:
"""
Initialize class-level pattern and field information.
Parses the template pattern to extract placeholders, validates them against
type annotations, and generates a compiled regex pattern with appropriate
capture groups for each field type.
:param pattern: Template pattern with placeholders like "file_<id>_<name>.txt"
:type pattern: str
:param annotations: Type annotations for fields
:type annotations: Dict[str, type]
:return: Tuple of (regex_pattern, fields_dict, field_names_list)
:rtype: Tuple[str, Dict[str, type], List[str]]
:raises NameError: If placeholders don't match annotations or vice versa
Example::
>>> pattern = "image_<id>_<name>.png"
>>> annotations = {'id': int, 'name': str}
>>> regex, fields, names = _MatcherMeta._cls_init(pattern, annotations)
>>> print(names)
['id', 'name']
"""
fields = {}
# Find all placeholders <field_name>
placeholder_pattern = r'<(\w+)>'
placeholders = re.findall(placeholder_pattern, pattern)
annotations = {key: value for key, value in annotations.items()
if not (key.startswith('__') and key.endswith('__'))}
# Build regular expression
regex_pattern = pattern
if set(annotations.keys()) != set(placeholders):
if set(annotations.keys()) - set(placeholders):
raise NameError(f'Field {", ".join(natsorted(set(annotations.keys()) - set(placeholders)))} '
f'not included in pattern {pattern!r}.')
if set(placeholders) - set(annotations.keys()):
raise NameError(f'Placeholder {", ".join(natsorted(set(placeholders) - set(annotations.keys())))} '
f'not included in fields {annotations!r}.')
for placeholder in placeholders:
field_type = annotations.get(placeholder, str)
fields[placeholder] = field_type
# Generate corresponding regex based on type
if field_type == int:
regex_pattern = regex_pattern.replace(f'<{placeholder}>', r'(\d+?)')
elif field_type == float:
regex_pattern = regex_pattern.replace(f'<{placeholder}>', r'(\d+\.?\d*?)')
else: # str or other types
regex_pattern = regex_pattern.replace(f'<{placeholder}>', r'([^/\\]+?)')
# Escape special characters but preserve capture groups
# Temporarily replace capture groups
temp_markers = {}
group_count = 0
for match in re.finditer(r'\([^)]+\)', regex_pattern):
marker = f"__TEMP_GROUP_{group_count}__"
temp_markers[marker] = match.group()
regex_pattern = regex_pattern.replace(match.group(), marker, 1)
group_count += 1
# Escape special characters
regex_pattern = re.escape(regex_pattern)
# Restore capture groups
for marker, group in temp_markers.items():
regex_pattern = regex_pattern.replace(marker, group)
return regex_pattern, fields, placeholders
[docs]
class BaseMatcher(IComparable, metaclass=_MatcherMeta):
"""
Base class for file pattern matchers.
This class provides functionality to match files based on template patterns and
automatically extract typed fields from filenames. Subclasses should define
__pattern__ and type-annotated fields.
:cvar __pattern__: Template pattern with placeholders (e.g., "file_<id>_<name>.txt")
:vartype __pattern__: str
:cvar __recursively__: Whether to search directories recursively
:vartype __recursively__: bool
Example::
>>> class LogMatcher(BaseMatcher):
... __pattern__ = "log_<date>_<level>.txt"
... date: str
... level: str
>>> matcher = LogMatcher.match("/var/logs")
>>> if matcher:
... print(f"Found log: {matcher.date} - {matcher.level}")
"""
__pattern__: str = ""
__recursively__: bool = False
[docs]
def __init__(self, full_path: str, **kwargs):
"""
Initialize matcher instance with extracted field values.
:param full_path: Complete path to the matched file
:type full_path: str
:param kwargs: Extracted field values from the filename
:type kwargs: Any
:raises ValueError: If unknown fields are provided or required fields are missing
Example::
>>> matcher = ImageMatcher("/path/image_001_test.png", id=1, name="test")
>>> print(matcher.id, matcher.name)
1 test
"""
self.full_path = full_path
self.file_name = os.path.basename(full_path)
self.dir_path = os.path.dirname(full_path)
unknown_fields = {}
excluded_fields = set(self.__field_names_set__)
for key, value in kwargs.items():
if key not in self.__field_names_set__:
unknown_fields[key] = value
else:
excluded_fields.remove(key)
if unknown_fields:
raise ValueError(f'Unknown fields for class {self.__class__.__name__}: {unknown_fields!r}.')
if excluded_fields:
raise ValueError(f'Non-included fields of class {self.__class__.__name__}: {natsorted(excluded_fields)!r}.')
# Set fields extracted from pattern
for key, value in kwargs.items():
setattr(self, key, value)
@classmethod
def _convert_value(cls, value: str, target_type: type) -> Any:
"""
Convert string value to target type.
:param value: String value to convert
:type value: str
:param target_type: Target type for conversion
:type target_type: type
:return: Converted value
:rtype: Any
:raises TypeError: If target type is not supported
Example::
>>> BaseMatcher._convert_value("123", int)
123
>>> BaseMatcher._convert_value("3.14", float)
3.14
"""
if target_type == int:
return int(value)
elif target_type == float:
return float(value)
elif target_type == bool:
return value.lower() in ('true', '1', 'yes', 'on')
elif target_type == str:
return value
else:
raise TypeError(f'Unsupported target type - {target_type!r}.')
@classmethod
def _yield_match(cls, directory: Union[str, Path]) -> Iterator['BaseMatcher']:
"""
Yield all matching file instances in the specified directory.
:param directory: Directory to search for matching files
:type directory: Union[str, Path]
:return: Iterator of matched file instances
:rtype: Iterator[BaseMatcher]
Example::
>>> for matcher in ImageMatcher._yield_match("/path/to/images"):
... print(matcher.id, matcher.name)
"""
directory = Path(directory)
if not directory.exists():
return
regex_pattern, fields, field_order = cls.__regexp_pattern__, cls.__fields__, cls.__field_names__
compiled_pattern = re.compile(regex_pattern)
recursively = getattr(cls, '__recursively__', False)
# Build search pattern
search_pattern = "**/*" if recursively else "*"
for file_path in natsorted(directory.glob(search_pattern)):
if file_path.is_file():
file_name = file_path.name
match = compiled_pattern.match(file_name)
if match:
# Extract field values
field_values = {}
for i, field_name in enumerate(field_order):
raw_value = match.group(i + 1)
field_type = fields[field_name]
try:
converted_value = cls._convert_value(raw_value, field_type)
except (ValueError, TypeError):
# Type conversion failed, skip this file
continue
else:
field_values[field_name] = converted_value
# Create instance
instance = cls(str(file_path), **field_values)
yield instance
[docs]
@classmethod
def match(cls, directory: Union[str, Path]) -> Optional['BaseMatcher']:
"""
Match the first file that conforms to the pattern in the specified directory.
:param directory: Directory to search
:type directory: Union[str, Path]
:return: Matched file instance, or None if not found
:rtype: Optional[BaseMatcher]
Example::
>>> matcher = ImageMatcher.match("/path/to/images")
>>> if matcher:
... print(f"Found: {matcher.full_path}")
"""
iterable = cls._yield_match(directory)
try:
return next(iterable)
except StopIteration:
return None
[docs]
@classmethod
def match_all(cls, directory: Union[str, Path]) -> List['BaseMatcher']:
"""
Match all files that conform to the pattern in the specified directory.
:param directory: Directory to search
:type directory: Union[str, Path]
:return: List of matched file instances
:rtype: List[BaseMatcher]
Example::
>>> matchers = ImageMatcher.match_all("/path/to/images")
>>> print(f"Found {len(matchers)} images")
"""
return list(cls._yield_match(directory))
[docs]
@classmethod
def exists(cls, directory: Union[str, Path]) -> bool:
"""
Check if any file matching the pattern exists in the specified directory.
:param directory: Directory to search
:type directory: Union[str, Path]
:return: True if matching file exists, False otherwise
:rtype: bool
Example::
>>> if ImageMatcher.exists("/path/to/images"):
... print("Images found!")
"""
return cls.match(directory) is not None
[docs]
def __str__(self) -> str:
"""
Get string representation of the matcher instance.
:return: String representation showing field values and full path
:rtype: str
"""
field_info = []
annotations = getattr(self.__class__, '__annotations__') or {}
for field_name in annotations:
if hasattr(self, field_name):
value = getattr(self, field_name)
field_info.append(f"{field_name}={value!r}")
field_info.append(f"full_path={self.full_path!r}")
field_str = ", ".join(field_info)
return f"{self.__class__.__name__}({field_str})"
[docs]
def __repr__(self) -> str:
"""
Get representation string of the matcher instance.
:return: Representation string
:rtype: str
"""
return self.__str__()
[docs]
def tuple(self):
"""
Get field values as a tuple.
:return: Tuple of field values in definition order
:rtype: tuple
Example::
>>> matcher = ImageMatcher("/path/image_001_test.png", id=1, name="test")
>>> matcher.tuple()
(1, 'test')
"""
return tuple(getattr(self, name) for name in self.__field_names__)
[docs]
def dict(self):
"""
Get field values as a dictionary.
:return: Dictionary mapping field names to values
:rtype: dict
Example::
>>> matcher = ImageMatcher("/path/image_001_test.png", id=1, name="test")
>>> matcher.dict()
{'id': 1, 'name': 'test'}
"""
return {name: getattr(self, name) for name in self.__field_names__}
[docs]
def __hash__(self):
"""
Get hash value of the matcher instance.
:return: Hash value based on field values
:rtype: int
"""
return hash(self.tuple())
def _cmpkey(self):
"""
Get comparison key for ordering instances.
:return: Tuple of field values used for comparison
:rtype: tuple
"""
return self.tuple()