Comparators

`stickler.comparators`

Common comparators for key information evaluation.

This package contains comparators that are shared between the traditional and ANLS Star evaluation systems. These comparators implement a unified interface that works with both systems.

`stickler.comparators.BaseComparator`

Bases: ABC

Base class for all comparators.

This class defines the interface that all comparators must implement. Comparators are used to compare two values and return a similarity score between 0.0 and 1.0, where 1.0 means the values are identical.

Source code in stickler/comparators/base.py

class BaseComparator(ABC):
    """Base class for all comparators.

    This class defines the interface that all comparators must implement.
    Comparators are used to compare two values and return a similarity score
    between 0.0 and 1.0, where 1.0 means the values are identical.
    """

    def __init__(self, threshold: float = 0.7):
        """Initialize the comparator.

        Args:
            threshold: Similarity threshold (0.0-1.0)
        """
        self.threshold = threshold

    @abstractmethod
    def compare(self, str1: Any, str2: Any) -> float:
        """Compare two values and return a similarity score.

        Args:
            str1: First value
            str2: Second value

        Returns:
            Similarity score between 0.0 and 1.0
        """
        pass

    def __call__(self, str1: Any, str2: Any) -> float:
        """Make the comparator callable.

        Args:
            str1: First value
            str2: Second value

        Returns:
            Similarity score between 0.0 and 1.0
        """
        return self.compare(str1, str2)

    def binary_compare(self, str1: Any, str2: Any) -> Tuple[int, int]:
        """Compare two values and return a binary result as (tp, fp) tuple.

        This method converts the continuous similarity score to a binary decision
        based on the threshold. If the similarity is greater than or equal to the
        threshold, it returns (1, 0) indicating true positive. Otherwise, it returns
        (0, 1) indicating false positive.

        Args:
            str1: First value
            str2: Second value

        Returns:
            Tuple of (tp, fp) where tp is 1 if similar, 0 otherwise,
            and fp is the opposite
        """
        score = self.compare(str1, str2)
        if score >= self.threshold:
            return (1, 0)  # True positive
        else:
            return (0, 1)  # False positive

    def __str__(self) -> str:
        """String representation for serialization."""
        return self.__class__.__name__

    def __repr__(self) -> str:
        """Detailed string representation."""
        return f"{self.__class__.__name__}(threshold={self.threshold})"

`call(str1, str2)`

Make the comparator callable.

Parameters:

Name	Type	Description	Default
`str1`	`Any`	First value	required
`str2`	`Any`	Second value	required

Returns:

Type	Description
`float`	Similarity score between 0.0 and 1.0

Source code in stickler/comparators/base.py

def __call__(self, str1: Any, str2: Any) -> float:
    """Make the comparator callable.

    Args:
        str1: First value
        str2: Second value

    Returns:
        Similarity score between 0.0 and 1.0
    """
    return self.compare(str1, str2)

`init(threshold=0.7)`

Initialize the comparator.

Parameters:

Name	Type	Description	Default
`threshold`	`float`	Similarity threshold (0.0-1.0)	`0.7`

Source code in stickler/comparators/base.py

def __init__(self, threshold: float = 0.7):
    """Initialize the comparator.

    Args:
        threshold: Similarity threshold (0.0-1.0)
    """
    self.threshold = threshold

`repr()`

Detailed string representation.

Source code in stickler/comparators/base.py

def __repr__(self) -> str:
    """Detailed string representation."""
    return f"{self.__class__.__name__}(threshold={self.threshold})"

`str()`

String representation for serialization.

Source code in stickler/comparators/base.py

def __str__(self) -> str:
    """String representation for serialization."""
    return self.__class__.__name__

`binary_compare(str1, str2)`

Compare two values and return a binary result as (tp, fp) tuple.

This method converts the continuous similarity score to a binary decision based on the threshold. If the similarity is greater than or equal to the threshold, it returns (1, 0) indicating true positive. Otherwise, it returns (0, 1) indicating false positive.

Parameters:

Name	Type	Description	Default
`str1`	`Any`	First value	required
`str2`	`Any`	Second value	required

Returns:

Type	Description
`int`	Tuple of (tp, fp) where tp is 1 if similar, 0 otherwise,
`int`	and fp is the opposite

Source code in stickler/comparators/base.py

def binary_compare(self, str1: Any, str2: Any) -> Tuple[int, int]:
    """Compare two values and return a binary result as (tp, fp) tuple.

    This method converts the continuous similarity score to a binary decision
    based on the threshold. If the similarity is greater than or equal to the
    threshold, it returns (1, 0) indicating true positive. Otherwise, it returns
    (0, 1) indicating false positive.

    Args:
        str1: First value
        str2: Second value

    Returns:
        Tuple of (tp, fp) where tp is 1 if similar, 0 otherwise,
        and fp is the opposite
    """
    score = self.compare(str1, str2)
    if score >= self.threshold:
        return (1, 0)  # True positive
    else:
        return (0, 1)  # False positive

`compare(str1, str2)` `abstractmethod`

Compare two values and return a similarity score.

Parameters:

Name	Type	Description	Default
`str1`	`Any`	First value	required
`str2`	`Any`	Second value	required

Returns:

Type	Description
`float`	Similarity score between 0.0 and 1.0

Source code in stickler/comparators/base.py

@abstractmethod
def compare(self, str1: Any, str2: Any) -> float:
    """Compare two values and return a similarity score.

    Args:
        str1: First value
        str2: Second value

    Returns:
        Similarity score between 0.0 and 1.0
    """
    pass

`stickler.comparators.ExactComparator`

Bases: BaseComparator

Comparator that checks for exact string matching.

This comparator removes whitespace and punctuation before comparison. It returns 1.0 for exact matches and 0.0 otherwise.

Example

comparator = ExactComparator()

# Returns 1.0 (exact match after normalization)
comparator.compare("hello, world!", "hello world")

# Returns 0.0 (different strings)
comparator.compare("hello", "goodbye")

Source code in stickler/comparators/exact.py

class ExactComparator(BaseComparator):
    """Comparator that checks for exact string matching.

    This comparator removes whitespace and punctuation before comparison.
    It returns 1.0 for exact matches and 0.0 otherwise.

    Example:
        ```python
        comparator = ExactComparator()

        # Returns 1.0 (exact match after normalization)
        comparator.compare("hello, world!", "hello world")

        # Returns 0.0 (different strings)
        comparator.compare("hello", "goodbye")
        ```
    """

    def __init__(self, threshold: float = 1.0, case_sensitive: bool = False):
        """Initialize the comparator.

        Args:
            threshold: Similarity threshold (default 1.0)
            case_sensitive: Whether comparison is case sensitive (default False)
        """
        super().__init__(threshold=threshold)
        self.case_sensitive = case_sensitive

    def compare(self, str1: Any, str2: Any) -> float:
        """Compare two values with exact string matching.

        Args:
            str1: First value
            str2: Second value

        Returns:
            1.0 if the strings match exactly after normalization, 0.0 otherwise
        """
        if str1 is None and str2 is None:
            return 1.0
        if str1 is None or str2 is None:
            return 0.0

        # Convert to strings if they aren't already
        str1 = str(str1)
        str2 = str(str2)

        # Apply case normalization if needed
        if not self.case_sensitive:
            str1 = lowercase(str1)
            str2 = lowercase(str2)

        # Remove whitespace and punctuation
        normalized1 = strip_punctuation_space(str1)
        normalized2 = strip_punctuation_space(str2)

        # Compare normalized strings
        return 1.0 if normalized1 == normalized2 else 0.0

`init(threshold=1.0, case_sensitive=False)`

Initialize the comparator.

Parameters:

Name	Type	Description	Default
`threshold`	`float`	Similarity threshold (default 1.0)	`1.0`
`case_sensitive`	`bool`	Whether comparison is case sensitive (default False)	`False`

Source code in stickler/comparators/exact.py

def __init__(self, threshold: float = 1.0, case_sensitive: bool = False):
    """Initialize the comparator.

    Args:
        threshold: Similarity threshold (default 1.0)
        case_sensitive: Whether comparison is case sensitive (default False)
    """
    super().__init__(threshold=threshold)
    self.case_sensitive = case_sensitive

`compare(str1, str2)`

Compare two values with exact string matching.

Parameters:

Name	Type	Description	Default
`str1`	`Any`	First value	required
`str2`	`Any`	Second value	required

Returns:

Type	Description
`float`	1.0 if the strings match exactly after normalization, 0.0 otherwise

Source code in stickler/comparators/exact.py

def compare(self, str1: Any, str2: Any) -> float:
    """Compare two values with exact string matching.

    Args:
        str1: First value
        str2: Second value

    Returns:
        1.0 if the strings match exactly after normalization, 0.0 otherwise
    """
    if str1 is None and str2 is None:
        return 1.0
    if str1 is None or str2 is None:
        return 0.0

    # Convert to strings if they aren't already
    str1 = str(str1)
    str2 = str(str2)

    # Apply case normalization if needed
    if not self.case_sensitive:
        str1 = lowercase(str1)
        str2 = lowercase(str2)

    # Remove whitespace and punctuation
    normalized1 = strip_punctuation_space(str1)
    normalized2 = strip_punctuation_space(str2)

    # Compare normalized strings
    return 1.0 if normalized1 == normalized2 else 0.0

`stickler.comparators.LevenshteinComparator`

Bases: BaseComparator

Comparator using Levenshtein distance for string similarity.

This class implements the Levenshtein distance algorithm for measuring the difference between two strings. It calculates a normalized similarity score between 0 and 1.

Source code in stickler/comparators/levenshtein.py

class LevenshteinComparator(BaseComparator):
    """Comparator using Levenshtein distance for string similarity.

    This class implements the Levenshtein distance algorithm for measuring
    the difference between two strings. It calculates a normalized similarity
    score between 0 and 1.
    """

    def __init__(self, normalize: bool = True, threshold: float = 0.7):
        """Initialize the comparator.

        Args:
            normalize: Whether to normalize input strings
                      (strip whitespace, lowercase) before comparison
            threshold: Similarity threshold (default 0.7)
        """
        super().__init__(threshold=threshold)
        self._normalize = normalize

    @property
    def name(self) -> str:
        """Return the name of the comparator."""
        return "levenshtein"

    @property
    def config(self) -> Optional[Dict[str, Any]]:
        """Return configuration parameters."""
        return {"normalize": self._normalize}

    def compare(self, s1: Any, s2: Any) -> float:
        """
        Compare two strings using Levenshtein distance.

        Args:
            s1: First string or value
            s2: Second string or value

        Returns:
            Similarity score between 0.0 and 1.0, with 1.0 indicating identical

        Raises:
            TypeError: If either input is a dictionary, as dictionaries are not suitable
                      for Levenshtein distance comparison and should be handled through
                      structured models instead.
        """
        # Reject dictionaries - they should be broken down into proper StructuredModel subclasses
        if isinstance(s1, dict) or isinstance(s2, dict):
            raise TypeError(
                "Dictionary objects cannot be compared using LevenshteinComparator. "
                "Use a StructuredModel subclass with properly defined fields instead."
            )

        # Convert to strings and handle None values
        s1 = "" if s1 is None else str(s1)
        s2 = "" if s2 is None else str(s2)

        # Normalize strings if enabled
        if self._normalize:
            s1 = " ".join(s1.strip().lower().split())
            s2 = " ".join(s2.strip().lower().split())

        # Handle empty strings
        if not s1 and not s2:
            return 1.0

        # Calculate Levenshtein distance
        dist = self._levenshtein_distance(s1, s2)
        str_length = max(len(s1), len(s2))

        if str_length == 0:
            return 1.0

        # Convert distance to similarity (1.0 - normalized_distance)
        return 1.0 - (float(dist) / float(str_length))

    @staticmethod
    def _levenshtein_distance(s1: str, s2: str) -> int:
        """
        Calculate the Levenshtein distance between two strings.

        Args:
            s1: First string
            s2: Second string

        Returns:
            The Levenshtein distance as an integer
        """
        if len(s1) > len(s2):
            s1, s2 = s2, s1

        distances = range(len(s1) + 1)
        for i2, c2 in enumerate(s2):
            distances_ = [i2 + 1]
            for i1, c1 in enumerate(s1):
                if c1 == c2:
                    distances_.append(distances[i1])
                else:
                    distances_.append(
                        1 + min((distances[i1], distances[i1 + 1], distances_[-1]))
                    )
            distances = distances_
        return distances[-1]

`config` `property`

Return configuration parameters.

`name` `property`

Return the name of the comparator.

`init(normalize=True, threshold=0.7)`

Initialize the comparator.

Parameters:

Name	Type	Description	Default
`normalize`	`bool`	Whether to normalize input strings (strip whitespace, lowercase) before comparison	`True`
`threshold`	`float`	Similarity threshold (default 0.7)	`0.7`

Source code in stickler/comparators/levenshtein.py

def __init__(self, normalize: bool = True, threshold: float = 0.7):
    """Initialize the comparator.

    Args:
        normalize: Whether to normalize input strings
                  (strip whitespace, lowercase) before comparison
        threshold: Similarity threshold (default 0.7)
    """
    super().__init__(threshold=threshold)
    self._normalize = normalize

`compare(s1, s2)`

Compare two strings using Levenshtein distance.

Parameters:

Name	Type	Description	Default
`s1`	`Any`	First string or value	required
`s2`	`Any`	Second string or value	required

Returns:

Type	Description
`float`	Similarity score between 0.0 and 1.0, with 1.0 indicating identical

Raises:

Type	Description
`TypeError`	If either input is a dictionary, as dictionaries are not suitable for Levenshtein distance comparison and should be handled through structured models instead.

Source code in stickler/comparators/levenshtein.py

def compare(self, s1: Any, s2: Any) -> float:
    """
    Compare two strings using Levenshtein distance.

    Args:
        s1: First string or value
        s2: Second string or value

    Returns:
        Similarity score between 0.0 and 1.0, with 1.0 indicating identical

    Raises:
        TypeError: If either input is a dictionary, as dictionaries are not suitable
                  for Levenshtein distance comparison and should be handled through
                  structured models instead.
    """
    # Reject dictionaries - they should be broken down into proper StructuredModel subclasses
    if isinstance(s1, dict) or isinstance(s2, dict):
        raise TypeError(
            "Dictionary objects cannot be compared using LevenshteinComparator. "
            "Use a StructuredModel subclass with properly defined fields instead."
        )

    # Convert to strings and handle None values
    s1 = "" if s1 is None else str(s1)
    s2 = "" if s2 is None else str(s2)

    # Normalize strings if enabled
    if self._normalize:
        s1 = " ".join(s1.strip().lower().split())
        s2 = " ".join(s2.strip().lower().split())

    # Handle empty strings
    if not s1 and not s2:
        return 1.0

    # Calculate Levenshtein distance
    dist = self._levenshtein_distance(s1, s2)
    str_length = max(len(s1), len(s2))

    if str_length == 0:
        return 1.0

    # Convert distance to similarity (1.0 - normalized_distance)
    return 1.0 - (float(dist) / float(str_length))

`stickler.comparators.NumericComparator`

Bases: BaseComparator

Comparator for numeric values with configurable tolerance.

This comparator extracts and compares numeric values from strings or numbers. It supports relative and absolute tolerance for comparison.

Example

# Default exact matching
exact = NumericComparator()
exact.compare("123", "123.0")  # Returns 1.0
exact.compare("123", "124")    # Returns 0.0

# With tolerance
approx = NumericComparator(relative_tolerance=0.1)  # 10% tolerance
approx.compare("100", "109")   # Returns 1.0 (within 10%)
approx.compare("100", "111")   # Returns 0.0 (beyond 10%)

Source code in stickler/comparators/numeric.py

class NumericComparator(BaseComparator):
    """Comparator for numeric values with configurable tolerance.

    This comparator extracts and compares numeric values from strings or numbers.
    It supports relative and absolute tolerance for comparison.

    Example:
        ```python
        # Default exact matching
        exact = NumericComparator()
        exact.compare("123", "123.0")  # Returns 1.0
        exact.compare("123", "124")    # Returns 0.0

        # With tolerance
        approx = NumericComparator(relative_tolerance=0.1)  # 10% tolerance
        approx.compare("100", "109")   # Returns 1.0 (within 10%)
        approx.compare("100", "111")   # Returns 0.0 (beyond 10%)
        ```
    """

    def __init__(
        self,
        threshold: float = 1.0,
        relative_tolerance: float = 0.0,
        absolute_tolerance: float = 0.0,
        tolerance: Optional[float] = None,
    ):
        """Initialize the comparator.

        Args:
            threshold: Similarity threshold (default 1.0)
            relative_tolerance: Relative tolerance for comparison (default 0.0)
            absolute_tolerance: Absolute tolerance for comparison (default 0.0)
            tolerance: Alias for absolute_tolerance (for backward compatibility)
        """
        super().__init__(threshold=threshold)
        self.relative_tolerance = relative_tolerance

        # Handle tolerance alias for backward compatibility
        if tolerance is not None:
            if absolute_tolerance != 0.0:
                raise ValueError(
                    "Cannot specify both 'tolerance' and 'absolute_tolerance'. Use 'absolute_tolerance'."
                )
            self.absolute_tolerance = tolerance
        else:
            self.absolute_tolerance = absolute_tolerance

    @property
    def config(self) -> Optional[Dict[str, Any]]:
        """Return configuration parameters for serialization."""
        config = {}
        if self.relative_tolerance != 0.0:
            config["relative_tolerance"] = self.relative_tolerance
        if self.absolute_tolerance != 0.0:
            config["absolute_tolerance"] = self.absolute_tolerance
        return config or None

    def compare(self, str1: Any, str2: Any) -> float:
        """Compare two values numerically.

        Args:
            str1: First value
            str2: Second value

        Returns:
            1.0 if the numbers match within tolerance, 0.0 otherwise
        """
        if str1 is None and str2 is None:
            return 1.0
        if str1 is None or str2 is None:
            return 0.0

        # Extract numeric values
        num1 = self._extract_number(str1)
        num2 = self._extract_number(str2)

        if num1 is None or num2 is None:
            return 0.0

        # Check equality with tolerance
        if self._numbers_equal(num1, num2):
            return 1.0

        return 0.0

    def _extract_number(self, value: Any) -> Union[Decimal, None]:
        """Extract a numeric value from a string or number.

        Args:
            value: Value to extract a number from

        Returns:
            Decimal value or None if no valid number could be extracted
        """
        if isinstance(value, (int, float)):
            return Decimal(str(value))

        if not isinstance(value, str):
            value = str(value)

        # Check for accounting notation: (123) means -123
        is_negative = False
        if value.startswith("(") and value.endswith(")"):
            value = value[1:-1]  # Remove the parentheses
            is_negative = True

        # Remove common currency symbols and other non-numeric characters
        value = re.sub(r"[^0-9.-]", "", value)

        # Handle empty string
        if not value:
            return None

        # Try to convert to Decimal
        try:
            decimal_value = Decimal(value)
            # Apply negative sign if accounting notation was used
            if is_negative:
                decimal_value = -decimal_value
            return decimal_value
        except InvalidOperation:
            return None

    def _numbers_equal(self, num1: Decimal, num2: Decimal) -> bool:
        """Check if two numbers are equal within tolerance.

        Args:
            num1: First number
            num2: Second number

        Returns:
            True if numbers are equal within tolerance, False otherwise
        """
        if num1 == num2:
            return True

        # Check with relative tolerance
        if self.relative_tolerance > 0:
            # Handle zero case
            if num1 == 0:
                return abs(num2) <= self.relative_tolerance

            # Calculate relative difference using num1 as base
            relative_diff = abs(num1 - num2) / abs(num1)
            if relative_diff <= self.relative_tolerance:
                return True

        # Check with absolute tolerance
        if self.absolute_tolerance > 0:
            if abs(num1 - num2) <= self.absolute_tolerance:
                return True

        return False

`config` `property`

Return configuration parameters for serialization.

`init(threshold=1.0, relative_tolerance=0.0, absolute_tolerance=0.0, tolerance=None)`

Initialize the comparator.

Parameters:

Name	Type	Description	Default
`threshold`	`float`	Similarity threshold (default 1.0)	`1.0`
`relative_tolerance`	`float`	Relative tolerance for comparison (default 0.0)	`0.0`
`absolute_tolerance`	`float`	Absolute tolerance for comparison (default 0.0)	`0.0`
`tolerance`	`Optional[float]`	Alias for absolute_tolerance (for backward compatibility)	`None`

Source code in stickler/comparators/numeric.py

def __init__(
    self,
    threshold: float = 1.0,
    relative_tolerance: float = 0.0,
    absolute_tolerance: float = 0.0,
    tolerance: Optional[float] = None,
):
    """Initialize the comparator.

    Args:
        threshold: Similarity threshold (default 1.0)
        relative_tolerance: Relative tolerance for comparison (default 0.0)
        absolute_tolerance: Absolute tolerance for comparison (default 0.0)
        tolerance: Alias for absolute_tolerance (for backward compatibility)
    """
    super().__init__(threshold=threshold)
    self.relative_tolerance = relative_tolerance

    # Handle tolerance alias for backward compatibility
    if tolerance is not None:
        if absolute_tolerance != 0.0:
            raise ValueError(
                "Cannot specify both 'tolerance' and 'absolute_tolerance'. Use 'absolute_tolerance'."
            )
        self.absolute_tolerance = tolerance
    else:
        self.absolute_tolerance = absolute_tolerance

`compare(str1, str2)`

Compare two values numerically.

Parameters:

Name	Type	Description	Default
`str1`	`Any`	First value	required
`str2`	`Any`	Second value	required

Returns:

Type	Description
`float`	1.0 if the numbers match within tolerance, 0.0 otherwise

Source code in stickler/comparators/numeric.py

def compare(self, str1: Any, str2: Any) -> float:
    """Compare two values numerically.

    Args:
        str1: First value
        str2: Second value

    Returns:
        1.0 if the numbers match within tolerance, 0.0 otherwise
    """
    if str1 is None and str2 is None:
        return 1.0
    if str1 is None or str2 is None:
        return 0.0

    # Extract numeric values
    num1 = self._extract_number(str1)
    num2 = self._extract_number(str2)

    if num1 is None or num2 is None:
        return 0.0

    # Check equality with tolerance
    if self._numbers_equal(num1, num2):
        return 1.0

    return 0.0

`stickler.comparators.NumericExactC = NumericComparator` `module-attribute`

`stickler.comparators.DateComparator`

Bases: BaseComparator

Deterministic date comparator with year/range awareness.

See docs/docs/Guides/Comparators/date-comparator.md for the full behavior reference, configuration matrix, and corner cases.

Parameters:

Name	Type	Description	Default
`threshold`	`float`	Forwarded to :class:`BaseComparator`.	`1.0`
`tolerance`	`Optional[Union[timedelta, int, float]]`	Optional window for Tier 1 single-vs-single comparisons only (range and partial-year branches ignore it). Accepts a `timedelta` or a numeric value in days. A whole-day tolerance floors both sides to the calendar day (time ignored); a sub-day tolerance (e.g. `1.5` = 36h) compares actual timestamps. Defaults to `None`, which is normalized to `timedelta(0)` (same calendar day).	`None`
`dayfirst`	`Optional[bool]`	How to interpret ambiguous numeric dates like `"01/02/2025"`. `None` (default) tries both interpretations and takes the better-matching score; `True` forces day-first; `False` forces month-first.	`None`
`allow_partial_year`	`bool`	If `True`, year-less ↔ year-bearing pairs with matching month/day score `0.7`. Default `False`.	`False`
`range_mode`	`RangeMode`	How range comparisons are scored. One of `"strict"`, `"reject"`, `"contains"`, `"graded"` (default).	`'graded'`
`precision_mode`	`PrecisionMode`	How month/day resolution mismatches are scored (`"Jan 2024"` vs `"Jan 1, 2024"`). The first argument to :meth:`compare` is treated as ground truth. `"exact"` (default): both sides must share the same resolution; a fabricated or dropped month/day is a miss. `"gt_loose"`: the prediction may be finer than the ground truth (extra precision ignored if consistent at the ground truth's grain) but not coarser. `"overlap"`: symmetric — either side may be coarser, as long as they agree on every field both sides specify.	`'exact'`

Source code in stickler/comparators/date.py

class DateComparator(BaseComparator):
    """Deterministic date comparator with year/range awareness.

    See ``docs/docs/Guides/Comparators/date-comparator.md`` for the full
    behavior reference, configuration matrix, and corner cases.

    Args:
        threshold: Forwarded to :class:`BaseComparator`.
        tolerance: Optional window for Tier 1 single-vs-single
            comparisons only (range and partial-year branches ignore it).
            Accepts a ``timedelta`` or a numeric value in days. A
            whole-day tolerance floors both sides to the calendar day
            (time ignored); a sub-day tolerance (e.g. ``1.5`` = 36h)
            compares actual timestamps. Defaults to ``None``, which is
            normalized to ``timedelta(0)`` (same calendar day).
        dayfirst: How to interpret ambiguous numeric dates like
            ``"01/02/2025"``. ``None`` (default) tries both
            interpretations and takes the better-matching score; ``True``
            forces day-first; ``False`` forces month-first.
        allow_partial_year: If ``True``, year-less ↔ year-bearing pairs
            with matching month/day score ``0.7``. Default ``False``.
        range_mode: How range comparisons are scored. One of
            ``"strict"``, ``"reject"``, ``"contains"``, ``"graded"``
            (default).
        precision_mode: How month/day *resolution* mismatches are scored
            (``"Jan 2024"`` vs ``"Jan 1, 2024"``). The first argument to
            :meth:`compare` is treated as ground truth.

            - ``"exact"`` (default): both sides must share the same
              resolution; a fabricated or dropped month/day is a miss.
            - ``"gt_loose"``: the prediction may be *finer* than the
              ground truth (extra precision ignored if consistent at the
              ground truth's grain) but not coarser.
            - ``"overlap"``: symmetric — either side may be coarser, as
              long as they agree on every field both sides specify.
    """

    def __init__(
        self,
        threshold: float = 1.0,
        tolerance: Optional[Union[timedelta, int, float]] = None,
        dayfirst: Optional[bool] = None,
        allow_partial_year: bool = False,
        range_mode: RangeMode = "graded",
        precision_mode: PrecisionMode = "exact",
    ):
        super().__init__(threshold=threshold)

        if not _DATEUTIL_AVAILABLE:
            raise ImportError(
                "The python-dateutil library is required for DateComparator. "
                "Install it with: pip install python-dateutil"
            )

        if dayfirst not in (None, True, False):
            raise ValueError(
                f"dayfirst must be None, True, or False; got {dayfirst!r}"
            )

        if range_mode not in _VALID_RANGE_MODES:
            raise ValueError(
                f"range_mode must be one of {_VALID_RANGE_MODES}; "
                f"got {range_mode!r}"
            )

        if precision_mode not in _VALID_PRECISION_MODES:
            raise ValueError(
                f"precision_mode must be one of {_VALID_PRECISION_MODES}; "
                f"got {precision_mode!r}"
            )

        # Tolerance accepts ``timedelta``, ``int``, or ``float``. Numeric
        # inputs are interpreted as days — friendlier for JSON-schema
        # configs where a literal ``timedelta(days=N)`` isn't expressible.
        if tolerance is None:
            self.tolerance = timedelta(0)
        elif isinstance(tolerance, timedelta):
            self.tolerance = tolerance
        elif isinstance(tolerance, bool):
            # bool is a subclass of int; reject it explicitly so True/False
            # don't silently become 1-day / 0-day windows.
            raise ValueError(
                "tolerance must be a timedelta or a numeric value in days; "
                f"got bool {tolerance!r}"
            )
        elif isinstance(tolerance, (int, float)):
            self.tolerance = timedelta(days=tolerance)
        else:
            raise ValueError(
                "tolerance must be a timedelta or a numeric value in days; "
                f"got {type(tolerance).__name__}"
            )

        if self.tolerance < timedelta(0):
            raise ValueError("tolerance must be non-negative")

        self.dayfirst = dayfirst
        self.allow_partial_year = allow_partial_year
        self.range_mode = range_mode
        self.precision_mode = precision_mode

    @property
    def config(self) -> Optional[dict]:
        """Round-trippable config for JSON-schema export.

        Only non-default values are emitted, and an all-default instance
        returns ``None`` — matching ``NumericComparator.config`` and
        keeping a redundant ``x-aws-stickler-comparator-config`` block out
        of every exported schema (the exporter keys off truthiness).

        Tolerance is exported as days (an int when the timedelta is a
        whole number of days, otherwise a float) so it can survive a
        JSON round-trip.
        """
        cfg: dict = {}
        if self.dayfirst is not None:
            cfg["dayfirst"] = self.dayfirst
        if self.allow_partial_year:
            cfg["allow_partial_year"] = self.allow_partial_year
        if self.range_mode != "graded":
            cfg["range_mode"] = self.range_mode
        if self.precision_mode != "exact":
            cfg["precision_mode"] = self.precision_mode
        if self.tolerance != timedelta(0):
            seconds = self.tolerance.total_seconds()
            days = seconds / 86400
            cfg["tolerance"] = int(days) if days.is_integer() else days
        return cfg or None

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def compare(self, str1: Any, str2: Any) -> float:
        """Score two date values per the tier system documented above."""
        if str1 is None and str2 is None:
            return 1.0
        if str1 is None or str2 is None:
            return 0.0

        # Resolve dayfirst pairwise. ``None`` means "try both
        # interpretations and take the best score" — that way a string
        # whose layout is genuinely ambiguous in isolation can still
        # match if one consistent interpretation lines up.
        #
        # A malformed value must never crash an evaluation run, so any
        # datetime-comparison edge (e.g. mixed tz-awareness the alignment
        # helpers didn't anticipate) degrades to 0.0 like every other
        # parse failure. The range/single paths align timezones inline;
        # this is a backstop, not the primary defense.
        try:
            if self.dayfirst is not None:
                return self._compare_with_dayfirst(str1, str2, self.dayfirst)

            return max(
                self._compare_with_dayfirst(str1, str2, False),
                self._compare_with_dayfirst(str1, str2, True),
            )
        except TypeError:
            return 0.0

    def _compare_with_dayfirst(
        self, str1: Any, str2: Any, dayfirst: bool
    ) -> float:
        """Run the tier dispatch with ``dayfirst`` pinned to one value."""
        a = self._parse(str1, dayfirst=dayfirst)
        b = self._parse(str2, dayfirst=dayfirst)
        if a is None or b is None:
            return 0.0

        a_is_range = isinstance(a, _ParsedRange)
        b_is_range = isinstance(b, _ParsedRange)

        # ``reject`` mode: any range input zeros out the comparison.
        if self.range_mode == "reject" and (a_is_range or b_is_range):
            return 0.0

        # Tier 4b: range vs range
        if a_is_range and b_is_range:
            return self._compare_range_range(a, b)

        # Tier 4: range vs single. ``a`` is always the ground truth (first
        # compare() argument); track whether it's the single side so the
        # directional precision gate (gt_loose) orients correctly.
        if a_is_range or b_is_range:
            single = b if a_is_range else a  # type: ignore[assignment]
            rng = a if a_is_range else b  # type: ignore[assignment]
            return self._compare_range_single(
                rng, single, single_is_gt=not a_is_range
            )

        # Both singles
        return self._compare_singles(a, b)

    # ------------------------------------------------------------------
    # Tier dispatch
    # ------------------------------------------------------------------

    def _compare_range_range(
        self, a: _ParsedRange, b: _ParsedRange
    ) -> float:
        """Tier 4b: range vs range under the configured range_mode."""
        # Year-presence consistency on both endpoints of both sides.
        # If endpoints disagree on year-presence within a side it's
        # malformed; we treat that as a 0.0 rather than try to repair.
        if a.start.has_year != a.end.has_year:
            return 0.0
        if b.start.has_year != b.end.has_year:
            return 0.0

        # Month/day resolution gate, per endpoint (precision_mode).
        if not self._resolution_ok(a.start, b.start):
            return 0.0
        if not self._resolution_ok(a.end, b.end):
            return 0.0

        year_match = a.start.has_year == b.start.has_year
        partial_year_multiplier = self._partial_year_multiplier(year_match)
        if partial_year_multiplier == 0.0:
            return 0.0

        # When year-presence differs (only reachable under
        # allow_partial_year=True), the year-less side's year is a
        # fictional 1900 placeholder, so endpoint equality and overlap are
        # judged on (month, day) only — mirroring the range-vs-single
        # m/d fallback. Otherwise compare full dates.
        if self.range_mode in ("strict", "contains"):
            if year_match:
                endpoints_match = self._dates_equal_day(
                    a.start.dt, b.start.dt
                ) and self._dates_equal_day(a.end.dt, b.end.dt)
            else:
                endpoints_match = self._md_equal(
                    a.start.dt, b.start.dt
                ) and self._md_equal(a.end.dt, b.end.dt)
            if endpoints_match:
                return 1.0 * partial_year_multiplier
            return 0.0

        # graded → Jaccard (reject mode is handled before we get here)
        jaccard = self._jaccard(a, b) if year_match else self._md_jaccard(a, b)
        return jaccard * partial_year_multiplier

    def _compare_range_single(
        self, rng: _ParsedRange, single: _ParsedSingle, single_is_gt: bool
    ) -> float:
        """Tier 4: range-vs-single under the configured range_mode.

        ``single_is_gt`` records whether the single side was the ground
        truth (the first :meth:`compare` argument), so the directional
        precision gate (``gt_loose``) is oriented the same way it is in the
        single-vs-single and range-vs-range paths.
        """
        if self.range_mode == "strict":
            return 0.0

        # Month/day resolution gate (precision_mode), applied per endpoint
        # with ground truth in the correct position — mirroring
        # _compare_range_range. Without this a reduced-precision single
        # (e.g. 'Jan 2024', whose day is fabricated to the 1st) would land
        # inside a day-grain range and score credit even under the default
        # 'exact' mode, the same score-inflating fabrication the gate
        # exists to refuse on the single-vs-single path.
        if single_is_gt:
            resolution_ok = self._resolution_ok(
                single, rng.start
            ) and self._resolution_ok(single, rng.end)
        else:
            resolution_ok = self._resolution_ok(
                rng.start, single
            ) and self._resolution_ok(rng.end, single)
        if not resolution_ok:
            return 0.0

        # Year-presence consistency: the range's endpoints must agree
        # internally, and we compare against the single's claim.
        if rng.start.has_year != rng.end.has_year:
            return 0.0
        year_match = rng.start.has_year == single.has_year
        partial_year_multiplier = self._partial_year_multiplier(year_match)
        if partial_year_multiplier == 0.0:
            return 0.0

        # Containment: when both sides agree on year-presence we compare
        # the full datetimes; when they disagree (only possible under
        # allow_partial_year=True) the year on the year-less side is a
        # fictional 1900 placeholder, so we compare on (month, day) only.
        if year_match:
            # Normalize all three to comparable naive days: endpoints and
            # the single may differ in tz-awareness.
            s = self._normalize_day(single.dt)
            lo = self._normalize_day(rng.start.dt)
            hi = self._normalize_day(rng.end.dt)
            inside = lo <= s <= hi
        else:
            inside = self._md_in_md_range(
                (single.dt.month, single.dt.day),
                (rng.start.dt.month, rng.start.dt.day),
                (rng.end.dt.month, rng.end.dt.day),
            )

        if inside:
            base = (
                1.0 if self.range_mode == "contains" else _RANGE_CONTAINS_GRADED_SCORE
            )
            return base * partial_year_multiplier
        return 0.0

    @staticmethod
    def _md_in_md_range(
        target: Tuple[int, int],
        lo: Tuple[int, int],
        hi: Tuple[int, int],
    ) -> bool:
        """Check (month, day) containment in a (month, day) range.

        When ``lo <= hi`` the span is the ordinary closed interval. When
        ``lo > hi`` the range wraps the year boundary in m/d space (e.g.
        Dec 20 → Jan 5, common for fiscal/holiday ranges in IDP data), so
        membership is the union of the two ends:
        ``target >= lo`` (late-year tail) or ``target <= hi`` (early-year
        head). Endpoints are inclusive either way.
        """
        if lo <= hi:
            return lo <= target <= hi
        return target >= lo or target <= hi

    def _compare_singles(
        self, a: _ParsedSingle, b: _ParsedSingle
    ) -> float:
        """Score two single dates across the year-presence and resolution axes.

        ``a`` is the ground truth (first :meth:`compare` argument). Two
        independent gates run before any value comparison:

        * month/day resolution (``precision_mode``), and
        * year presence (``allow_partial_year``, via
          :meth:`_partial_year_multiplier`).

        If both gates pass, the fields that *both* sides specify must
        agree; year-bearing day-grain pairs additionally honor
        ``tolerance``.
        """
        # Axis 1 — month/day resolution.
        if not self._resolution_ok(a, b):
            return 0.0

        # Axis 2 — year presence (carries the 0.7 partial-year credit).
        year_multiplier = self._partial_year_multiplier(a.has_year == b.has_year)
        if year_multiplier == 0.0:
            return 0.0

        if not self._single_values_agree(a, b):
            return 0.0

        return year_multiplier

    def _resolution_ok(self, a: _ParsedSingle, b: _ParsedSingle) -> bool:
        """Whether a month/day resolution mismatch is permitted.

        ``a`` is ground truth. Equal resolution is always fine; otherwise
        ``precision_mode`` decides:

        * ``"exact"`` — never (resolutions must match);
        * ``"gt_loose"`` — only if the prediction ``b`` is *finer* than
          the ground truth ``a`` (``b`` may add precision, not drop it);
        * ``"overlap"`` — either side may be coarser.
        """
        if a.md_resolution == b.md_resolution:
            return True
        if self.precision_mode == "exact":
            return False
        if self.precision_mode == "overlap":
            return True
        # gt_loose
        return b.md_resolution >= a.md_resolution

    def _single_values_agree(
        self, a: _ParsedSingle, b: _ParsedSingle
    ) -> bool:
        """Whether two singles agree on every field both sides specify.

        Year-bearing day-grain pairs go through the ``tolerance`` window
        (which spans month/year boundaries); every other pairing is exact
        on the fields present at the common (coarser) grain.
        """
        if a.has_year and b.has_year:
            if a.md_resolution == 2 and b.md_resolution == 2:
                # Both full dates: tolerance-aware comparison.
                a_dt, b_dt = self._align_timezones(a.dt, b.dt)
                if not self._has_subday_tolerance():
                    # Whole-day (or zero) tolerance keeps same-calendar-day
                    # semantics: floor to midnight so intra-day times are
                    # ignored and the window counts whole days.
                    a_dt = self._truncate_day(a_dt)
                    b_dt = self._truncate_day(b_dt)
                # A sub-day tolerance (e.g. 1.5 days = 36h) means the caller
                # cares about real elapsed time, so compare actual
                # timestamps without flooring.
                return abs(a_dt - b_dt) <= self.tolerance
            if a.dt.year != b.dt.year:
                return False
        if a.has_month and b.has_month and a.dt.month != b.dt.month:
            return False
        if a.has_day and b.has_day and a.dt.day != b.dt.day:
            return False
        return True

    def _has_subday_tolerance(self) -> bool:
        """Whether ``tolerance`` carries a sub-day (hours/minutes) component."""
        return self.tolerance.total_seconds() % 86400 != 0

    def _partial_year_multiplier(self, year_match: bool) -> float:
        """Multiplier applied to range scores when year-presence (mis)matches.

        Returns 1.0 when both sides agree on year-presence. Returns
        ``_PARTIAL_YEAR_MULTIPLIER`` (0.7) when ``allow_partial_year=True``
        and they disagree. Returns 0.0 otherwise.
        """
        if year_match:
            return 1.0
        return _PARTIAL_YEAR_MULTIPLIER if self.allow_partial_year else 0.0

    def _jaccard(self, a: _ParsedRange, b: _ParsedRange) -> float:
        """Jaccard overlap between two date ranges, day-level."""
        # Normalize to comparable naive days so endpoints from different
        # ranges (possibly mixed tz-awareness) can be min/max'd together.
        a_lo = self._normalize_day(a.start.dt)
        a_hi = self._normalize_day(a.end.dt)
        b_lo = self._normalize_day(b.start.dt)
        b_hi = self._normalize_day(b.end.dt)

        # Inclusive day count.
        intersect_lo = max(a_lo, b_lo)
        intersect_hi = min(a_hi, b_hi)
        if intersect_hi < intersect_lo:
            return 0.0

        intersect_days = (intersect_hi - intersect_lo).days + 1
        union_lo = min(a_lo, b_lo)
        union_hi = max(a_hi, b_hi)
        union_days = (union_hi - union_lo).days + 1
        return intersect_days / union_days

    @staticmethod
    def _md_equal(dt1: datetime, dt2: datetime) -> bool:
        """Whether two datetimes share a (month, day) — ignores year/time."""
        return (dt1.month, dt1.day) == (dt2.month, dt2.day)

    @classmethod
    def _md_jaccard(cls, a: _ParsedRange, b: _ParsedRange) -> float:
        """Jaccard overlap of two ranges in (month, day) space.

        Used when year-presence differs: the year-less side's year is a
        1900 placeholder, so overlap is measured over the set of
        ``(month, day)`` pairs each range spans rather than absolute days.
        """
        a_days = cls._md_set(a)
        b_days = cls._md_set(b)
        union = a_days | b_days
        if not union:
            return 0.0
        return len(a_days & b_days) / len(union)

    @staticmethod
    def _md_set(rng: _ParsedRange) -> set:
        """The set of (month, day) pairs a range covers, inclusive.

        Walks day by day from start to end. Bounded by a one-year cap so
        a malformed multi-year span can't run away.
        """
        start = rng.start.dt.replace(hour=0, minute=0, second=0, microsecond=0)
        end = rng.end.dt.replace(hour=0, minute=0, second=0, microsecond=0)
        days = (end - start).days
        # Year-less ranges can't wrap (parser enforces start <= end and
        # both default to 1900); cap the walk at a full year defensively.
        days = min(days, 366)
        out = set()
        cur = start
        for _ in range(days + 1):
            out.add((cur.month, cur.day))
            cur += timedelta(days=1)
        return out

    # ------------------------------------------------------------------
    # Parsing
    # ------------------------------------------------------------------

    def _parse(self, value: Any, dayfirst: bool) -> _ParseResult:
        """Parse input into a single date or range, or ``None`` on failure.

        Single-day ranges (``X to X``) are normally collapsed to a
        ``_ParsedSingle`` so they compare consistently with the bare
        single-date form. Under ``range_mode="reject"`` we skip the
        collapse so that the original range shape is preserved and the
        comparison surfaces it as a structural mismatch.
        """
        if isinstance(value, datetime):
            return _ParsedSingle(dt=value, has_year=True)
        if isinstance(value, date):
            return _ParsedSingle(
                dt=datetime(value.year, value.month, value.day), has_year=True
            )

        if not isinstance(value, str):
            value = str(value)

        s = value.strip()
        if not s:
            return None

        # Reject pathologically long input before any parsing. A real date
        # string is well under this; a huge value is malformed and would
        # otherwise cost dateutil a full scan (and could split into two
        # huge range halves). Over-length degrades to 0.0 like any other
        # parse failure.
        if len(s) > _MAX_INPUT_LEN:
            return None

        rng = self._try_parse_range(s, dayfirst=dayfirst)
        if rng is not None:
            # Collapse degenerate single-day ranges to singles, EXCEPT
            # under reject mode where we want the range shape preserved.
            if (
                self.range_mode != "reject"
                and self._dates_equal_day(rng.start.dt, rng.end.dt)
                and rng.start.has_year == rng.end.has_year
            ):
                return rng.start
            return rng

        # A string that carries a range-delimiter signal but didn't parse
        # as a valid range is a malformed/truncated range, not a single
        # date. Falling through to a single parse here would let dateutil
        # silently swallow a dangling dash (``'- 10/24/16'``) and score it
        # as a clean date.
        if self._has_range_delim_signal(s):
            return None

        return self._try_parse_single(s, dayfirst=dayfirst)

    @staticmethod
    def _has_range_delim_signal(s: str) -> bool:
        """Whether ``s`` looks like it was meant to be a range.

        Catches both the configured delimiters appearing internally and a
        dangling bare dash at either edge. Legitimate single dates put
        their dashes *between* digits (``2025-01-01``, ``10-24-2016``), so
        an edge dash only shows up on truncated range input.
        """
        if any(delim in s for delim in _RANGE_DELIMS):
            return True
        return s.startswith("-") or s.endswith("-")

    def _try_parse_range(
        self, s: str, dayfirst: bool
    ) -> Optional[_ParsedRange]:
        """Detect a range by splitting on configured delimiters."""
        for delim in _RANGE_DELIMS:
            if delim not in s:
                continue
            left, _, right = s.partition(delim)
            left_p = self._try_parse_single(left.strip(), dayfirst=dayfirst)
            right_p = self._try_parse_single(right.strip(), dayfirst=dayfirst)
            if left_p is None or right_p is None:
                continue
            # Align before the ordering check: endpoints may differ in
            # tz-awareness (one ISO-with-offset, one naive), which would
            # otherwise raise TypeError on the comparison.
            left_dt, right_dt = self._align_timezones(left_p.dt, right_p.dt)
            if left_dt > right_dt:
                return None
            return _ParsedRange(start=left_p, end=right_p)
        return None

    def _try_parse_single(
        self, s: str, dayfirst: bool
    ) -> Optional[_ParsedSingle]:
        """Parse one side as a single date (or ``None`` on failure).

        Year/month/day presence is detected by parsing twice with default
        dates that differ in *all three* components: any field the parser
        had to borrow from the default reveals itself by disagreeing
        between the two parses. This is what lets reduced-resolution
        inputs (``'Jan 2024'``, ``'2024'``) be told apart from full dates
        rather than silently fabricating the missing fields.
        """
        if not s:
            return None

        # Year-first layouts (ISO and ``YYYY/MM/DD``) fix month-then-day
        # order, so the day-first interpretation would corrupt them. Pin
        # those to month-first regardless of the requested ``dayfirst``.
        if _YEAR_FIRST_RE.match(s):
            dayfirst = False

        try:
            dt_lo = _dateutil_parser.parse(
                s, default=datetime(1900, 1, 1), dayfirst=dayfirst
            )
            # Default differs in year, month, AND day so each component's
            # presence can be probed independently.
            dt_hi = _dateutil_parser.parse(
                s, default=datetime(2099, 6, 15), dayfirst=dayfirst
            )
        except (ValueError, OverflowError, TypeError):
            return None

        has_year = dt_lo.year == dt_hi.year
        has_month = dt_lo.month == dt_hi.month
        has_day = dt_lo.day == dt_hi.day

        # Reject time-only inputs ('12:30 PM', '10/45AM' etc.): no date
        # component at all was specified, so everything came from the
        # default and only the time survives.
        if not (has_year or has_month or has_day):
            return None

        return _ParsedSingle(
            dt=dt_lo, has_year=has_year, has_month=has_month, has_day=has_day
        )

    # ------------------------------------------------------------------
    # Helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _align_timezones(
        dt1: datetime, dt2: datetime
    ) -> tuple[datetime, datetime]:
        """Make two datetimes tz-comparable for subtraction."""
        aware1 = dt1.tzinfo is not None
        aware2 = dt2.tzinfo is not None
        if aware1 and aware2:
            return dt1.astimezone(timezone.utc), dt2.astimezone(timezone.utc)
        if aware1 and not aware2:
            return dt1, dt2.replace(tzinfo=dt1.tzinfo)
        if aware2 and not aware1:
            return dt1.replace(tzinfo=dt2.tzinfo), dt2
        return dt1, dt2

    @staticmethod
    def _truncate_day(dt: datetime) -> datetime:
        return dt.replace(hour=0, minute=0, second=0, microsecond=0)

    @staticmethod
    def _normalize_day(dt: datetime) -> datetime:
        """Collapse to a tz-naive midnight so any two dates are comparable.

        Aware datetimes are converted to UTC first; the result is always
        naive, so values that started with differing tz-awareness can be
        ordered against each other without raising ``TypeError``.
        """
        if dt.tzinfo is not None:
            dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
        return dt.replace(hour=0, minute=0, second=0, microsecond=0)

    @classmethod
    def _dates_equal_day(cls, dt1: datetime, dt2: datetime) -> bool:
        a, b = cls._align_timezones(dt1, dt2)
        return cls._truncate_day(a) == cls._truncate_day(b)

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}("
            f"threshold={self.threshold}, "
            f"tolerance={self.tolerance!r}, "
            f"dayfirst={self.dayfirst!r}, "
            f"allow_partial_year={self.allow_partial_year}, "
            f"range_mode={self.range_mode!r}, "
            f"precision_mode={self.precision_mode!r})"
        )

`config` `property`

Round-trippable config for JSON-schema export.

Only non-default values are emitted, and an all-default instance returns None — matching NumericComparator.config and keeping a redundant x-aws-stickler-comparator-config block out of every exported schema (the exporter keys off truthiness).

Tolerance is exported as days (an int when the timedelta is a whole number of days, otherwise a float) so it can survive a JSON round-trip.

`compare(str1, str2)`

Score two date values per the tier system documented above.

Source code in stickler/comparators/date.py

def compare(self, str1: Any, str2: Any) -> float:
    """Score two date values per the tier system documented above."""
    if str1 is None and str2 is None:
        return 1.0
    if str1 is None or str2 is None:
        return 0.0

    # Resolve dayfirst pairwise. ``None`` means "try both
    # interpretations and take the best score" — that way a string
    # whose layout is genuinely ambiguous in isolation can still
    # match if one consistent interpretation lines up.
    #
    # A malformed value must never crash an evaluation run, so any
    # datetime-comparison edge (e.g. mixed tz-awareness the alignment
    # helpers didn't anticipate) degrades to 0.0 like every other
    # parse failure. The range/single paths align timezones inline;
    # this is a backstop, not the primary defense.
    try:
        if self.dayfirst is not None:
            return self._compare_with_dayfirst(str1, str2, self.dayfirst)

        return max(
            self._compare_with_dayfirst(str1, str2, False),
            self._compare_with_dayfirst(str1, str2, True),
        )
    except TypeError:
        return 0.0

`stickler.comparators.FuzzyComparator`

Bases: BaseComparator

Comparator for fuzzy string matching.

This comparator uses the rapidfuzz library to calculate similarity between strings using advanced Levenshtein distance calculations. It provides better fuzzy matching than basic Levenshtein for many use cases.

If rapidfuzz is not available, this will raise an ImportError when instantiated.

Source code in stickler/comparators/fuzzy.py

class FuzzyComparator(BaseComparator):
    """Comparator for fuzzy string matching.

    This comparator uses the rapidfuzz library to calculate similarity between
    strings using advanced Levenshtein distance calculations. It provides better
    fuzzy matching than basic Levenshtein for many use cases.

    If rapidfuzz is not available, this will raise an ImportError when instantiated.
    """

    def __init__(
        self, method: str = "ratio", normalize: bool = True, threshold: float = 0.7
    ):
        """Initialize the fuzzy comparator.

        Args:
            method: The fuzzy matching method to use. Options:
                - "ratio": Standard Levenshtein distance ratio
                - "partial_ratio": Partial string matching
                - "token_sort_ratio": Token-based matching with sorting
                - "token_set_ratio": Token-based matching with set operations
            normalize: Whether to normalize input strings before comparison
                      (strip whitespace, lowercase)
            threshold: Similarity threshold (default 0.7)

        Raises:
            ImportError: If rapidfuzz library is not available
        """
        super().__init__(threshold=threshold)

        if not RAPIDFUZZ_AVAILABLE:
            raise ImportError(
                "The rapidfuzz library is required for FuzzyComparator. "
                "Install it with: pip install rapidfuzz"
            )

        self._method = method
        self._normalize = normalize

        # Select the appropriate fuzzy matching function
        self._fuzzy_func = {
            "ratio": fuzz.ratio,
            "partial_ratio": fuzz.partial_ratio,
            "token_sort_ratio": fuzz.token_sort_ratio,
            "token_set_ratio": fuzz.token_set_ratio,
        }.get(method, fuzz.ratio)

    @property
    def name(self) -> str:
        """Return the name of the comparator."""
        return f"fuzzy_{self._method}"

    @property
    def config(self) -> Optional[Dict[str, Any]]:
        """Return configuration parameters."""
        return {"method": self._method, "normalize": self._normalize}

    def compare(self, value1: Any, value2: Any) -> float:
        """Compare two strings using fuzzy matching.

        Args:
            value1: First string or value
            value2: Second string or value

        Returns:
            Similarity score between 0.0 and 1.0
        """
        # Handle None values
        if value1 is None and value2 is None:
            return 1.0
        elif value1 is None or value2 is None:
            return 0.0

        # Convert to strings
        s1 = str(value1)
        s2 = str(value2)

        # Normalize if enabled
        if self._normalize:
            s1 = s1.strip().lower()
            s2 = s2.strip().lower()

        # Calculate fuzzy match score and normalize to 0.0-1.0
        if s1 == "" and s2 == "":
            return 1.0

        # Use the selected fuzzy matching function
        try:
            return self._fuzzy_func(s1, s2) / 100.0
        except Exception:
            # Fall back to basic comparison if fuzzy match fails
            return 1.0 if s1 == s2 else 0.0

`config` `property`

Return configuration parameters.

`name` `property`

Return the name of the comparator.

`init(method='ratio', normalize=True, threshold=0.7)`

Initialize the fuzzy comparator.

Parameters:

Name	Type	Description	Default
`method`	`str`	The fuzzy matching method to use. Options: - "ratio": Standard Levenshtein distance ratio - "partial_ratio": Partial string matching - "token_sort_ratio": Token-based matching with sorting - "token_set_ratio": Token-based matching with set operations	`'ratio'`
`normalize`	`bool`	Whether to normalize input strings before comparison (strip whitespace, lowercase)	`True`
`threshold`	`float`	Similarity threshold (default 0.7)	`0.7`

Raises:

Type	Description
`ImportError`	If rapidfuzz library is not available

Source code in stickler/comparators/fuzzy.py

def __init__(
    self, method: str = "ratio", normalize: bool = True, threshold: float = 0.7
):
    """Initialize the fuzzy comparator.

    Args:
        method: The fuzzy matching method to use. Options:
            - "ratio": Standard Levenshtein distance ratio
            - "partial_ratio": Partial string matching
            - "token_sort_ratio": Token-based matching with sorting
            - "token_set_ratio": Token-based matching with set operations
        normalize: Whether to normalize input strings before comparison
                  (strip whitespace, lowercase)
        threshold: Similarity threshold (default 0.7)

    Raises:
        ImportError: If rapidfuzz library is not available
    """
    super().__init__(threshold=threshold)

    if not RAPIDFUZZ_AVAILABLE:
        raise ImportError(
            "The rapidfuzz library is required for FuzzyComparator. "
            "Install it with: pip install rapidfuzz"
        )

    self._method = method
    self._normalize = normalize

    # Select the appropriate fuzzy matching function
    self._fuzzy_func = {
        "ratio": fuzz.ratio,
        "partial_ratio": fuzz.partial_ratio,
        "token_sort_ratio": fuzz.token_sort_ratio,
        "token_set_ratio": fuzz.token_set_ratio,
    }.get(method, fuzz.ratio)

`compare(value1, value2)`

Compare two strings using fuzzy matching.

Parameters:

Name	Type	Description	Default
`value1`	`Any`	First string or value	required
`value2`	`Any`	Second string or value	required

Returns:

Type	Description
`float`	Similarity score between 0.0 and 1.0

Source code in stickler/comparators/fuzzy.py

def compare(self, value1: Any, value2: Any) -> float:
    """Compare two strings using fuzzy matching.

    Args:
        value1: First string or value
        value2: Second string or value

    Returns:
        Similarity score between 0.0 and 1.0
    """
    # Handle None values
    if value1 is None and value2 is None:
        return 1.0
    elif value1 is None or value2 is None:
        return 0.0

    # Convert to strings
    s1 = str(value1)
    s2 = str(value2)

    # Normalize if enabled
    if self._normalize:
        s1 = s1.strip().lower()
        s2 = s2.strip().lower()

    # Calculate fuzzy match score and normalize to 0.0-1.0
    if s1 == "" and s2 == "":
        return 1.0

    # Use the selected fuzzy matching function
    try:
        return self._fuzzy_func(s1, s2) / 100.0
    except Exception:
        # Fall back to basic comparison if fuzzy match fails
        return 1.0 if s1 == s2 else 0.0

`stickler.comparators.BERTComparator`

Bases: BaseComparator

Comparator that uses BERT embeddings for semantic similarity.

This comparator uses the BERTScore metric to calculate semantic similarity between strings, returning the f1 score as the similarity measure.

Example

comparator = BERTComparator(threshold=0.8)

# Returns similarity score based on semantic similarity
score = comparator.compare("The cat sat on the mat", "A feline was sitting on a rug")

Source code in stickler/comparators/bert.py

class BERTComparator(BaseComparator):
    """Comparator that uses BERT embeddings for semantic similarity.

    This comparator uses the BERTScore metric to calculate semantic similarity
    between strings, returning the f1 score as the similarity measure.

    Example:
        ```python
        comparator = BERTComparator(threshold=0.8)

        # Returns similarity score based on semantic similarity
        score = comparator.compare("The cat sat on the mat", "A feline was sitting on a rug")
        ```
    """

    def __init__(self, threshold: float = 0.7):
        """Initialize the BERTComparator.

        Args:
            threshold: Similarity threshold (0.0-1.0)
        """
        super().__init__(threshold=threshold)
        if model is None:
            raise ImportError(
                "BERTScore model could not be loaded. Please install 'evaluate' package."
            )

    def compare(self, str1: Any, str2: Any) -> float:
        """Compare two strings using BERT semantic similarity.

        Args:
            str1: First string
            str2: Second string

        Returns:
            Similarity score between 0.0 and 1.0 based on BERTScore f1
        """
        if str1 is None or str2 is None:
            return 0.0

        # Convert to strings if they aren't already
        str1 = str(str1)
        str2 = str(str2)

        # Strip punctuation and whitespace
        str1_clean = strip_punctuation_space(str1)
        str2_clean = strip_punctuation_space(str2)

        # Handle empty strings
        if not str1_clean or not str2_clean:
            return 1.0 if str1_clean == str2_clean else 0.0

        try:
            # Calculate BERT score
            result = model.compute(
                predictions=[str1_clean], references=[str2_clean], lang="en"
            )

            # Return f1 score
            return result["f1"][0]
        except Exception as e:
            # Fallback to direct comparison
            print(f"BERT comparison error: {str(e)}")
            return 1.0 if str1_clean == str2_clean else 0.0

`init(threshold=0.7)`

Initialize the BERTComparator.

Parameters:

Name	Type	Description	Default
`threshold`	`float`	Similarity threshold (0.0-1.0)	`0.7`

Source code in stickler/comparators/bert.py

def __init__(self, threshold: float = 0.7):
    """Initialize the BERTComparator.

    Args:
        threshold: Similarity threshold (0.0-1.0)
    """
    super().__init__(threshold=threshold)
    if model is None:
        raise ImportError(
            "BERTScore model could not be loaded. Please install 'evaluate' package."
        )

`compare(str1, str2)`

Compare two strings using BERT semantic similarity.

Parameters:

Name	Type	Description	Default
`str1`	`Any`	First string	required
`str2`	`Any`	Second string	required

Returns:

Type	Description
`float`	Similarity score between 0.0 and 1.0 based on BERTScore f1

Source code in stickler/comparators/bert.py

def compare(self, str1: Any, str2: Any) -> float:
    """Compare two strings using BERT semantic similarity.

    Args:
        str1: First string
        str2: Second string

    Returns:
        Similarity score between 0.0 and 1.0 based on BERTScore f1
    """
    if str1 is None or str2 is None:
        return 0.0

    # Convert to strings if they aren't already
    str1 = str(str1)
    str2 = str(str2)

    # Strip punctuation and whitespace
    str1_clean = strip_punctuation_space(str1)
    str2_clean = strip_punctuation_space(str2)

    # Handle empty strings
    if not str1_clean or not str2_clean:
        return 1.0 if str1_clean == str2_clean else 0.0

    try:
        # Calculate BERT score
        result = model.compute(
            predictions=[str1_clean], references=[str2_clean], lang="en"
        )

        # Return f1 score
        return result["f1"][0]
    except Exception as e:
        # Fallback to direct comparison
        print(f"BERT comparison error: {str(e)}")
        return 1.0 if str1_clean == str2_clean else 0.0

`stickler.comparators.SemanticComparator`

Bases: BaseComparator

Comparator that uses embeddings for semantic similarity.

This comparator uses embeddings from a model (default: Titan) to calculate semantic similarity between strings.

Attributes:

Name	Type	Description
`SIMILARITY_FUNCTIONS`		Dictionary of similarity functions
`bc`		BedrockClient instance
`model_id`		Model ID to use for embeddings
`embedding_function`		Function to generate embeddings
`sim_function`		Name of the similarity function to use
`similarity_function`		The actual similarity function

Source code in stickler/comparators/semantic.py

class SemanticComparator(BaseComparator):
    """Comparator that uses embeddings for semantic similarity.

    This comparator uses embeddings from a model (default: Titan) to calculate
    semantic similarity between strings.

    Attributes:
        SIMILARITY_FUNCTIONS: Dictionary of similarity functions
        bc: BedrockClient instance
        model_id: Model ID to use for embeddings
        embedding_function: Function to generate embeddings
        sim_function: Name of the similarity function to use
        similarity_function: The actual similarity function
    """

    SIMILARITY_FUNCTIONS = {
        "cosine_similarity": lambda x, y: 1 - spatial.distance.cosine(x, y)
    }

    def __init__(
        self,
        model_id: str = "amazon.titan-embed-text-v2:0",
        sim_function: str = "cosine_similarity",
        embedding_function: Optional[Callable] = None,
        threshold: float = 0.7,
    ):
        """Initialize the SemanticComparator.

        Args:
            model_id: Model ID to use for embeddings
            sim_function: Name of the similarity function to use
            embedding_function: Optional custom embedding function
            threshold: Similarity threshold (0.0-1.0)

        Raises:
            ImportError: If BedrockClient is not available and no embedding_function is provided
        """
        super().__init__(threshold=threshold)

        self.model_id = model_id
        if embedding_function is not None:
            self.embedding_function = embedding_function
        else:
            self.embedding_function = partial(
                generate_bedrock_embedding, model_id=model_id
            )

        self.sim_function = sim_function
        self.similarity_function = self.SIMILARITY_FUNCTIONS[self.sim_function]

    def compare(self, str1: str, str2: str) -> float:
        """Compare two values using semantic similarity.

        If embedding generation fails, this logs the model ID, embedding function,
        input lengths, similarity function, and exception type before falling back
        to raw equality.

        Args:
            str1: First value
            str2: Second value

        Returns:
            Similarity score between 0.0 and 1.0
        """
        if str1 is None or str2 is None:
            return 0.0

        try:
            x, y = self.embedding_function(str1), self.embedding_function(str2)
            return self.similarity_function(x, y)
        except Exception:
            logger.exception(
                "Semantic embedding comparison failed; falling back to string equality",
                extra={
                    "embedding_function": _embedding_function_name(
                        self.embedding_function
                    ),
                    "model_id": getattr(self, "model_id", None),
                    "input_1_length": _input_length(str1),
                    "input_2_length": _input_length(str2),
                    "similarity_function": self.sim_function,
                    "exception_type": type(sys.exc_info()[1]).__name__,
                },
            )
            # Fallback to string equality if embedding fails
            return 1.0 if str1 == str2 else 0.0

`init(model_id='amazon.titan-embed-text-v2:0', sim_function='cosine_similarity', embedding_function=None, threshold=0.7)`

Initialize the SemanticComparator.

Parameters:

Name	Type	Description	Default
`model_id`	`str`	Model ID to use for embeddings	`'amazon.titan-embed-text-v2:0'`
`sim_function`	`str`	Name of the similarity function to use	`'cosine_similarity'`
`embedding_function`	`Optional[Callable]`	Optional custom embedding function	`None`
`threshold`	`float`	Similarity threshold (0.0-1.0)	`0.7`

Raises:

Type	Description
`ImportError`	If BedrockClient is not available and no embedding_function is provided

Source code in stickler/comparators/semantic.py

def __init__(
    self,
    model_id: str = "amazon.titan-embed-text-v2:0",
    sim_function: str = "cosine_similarity",
    embedding_function: Optional[Callable] = None,
    threshold: float = 0.7,
):
    """Initialize the SemanticComparator.

    Args:
        model_id: Model ID to use for embeddings
        sim_function: Name of the similarity function to use
        embedding_function: Optional custom embedding function
        threshold: Similarity threshold (0.0-1.0)

    Raises:
        ImportError: If BedrockClient is not available and no embedding_function is provided
    """
    super().__init__(threshold=threshold)

    self.model_id = model_id
    if embedding_function is not None:
        self.embedding_function = embedding_function
    else:
        self.embedding_function = partial(
            generate_bedrock_embedding, model_id=model_id
        )

    self.sim_function = sim_function
    self.similarity_function = self.SIMILARITY_FUNCTIONS[self.sim_function]

`compare(str1, str2)`

Compare two values using semantic similarity.

If embedding generation fails, this logs the model ID, embedding function, input lengths, similarity function, and exception type before falling back to raw equality.

Parameters:

Name	Type	Description	Default
`str1`	`str`	First value	required
`str2`	`str`	Second value	required

Returns:

Type	Description
`float`	Similarity score between 0.0 and 1.0

Source code in stickler/comparators/semantic.py

def compare(self, str1: str, str2: str) -> float:
    """Compare two values using semantic similarity.

    If embedding generation fails, this logs the model ID, embedding function,
    input lengths, similarity function, and exception type before falling back
    to raw equality.

    Args:
        str1: First value
        str2: Second value

    Returns:
        Similarity score between 0.0 and 1.0
    """
    if str1 is None or str2 is None:
        return 0.0

    try:
        x, y = self.embedding_function(str1), self.embedding_function(str2)
        return self.similarity_function(x, y)
    except Exception:
        logger.exception(
            "Semantic embedding comparison failed; falling back to string equality",
            extra={
                "embedding_function": _embedding_function_name(
                    self.embedding_function
                ),
                "model_id": getattr(self, "model_id", None),
                "input_1_length": _input_length(str1),
                "input_2_length": _input_length(str2),
                "similarity_function": self.sim_function,
                "exception_type": type(sys.exc_info()[1]).__name__,
            },
        )
        # Fallback to string equality if embedding fails
        return 1.0 if str1 == str2 else 0.0

`stickler.comparators.LLMComparator`

Bases: BaseComparator

Large Language Model-based semantic comparator.

This comparator uses LLMs to perform intelligent semantic comparisons that go beyond simple string matching. It can understand context, handle abbreviations, recognize synonyms, and apply domain-specific comparison logic through custom evaluation guidelines.

The comparator returns binary similarity scores (0.0 or 1.0) based on whether the LLM determines the values are semantically equivalent. It handles edge cases like None values and provides detailed comparison information for debugging.

Attributes:

Name	Type	Description
`model`	`Union[Model, str]`	The LLM model identifier or Model instance.
`eval_guidelines`	`str`	Custom guidelines for comparison logic.
`system_prompt`	`str`	The system prompt used to instruct the LLM.
`prompt_template`	`Template`	Jinja2 template for formatting comparison prompts.
`agent`	`Agent`	The strands Agent instance for LLM interactions.
`threshold`	`float`	Inherited from BaseComparator, used for binary decisions.

Note

This comparator requires AWS Bedrock access and proper authentication. API calls incur costs and latency, so consider caching for repeated comparisons.

Source code in stickler/comparators/llm.py

class LLMComparator(BaseComparator):
    """Large Language Model-based semantic comparator.

    This comparator uses LLMs to perform intelligent semantic comparisons that go
    beyond simple string matching. It can understand context, handle abbreviations,
    recognize synonyms, and apply domain-specific comparison logic through custom
    evaluation guidelines.

    The comparator returns binary similarity scores (0.0 or 1.0) based on whether
    the LLM determines the values are semantically equivalent. It handles edge cases
    like None values and provides detailed comparison information for debugging.

    Attributes:
        model (Union[Model, str]): The LLM model identifier or Model instance.
        eval_guidelines (str, optional): Custom guidelines for comparison logic.
        system_prompt (str): The system prompt used to instruct the LLM.
        prompt_template (Template): Jinja2 template for formatting comparison prompts.
        agent (Agent): The strands Agent instance for LLM interactions.
        threshold (float): Inherited from BaseComparator, used for binary decisions.

    Note:
        This comparator requires AWS Bedrock access and proper authentication.
        API calls incur costs and latency, so consider caching for repeated comparisons.
    """

    def __init__(
        self,
        model: Union[Model, str] = None,
        eval_guidelines: str = None,
    ):
        """Initialize the LLM comparator.

        Args:
            model: The LLM model to use for comparisons. Can be a model identifier
                string (e.g., "us.anthropic.claude-3-haiku-20240307-v1:0") or a
                strands Model instance. Defaults to Claude 3 Haiku.
            eval_guidelines: Optional custom guidelines to include in the comparison
                prompt. These guidelines help the LLM understand domain-specific
                comparison rules (e.g., "Consider abbreviations equivalent").

        Raises:
            ImportError: If strands-agents is not installed.
            ValueError: If the model parameter is not provided.

        Example:
            >>> # Basic initialization
            >>> comparator = LLMComparator()

            >>> # With custom model and guidelines
            >>> comparator = LLMComparator(
            ...     model="us.amazon.nova-lite-v1:0",
            ...     eval_guidelines="Consider street abbreviations equivalent"
            ... )
        """
        super().__init__()

        # Check if strands is available
        if not STRANDS_AVAILABLE:
            raise ImportError(
                "LLMComparator requires the 'strands-agents' package. "
                "Install it with: pip install stickler-eval[llm]"
            )

        if model is None:
            raise ValueError("Model must be provided for LLMComparator.")
        self.model = model
        self.system_prompt = self._default_system_prompt()
        self.prompt_template = self._default_prompt_template()
        if eval_guidelines is not None:
            self.eval_guidelines = html.escape(eval_guidelines)
        else:
            self.eval_guidelines = eval_guidelines

        # Initialize Agent
        self.agent = Agent(
            model=self.model, system_prompt=self.system_prompt, callback_handler=None
        )

    def _default_system_prompt(self) -> str:
        """Generate the default system prompt for the LLM.

        Returns:
            str: System prompt instructing the LLM to perform binary comparisons.
        """
        return "You are a helpful assistant that compares two values and determines if they are equivalent. Only return one word: 'true' or 'false'."

    def _default_prompt_template(self) -> Template:
        """Generate the default Jinja2 template for comparison prompts.

        Returns:
            Template: Jinja2 template that formats comparison prompts with values
                and optional evaluation guidelines.
        """
        prompt_template = """
            Compare these two values and determine if they are equivalent:

            Value 1: {{ value1 }}
            Value 2: {{ value2 }}

            {% if eval_guidelines is not none %}
            <guidelines>
            Here are some guidelines to follow for the comparison:
            {{ eval_guidelines }}
            </guidelines>
            {% endif %}

            If the values are equivalent, return 'true'. If not, return 'false'. Only return one word: 'true' or 'false'.
            """

        template = Template(prompt_template)
        return template

    def _invoke_agent(self, prompt: str) -> str:
        """Invoke the LLM agent with a formatted prompt.

        Args:
            prompt: The formatted prompt string to send to the LLM.

        Returns:
            str: The text response from the LLM.

        Raises:
            Exception: If the agent call fails or response format is unexpected.
        """
        result = self.agent(prompt)
        return result.message["content"][0]["text"]

    def compare(self, value1: Any, value2: Any) -> float:
        """Compare two values using LLM-based semantic analysis.

        This method converts both values to strings and uses the configured LLM
        to determine if they are semantically equivalent. The comparison considers
        context, abbreviations, synonyms, and any provided evaluation guidelines.

        Args:
            value1: First value to compare. Can be any type that converts to string.
            value2: Second value to compare. Can be any type that converts to string.

        Returns:
            float: Binary similarity score:
                - 1.0 if the LLM determines the values are equivalent
                - 0.0 if the LLM determines the values are not equivalent
                - 0.0 if an error occurs during comparison

        Note:
            - None values: Returns 1.0 if both are None, 0.0 if only one is None
            - Error handling: Returns 0.0 for any exceptions during LLM calls
            - Cost consideration: Each call incurs API costs and latency

        Example:
            >>> comparator = LLMComparator()
            >>> comparator.compare("St. John's Street", "Saint John's St")
            1.0
            >>> comparator.compare("apple", "orange")
            0.0
            >>> comparator.compare(None, None)
            1.0
        """
        # Handle None values
        if value1 is None and value2 is None:
            return 1.0
        elif value1 is None or value2 is None:
            return 0.0

        # Format the prompt with your values
        formatted_prompt = self.prompt_template.render(
            value1=html.escape(str(value1)),
            value2=html.escape(str(value2)),
            eval_guidelines=self.eval_guidelines,
        )

        try:
            # Get LLM response
            response = self._invoke_agent(formatted_prompt)
            # Parse response to boolean
            response_lower = response.strip().lower()
            if "true" in response_lower:
                return 1.0
            else:
                return 0.0

        except NoCredentialsError:
            print("Error: AWS credentials not found.")
            raise

        except Exception as e:
            print(f"Error during LLM call: {e}")
            raise

    def get_comparison_details(self, value1: Any, value2: Any) -> Dict[str, Any]:
        """Get detailed information about a comparison operation.

        This method provides comprehensive details about the comparison process,
        including the formatted prompt, LLM response, model information, and
        final comparison result. Useful for debugging, auditing, and understanding
        how the LLM made its decision.

        Args:
            value1: First value to compare. Can be any type that converts to string.
            value2: Second value to compare. Can be any type that converts to string.

        Returns:
            Dict[str, Any]: Dictionary containing comparison details:
                - 'prompt' (str): The formatted prompt sent to the LLM
                - 'llm_response' (str): Raw response from the LLM
                - 'model_id' (Union[Model, str]): The model used (string ID or Model instance)
                - 'comparison_result' (float): Final similarity score (0.0 or 1.0)

                On error:
                - 'error' (str): Error message describing what went wrong
                - 'comparison_result' (bool): False to indicate failure

        Example:
            >>> comparator = LLMComparator(eval_guidelines="Consider abbreviations")
            >>> details = comparator.get_comparison_details("St. John", "Saint John")
            >>> print(details['llm_response'])
            'true'
            >>> print(details['comparison_result'])
            1.0
            >>> print('guidelines' in details['prompt'])
            True
        """
        formatted_prompt = self.prompt_template.render(
            value1=html.escape(str(value1)),
            value2=html.escape(str(value2)),
            eval_guidelines=self.eval_guidelines,
        )

        try:
            response = self._invoke_agent(formatted_prompt)
            return {
                "prompt": formatted_prompt,
                "llm_response": response,
                "model_id": self.model,
                "comparison_result": self.compare(value1, value2),
            }
        except Exception as e:
            return {"error": str(e), "comparison_result": False}

`init(model=None, eval_guidelines=None)`

Initialize the LLM comparator.

Parameters:

Name	Type	Description	Default
`model`	`Union[Model, str]`	The LLM model to use for comparisons. Can be a model identifier string (e.g., "us.anthropic.claude-3-haiku-20240307-v1:0") or a strands Model instance. Defaults to Claude 3 Haiku.	`None`
`eval_guidelines`	`str`	Optional custom guidelines to include in the comparison prompt. These guidelines help the LLM understand domain-specific comparison rules (e.g., "Consider abbreviations equivalent").	`None`

Raises:

Type	Description
`ImportError`	If strands-agents is not installed.
`ValueError`	If the model parameter is not provided.

Example

Basic initialization

comparator = LLMComparator()

With custom model and guidelines

comparator = LLMComparator( ... model="us.amazon.nova-lite-v1:0", ... eval_guidelines="Consider street abbreviations equivalent" ... )

Source code in stickler/comparators/llm.py

def __init__(
    self,
    model: Union[Model, str] = None,
    eval_guidelines: str = None,
):
    """Initialize the LLM comparator.

    Args:
        model: The LLM model to use for comparisons. Can be a model identifier
            string (e.g., "us.anthropic.claude-3-haiku-20240307-v1:0") or a
            strands Model instance. Defaults to Claude 3 Haiku.
        eval_guidelines: Optional custom guidelines to include in the comparison
            prompt. These guidelines help the LLM understand domain-specific
            comparison rules (e.g., "Consider abbreviations equivalent").

    Raises:
        ImportError: If strands-agents is not installed.
        ValueError: If the model parameter is not provided.

    Example:
        >>> # Basic initialization
        >>> comparator = LLMComparator()

        >>> # With custom model and guidelines
        >>> comparator = LLMComparator(
        ...     model="us.amazon.nova-lite-v1:0",
        ...     eval_guidelines="Consider street abbreviations equivalent"
        ... )
    """
    super().__init__()

    # Check if strands is available
    if not STRANDS_AVAILABLE:
        raise ImportError(
            "LLMComparator requires the 'strands-agents' package. "
            "Install it with: pip install stickler-eval[llm]"
        )

    if model is None:
        raise ValueError("Model must be provided for LLMComparator.")
    self.model = model
    self.system_prompt = self._default_system_prompt()
    self.prompt_template = self._default_prompt_template()
    if eval_guidelines is not None:
        self.eval_guidelines = html.escape(eval_guidelines)
    else:
        self.eval_guidelines = eval_guidelines

    # Initialize Agent
    self.agent = Agent(
        model=self.model, system_prompt=self.system_prompt, callback_handler=None
    )

`compare(value1, value2)`

Compare two values using LLM-based semantic analysis.

This method converts both values to strings and uses the configured LLM to determine if they are semantically equivalent. The comparison considers context, abbreviations, synonyms, and any provided evaluation guidelines.

Parameters:

Name	Type	Description	Default
`value1`	`Any`	First value to compare. Can be any type that converts to string.	required
`value2`	`Any`	Second value to compare. Can be any type that converts to string.	required

Returns:

Name	Type	Description
`float`	`float`	Binary similarity score: - 1.0 if the LLM determines the values are equivalent - 0.0 if the LLM determines the values are not equivalent - 0.0 if an error occurs during comparison

Note

None values: Returns 1.0 if both are None, 0.0 if only one is None
Error handling: Returns 0.0 for any exceptions during LLM calls
Cost consideration: Each call incurs API costs and latency

Example

comparator = LLMComparator() comparator.compare("St. John's Street", "Saint John's St") 1.0 comparator.compare("apple", "orange") 0.0 comparator.compare(None, None) 1.0

Source code in stickler/comparators/llm.py

def compare(self, value1: Any, value2: Any) -> float:
    """Compare two values using LLM-based semantic analysis.

    This method converts both values to strings and uses the configured LLM
    to determine if they are semantically equivalent. The comparison considers
    context, abbreviations, synonyms, and any provided evaluation guidelines.

    Args:
        value1: First value to compare. Can be any type that converts to string.
        value2: Second value to compare. Can be any type that converts to string.

    Returns:
        float: Binary similarity score:
            - 1.0 if the LLM determines the values are equivalent
            - 0.0 if the LLM determines the values are not equivalent
            - 0.0 if an error occurs during comparison

    Note:
        - None values: Returns 1.0 if both are None, 0.0 if only one is None
        - Error handling: Returns 0.0 for any exceptions during LLM calls
        - Cost consideration: Each call incurs API costs and latency

    Example:
        >>> comparator = LLMComparator()
        >>> comparator.compare("St. John's Street", "Saint John's St")
        1.0
        >>> comparator.compare("apple", "orange")
        0.0
        >>> comparator.compare(None, None)
        1.0
    """
    # Handle None values
    if value1 is None and value2 is None:
        return 1.0
    elif value1 is None or value2 is None:
        return 0.0

    # Format the prompt with your values
    formatted_prompt = self.prompt_template.render(
        value1=html.escape(str(value1)),
        value2=html.escape(str(value2)),
        eval_guidelines=self.eval_guidelines,
    )

    try:
        # Get LLM response
        response = self._invoke_agent(formatted_prompt)
        # Parse response to boolean
        response_lower = response.strip().lower()
        if "true" in response_lower:
            return 1.0
        else:
            return 0.0

    except NoCredentialsError:
        print("Error: AWS credentials not found.")
        raise

    except Exception as e:
        print(f"Error during LLM call: {e}")
        raise

`get_comparison_details(value1, value2)`

Get detailed information about a comparison operation.

This method provides comprehensive details about the comparison process, including the formatted prompt, LLM response, model information, and final comparison result. Useful for debugging, auditing, and understanding how the LLM made its decision.

Parameters:

Name	Type	Description	Default
`value1`	`Any`	First value to compare. Can be any type that converts to string.	required
`value2`	`Any`	Second value to compare. Can be any type that converts to string.	required

Returns:

Type Description

Dict[str, Any]

Dict[str, Any]: Dictionary containing comparison details: - 'prompt' (str): The formatted prompt sent to the LLM - 'llm_response' (str): Raw response from the LLM - 'model_id' (Union[Model, str]): The model used (string ID or Model instance) - 'comparison_result' (float): Final similarity score (0.0 or 1.0)

On error: - 'error' (str): Error message describing what went wrong - 'comparison_result' (bool): False to indicate failure

Example

comparator = LLMComparator(eval_guidelines="Consider abbreviations") details = comparator.get_comparison_details("St. John", "Saint John") print(details['llm_response']) 'true' print(details['comparison_result']) 1.0 print('guidelines' in details['prompt']) True

Source code in stickler/comparators/llm.py

def get_comparison_details(self, value1: Any, value2: Any) -> Dict[str, Any]:
    """Get detailed information about a comparison operation.

    This method provides comprehensive details about the comparison process,
    including the formatted prompt, LLM response, model information, and
    final comparison result. Useful for debugging, auditing, and understanding
    how the LLM made its decision.

    Args:
        value1: First value to compare. Can be any type that converts to string.
        value2: Second value to compare. Can be any type that converts to string.

    Returns:
        Dict[str, Any]: Dictionary containing comparison details:
            - 'prompt' (str): The formatted prompt sent to the LLM
            - 'llm_response' (str): Raw response from the LLM
            - 'model_id' (Union[Model, str]): The model used (string ID or Model instance)
            - 'comparison_result' (float): Final similarity score (0.0 or 1.0)

            On error:
            - 'error' (str): Error message describing what went wrong
            - 'comparison_result' (bool): False to indicate failure

    Example:
        >>> comparator = LLMComparator(eval_guidelines="Consider abbreviations")
        >>> details = comparator.get_comparison_details("St. John", "Saint John")
        >>> print(details['llm_response'])
        'true'
        >>> print(details['comparison_result'])
        1.0
        >>> print('guidelines' in details['prompt'])
        True
    """
    formatted_prompt = self.prompt_template.render(
        value1=html.escape(str(value1)),
        value2=html.escape(str(value2)),
        eval_guidelines=self.eval_guidelines,
    )

    try:
        response = self._invoke_agent(formatted_prompt)
        return {
            "prompt": formatted_prompt,
            "llm_response": response,
            "model_id": self.model,
            "comparison_result": self.compare(value1, value2),
        }
    except Exception as e:
        return {"error": str(e), "comparison_result": False}

`stickler.comparators.StructuredModelComparator`

Bases: BaseComparator

Comparator for structured model objects.

This comparator is designed to work with StructuredModel instances, leveraging their built-in comparison capabilities.

Source code in stickler/comparators/structured.py

class StructuredModelComparator(BaseComparator):
    """Comparator for structured model objects.

    This comparator is designed to work with StructuredModel instances,
    leveraging their built-in comparison capabilities.
    """

    def __init__(self, threshold: float = 0.7, strict_types: bool = False):
        """Initialize the comparator.

        Args:
            threshold: Similarity threshold (0.0-1.0)
            strict_types: If True, will raise TypeError when non-StructuredModel objects are compared
        """
        super().__init__(threshold)
        self.strict_types = strict_types

    def compare(self, model1: Any, model2: Any) -> float:
        """Compare two structured model instances.

        This method uses the built-in compare method of StructuredModel objects
        if available, otherwise falls back to basic equality comparison.

        Args:
            model1: First model (ideally a StructuredModel instance)
            model2: Second model (ideally a StructuredModel instance)

        Returns:
            Similarity score between 0.0 and 1.0

        Raises:
            TypeError: When strict_types=True and comparing non-StructuredModel objects
        """
        # In strict mode, enforce StructuredModel types (used in tests)
        # For string values, always raise TypeError in strict mode
        if self.strict_types and isinstance(model1, str) and isinstance(model2, str):
            raise TypeError(
                "StructuredModelComparator can only compare StructuredModel instances"
            )

        # Handle None values
        if model1 is None or model2 is None:
            return 1.0 if model1 == model2 else 0.0

        # Check if both objects have a compare method (duck typing)
        if hasattr(model1, "compare") and callable(model1.compare):
            return model1.compare(model2)

        # Fall back to equality check for non-StructuredModel objects
        return 1.0 if model1 == model2 else 0.0

`init(threshold=0.7, strict_types=False)`

Initialize the comparator.

Parameters:

Name	Type	Description	Default
`threshold`	`float`	Similarity threshold (0.0-1.0)	`0.7`
`strict_types`	`bool`	If True, will raise TypeError when non-StructuredModel objects are compared	`False`

Source code in stickler/comparators/structured.py

def __init__(self, threshold: float = 0.7, strict_types: bool = False):
    """Initialize the comparator.

    Args:
        threshold: Similarity threshold (0.0-1.0)
        strict_types: If True, will raise TypeError when non-StructuredModel objects are compared
    """
    super().__init__(threshold)
    self.strict_types = strict_types

`compare(model1, model2)`

Compare two structured model instances.

This method uses the built-in compare method of StructuredModel objects if available, otherwise falls back to basic equality comparison.

Parameters:

Name	Type	Description	Default
`model1`	`Any`	First model (ideally a StructuredModel instance)	required
`model2`	`Any`	Second model (ideally a StructuredModel instance)	required

Returns:

Type	Description
`float`	Similarity score between 0.0 and 1.0

Raises:

Type	Description
`TypeError`	When strict_types=True and comparing non-StructuredModel objects

Source code in stickler/comparators/structured.py

def compare(self, model1: Any, model2: Any) -> float:
    """Compare two structured model instances.

    This method uses the built-in compare method of StructuredModel objects
    if available, otherwise falls back to basic equality comparison.

    Args:
        model1: First model (ideally a StructuredModel instance)
        model2: Second model (ideally a StructuredModel instance)

    Returns:
        Similarity score between 0.0 and 1.0

    Raises:
        TypeError: When strict_types=True and comparing non-StructuredModel objects
    """
    # In strict mode, enforce StructuredModel types (used in tests)
    # For string values, always raise TypeError in strict mode
    if self.strict_types and isinstance(model1, str) and isinstance(model2, str):
        raise TypeError(
            "StructuredModelComparator can only compare StructuredModel instances"
        )

    # Handle None values
    if model1 is None or model2 is None:
        return 1.0 if model1 == model2 else 0.0

    # Check if both objects have a compare method (duck typing)
    if hasattr(model1, "compare") and callable(model1.compare):
        return model1.compare(model2)

    # Fall back to equality check for non-StructuredModel objects
    return 1.0 if model1 == model2 else 0.0

`stickler.comparators.BBoxIoUComparator`

Bases: BaseComparator

Comparator for bounding boxes using Intersection over Union.

Compares two bounding boxes and returns their IoU as a similarity score between 0.0 and 1.0.

Bounding box formats accepted

Two-point: [[x1, y1], [x2, y2]]
Flat: [x1, y1, x2, y2]

Coordinates must be finite numbers; non-finite values (NaN, inf) are treated as malformed input and score 0.0. Booleans are accepted as coordinates (bool is a subclass of int: True == 1, False == 0), so guard upstream if that is not intended. Note that a zero-area box (a point, e.g. [[5, 5], [5, 5]]) has no area to intersect, so it scores IoU 0.0 even against an identical point — relevant when annotating point locations rather than regions.

Parameters:

Name	Type	Description	Default
`threshold`	`float`	IoU threshold for binary match classification (default: 0.5).	`0.5`

Example

from stickler.comparators.bbox import BBoxIoUComparator cmp = BBoxIoUComparator(threshold=0.5) cmp.compare([[0, 0], [10, 10]], [[0, 0], [10, 10]]) 1.0 cmp.compare([[0, 0], [5, 5]], [[5, 5], [10, 10]]) 0.0

Source code in stickler/comparators/bbox.py

class BBoxIoUComparator(BaseComparator):
    """Comparator for bounding boxes using Intersection over Union.

    Compares two bounding boxes and returns their IoU as a similarity
    score between 0.0 and 1.0.

    Bounding box formats accepted:
        - Two-point: [[x1, y1], [x2, y2]]
        - Flat: [x1, y1, x2, y2]

    Coordinates must be finite numbers; non-finite values (NaN, inf) are
    treated as malformed input and score 0.0. Booleans are accepted as
    coordinates (``bool`` is a subclass of ``int``: ``True`` == 1, ``False``
    == 0), so guard upstream if that is not intended. Note that a zero-area
    box (a point, e.g. ``[[5, 5], [5, 5]]``) has no area to intersect, so it
    scores IoU 0.0 even against an identical point — relevant when annotating
    point locations rather than regions.

    Args:
        threshold: IoU threshold for binary match classification (default: 0.5).

    Example:
        >>> from stickler.comparators.bbox import BBoxIoUComparator
        >>> cmp = BBoxIoUComparator(threshold=0.5)
        >>> cmp.compare([[0, 0], [10, 10]], [[0, 0], [10, 10]])
        1.0
        >>> cmp.compare([[0, 0], [5, 5]], [[5, 5], [10, 10]])
        0.0
    """

    def __init__(
        self,
        threshold: float = 0.5,
    ):
        super().__init__(threshold=threshold)

    def compare(self, bbox1: Any, bbox2: Any) -> float:
        """Compare two bounding boxes and return their IoU.

        Args:
            bbox1: First bounding box (prediction).
            bbox2: Second bounding box (ground truth).

        Returns:
            IoU score between 0.0 and 1.0.
        """
        if bbox1 is None and bbox2 is None:
            return 1.0
        if bbox1 is None or bbox2 is None:
            return 0.0

        coords1 = self._normalize_bbox(bbox1)
        coords2 = self._normalize_bbox(bbox2)

        if coords1 is None or coords2 is None:
            return 0.0

        return self._compute_iou(coords1, coords2)

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _normalize_bbox(
        bbox: Any,
    ) -> Optional[Tuple[float, float, float, float]]:
        """Normalize a bounding box to (x1, y1, x2, y2) with x1<=x2, y1<=y2.

        Accepts:
            - [[x1, y1], [x2, y2]]
            - [x1, y1, x2, y2]

        Returns:
            (x_min, y_min, x_max, y_max) or None if the input is invalid.
        """
        try:
            if not isinstance(bbox, (list, tuple)):
                return None

            if len(bbox) == 2 and all(
                isinstance(p, (list, tuple)) and len(p) == 2 for p in bbox
            ):
                # Two-point format: [[x1, y1], [x2, y2]]
                x1, y1 = float(bbox[0][0]), float(bbox[0][1])
                x2, y2 = float(bbox[1][0]), float(bbox[1][1])
            elif len(bbox) == 4 and all(isinstance(v, (int, float)) for v in bbox):
                # Flat format: [x1, y1, x2, y2]
                x1, y1, x2, y2 = (float(v) for v in bbox)
            else:
                return None

            # Reject non-finite coordinates (NaN, inf) as malformed input so
            # they score as a miss rather than poisoning IoU output.
            if not all(math.isfinite(v) for v in (x1, y1, x2, y2)):
                return None

            return (min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2))
        except (TypeError, ValueError, IndexError):
            return None

    @staticmethod
    def _compute_iou(
        box1: Tuple[float, float, float, float],
        box2: Tuple[float, float, float, float],
    ) -> float:
        """Compute IoU between two normalized boxes (x1, y1, x2, y2).

        Args:
            box1: (x_min, y_min, x_max, y_max)
            box2: (x_min, y_min, x_max, y_max)

        Returns:
            IoU value between 0.0 and 1.0.
        """
        x_left = max(box1[0], box2[0])
        y_top = max(box1[1], box2[1])
        x_right = min(box1[2], box2[2])
        y_bottom = min(box1[3], box2[3])

        inter_area = max(0.0, x_right - x_left) * max(0.0, y_bottom - y_top)

        area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
        area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
        union_area = area1 + area2 - inter_area

        if union_area <= 0:
            return 0.0

        return inter_area / union_area

`compare(bbox1, bbox2)`

Compare two bounding boxes and return their IoU.

Parameters:

Name	Type	Description	Default
`bbox1`	`Any`	First bounding box (prediction).	required
`bbox2`	`Any`	Second bounding box (ground truth).	required

Returns:

Type	Description
`float`	IoU score between 0.0 and 1.0.

Source code in stickler/comparators/bbox.py

def compare(self, bbox1: Any, bbox2: Any) -> float:
    """Compare two bounding boxes and return their IoU.

    Args:
        bbox1: First bounding box (prediction).
        bbox2: Second bounding box (ground truth).

    Returns:
        IoU score between 0.0 and 1.0.
    """
    if bbox1 is None and bbox2 is None:
        return 1.0
    if bbox1 is None or bbox2 is None:
        return 0.0

    coords1 = self._normalize_bbox(bbox1)
    coords2 = self._normalize_bbox(bbox2)

    if coords1 is None or coords2 is None:
        return 0.0

    return self._compute_iou(coords1, coords2)

Comparators

stickler.comparators

stickler.comparators.BaseComparator

__call__(str1, str2)

__init__(threshold=0.7)

__repr__()

__str__()

binary_compare(str1, str2)

compare(str1, str2) abstractmethod

stickler.comparators.ExactComparator

__init__(threshold=1.0, case_sensitive=False)

compare(str1, str2)

stickler.comparators.LevenshteinComparator

config property

name property

__init__(normalize=True, threshold=0.7)

compare(s1, s2)

stickler.comparators.NumericComparator

config property

__init__(threshold=1.0, relative_tolerance=0.0, absolute_tolerance=0.0, tolerance=None)

compare(str1, str2)

stickler.comparators.NumericExactC = NumericComparator module-attribute

stickler.comparators.DateComparator

config property

compare(str1, str2)

stickler.comparators.FuzzyComparator

config property

name property

__init__(method='ratio', normalize=True, threshold=0.7)

compare(value1, value2)

stickler.comparators.BERTComparator

__init__(threshold=0.7)

compare(str1, str2)

stickler.comparators.SemanticComparator

__init__(model_id='amazon.titan-embed-text-v2:0', sim_function='cosine_similarity', embedding_function=None, threshold=0.7)

compare(str1, str2)

stickler.comparators.LLMComparator

__init__(model=None, eval_guidelines=None)

Basic initialization

With custom model and guidelines

compare(value1, value2)

get_comparison_details(value1, value2)

stickler.comparators.StructuredModelComparator

__init__(threshold=0.7, strict_types=False)

compare(model1, model2)

stickler.comparators.BBoxIoUComparator

compare(bbox1, bbox2)

`stickler.comparators`

`stickler.comparators.BaseComparator`

`call(str1, str2)`

`init(threshold=0.7)`

`repr()`

`str()`

`binary_compare(str1, str2)`

`compare(str1, str2)` `abstractmethod`

`stickler.comparators.ExactComparator`

`init(threshold=1.0, case_sensitive=False)`

`compare(str1, str2)`

`stickler.comparators.LevenshteinComparator`

`config` `property`

`name` `property`

`init(normalize=True, threshold=0.7)`

`compare(s1, s2)`

`stickler.comparators.NumericComparator`

`config` `property`

`init(threshold=1.0, relative_tolerance=0.0, absolute_tolerance=0.0, tolerance=None)`

`compare(str1, str2)`

`stickler.comparators.NumericExactC = NumericComparator` `module-attribute`

`stickler.comparators.DateComparator`

`config` `property`

`compare(str1, str2)`

`stickler.comparators.FuzzyComparator`

`config` `property`

`name` `property`

`init(method='ratio', normalize=True, threshold=0.7)`

`compare(value1, value2)`

`stickler.comparators.BERTComparator`

`init(threshold=0.7)`

`compare(str1, str2)`

`stickler.comparators.SemanticComparator`

`init(model_id='amazon.titan-embed-text-v2:0', sim_function='cosine_similarity', embedding_function=None, threshold=0.7)`

`compare(str1, str2)`

`stickler.comparators.LLMComparator`

`init(model=None, eval_guidelines=None)`

`compare(value1, value2)`

`get_comparison_details(value1, value2)`

`stickler.comparators.StructuredModelComparator`

`init(threshold=0.7, strict_types=False)`

`compare(model1, model2)`

`stickler.comparators.BBoxIoUComparator`

`compare(bbox1, bbox2)`