Evaluator

`stickler.structured_object_evaluator.evaluator`

Evaluator for StructuredModel objects.

This module provides an evaluator class for computing metrics on StructuredModel objects, leveraging their built-in comparison capabilities to generate comprehensive metrics. It also supports documenting non-matches (false positives, false negatives) for detailed analysis.

`stickler.structured_object_evaluator.evaluator.StructuredModelEvaluator`

Evaluator for StructuredModel objects.

This evaluator computes comprehensive metrics for StructuredModel objects, leveraging their built-in comparison capabilities. It includes confusion matrix calculations, field-level metrics, non-match documentation, and memory optimization capabilities.

Source code in stickler/structured_object_evaluator/evaluator.py

class StructuredModelEvaluator:
    """
    Evaluator for StructuredModel objects.

    This evaluator computes comprehensive metrics for StructuredModel objects,
    leveraging their built-in comparison capabilities. It includes confusion matrix
    calculations, field-level metrics, non-match documentation, and memory optimization capabilities.
    """

    def __init__(
        self,
        model_class: Optional[Type[StructuredModel]] = None,
        threshold: float = 0.5,
        verbose: bool = False,
        document_non_matches: bool = True,
        recall_with_fd: bool = False
    ):
        """
        Initialize the evaluator.

        Args:
            model_class: Optional StructuredModel class for type checking
            threshold: Similarity threshold for considering a match
            verbose: Whether to print detailed progress information
            document_non_matches: Whether to document detailed non-match information
        """
        self.model_class = model_class
        self.threshold = threshold
        self.verbose = verbose
        self.peak_memory_usage = 0
        self.recall_with_fd = recall_with_fd
        self.start_memory = get_memory_usage()

        # New attributes for documenting non-matches
        self.document_non_matches = document_non_matches
        self.non_match_documents: List[NonMatchField] = []

        warnings.warn(
            "This module is going to be removed in future versions. Use the StructuredModel.compare_with() method.",
            DeprecationWarning,
            stacklevel=2,
        )



        if self.verbose:
            print(
                f"Initialized StructuredModelEvaluator. Starting memory: {self.start_memory:.2f} MB"
            )

    def _check_memory(self):
        """Check current memory usage and update peak memory."""
        current_memory = get_memory_usage()

        if current_memory > self.peak_memory_usage:
            self.peak_memory_usage = current_memory

        if self.verbose and current_memory > self.start_memory + 100:  # 100MB increase
            print(f"Memory usage increased: {current_memory:.2f} MB")

        return current_memory

    def _calculate_metrics_from_binary(
        self,
        tp: float,
        fp: float,
        fn: float,
        tn: float = 0.0,
        fd: float = 0.0,
        recall_with_fd: bool = False,
    ) -> Dict[str, float]:
        """
        Calculate metrics from binary classification counts.

        Args:
            tp: True positive count
            fp: False positive count
            fn: False negative count
            tn: True negative count (default 0)
            fd: False discovery count (default 0) - used only when recall_with_fd=True
            recall_with_fd: Whether to use alternative recall formula including FD in denominator

        Returns:
            Dictionary with precision, recall, F1, and accuracy
        """
        # Calculate precision
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0

        # Calculate recall based on the selected formula
        if recall_with_fd:
            # Alternative recall: TP / (TP + FN + FD)
            recall = tp / (tp + fn + fd) if (tp + fn + fd) > 0 else 0.0
        else:
            # Traditional recall: TP / (TP + FN)
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

        # Calculate F1 score
        f1 = (
            2 * (precision * recall) / (precision + recall)
            if (precision + recall) > 0
            else 0.0
        )

        # Calculate accuracy
        accuracy = (tp + tn) / (tp + fp + fn + tn) if (tp + fp + fn + tn) > 0 else 0.0

        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "accuracy": accuracy,
        }

    def calculate_derived_confusion_matrix_metrics(
        self, cm_counts: Dict[str, Union[int, float]]
    ) -> Dict[str, float]:
        """
        Calculate derived metrics from confusion matrix counts.

        This method uses MetricsHelper to maintain consistency and avoid code duplication.

        Args:
            cm_counts: Dictionary with confusion matrix counts containing keys:
                      'tp', 'fp', 'tn', 'fn', and optionally 'fd', 'fa'

        Returns:
            Dictionary with derived metrics: cm_precision, cm_recall, cm_f1, cm_accuracy
        """
        # Use MetricsHelper for consistent metric calculation
        from stickler.structured_object_evaluator.models.metrics_helper import (
            MetricsHelper,
        )

        metrics_helper = MetricsHelper()

        # Convert counts to the format expected by MetricsHelper
        metrics_dict = {
            "tp": int(cm_counts.get("tp", 0)),
            "fp": int(cm_counts.get("fp", 0)),
            "tn": int(cm_counts.get("tn", 0)),
            "fn": int(cm_counts.get("fn", 0)),
            "fd": int(cm_counts.get("fd", 0)),
            "fa": int(cm_counts.get("fa", 0)),
        }

        # Use MetricsHelper to calculate derived metrics
        return metrics_helper.calculate_derived_metrics(metrics_dict)

    def _convert_score_to_binary(self, score: float) -> Dict[str, float]:
        """
        Convert an ANLS Star score to binary classification counts.

        Args:
            score: ANLS Star similarity score [0-1]

        Returns:
            Dictionary with TP, FP, FN, TN counts
        """
        # For a single field comparison, there are different approaches
        # to convert a similarity score to binary classification:

        # Approach used here: If score >= threshold, count as TP with
        # proportional value, otherwise count as partial FP and partial FN
        if score >= self.threshold:
            # Handle as true positive with proportional credit
            tp = score  # Proportional TP
            fp = (
                1 - score if score < 1.0 else 0
            )  # Proportional FP for imperfect matches
            fn = 0
            tn = 0
        else:
            # Handle as false classification
            tp = 0
            fp = score  # Give partial credit for similarity even if below threshold
            fn = 1 - score  # More different = higher FN
            tn = 0

        return {"tp": tp, "fp": fp, "fn": fn, "tn": tn}

    def _is_null_value(self, value: Any) -> bool:
        """
        Determine if a value should be considered null or empty.

        Args:
            value: The value to check

        Returns:
            True if the value is null/empty, False otherwise
        """
        if value is None:
            return True
        elif hasattr(value, "__len__") and not isinstance(
            value, (str, bytes, bytearray)
        ):
            # Consider empty lists/collections as null values
            return len(value) == 0
        elif isinstance(value, (str, bytes, bytearray)):
            return len(value.strip()) == 0
        return False

    def combine_cm_dicts(
        self, cm1: Dict[str, int], cm2: Dict[str, int]
    ) -> Dict[str, int]:
        """
        Combine two confusion matrix dictionaries by adding corresponding values.

        Args:
            cm1: First confusion matrix dictionary
            cm2: Second confusion matrix dictionary

        Returns:
            Combined confusion matrix dictionary
        """
        return {
            key: cm1.get(key, 0) + cm2.get(key, 0)
            for key in ["tp", "fa", "fd", "fp", "tn", "fn"]
        }

    def add_non_match(
        self,
        field_path: str,
        non_match_type: NonMatchType,
        gt_value: Any,
        pred_value: Any,
        similarity_score: Optional[float] = None,
        details: Optional[Dict[str, Any]] = None,
    ):
        """
        Document a non-match with detailed information.

        Args:
            field_path: Dot-notation path to the field (e.g., 'address.city')
            non_match_type: Type of non-match
            gt_value: Ground truth value
            pred_value: Predicted value
            similarity_score: Optional similarity score if available
            details: Optional additional context or details
            document_id: Optional ID of the document this non-match belongs to
        """
        if not self.document_non_matches:
            return

        self.non_match_documents.append(
            NonMatchField(
                field_path=field_path,
                non_match_type=non_match_type,
                ground_truth_value=gt_value,
                prediction_value=pred_value,
                similarity_score=similarity_score,
                details=details or {},
            )
        )

    def clear_non_match_documents(self):
        """Clear the stored non-match documents."""
        self.non_match_documents = []

    def _convert_enhanced_non_match_to_field(
        self, nm_dict: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Convert enhanced non-match format to NonMatchField format.

        Args:
            nm_dict: Enhanced non-match dictionary from StructuredModel

        Returns:
            Dictionary in NonMatchField format
        """
        # Map enhanced format to NonMatchField format
        converted = {
            "field_path": nm_dict.get("field_path", ""),
            "ground_truth_value": nm_dict.get("ground_truth_value"),
            "prediction_value": nm_dict.get("prediction_value"),
            "similarity_score": nm_dict.get("similarity_score"),
            "details": nm_dict.get("details", {}),
        }

        # The non_match_type is already a NonMatchType enum from StructuredModel
        converted["non_match_type"] = nm_dict.get("non_match_type")

        return converted

    def _compare_models(
        self, gt_model: StructuredModel, pred_model: StructuredModel
    ) -> Dict[str, Any]:
        """
        Compare two StructuredModel instances and return metrics.

        Args:
            gt_model: Ground truth model
            pred_model: Predicted model

        Returns:
            Dict with comparison metrics including tp, fp, fn, tn, field_scores, overall_score
        """
        # Check if inputs are valid StructuredModel instances
        if not (
            isinstance(gt_model, StructuredModel)
            and isinstance(pred_model, StructuredModel)
        ):
            raise TypeError("Both models must be StructuredModel instances")

        # If model_class is specified, check type
        if self.model_class and not (
            isinstance(gt_model, self.model_class)
            and isinstance(pred_model, self.model_class)
        ):
            raise TypeError(
                f"Both models must be instances of {self.model_class.__name__}"
            )

        # Use the built-in compare_with method from StructuredModel
        comparison_result = gt_model.compare_with(pred_model)

        # Initialize metrics
        tp = fp = fn = tn = 0

        # Determine match status
        if comparison_result["overall_score"] >= self.threshold:
            # Good enough match
            tp = 1
        else:
            # Not a good enough match
            fp = 1

        # Prepare result
        result = {
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "tn": tn,
            "field_scores": comparison_result["field_scores"],
            "overall_score": comparison_result["overall_score"],
            # match_status removed - now unnecessary
        }

        return result

    def evaluate(
        self,
        ground_truth: StructuredModel,
        predictions: StructuredModel,
        recall_with_fd: bool = False,
    ) -> Dict[str, Any]:
        """
        Evaluate predictions against ground truth and return comprehensive metrics.

        Args:
            ground_truth: Ground truth data (StructuredModel instance)
            predictions: Predicted data (StructuredModel instance)
            recall_with_fd: If True, include FD in recall denominator (TP/(TP+FN+FD))
                            If False, use traditional recall (TP/(TP+FN))

        Returns:
            Dictionary with the following structure:

            {
                "overall": {
                    "precision": float,     # Overall precision [0-1]
                    "recall": float,        # Overall recall [0-1]
                    "f1": float,           # Overall F1 score [0-1]
                    "accuracy": float,     # Overall accuracy [0-1]
                    "anls_score": float    # Overall ANLS similarity score [0-1]
                },

                "fields": {
                    "<field_name>": {
                        # For primitive fields (str, int, float, bool):
                        "precision": float,
                        "recall": float,
                        "f1": float,
                        "accuracy": float,
                        "anls_score": float
                    },

                    "<list_field_name>": {
                        # For list fields (e.g., products: List[Product]):
                        "overall": {
                            "precision": float,
                            "recall": float,
                            "f1": float,
                            "accuracy": float,
                            "anls_score": float
                        },
                        "items": [
                            # Individual metrics for each matched item pair
                            {
                                "overall": {...},  # Item-level overall metrics
                                "fields": {        # Field metrics within each item
                                    "<nested_field>": {...}
                                }
                            }
                        ]
                    }
                },

                "confusion_matrix": {
                    "fields": {
                        # AGGREGATED metrics for all field types
                        "<field_name>": {
                            "tp": int,          # True positives
                            "fp": int,          # False positives
                            "tn": int,          # True negatives
                            "fn": int,          # False negatives
                            "fd": int,          # False discoveries (non-null but don't match)
                            "fa": int,          # False alarms
                            "derived": {
                                "cm_precision": float,
                                "cm_recall": float,
                                "cm_f1": float,
                                "cm_accuracy": float
                            }
                        },

                        # For list fields with nested objects, aggregated field metrics:
                        "<list_field>.<nested_field>": {
                            # Aggregated counts across ALL instances in the list
                            "tp": int,    # Total true positives for this field across all items
                            "fp": int,    # Total false positives for this field across all items
                            "fn": int,    # Total false negatives for this field across all items
                            "fd": int,    # Total false discoveries for this field across all items
                            "fa": int,    # Total false alarms for this field across all items
                            "derived": {...}
                        }
                    },

                    "overall": {
                        # Overall confusion matrix aggregating all fields
                        "tp": int, "fp": int, "tn": int, "fn": int, "fd": int, "fa": int
                        "derived": {...}
                    }
                }
            }

        Key Usage Patterns:

        1. **Individual Item Metrics** (per-instance analysis):
           ```python
           # Access metrics for each individual item in a list
           for i, item_metrics in enumerate(results['fields']['products']['items']):
               print(f"Product {i}: {item_metrics['overall']['f1']}")
           ```

        2. **Aggregated Field Metrics** (recommended for field performance analysis):
           ```python
           # Access aggregated metrics across all instances of a field type
           cm_fields = results['confusion_matrix']['fields']
           product_id_performance = cm_fields['products.product_id']
           print(f"Product ID field: {product_id_performance['derived']['cm_precision']}")

           # Get all aggregated product field metrics
           product_fields = {k: v for k, v in cm_fields.items()
                           if k.startswith('products.')}
           ```

        3. **Helper Function for Aggregated Metrics**:
           ```python
           def get_aggregated_metrics(results, list_field_name):
               '''Extract aggregated field metrics for a list field.'''
               cm_fields = results['confusion_matrix']['fields']
               prefix = f"{list_field_name}."
               return {k.replace(prefix, ''): v for k, v in cm_fields.items()
                      if k.startswith(prefix)}

           # Usage:
           product_metrics = get_aggregated_metrics(results, 'products')
           print(f"Product name precision: {product_metrics['name']['derived']['cm_precision']}")
           ```

        Note:
            - Use `results['fields'][field]['items']` for per-instance analysis
            - Use `results['confusion_matrix']['fields'][field.subfield]` for aggregated field analysis
            - Aggregated metrics provide rolled-up performance across all instances of a field type
            - Confusion matrix metrics use standard TP/FP/TN/FN/FD classification with derived metrics
        """
        # Clear any existing non-match documents
        self.clear_non_match_documents()

        # Use StructuredModel's enhanced comparison with evaluator format
        # This pushes all the heavy lifting into the StructuredModel as requested
        result = ground_truth.compare_with(
            predictions,
            include_confusion_matrix=True,
            document_non_matches=self.document_non_matches,
            evaluator_format=True,  # This makes StructuredModel return evaluator-compatible format
            recall_with_fd=recall_with_fd,
        )

        # Add non-matches to evaluator's collection if they exist
        if result.get("non_matches"):
            for nm_dict in result["non_matches"]:
                # Convert enhanced non-match format to NonMatchField format
                converted_nm = self._convert_enhanced_non_match_to_field(nm_dict)
                self.non_match_documents.append(NonMatchField(**converted_nm))

        # Process derived metrics explicitly with recall_with_fd parameter
        if "confusion_matrix" in result and "overall" in result["confusion_matrix"]:
            overall_cm = result["confusion_matrix"]["overall"]

            # Update derived metrics directly in the result
            from stickler.structured_object_evaluator.models.metrics_helper import (
                MetricsHelper,
            )

            metrics_helper = MetricsHelper()

            # Apply correct recall_with_fd to overall metrics
            derived_metrics = metrics_helper.calculate_derived_metrics(
                overall_cm, recall_with_fd=recall_with_fd
            )
            result["confusion_matrix"]["overall"]["derived"] = derived_metrics

            # Copy these to the top-level metrics if needed
            if "overall" in result:
                result["overall"]["precision"] = derived_metrics["cm_precision"]
                result["overall"]["recall"] = derived_metrics["cm_recall"]
                result["overall"]["f1"] = derived_metrics["cm_f1"]

        return result

    def _format_evaluation_results(
        self, comparison_result: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Format StructuredModel comparison results to match expected evaluator output format.

        Args:
            comparison_result: Result from StructuredModel.compare_with()

        Returns:
            Dictionary in the expected evaluator format
        """
        # Extract components from StructuredModel result
        field_scores = comparison_result["field_scores"]
        overall_score = comparison_result["overall_score"]
        confusion_matrix = comparison_result.get("confusion_matrix", {})
        non_matches = comparison_result.get("non_matches", [])

        # Calculate field metrics using existing logic for backward compatibility
        field_metrics = {}

        for field_name, score in field_scores.items():
            # Convert field score to binary metrics using existing method
            binary = self._convert_score_to_binary(score)
            # For field metrics, fd is often not available directly, so we ignore recall_with_fd
            metrics = self._calculate_metrics_from_binary(
                binary["tp"], binary["fp"], binary["fn"], binary["tn"]
            )
            metrics["anls_score"] = score
            field_metrics[field_name] = metrics

        # Calculate overall metrics
        binary = self._convert_score_to_binary(overall_score)
        # For overall metrics, use confusion_matrix data which should have fd
        overall_fd = confusion_matrix.get("overall", {}).get("fd", 0)
        overall_metrics = self._calculate_metrics_from_binary(
            binary["tp"],
            binary["fp"],
            binary["fn"],
            binary["tn"],
            fd=overall_fd,
            recall_with_fd=self.recall_with_fd,
        )
        overall_metrics["anls_score"] = overall_score

        # Add non-matches to evaluator's collection if they exist
        if non_matches:
            for nm_dict in non_matches:
                self.non_match_documents.append(NonMatchField(**nm_dict))

        # Prepare final result in expected format
        result = {
            "overall": overall_metrics,
            "fields": field_metrics,
            "confusion_matrix": confusion_matrix,
            "non_matches": non_matches,
        }

        return result

    def _compare_model_lists(
        self, gt_models: List[StructuredModel], pred_models: List[StructuredModel]
    ) -> Dict[str, Any]:
        """
        Compare two lists of StructuredModel instances using Hungarian matching.

        Args:
            gt_models: List of ground truth models
            pred_models: List of predicted models

        Returns:
            Dict with comparison metrics including tp, fp, fn, overall_score
        """
        # Handle empty lists
        if not gt_models and not pred_models:
            return {
                "tp": 0,
                "fp": 0,
                "fn": 0,
                "tn": 0,
                "overall_score": 1.0,  # Empty lists are a perfect match
            }

        if not gt_models:
            return {
                "tp": 0,
                "fp": len(pred_models),
                "fn": 0,
                "tn": 0,
                "overall_score": 0.0,  # All predictions are false positives
            }

        if not pred_models:
            return {
                "tp": 0,
                "fp": 0,
                "fn": len(gt_models),
                "tn": 0,
                "overall_score": 0.0,  # All ground truths are false negatives
            }

        # Ensure all items are StructuredModel instances
        if not all(
            isinstance(model, StructuredModel) for model in gt_models + pred_models
        ):
            raise TypeError("All items in both lists must be StructuredModel instances")

        # If model_class is specified, check type for all models
        if self.model_class:
            if not all(
                isinstance(model, self.model_class) for model in gt_models + pred_models
            ):
                raise TypeError(
                    f"All models must be instances of {self.model_class.__name__}"
                )

        # Create a Hungarian matcher with StructuredModelComparator
        hungarian = HungarianMatcher(StructuredModelComparator())

        # Run Hungarian matching
        tp, fp = hungarian(gt_models, pred_models)

        # Calculate false negatives
        fn = len(gt_models) - tp

        # Calculate overall score (proportion of correct matches)
        max_items = max(len(gt_models), len(pred_models))
        overall_score = tp / max_items if max_items > 0 else 1.0

        return {"tp": tp, "fp": fp, "fn": fn, "tn": 0, "overall_score": overall_score}

`init(model_class=None, threshold=0.5, verbose=False, document_non_matches=True, recall_with_fd=False)`

Initialize the evaluator.

Parameters:

Name	Type	Description	Default
`model_class`	`Optional[Type[StructuredModel]]`	Optional StructuredModel class for type checking	`None`
`threshold`	`float`	Similarity threshold for considering a match	`0.5`
`verbose`	`bool`	Whether to print detailed progress information	`False`
`document_non_matches`	`bool`	Whether to document detailed non-match information	`True`

Source code in stickler/structured_object_evaluator/evaluator.py

def __init__(
    self,
    model_class: Optional[Type[StructuredModel]] = None,
    threshold: float = 0.5,
    verbose: bool = False,
    document_non_matches: bool = True,
    recall_with_fd: bool = False
):
    """
    Initialize the evaluator.

    Args:
        model_class: Optional StructuredModel class for type checking
        threshold: Similarity threshold for considering a match
        verbose: Whether to print detailed progress information
        document_non_matches: Whether to document detailed non-match information
    """
    self.model_class = model_class
    self.threshold = threshold
    self.verbose = verbose
    self.peak_memory_usage = 0
    self.recall_with_fd = recall_with_fd
    self.start_memory = get_memory_usage()

    # New attributes for documenting non-matches
    self.document_non_matches = document_non_matches
    self.non_match_documents: List[NonMatchField] = []

    warnings.warn(
        "This module is going to be removed in future versions. Use the StructuredModel.compare_with() method.",
        DeprecationWarning,
        stacklevel=2,
    )



    if self.verbose:
        print(
            f"Initialized StructuredModelEvaluator. Starting memory: {self.start_memory:.2f} MB"
        )

`add_non_match(field_path, non_match_type, gt_value, pred_value, similarity_score=None, details=None)`

Document a non-match with detailed information.

Parameters:

Name	Type	Description	Default
`field_path`	`str`	Dot-notation path to the field (e.g., 'address.city')	required
`non_match_type`	`NonMatchType`	Type of non-match	required
`gt_value`	`Any`	Ground truth value	required
`pred_value`	`Any`	Predicted value	required
`similarity_score`	`Optional[float]`	Optional similarity score if available	`None`
`details`	`Optional[Dict[str, Any]]`	Optional additional context or details	`None`
`document_id`		Optional ID of the document this non-match belongs to	required

Source code in stickler/structured_object_evaluator/evaluator.py

def add_non_match(
    self,
    field_path: str,
    non_match_type: NonMatchType,
    gt_value: Any,
    pred_value: Any,
    similarity_score: Optional[float] = None,
    details: Optional[Dict[str, Any]] = None,
):
    """
    Document a non-match with detailed information.

    Args:
        field_path: Dot-notation path to the field (e.g., 'address.city')
        non_match_type: Type of non-match
        gt_value: Ground truth value
        pred_value: Predicted value
        similarity_score: Optional similarity score if available
        details: Optional additional context or details
        document_id: Optional ID of the document this non-match belongs to
    """
    if not self.document_non_matches:
        return

    self.non_match_documents.append(
        NonMatchField(
            field_path=field_path,
            non_match_type=non_match_type,
            ground_truth_value=gt_value,
            prediction_value=pred_value,
            similarity_score=similarity_score,
            details=details or {},
        )
    )

`calculate_derived_confusion_matrix_metrics(cm_counts)`

Calculate derived metrics from confusion matrix counts.

This method uses MetricsHelper to maintain consistency and avoid code duplication.

Parameters:

Name	Type	Description	Default
`cm_counts`	`Dict[str, Union[int, float]]`	Dictionary with confusion matrix counts containing keys: 'tp', 'fp', 'tn', 'fn', and optionally 'fd', 'fa'	required

Returns:

Type	Description
`Dict[str, float]`	Dictionary with derived metrics: cm_precision, cm_recall, cm_f1, cm_accuracy

Source code in stickler/structured_object_evaluator/evaluator.py

def calculate_derived_confusion_matrix_metrics(
    self, cm_counts: Dict[str, Union[int, float]]
) -> Dict[str, float]:
    """
    Calculate derived metrics from confusion matrix counts.

    This method uses MetricsHelper to maintain consistency and avoid code duplication.

    Args:
        cm_counts: Dictionary with confusion matrix counts containing keys:
                  'tp', 'fp', 'tn', 'fn', and optionally 'fd', 'fa'

    Returns:
        Dictionary with derived metrics: cm_precision, cm_recall, cm_f1, cm_accuracy
    """
    # Use MetricsHelper for consistent metric calculation
    from stickler.structured_object_evaluator.models.metrics_helper import (
        MetricsHelper,
    )

    metrics_helper = MetricsHelper()

    # Convert counts to the format expected by MetricsHelper
    metrics_dict = {
        "tp": int(cm_counts.get("tp", 0)),
        "fp": int(cm_counts.get("fp", 0)),
        "tn": int(cm_counts.get("tn", 0)),
        "fn": int(cm_counts.get("fn", 0)),
        "fd": int(cm_counts.get("fd", 0)),
        "fa": int(cm_counts.get("fa", 0)),
    }

    # Use MetricsHelper to calculate derived metrics
    return metrics_helper.calculate_derived_metrics(metrics_dict)

`clear_non_match_documents()`

Clear the stored non-match documents.

Source code in stickler/structured_object_evaluator/evaluator.py

def clear_non_match_documents(self):
    """Clear the stored non-match documents."""
    self.non_match_documents = []

`combine_cm_dicts(cm1, cm2)`

Combine two confusion matrix dictionaries by adding corresponding values.

Parameters:

Name	Type	Description	Default
`cm1`	`Dict[str, int]`	First confusion matrix dictionary	required
`cm2`	`Dict[str, int]`	Second confusion matrix dictionary	required

Returns:

Type	Description
`Dict[str, int]`	Combined confusion matrix dictionary

Source code in stickler/structured_object_evaluator/evaluator.py

def combine_cm_dicts(
    self, cm1: Dict[str, int], cm2: Dict[str, int]
) -> Dict[str, int]:
    """
    Combine two confusion matrix dictionaries by adding corresponding values.

    Args:
        cm1: First confusion matrix dictionary
        cm2: Second confusion matrix dictionary

    Returns:
        Combined confusion matrix dictionary
    """
    return {
        key: cm1.get(key, 0) + cm2.get(key, 0)
        for key in ["tp", "fa", "fd", "fp", "tn", "fn"]
    }

`evaluate(ground_truth, predictions, recall_with_fd=False)`

Evaluate predictions against ground truth and return comprehensive metrics.

Parameters:

Name	Type	Description	Default
`ground_truth`	`StructuredModel`	Ground truth data (StructuredModel instance)	required
`predictions`	`StructuredModel`	Predicted data (StructuredModel instance)	required
`recall_with_fd`	`bool`	If True, include FD in recall denominator (TP/(TP+FN+FD)) If False, use traditional recall (TP/(TP+FN))	`False`

Returns:

Type Description

Dict[str, Any]

Dictionary with the following structure:

Dict[str, Any]

{ "overall": { "precision": float, # Overall precision [0-1] "recall": float, # Overall recall [0-1] "f1": float, # Overall F1 score [0-1] "accuracy": float, # Overall accuracy [0-1] "anls_score": float # Overall ANLS similarity score [0-1] },

"fields": { "": { # For primitive fields (str, int, float, bool): "precision": float, "recall": float, "f1": float, "accuracy": float, "anls_score": float },

"<list_field_name>": {
    # For list fields (e.g., products: List[Product]):
    "overall": {
        "precision": float,
        "recall": float,
        "f1": float,
        "accuracy": float,
        "anls_score": float
    },
    "items": [
        # Individual metrics for each matched item pair
        {
            "overall": {...},  # Item-level overall metrics
            "fields": {        # Field metrics within each item
                "<nested_field>": {...}
            }
        }
    ]
}

},

"confusion_matrix": { "fields": { # AGGREGATED metrics for all field types "": { "tp": int, # True positives "fp": int, # False positives "tn": int, # True negatives "fn": int, # False negatives "fd": int, # False discoveries (non-null but don't match) "fa": int, # False alarms "derived": { "cm_precision": float, "cm_recall": float, "cm_f1": float, "cm_accuracy": float } },

    # For list fields with nested objects, aggregated field metrics:
    "<list_field>.<nested_field>": {
        # Aggregated counts across ALL instances in the list
        "tp": int,    # Total true positives for this field across all items
        "fp": int,    # Total false positives for this field across all items
        "fn": int,    # Total false negatives for this field across all items
        "fd": int,    # Total false discoveries for this field across all items
        "fa": int,    # Total false alarms for this field across all items
        "derived": {...}
    }
},

"overall": {
    # Overall confusion matrix aggregating all fields
    "tp": int, "fp": int, "tn": int, "fn": int, "fd": int, "fa": int
    "derived": {...}
}

}

Dict[str, Any]

}

Key Usage Patterns:

Individual Item Metrics (per-instance analysis):

# Access metrics for each individual item in a list
for i, item_metrics in enumerate(results['fields']['products']['items']):
    print(f"Product {i}: {item_metrics['overall']['f1']}")

Aggregated Field Metrics (recommended for field performance analysis):

# Access aggregated metrics across all instances of a field type
cm_fields = results['confusion_matrix']['fields']
product_id_performance = cm_fields['products.product_id']
print(f"Product ID field: {product_id_performance['derived']['cm_precision']}")

# Get all aggregated product field metrics
product_fields = {k: v for k, v in cm_fields.items()
                if k.startswith('products.')}

Helper Function for Aggregated Metrics:

def get_aggregated_metrics(results, list_field_name):
    '''Extract aggregated field metrics for a list field.'''
    cm_fields = results['confusion_matrix']['fields']
    prefix = f"{list_field_name}."
    return {k.replace(prefix, ''): v for k, v in cm_fields.items()
           if k.startswith(prefix)}

# Usage:
product_metrics = get_aggregated_metrics(results, 'products')
print(f"Product name precision: {product_metrics['name']['derived']['cm_precision']}")

Note

Use results['fields'][field]['items'] for per-instance analysis
Use results['confusion_matrix']['fields'][field.subfield] for aggregated field analysis
Aggregated metrics provide rolled-up performance across all instances of a field type
Confusion matrix metrics use standard TP/FP/TN/FN/FD classification with derived metrics

Source code in stickler/structured_object_evaluator/evaluator.py

def evaluate(
    self,
    ground_truth: StructuredModel,
    predictions: StructuredModel,
    recall_with_fd: bool = False,
) -> Dict[str, Any]:
    """
    Evaluate predictions against ground truth and return comprehensive metrics.

    Args:
        ground_truth: Ground truth data (StructuredModel instance)
        predictions: Predicted data (StructuredModel instance)
        recall_with_fd: If True, include FD in recall denominator (TP/(TP+FN+FD))
                        If False, use traditional recall (TP/(TP+FN))

    Returns:
        Dictionary with the following structure:

        {
            "overall": {
                "precision": float,     # Overall precision [0-1]
                "recall": float,        # Overall recall [0-1]
                "f1": float,           # Overall F1 score [0-1]
                "accuracy": float,     # Overall accuracy [0-1]
                "anls_score": float    # Overall ANLS similarity score [0-1]
            },

            "fields": {
                "<field_name>": {
                    # For primitive fields (str, int, float, bool):
                    "precision": float,
                    "recall": float,
                    "f1": float,
                    "accuracy": float,
                    "anls_score": float
                },

                "<list_field_name>": {
                    # For list fields (e.g., products: List[Product]):
                    "overall": {
                        "precision": float,
                        "recall": float,
                        "f1": float,
                        "accuracy": float,
                        "anls_score": float
                    },
                    "items": [
                        # Individual metrics for each matched item pair
                        {
                            "overall": {...},  # Item-level overall metrics
                            "fields": {        # Field metrics within each item
                                "<nested_field>": {...}
                            }
                        }
                    ]
                }
            },

            "confusion_matrix": {
                "fields": {
                    # AGGREGATED metrics for all field types
                    "<field_name>": {
                        "tp": int,          # True positives
                        "fp": int,          # False positives
                        "tn": int,          # True negatives
                        "fn": int,          # False negatives
                        "fd": int,          # False discoveries (non-null but don't match)
                        "fa": int,          # False alarms
                        "derived": {
                            "cm_precision": float,
                            "cm_recall": float,
                            "cm_f1": float,
                            "cm_accuracy": float
                        }
                    },

                    # For list fields with nested objects, aggregated field metrics:
                    "<list_field>.<nested_field>": {
                        # Aggregated counts across ALL instances in the list
                        "tp": int,    # Total true positives for this field across all items
                        "fp": int,    # Total false positives for this field across all items
                        "fn": int,    # Total false negatives for this field across all items
                        "fd": int,    # Total false discoveries for this field across all items
                        "fa": int,    # Total false alarms for this field across all items
                        "derived": {...}
                    }
                },

                "overall": {
                    # Overall confusion matrix aggregating all fields
                    "tp": int, "fp": int, "tn": int, "fn": int, "fd": int, "fa": int
                    "derived": {...}
                }
            }
        }

    Key Usage Patterns:

    1. **Individual Item Metrics** (per-instance analysis):
       ```python
       # Access metrics for each individual item in a list
       for i, item_metrics in enumerate(results['fields']['products']['items']):
           print(f"Product {i}: {item_metrics['overall']['f1']}")
       ```

    2. **Aggregated Field Metrics** (recommended for field performance analysis):
       ```python
       # Access aggregated metrics across all instances of a field type
       cm_fields = results['confusion_matrix']['fields']
       product_id_performance = cm_fields['products.product_id']
       print(f"Product ID field: {product_id_performance['derived']['cm_precision']}")

       # Get all aggregated product field metrics
       product_fields = {k: v for k, v in cm_fields.items()
                       if k.startswith('products.')}
       ```

    3. **Helper Function for Aggregated Metrics**:
       ```python
       def get_aggregated_metrics(results, list_field_name):
           '''Extract aggregated field metrics for a list field.'''
           cm_fields = results['confusion_matrix']['fields']
           prefix = f"{list_field_name}."
           return {k.replace(prefix, ''): v for k, v in cm_fields.items()
                  if k.startswith(prefix)}

       # Usage:
       product_metrics = get_aggregated_metrics(results, 'products')
       print(f"Product name precision: {product_metrics['name']['derived']['cm_precision']}")
       ```

    Note:
        - Use `results['fields'][field]['items']` for per-instance analysis
        - Use `results['confusion_matrix']['fields'][field.subfield]` for aggregated field analysis
        - Aggregated metrics provide rolled-up performance across all instances of a field type
        - Confusion matrix metrics use standard TP/FP/TN/FN/FD classification with derived metrics
    """
    # Clear any existing non-match documents
    self.clear_non_match_documents()

    # Use StructuredModel's enhanced comparison with evaluator format
    # This pushes all the heavy lifting into the StructuredModel as requested
    result = ground_truth.compare_with(
        predictions,
        include_confusion_matrix=True,
        document_non_matches=self.document_non_matches,
        evaluator_format=True,  # This makes StructuredModel return evaluator-compatible format
        recall_with_fd=recall_with_fd,
    )

    # Add non-matches to evaluator's collection if they exist
    if result.get("non_matches"):
        for nm_dict in result["non_matches"]:
            # Convert enhanced non-match format to NonMatchField format
            converted_nm = self._convert_enhanced_non_match_to_field(nm_dict)
            self.non_match_documents.append(NonMatchField(**converted_nm))

    # Process derived metrics explicitly with recall_with_fd parameter
    if "confusion_matrix" in result and "overall" in result["confusion_matrix"]:
        overall_cm = result["confusion_matrix"]["overall"]

        # Update derived metrics directly in the result
        from stickler.structured_object_evaluator.models.metrics_helper import (
            MetricsHelper,
        )

        metrics_helper = MetricsHelper()

        # Apply correct recall_with_fd to overall metrics
        derived_metrics = metrics_helper.calculate_derived_metrics(
            overall_cm, recall_with_fd=recall_with_fd
        )
        result["confusion_matrix"]["overall"]["derived"] = derived_metrics

        # Copy these to the top-level metrics if needed
        if "overall" in result:
            result["overall"]["precision"] = derived_metrics["cm_precision"]
            result["overall"]["recall"] = derived_metrics["cm_recall"]
            result["overall"]["f1"] = derived_metrics["cm_f1"]

    return result

`stickler.structured_object_evaluator.bulk_structured_model_evaluator`

Stateful Bulk Evaluator for StructuredModel objects.

This module provides a modern stateful bulk evaluator inspired by PyTorch Lightning's stateful metrics and scikit-learn's incremental learning patterns. It supports memory-efficient processing of large datasets through accumulation-based evaluation.

`stickler.structured_object_evaluator.bulk_structured_model_evaluator.BulkStructuredModelEvaluator`

Stateful bulk evaluator for StructuredModel objects.

Inspired by PyTorch Lightning's stateful metrics and scikit-learn's incremental learning patterns. This evaluator accumulates evaluation state across multiple document processing calls, enabling memory-efficient evaluation of arbitrarily large datasets without loading everything into memory at once.

Key Features: - Stateful accumulation (like PyTorch Lightning metrics) - Memory-efficient streaming processing (like scikit-learn partial_fit) - External control over data flow and error handling - Checkpointing and recovery capabilities - Distributed processing support via state merging - Uses StructuredModel.compare_with() method directly

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

class BulkStructuredModelEvaluator:
    """
    Stateful bulk evaluator for StructuredModel objects.

    Inspired by PyTorch Lightning's stateful metrics and scikit-learn's incremental
    learning patterns. This evaluator accumulates evaluation state across multiple
    document processing calls, enabling memory-efficient evaluation of arbitrarily
    large datasets without loading everything into memory at once.

    Key Features:
    - Stateful accumulation (like PyTorch Lightning metrics)
    - Memory-efficient streaming processing (like scikit-learn partial_fit)
    - External control over data flow and error handling
    - Checkpointing and recovery capabilities
    - Distributed processing support via state merging
    - Uses StructuredModel.compare_with() method directly
    """

    def __init__(
        self,
        target_schema: Type[StructuredModel],
        verbose: bool = False,
        document_non_matches: bool = True,
        elide_errors: bool = False,
        individual_results_jsonl: Optional[str] = None,
    ):
        """
        Initialize the stateful bulk evaluator.

        Args:
            target_schema: StructuredModel class for validation and processing
            verbose: Whether to print detailed progress information
            document_non_matches: Whether to document detailed non-match information
            elide_errors: If True, skip documents with errors; if False, accumulate error metrics
            individual_results_jsonl: Optional path to JSONL file for appending individual comparison results
        """
        self.target_schema = target_schema
        self.verbose = verbose
        self.document_non_matches = document_non_matches
        self.elide_errors = elide_errors
        self.individual_results_jsonl = individual_results_jsonl

        # Initialize state
        self.reset()

        if self.verbose:
            print(
                f"Initialized BulkStructuredModelEvaluator for {target_schema.__name__}"
            )
            if self.individual_results_jsonl:
                print(
                    f"Individual results will be appended to: {self.individual_results_jsonl}"
                )

    def reset(self) -> None:
        """
        Clear all accumulated state and start fresh evaluation.

        This method resets all internal counters, metrics, and error tracking
        to initial state, enabling reuse of the same evaluator instance for
        multiple evaluation runs.
        """
        # Accumulated confusion matrix state using nested defaultdicts
        self._confusion_matrix = {
            "overall": defaultdict(int),
            "fields": defaultdict(lambda: defaultdict(int)),
        }

        # Non-match tracking (when document_non_matches=True)
        self._non_matches = []

        # Error tracking
        self._errors = []

        # Processing statistics
        self._processed_count = 0
        self._start_time = time.time()

        if self.verbose:
            print("Reset evaluator state")

    def update(
        self,
        gt_model: StructuredModel,
        pred_model: StructuredModel,
        doc_id: Optional[str] = None,
    ) -> None:
        """
        Process a single document pair and accumulate the results in internal state.

        This is the core method for stateful evaluation, inspired by PyTorch Lightning's
        training_step pattern. Each call processes one document pair and updates
        the internal confusion matrix counters.

        Args:
            gt_model: Ground truth StructuredModel instance
            pred_model: Predicted StructuredModel instance
            doc_id: Optional document identifier for error tracking
        """
        if doc_id is None:
            doc_id = f"doc_{self._processed_count}"

        try:
            # Use compare_with method directly on the StructuredModel
            # Pass document_non_matches to achieve parity with compare_with method
            comparison_result = gt_model.compare_with(
                pred_model,
                include_confusion_matrix=True,
                document_non_matches=self.document_non_matches,
            )

            # Collect non-matches if enabled
            if self.document_non_matches and "non_matches" in comparison_result:
                # Add doc_id to each non-match for bulk tracking
                for non_match in comparison_result["non_matches"]:
                    non_match_with_doc = non_match.copy()
                    non_match_with_doc["doc_id"] = doc_id
                    self._non_matches.append(non_match_with_doc)

            # Simple JSONL append of raw comparison result (before any processing)
            if self.individual_results_jsonl:
                record = {"doc_id": doc_id, "comparison_result": comparison_result}
                with open(self.individual_results_jsonl, "a", encoding="utf-8") as f:
                    f.write(json.dumps(record) + "\n")

            # Accumulate the results into our state (this flattens for aggregation)
            self._accumulate_confusion_matrix(comparison_result["confusion_matrix"])

            self._processed_count += 1

            if self.verbose and self._processed_count % 1000 == 0:
                elapsed = time.time() - self._start_time
                print(f"Processed {self._processed_count} documents ({elapsed:.2f}s)")

        except Exception as e:
            error_record = {
                "doc_id": doc_id,
                "error": str(e),
                "error_type": type(e).__name__,
            }

            if not self.elide_errors:
                self._errors.append(error_record)

                # For errors, add a "failed" classification to overall metrics
                # This represents complete failure to process the document
                self._confusion_matrix["overall"]["fn"] += 1

            if self.verbose:
                print(f"Error processing document {doc_id}: {str(e)}")

    def update_batch(
        self, batch_data: List[Tuple[StructuredModel, StructuredModel, Optional[str]]]
    ) -> None:
        """
        Process multiple document pairs efficiently in a batch.

        This method provides efficient batch processing by calling update()
        multiple times with optional garbage collection for memory management.

        Args:
            batch_data: List of tuples containing (gt_model, pred_model, doc_id)
        """
        batch_start = self._processed_count

        for gt_model, pred_model, doc_id in batch_data:
            self.update(gt_model, pred_model, doc_id)

        # Garbage collection for large batches
        if len(batch_data) >= 1000:
            gc.collect()

        if self.verbose:
            batch_size = self._processed_count - batch_start
            print(f"Processed batch of {batch_size} documents")

    def get_current_metrics(self) -> ProcessEvaluation:
        """
        Get current accumulated metrics without clearing state.

        This method allows monitoring evaluation progress by returning current
        metrics computed from accumulated state. Unlike compute(), this does
        not clear the internal state.

        Returns:
            ProcessEvaluation with current accumulated metrics
        """
        return self._build_process_evaluation()

    def compute(self) -> ProcessEvaluation:
        """
        Calculate final aggregated metrics from accumulated state.

        This method performs the final computation of all derived metrics from
        the accumulated confusion matrix state, similar to PyTorch Lightning's
        training_epoch_end pattern.

        Returns:
            ProcessEvaluation with final aggregated metrics
        """
        result = self._build_process_evaluation()

        if self.verbose:
            total_time = time.time() - self._start_time
            print(
                f"Final computation completed: {self._processed_count} documents in {total_time:.2f}s"
            )
            print(f"Overall accuracy: {result.metrics.get('cm_accuracy', 0.0):.3f}")

        return result

    def _accumulate_confusion_matrix(self, cm_result: Dict[str, Any]) -> None:
        """
        Accumulate confusion matrix results from a single document evaluation.

        This method handles the core accumulation logic, properly aggregating
        both overall metrics and field-level metrics while maintaining correct
        nested field paths.

        Args:
            cm_result: Confusion matrix result from compare_with method
        """
        # Accumulate overall metrics
        if "overall" in cm_result:
            for metric_name, value in cm_result["overall"].items():
                if isinstance(value, (int, float)) and metric_name in [
                    "tp",
                    "fp",
                    "tn",
                    "fn",
                    "fd",
                    "fa",
                ]:
                    self._confusion_matrix["overall"][metric_name] += value

        # Accumulate field-level metrics with proper path handling
        if "fields" in cm_result:
            self._accumulate_field_metrics(cm_result["fields"], "")

    def _accumulate_field_metrics(
        self, fields_dict: Dict[str, Any], path_prefix: str
    ) -> None:
        """
        Recursively accumulate field-level metrics with proper nested path construction.

        This method fixes the nested field aggregation bugs from the original implementation
        by properly handling different field structure formats and maintaining correct
        dotted notation paths for nested fields.

        Args:
            fields_dict: Dictionary containing field metrics to accumulate
            path_prefix: Current path prefix for building nested field paths
        """
        for field_name, field_data in fields_dict.items():
            current_path = f"{path_prefix}.{field_name}" if path_prefix else field_name

            if not isinstance(field_data, dict):
                continue

            # Handle field with direct confusion matrix metrics (simple leaf field)
            direct_metrics = {
                k: v
                for k, v in field_data.items()
                if k in ["tp", "fp", "tn", "fn", "fd", "fa"]
                and isinstance(v, (int, float))
            }
            if direct_metrics:
                self._accumulate_single_field_metrics(current_path, direct_metrics)

            # Handle hierarchical field structure (object fields with overall + fields)
            if "overall" in field_data:
                # Accumulate the overall metrics for this field
                self._accumulate_single_field_metrics(
                    current_path, field_data["overall"]
                )

            # Handle nested fields - check if there's a "fields" structure
            if "fields" in field_data and isinstance(field_data["fields"], dict):
                # For each nested field, create the proper dotted path
                for nested_field_name, nested_field_data in field_data[
                    "fields"
                ].items():
                    nested_path = f"{current_path}.{nested_field_name}"

                    if isinstance(nested_field_data, dict):
                        # If nested field has "overall", use those metrics
                        if "overall" in nested_field_data:
                            self._accumulate_single_field_metrics(
                                nested_path, nested_field_data["overall"]
                            )
                        else:
                            # Otherwise, look for direct metrics
                            nested_metrics = {
                                k: v
                                for k, v in nested_field_data.items()
                                if k in ["tp", "fp", "tn", "fn", "fd", "fa"]
                                and isinstance(v, (int, float))
                            }
                            if nested_metrics:
                                self._accumulate_single_field_metrics(
                                    nested_path, nested_metrics
                                )

                        # Continue recursion if there are more nested fields
                        if "fields" in nested_field_data:
                            self._accumulate_field_metrics(
                                nested_field_data["fields"], nested_path
                            )

            # Handle list field structure with nested_fields
            elif "nested_fields" in field_data:
                # Accumulate list-level metrics
                list_metrics = {
                    k: v
                    for k, v in field_data.items()
                    if k in ["tp", "fp", "tn", "fn", "fd", "fa"]
                    and isinstance(v, (int, float))
                }
                if list_metrics:
                    self._accumulate_single_field_metrics(current_path, list_metrics)

                # Accumulate nested field metrics from the list items
                for nested_field_name, nested_metrics in field_data[
                    "nested_fields"
                ].items():
                    nested_path = f"{current_path}.{nested_field_name}"
                    self._accumulate_single_field_metrics(nested_path, nested_metrics)

    def _accumulate_single_field_metrics(
        self, field_path: str, metrics: Dict[str, Union[int, float]]
    ) -> None:
        """
        Accumulate metrics for a single field path.

        Args:
            field_path: Dotted path to the field (e.g., 'transactions.date')
            metrics: Dictionary of confusion matrix metrics to accumulate
        """
        for metric_name, value in metrics.items():
            if metric_name in ["tp", "fp", "tn", "fn", "fd", "fa"] and isinstance(
                value, (int, float)
            ):
                self._confusion_matrix["fields"][field_path][metric_name] += value

    def _calculate_derived_metrics(
        self, cm_dict: Dict[str, Union[int, float]]
    ) -> Dict[str, float]:
        """
        Calculate derived confusion matrix metrics (precision, recall, f1, accuracy).

        This method replicates the derivation logic that was previously handled
        by StructuredModelEvaluator.

        Args:
            cm_dict: Dictionary with basic confusion matrix counts

        Returns:
            Dictionary with derived metrics
        """
        tp = cm_dict.get("tp", 0)
        fp = cm_dict.get("fp", 0)
        tn = cm_dict.get("tn", 0)
        fn = cm_dict.get("fn", 0)

        # Calculate derived metrics with safe division
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (
            2 * (precision * recall) / (precision + recall)
            if (precision + recall) > 0
            else 0.0
        )
        accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0.0

        return {
            "cm_precision": precision,
            "cm_recall": recall,
            "cm_f1": f1,
            "cm_accuracy": accuracy,
        }

    def _build_process_evaluation(self) -> ProcessEvaluation:
        """
        Build ProcessEvaluation from current accumulated state.

        Returns:
            ProcessEvaluation with computed metrics from accumulated state
        """
        # Calculate derived metrics for overall results
        overall_cm = dict(self._confusion_matrix["overall"])
        overall_derived = self._calculate_derived_metrics(overall_cm)
        overall_metrics = {**overall_cm, **overall_derived}

        # Calculate derived metrics for each field
        field_metrics = {}
        for field_path, field_cm in self._confusion_matrix["fields"].items():
            field_cm_dict = dict(field_cm)
            field_derived = self._calculate_derived_metrics(field_cm_dict)
            field_metrics[field_path] = {**field_cm_dict, **field_derived}

        total_time = time.time() - self._start_time

        return ProcessEvaluation(
            document_count=self._processed_count,
            metrics=overall_metrics,
            field_metrics=field_metrics,
            errors=list(self._errors),  # Copy to avoid external modification
            total_time=total_time,
            non_matches=list(self._non_matches) if self.document_non_matches else None,
        )

    def save_metrics(self, filepath: str) -> None:
        """
        Save current accumulated metrics to a JSON file.

        Args:
            filepath: Path where metrics will be saved as JSON
        """
        process_eval = self._build_process_evaluation()

        # Build comprehensive metrics dictionary
        metrics_data = {
            "overall_metrics": process_eval.metrics,
            "field_metrics": process_eval.field_metrics,
            "evaluation_summary": {
                "total_documents_processed": self._processed_count,
                "total_evaluation_time": process_eval.total_time,
                "documents_per_second": self._processed_count / process_eval.total_time
                if process_eval.total_time > 0
                else 0,
                "error_count": len(process_eval.errors),
                "error_rate": len(process_eval.errors) / self._processed_count
                if self._processed_count > 0
                else 0,
                "target_schema": self.target_schema.__name__,
            },
            "errors": process_eval.errors,
            "metadata": {
                "saved_at": time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime()),
                "evaluator_config": {
                    "verbose": self.verbose,
                    "document_non_matches": self.document_non_matches,
                    "elide_errors": self.elide_errors,
                    "individual_results_jsonl": self.individual_results_jsonl,
                },
            },
        }

        # Ensure directory exists
        import os

        os.makedirs(os.path.dirname(os.path.abspath(filepath)), exist_ok=True)

        # Write to file
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(metrics_data, f, indent=2, default=str)

        if self.verbose:
            print(f"Metrics saved to: {filepath}")

    def pretty_print_metrics(self) -> None:
        """
        Pretty print current accumulated metrics in a format similar to StructuredModel.

        Displays overall metrics, field-level metrics, and evaluation summary
        in a human-readable format.
        """
        process_eval = self._build_process_evaluation()

        # Header
        print("\n" + "=" * 80)
        print(f"BULK EVALUATION RESULTS - {self.target_schema.__name__}")
        print("=" * 80)

        # Overall metrics
        overall_metrics = process_eval.metrics
        print("\nOVERALL METRICS:")
        print("-" * 40)
        print(f"Documents Processed: {self._processed_count:,}")
        print(f"Evaluation Time: {process_eval.total_time:.2f}s")
        print(
            f"Processing Rate: {self._processed_count / process_eval.total_time:.1f} docs/sec"
            if process_eval.total_time > 0
            else "Processing Rate: N/A"
        )

        # Confusion matrix
        print("\nCONFUSION MATRIX:")
        print(f"  True Positives (TP):    {overall_metrics.get('tp', 0):,}")
        print(f"  False Positives (FP):   {overall_metrics.get('fp', 0):,}")
        print(f"  True Negatives (TN):    {overall_metrics.get('tn', 0):,}")
        print(f"  False Negatives (FN):   {overall_metrics.get('fn', 0):,}")
        print(f"  False Discovery (FD):   {overall_metrics.get('fd', 0):,}")
        print(f"  False Alarm (FA):   {overall_metrics.get('fa', 0):,}")

        # Derived metrics
        print("\nDERIVED METRICS:")
        print(f"  Precision:     {overall_metrics.get('cm_precision', 0.0):.4f}")
        print(f"  Recall:        {overall_metrics.get('cm_recall', 0.0):.4f}")
        print(f"  F1 Score:      {overall_metrics.get('cm_f1', 0.0):.4f}")
        print(f"  Accuracy:      {overall_metrics.get('cm_accuracy', 0.0):.4f}")

        # Field-level metrics
        if process_eval.field_metrics:
            print("\nFIELD-LEVEL METRICS:")
            print("-" * 40)

            # Sort fields by F1 score descending for better readability
            sorted_fields = sorted(
                process_eval.field_metrics.items(),
                key=lambda x: x[1].get("cm_f1", 0.0),
                reverse=True,
            )

            for field_path, field_metrics in sorted_fields:
                tp = field_metrics.get("tp", 0)
                fp = field_metrics.get("fp", 0)
                fn = field_metrics.get("fn", 0)
                precision = field_metrics.get("cm_precision", 0.0)
                recall = field_metrics.get("cm_recall", 0.0)
                f1 = field_metrics.get("cm_f1", 0.0)

                # Only show fields with some activity
                if tp + fp + fn > 0:
                    print(
                        f"  {field_path:30} P: {precision:.3f} | R: {recall:.3f} | F1: {f1:.3f} | TP: {tp:,} | FP: {fp:,} | FN: {fn:,}"
                    )

        # Error summary
        if process_eval.errors:
            print("\nERROR SUMMARY:")
            print("-" * 40)
            print(f"Total Errors: {len(process_eval.errors):,}")
            print(
                f"Error Rate: {len(process_eval.errors) / self._processed_count * 100:.2f}%"
                if self._processed_count > 0
                else "Error Rate: N/A"
            )

            # Group errors by type
            error_types = {}
            for error in process_eval.errors:
                error_type = error.get("error_type", "Unknown")
                error_types[error_type] = error_types.get(error_type, 0) + 1

            if error_types:
                print("Error Types:")
                for error_type, count in sorted(
                    error_types.items(), key=lambda x: x[1], reverse=True
                ):
                    print(f"  {error_type}: {count:,}")

        # Configuration info
        print("\nCONFIGURATION:")
        print("-" * 40)
        print(f"Target Schema: {self.target_schema.__name__}")
        print(f"Document Non-matches: {'Yes' if self.document_non_matches else 'No'}")
        print(f"Elide Errors: {'Yes' if self.elide_errors else 'No'}")
        if self.individual_results_jsonl:
            print(f"Individual Results JSONL: {self.individual_results_jsonl}")

        print("=" * 80)

    def get_state(self) -> Dict[str, Any]:
        """
        Get serializable state for checkpointing and recovery.

        Returns a dictionary containing all internal state that can be serialized
        and later restored using load_state(). This enables checkpointing for
        long-running evaluation jobs.

        Returns:
            Dictionary containing serializable evaluator state
        """
        return {
            "confusion_matrix": {
                "overall": dict(self._confusion_matrix["overall"]),
                "fields": {
                    path: dict(metrics)
                    for path, metrics in self._confusion_matrix["fields"].items()
                },
            },
            "errors": list(self._errors),
            "processed_count": self._processed_count,
            "start_time": self._start_time,
            # Configuration
            "target_schema": self.target_schema.__name__,
            "elide_errors": self.elide_errors,
        }

    def load_state(self, state: Dict[str, Any]) -> None:
        """
        Restore evaluator state from serialized data.

        This method restores the internal state from data previously saved
        with get_state(), enabling recovery from checkpoints.

        Args:
            state: State dictionary from get_state()
        """
        # Validate state compatibility
        if state.get("target_schema") != self.target_schema.__name__:
            raise ValueError(
                f"State schema {state.get('target_schema')} doesn't match evaluator schema {self.target_schema.__name__}"
            )

        # Restore confusion matrix state
        cm_state = state["confusion_matrix"]
        self._confusion_matrix = {
            "overall": defaultdict(int, cm_state["overall"]),
            "fields": defaultdict(lambda: defaultdict(int)),
        }

        for field_path, field_metrics in cm_state["fields"].items():
            self._confusion_matrix["fields"][field_path] = defaultdict(
                int, field_metrics
            )

        # Restore other state
        self._errors = list(state["errors"])
        self._processed_count = state["processed_count"]
        self._start_time = state["start_time"]

        if self.verbose:
            print(f"Loaded state: {self._processed_count} documents processed")

    def merge_state(self, other_state: Dict[str, Any]) -> None:
        """
        Merge results from another evaluator instance.

        This method enables distributed processing by merging confusion matrix
        counts from multiple evaluator instances that processed different
        portions of a dataset.

        Args:
            other_state: State dictionary from another evaluator instance
        """
        # Validate compatibility
        if other_state.get("target_schema") != self.target_schema.__name__:
            raise ValueError(
                f"Cannot merge incompatible schemas: {other_state.get('target_schema')} vs {self.target_schema.__name__}"
            )

        # Merge overall metrics
        other_cm = other_state["confusion_matrix"]
        for metric, value in other_cm["overall"].items():
            self._confusion_matrix["overall"][metric] += value

        # Merge field-level metrics
        for field_path, field_metrics in other_cm["fields"].items():
            for metric, value in field_metrics.items():
                self._confusion_matrix["fields"][field_path][metric] += value

        # Merge errors and counts
        self._errors.extend(other_state["errors"])
        self._processed_count += other_state["processed_count"]

        if self.verbose:
            print(
                f"Merged state: now {self._processed_count} total documents processed"
            )

    # Legacy compatibility methods

    def evaluate_dataframe(self, df) -> ProcessEvaluation:
        """
        Legacy compatibility method for DataFrame-based evaluation.

        This method provides backward compatibility with the original DataFrame-based
        API while leveraging the new stateful processing internally.

        Args:
            df: DataFrame with columns for ground truth and predictions

        Returns:
            ProcessEvaluation with aggregated results
        """
        # Reset state for clean evaluation
        self.reset()

        # Process each row
        for idx, row in df.iterrows():
            doc_id = row.get("doc_id", f"row_{idx}")

            try:
                # Parse JSON data
                gt_data = json.loads(row["expected"])
                pred_data = json.loads(row["predicted"])

                # Create StructuredModel instances
                gt_model = self.target_schema(**gt_data)
                pred_model = self.target_schema(**pred_data)

                # Process using stateful update
                self.update(gt_model, pred_model, doc_id)

            except Exception as e:
                if self.verbose:
                    print(f"Error processing row {idx}: {e}")
                continue

        return self.compute()

`init(target_schema, verbose=False, document_non_matches=True, elide_errors=False, individual_results_jsonl=None)`

Initialize the stateful bulk evaluator.

Parameters:

Name	Type	Description	Default
`target_schema`	`Type[StructuredModel]`	StructuredModel class for validation and processing	required
`verbose`	`bool`	Whether to print detailed progress information	`False`
`document_non_matches`	`bool`	Whether to document detailed non-match information	`True`
`elide_errors`	`bool`	If True, skip documents with errors; if False, accumulate error metrics	`False`
`individual_results_jsonl`	`Optional[str]`	Optional path to JSONL file for appending individual comparison results	`None`

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def __init__(
    self,
    target_schema: Type[StructuredModel],
    verbose: bool = False,
    document_non_matches: bool = True,
    elide_errors: bool = False,
    individual_results_jsonl: Optional[str] = None,
):
    """
    Initialize the stateful bulk evaluator.

    Args:
        target_schema: StructuredModel class for validation and processing
        verbose: Whether to print detailed progress information
        document_non_matches: Whether to document detailed non-match information
        elide_errors: If True, skip documents with errors; if False, accumulate error metrics
        individual_results_jsonl: Optional path to JSONL file for appending individual comparison results
    """
    self.target_schema = target_schema
    self.verbose = verbose
    self.document_non_matches = document_non_matches
    self.elide_errors = elide_errors
    self.individual_results_jsonl = individual_results_jsonl

    # Initialize state
    self.reset()

    if self.verbose:
        print(
            f"Initialized BulkStructuredModelEvaluator for {target_schema.__name__}"
        )
        if self.individual_results_jsonl:
            print(
                f"Individual results will be appended to: {self.individual_results_jsonl}"
            )

`compute()`

Calculate final aggregated metrics from accumulated state.

This method performs the final computation of all derived metrics from the accumulated confusion matrix state, similar to PyTorch Lightning's training_epoch_end pattern.

Returns:

Type	Description
`ProcessEvaluation`	ProcessEvaluation with final aggregated metrics

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def compute(self) -> ProcessEvaluation:
    """
    Calculate final aggregated metrics from accumulated state.

    This method performs the final computation of all derived metrics from
    the accumulated confusion matrix state, similar to PyTorch Lightning's
    training_epoch_end pattern.

    Returns:
        ProcessEvaluation with final aggregated metrics
    """
    result = self._build_process_evaluation()

    if self.verbose:
        total_time = time.time() - self._start_time
        print(
            f"Final computation completed: {self._processed_count} documents in {total_time:.2f}s"
        )
        print(f"Overall accuracy: {result.metrics.get('cm_accuracy', 0.0):.3f}")

    return result

`evaluate_dataframe(df)`

Legacy compatibility method for DataFrame-based evaluation.

This method provides backward compatibility with the original DataFrame-based API while leveraging the new stateful processing internally.

Parameters:

Name	Type	Description	Default
`df`		DataFrame with columns for ground truth and predictions	required

Returns:

Type	Description
`ProcessEvaluation`	ProcessEvaluation with aggregated results

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def evaluate_dataframe(self, df) -> ProcessEvaluation:
    """
    Legacy compatibility method for DataFrame-based evaluation.

    This method provides backward compatibility with the original DataFrame-based
    API while leveraging the new stateful processing internally.

    Args:
        df: DataFrame with columns for ground truth and predictions

    Returns:
        ProcessEvaluation with aggregated results
    """
    # Reset state for clean evaluation
    self.reset()

    # Process each row
    for idx, row in df.iterrows():
        doc_id = row.get("doc_id", f"row_{idx}")

        try:
            # Parse JSON data
            gt_data = json.loads(row["expected"])
            pred_data = json.loads(row["predicted"])

            # Create StructuredModel instances
            gt_model = self.target_schema(**gt_data)
            pred_model = self.target_schema(**pred_data)

            # Process using stateful update
            self.update(gt_model, pred_model, doc_id)

        except Exception as e:
            if self.verbose:
                print(f"Error processing row {idx}: {e}")
            continue

    return self.compute()

`get_current_metrics()`

Get current accumulated metrics without clearing state.

This method allows monitoring evaluation progress by returning current metrics computed from accumulated state. Unlike compute(), this does not clear the internal state.

Returns:

Type	Description
`ProcessEvaluation`	ProcessEvaluation with current accumulated metrics

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def get_current_metrics(self) -> ProcessEvaluation:
    """
    Get current accumulated metrics without clearing state.

    This method allows monitoring evaluation progress by returning current
    metrics computed from accumulated state. Unlike compute(), this does
    not clear the internal state.

    Returns:
        ProcessEvaluation with current accumulated metrics
    """
    return self._build_process_evaluation()

`get_state()`

Get serializable state for checkpointing and recovery.

Returns a dictionary containing all internal state that can be serialized and later restored using load_state(). This enables checkpointing for long-running evaluation jobs.

Returns:

Type	Description
`Dict[str, Any]`	Dictionary containing serializable evaluator state

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def get_state(self) -> Dict[str, Any]:
    """
    Get serializable state for checkpointing and recovery.

    Returns a dictionary containing all internal state that can be serialized
    and later restored using load_state(). This enables checkpointing for
    long-running evaluation jobs.

    Returns:
        Dictionary containing serializable evaluator state
    """
    return {
        "confusion_matrix": {
            "overall": dict(self._confusion_matrix["overall"]),
            "fields": {
                path: dict(metrics)
                for path, metrics in self._confusion_matrix["fields"].items()
            },
        },
        "errors": list(self._errors),
        "processed_count": self._processed_count,
        "start_time": self._start_time,
        # Configuration
        "target_schema": self.target_schema.__name__,
        "elide_errors": self.elide_errors,
    }

`load_state(state)`

Restore evaluator state from serialized data.

This method restores the internal state from data previously saved with get_state(), enabling recovery from checkpoints.

Parameters:

Name	Type	Description	Default
`state`	`Dict[str, Any]`	State dictionary from get_state()	required

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def load_state(self, state: Dict[str, Any]) -> None:
    """
    Restore evaluator state from serialized data.

    This method restores the internal state from data previously saved
    with get_state(), enabling recovery from checkpoints.

    Args:
        state: State dictionary from get_state()
    """
    # Validate state compatibility
    if state.get("target_schema") != self.target_schema.__name__:
        raise ValueError(
            f"State schema {state.get('target_schema')} doesn't match evaluator schema {self.target_schema.__name__}"
        )

    # Restore confusion matrix state
    cm_state = state["confusion_matrix"]
    self._confusion_matrix = {
        "overall": defaultdict(int, cm_state["overall"]),
        "fields": defaultdict(lambda: defaultdict(int)),
    }

    for field_path, field_metrics in cm_state["fields"].items():
        self._confusion_matrix["fields"][field_path] = defaultdict(
            int, field_metrics
        )

    # Restore other state
    self._errors = list(state["errors"])
    self._processed_count = state["processed_count"]
    self._start_time = state["start_time"]

    if self.verbose:
        print(f"Loaded state: {self._processed_count} documents processed")

`merge_state(other_state)`

Merge results from another evaluator instance.

This method enables distributed processing by merging confusion matrix counts from multiple evaluator instances that processed different portions of a dataset.

Parameters:

Name	Type	Description	Default
`other_state`	`Dict[str, Any]`	State dictionary from another evaluator instance	required

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def merge_state(self, other_state: Dict[str, Any]) -> None:
    """
    Merge results from another evaluator instance.

    This method enables distributed processing by merging confusion matrix
    counts from multiple evaluator instances that processed different
    portions of a dataset.

    Args:
        other_state: State dictionary from another evaluator instance
    """
    # Validate compatibility
    if other_state.get("target_schema") != self.target_schema.__name__:
        raise ValueError(
            f"Cannot merge incompatible schemas: {other_state.get('target_schema')} vs {self.target_schema.__name__}"
        )

    # Merge overall metrics
    other_cm = other_state["confusion_matrix"]
    for metric, value in other_cm["overall"].items():
        self._confusion_matrix["overall"][metric] += value

    # Merge field-level metrics
    for field_path, field_metrics in other_cm["fields"].items():
        for metric, value in field_metrics.items():
            self._confusion_matrix["fields"][field_path][metric] += value

    # Merge errors and counts
    self._errors.extend(other_state["errors"])
    self._processed_count += other_state["processed_count"]

    if self.verbose:
        print(
            f"Merged state: now {self._processed_count} total documents processed"
        )

`pretty_print_metrics()`

Pretty print current accumulated metrics in a format similar to StructuredModel.

Displays overall metrics, field-level metrics, and evaluation summary in a human-readable format.

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def pretty_print_metrics(self) -> None:
    """
    Pretty print current accumulated metrics in a format similar to StructuredModel.

    Displays overall metrics, field-level metrics, and evaluation summary
    in a human-readable format.
    """
    process_eval = self._build_process_evaluation()

    # Header
    print("\n" + "=" * 80)
    print(f"BULK EVALUATION RESULTS - {self.target_schema.__name__}")
    print("=" * 80)

    # Overall metrics
    overall_metrics = process_eval.metrics
    print("\nOVERALL METRICS:")
    print("-" * 40)
    print(f"Documents Processed: {self._processed_count:,}")
    print(f"Evaluation Time: {process_eval.total_time:.2f}s")
    print(
        f"Processing Rate: {self._processed_count / process_eval.total_time:.1f} docs/sec"
        if process_eval.total_time > 0
        else "Processing Rate: N/A"
    )

    # Confusion matrix
    print("\nCONFUSION MATRIX:")
    print(f"  True Positives (TP):    {overall_metrics.get('tp', 0):,}")
    print(f"  False Positives (FP):   {overall_metrics.get('fp', 0):,}")
    print(f"  True Negatives (TN):    {overall_metrics.get('tn', 0):,}")
    print(f"  False Negatives (FN):   {overall_metrics.get('fn', 0):,}")
    print(f"  False Discovery (FD):   {overall_metrics.get('fd', 0):,}")
    print(f"  False Alarm (FA):   {overall_metrics.get('fa', 0):,}")

    # Derived metrics
    print("\nDERIVED METRICS:")
    print(f"  Precision:     {overall_metrics.get('cm_precision', 0.0):.4f}")
    print(f"  Recall:        {overall_metrics.get('cm_recall', 0.0):.4f}")
    print(f"  F1 Score:      {overall_metrics.get('cm_f1', 0.0):.4f}")
    print(f"  Accuracy:      {overall_metrics.get('cm_accuracy', 0.0):.4f}")

    # Field-level metrics
    if process_eval.field_metrics:
        print("\nFIELD-LEVEL METRICS:")
        print("-" * 40)

        # Sort fields by F1 score descending for better readability
        sorted_fields = sorted(
            process_eval.field_metrics.items(),
            key=lambda x: x[1].get("cm_f1", 0.0),
            reverse=True,
        )

        for field_path, field_metrics in sorted_fields:
            tp = field_metrics.get("tp", 0)
            fp = field_metrics.get("fp", 0)
            fn = field_metrics.get("fn", 0)
            precision = field_metrics.get("cm_precision", 0.0)
            recall = field_metrics.get("cm_recall", 0.0)
            f1 = field_metrics.get("cm_f1", 0.0)

            # Only show fields with some activity
            if tp + fp + fn > 0:
                print(
                    f"  {field_path:30} P: {precision:.3f} | R: {recall:.3f} | F1: {f1:.3f} | TP: {tp:,} | FP: {fp:,} | FN: {fn:,}"
                )

    # Error summary
    if process_eval.errors:
        print("\nERROR SUMMARY:")
        print("-" * 40)
        print(f"Total Errors: {len(process_eval.errors):,}")
        print(
            f"Error Rate: {len(process_eval.errors) / self._processed_count * 100:.2f}%"
            if self._processed_count > 0
            else "Error Rate: N/A"
        )

        # Group errors by type
        error_types = {}
        for error in process_eval.errors:
            error_type = error.get("error_type", "Unknown")
            error_types[error_type] = error_types.get(error_type, 0) + 1

        if error_types:
            print("Error Types:")
            for error_type, count in sorted(
                error_types.items(), key=lambda x: x[1], reverse=True
            ):
                print(f"  {error_type}: {count:,}")

    # Configuration info
    print("\nCONFIGURATION:")
    print("-" * 40)
    print(f"Target Schema: {self.target_schema.__name__}")
    print(f"Document Non-matches: {'Yes' if self.document_non_matches else 'No'}")
    print(f"Elide Errors: {'Yes' if self.elide_errors else 'No'}")
    if self.individual_results_jsonl:
        print(f"Individual Results JSONL: {self.individual_results_jsonl}")

    print("=" * 80)

`reset()`

Clear all accumulated state and start fresh evaluation.

This method resets all internal counters, metrics, and error tracking to initial state, enabling reuse of the same evaluator instance for multiple evaluation runs.

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def reset(self) -> None:
    """
    Clear all accumulated state and start fresh evaluation.

    This method resets all internal counters, metrics, and error tracking
    to initial state, enabling reuse of the same evaluator instance for
    multiple evaluation runs.
    """
    # Accumulated confusion matrix state using nested defaultdicts
    self._confusion_matrix = {
        "overall": defaultdict(int),
        "fields": defaultdict(lambda: defaultdict(int)),
    }

    # Non-match tracking (when document_non_matches=True)
    self._non_matches = []

    # Error tracking
    self._errors = []

    # Processing statistics
    self._processed_count = 0
    self._start_time = time.time()

    if self.verbose:
        print("Reset evaluator state")

`save_metrics(filepath)`

Save current accumulated metrics to a JSON file.

Parameters:

Name	Type	Description	Default
`filepath`	`str`	Path where metrics will be saved as JSON	required

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def save_metrics(self, filepath: str) -> None:
    """
    Save current accumulated metrics to a JSON file.

    Args:
        filepath: Path where metrics will be saved as JSON
    """
    process_eval = self._build_process_evaluation()

    # Build comprehensive metrics dictionary
    metrics_data = {
        "overall_metrics": process_eval.metrics,
        "field_metrics": process_eval.field_metrics,
        "evaluation_summary": {
            "total_documents_processed": self._processed_count,
            "total_evaluation_time": process_eval.total_time,
            "documents_per_second": self._processed_count / process_eval.total_time
            if process_eval.total_time > 0
            else 0,
            "error_count": len(process_eval.errors),
            "error_rate": len(process_eval.errors) / self._processed_count
            if self._processed_count > 0
            else 0,
            "target_schema": self.target_schema.__name__,
        },
        "errors": process_eval.errors,
        "metadata": {
            "saved_at": time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime()),
            "evaluator_config": {
                "verbose": self.verbose,
                "document_non_matches": self.document_non_matches,
                "elide_errors": self.elide_errors,
                "individual_results_jsonl": self.individual_results_jsonl,
            },
        },
    }

    # Ensure directory exists
    import os

    os.makedirs(os.path.dirname(os.path.abspath(filepath)), exist_ok=True)

    # Write to file
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(metrics_data, f, indent=2, default=str)

    if self.verbose:
        print(f"Metrics saved to: {filepath}")

`update(gt_model, pred_model, doc_id=None)`

Process a single document pair and accumulate the results in internal state.

This is the core method for stateful evaluation, inspired by PyTorch Lightning's training_step pattern. Each call processes one document pair and updates the internal confusion matrix counters.

Parameters:

Name	Type	Description	Default
`gt_model`	`StructuredModel`	Ground truth StructuredModel instance	required
`pred_model`	`StructuredModel`	Predicted StructuredModel instance	required
`doc_id`	`Optional[str]`	Optional document identifier for error tracking	`None`

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def update(
    self,
    gt_model: StructuredModel,
    pred_model: StructuredModel,
    doc_id: Optional[str] = None,
) -> None:
    """
    Process a single document pair and accumulate the results in internal state.

    This is the core method for stateful evaluation, inspired by PyTorch Lightning's
    training_step pattern. Each call processes one document pair and updates
    the internal confusion matrix counters.

    Args:
        gt_model: Ground truth StructuredModel instance
        pred_model: Predicted StructuredModel instance
        doc_id: Optional document identifier for error tracking
    """
    if doc_id is None:
        doc_id = f"doc_{self._processed_count}"

    try:
        # Use compare_with method directly on the StructuredModel
        # Pass document_non_matches to achieve parity with compare_with method
        comparison_result = gt_model.compare_with(
            pred_model,
            include_confusion_matrix=True,
            document_non_matches=self.document_non_matches,
        )

        # Collect non-matches if enabled
        if self.document_non_matches and "non_matches" in comparison_result:
            # Add doc_id to each non-match for bulk tracking
            for non_match in comparison_result["non_matches"]:
                non_match_with_doc = non_match.copy()
                non_match_with_doc["doc_id"] = doc_id
                self._non_matches.append(non_match_with_doc)

        # Simple JSONL append of raw comparison result (before any processing)
        if self.individual_results_jsonl:
            record = {"doc_id": doc_id, "comparison_result": comparison_result}
            with open(self.individual_results_jsonl, "a", encoding="utf-8") as f:
                f.write(json.dumps(record) + "\n")

        # Accumulate the results into our state (this flattens for aggregation)
        self._accumulate_confusion_matrix(comparison_result["confusion_matrix"])

        self._processed_count += 1

        if self.verbose and self._processed_count % 1000 == 0:
            elapsed = time.time() - self._start_time
            print(f"Processed {self._processed_count} documents ({elapsed:.2f}s)")

    except Exception as e:
        error_record = {
            "doc_id": doc_id,
            "error": str(e),
            "error_type": type(e).__name__,
        }

        if not self.elide_errors:
            self._errors.append(error_record)

            # For errors, add a "failed" classification to overall metrics
            # This represents complete failure to process the document
            self._confusion_matrix["overall"]["fn"] += 1

        if self.verbose:
            print(f"Error processing document {doc_id}: {str(e)}")

`update_batch(batch_data)`

Process multiple document pairs efficiently in a batch.

This method provides efficient batch processing by calling update() multiple times with optional garbage collection for memory management.

Parameters:

Name	Type	Description	Default
`batch_data`	`List[Tuple[StructuredModel, StructuredModel, Optional[str]]]`	List of tuples containing (gt_model, pred_model, doc_id)	required

Source code in stickler/structured_object_evaluator/bulk_structured_model_evaluator.py

def update_batch(
    self, batch_data: List[Tuple[StructuredModel, StructuredModel, Optional[str]]]
) -> None:
    """
    Process multiple document pairs efficiently in a batch.

    This method provides efficient batch processing by calling update()
    multiple times with optional garbage collection for memory management.

    Args:
        batch_data: List of tuples containing (gt_model, pred_model, doc_id)
    """
    batch_start = self._processed_count

    for gt_model, pred_model, doc_id in batch_data:
        self.update(gt_model, pred_model, doc_id)

    # Garbage collection for large batches
    if len(batch_data) >= 1000:
        gc.collect()

    if self.verbose:
        batch_size = self._processed_count - batch_start
        print(f"Processed batch of {batch_size} documents")

Evaluator

stickler.structured_object_evaluator.evaluator

stickler.structured_object_evaluator.evaluator.StructuredModelEvaluator

__init__(model_class=None, threshold=0.5, verbose=False, document_non_matches=True, recall_with_fd=False)

add_non_match(field_path, non_match_type, gt_value, pred_value, similarity_score=None, details=None)

calculate_derived_confusion_matrix_metrics(cm_counts)

clear_non_match_documents()

combine_cm_dicts(cm1, cm2)

evaluate(ground_truth, predictions, recall_with_fd=False)

stickler.structured_object_evaluator.bulk_structured_model_evaluator

stickler.structured_object_evaluator.bulk_structured_model_evaluator.BulkStructuredModelEvaluator

__init__(target_schema, verbose=False, document_non_matches=True, elide_errors=False, individual_results_jsonl=None)

compute()

evaluate_dataframe(df)

get_current_metrics()

get_state()

load_state(state)

merge_state(other_state)

pretty_print_metrics()

reset()

save_metrics(filepath)

update(gt_model, pred_model, doc_id=None)

update_batch(batch_data)

`stickler.structured_object_evaluator.evaluator`

`stickler.structured_object_evaluator.evaluator.StructuredModelEvaluator`

`init(model_class=None, threshold=0.5, verbose=False, document_non_matches=True, recall_with_fd=False)`

`add_non_match(field_path, non_match_type, gt_value, pred_value, similarity_score=None, details=None)`

`calculate_derived_confusion_matrix_metrics(cm_counts)`

`clear_non_match_documents()`

`combine_cm_dicts(cm1, cm2)`

`evaluate(ground_truth, predictions, recall_with_fd=False)`

`stickler.structured_object_evaluator.bulk_structured_model_evaluator`

`stickler.structured_object_evaluator.bulk_structured_model_evaluator.BulkStructuredModelEvaluator`

`init(target_schema, verbose=False, document_non_matches=True, elide_errors=False, individual_results_jsonl=None)`

`compute()`

`evaluate_dataframe(df)`

`get_current_metrics()`

`get_state()`

`load_state(state)`

`merge_state(other_state)`

`pretty_print_metrics()`

`reset()`

`save_metrics(filepath)`

`update(gt_model, pred_model, doc_id=None)`

`update_batch(batch_data)`