Utils

`stickler.structured_object_evaluator.utils`

Utility functions for structured object evaluation.

`stickler.structured_object_evaluator.utils.anls_score`

ANLS score calculation for structured objects.

`stickler.structured_object_evaluator.utils.anls_score.compare_structured_models(gt, pred)`

Compare a ground truth model with a prediction.

This function wraps the compare_with method of StructuredModel for a more explicit API.

Parameters:

Name	Type	Description	Default
`gt`	`StructuredModel`	Ground truth model	required
`pred`	`StructuredModel`	Prediction model	required

Returns:

Type	Description
`Dict[str, Any]`	Comparison result dictionary

Source code in stickler/structured_object_evaluator/utils/anls_score.py

def compare_structured_models(
    gt: StructuredModel, pred: StructuredModel
) -> Dict[str, Any]:
    """Compare a ground truth model with a prediction.

    This function wraps the compare_with method of StructuredModel for
    a more explicit API.

    Args:
        gt: Ground truth model
        pred: Prediction model

    Returns:
        Comparison result dictionary
    """
    return gt.compare_with(pred)

`stickler.structured_object_evaluator.utils.anls_score.anls_score(gt, pred, return_gt=False, return_key_scores=False)`

Calculate ANLS* score between two objects.

This function provides a simple API for getting an ANLS* score between any two objects, similar to the original anls_score function.

Parameters:

Name	Type	Description	Default
`gt`	`Any`	Ground truth object	required
`pred`	`Any`	Prediction object	required
`return_gt`	`bool`	Whether to return the closest ground truth	`False`
`return_key_scores`	`bool`	Whether to return detailed key scores	`False`

Returns:

Type	Description
`Union[float, Tuple[float, Any], Tuple[float, Any, Dict[str, Any]]]`	Either just the overall score (float), or a tuple with the score and
`Union[float, Tuple[float, Any], Tuple[float, Any, Dict[str, Any]]]`	closest ground truth, or a tuple with the score, closest ground truth,
`Union[float, Tuple[float, Any], Tuple[float, Any, Dict[str, Any]]]`	and key scores.

Source code in stickler/structured_object_evaluator/utils/anls_score.py

def anls_score(
    gt: Any, pred: Any, return_gt: bool = False, return_key_scores: bool = False
) -> Union[float, Tuple[float, Any], Tuple[float, Any, Dict[str, Any]]]:
    """Calculate ANLS* score between two objects.

    This function provides a simple API for getting an ANLS* score
    between any two objects, similar to the original anls_score function.

    Args:
        gt: Ground truth object
        pred: Prediction object
        return_gt: Whether to return the closest ground truth
        return_key_scores: Whether to return detailed key scores

    Returns:
        Either just the overall score (float), or a tuple with the score and
        closest ground truth, or a tuple with the score, closest ground truth,
        and key scores.
    """
    import warnings
    from ..trees.base import ANLSTree

    # Store original gt object for possible return
    original_gt = gt

    # Handle classical QA dataset compatibility
    gt_is_list_str = isinstance(gt, list) and all(isinstance(x, str) for x in gt)
    pred_is_str = isinstance(pred, str)
    if gt_is_list_str and pred_is_str:
        warnings.warn(
            "Treating ground truth as a list of options. This is a compatibility mode for ST-VQA-like datasets."
        )
        gt = tuple(gt)

    # Create trees from the objects
    gt_tree = ANLSTree.make_tree(gt, is_gt=True)
    pred_tree = ANLSTree.make_tree(pred, is_gt=False)

    # Calculate ANLS score
    score, closest_gt, key_scores = gt_tree.anls(pred_tree)

    # Determine what to return for gt (smart detection)
    gt_to_return = original_gt if hasattr(original_gt, "model_dump") else closest_gt

    # Return the requested information
    if return_gt and return_key_scores:
        from .key_scores import construct_nested_dict

        key_scores_dict = construct_nested_dict(key_scores)
        return score, gt_to_return, key_scores_dict
    elif return_gt:
        return score, gt_to_return
    elif return_key_scores:
        from .key_scores import construct_nested_dict

        key_scores_dict = construct_nested_dict(key_scores)
        return score, key_scores_dict
    else:
        return score

`stickler.structured_object_evaluator.utils.compare_json`

`stickler.structured_object_evaluator.utils.compare_json.compare_json(gt_json, pred_json, model_cls)`

Compare JSON objects using a StructuredModel.

This function is a utility for comparing raw JSON objects using a StructuredModel class. It handles missing fields and extra fields gracefully.

Parameters:

Name	Type	Description	Default
`gt_json`	`Dict[str, Any]`	Ground truth JSON	required
`pred_json`	`Dict[str, Any]`	Prediction JSON	required
`model_cls`	`Type[StructuredModel]`	StructuredModel class to use for comparison	required

Returns:

Type	Description
`Dict[str, Any]`	Dictionary with comparison results

Source code in stickler/structured_object_evaluator/utils/compare_json.py

def compare_json(
    gt_json: Dict[str, Any], pred_json: Dict[str, Any], model_cls: Type[StructuredModel]
) -> Dict[str, Any]:
    """Compare JSON objects using a StructuredModel.

    This function is a utility for comparing raw JSON objects using a
    StructuredModel class. It handles missing fields and extra fields gracefully.

    Args:
        gt_json: Ground truth JSON
        pred_json: Prediction JSON
        model_cls: StructuredModel class to use for comparison

    Returns:
        Dictionary with comparison results
    """
    try:
        # Try to convert both JSONs to structured models
        gt_model = model_cls.from_json(gt_json)
        pred_model = model_cls.from_json(pred_json)

        # Compare the models
        return gt_model.compare_with(pred_model)
    except Exception as e:
        # Return error details if conversion fails
        return {
            "error": str(e),
            "overall_score": 0.0,
            "field_scores": {},
            "all_fields_matched": False,
        }

`stickler.structured_object_evaluator.utils.key_scores`

Utility functions for handling key scores in structured object evaluation.

`stickler.structured_object_evaluator.utils.key_scores.ScoreNode` `dataclass`

Node in a score tree representing scores for hierarchical structures.

Attributes:

Name	Type	Description
`name`	`str`	The name of this node (key in the hierarchy).
`score`	`Optional[float]`	The score for this node, or None if this is an intermediate node.
`children`	`Dict[str, Any]`	A dictionary mapping child keys to their ScoreNode objects.

Source code in stickler/structured_object_evaluator/utils/key_scores.py

@dataclass
class ScoreNode:
    """Node in a score tree representing scores for hierarchical structures.

    Attributes:
        name: The name of this node (key in the hierarchy).
        score: The score for this node, or None if this is an intermediate node.
        children: A dictionary mapping child keys to their ScoreNode objects.
    """

    name: str = ""
    score: Optional[float] = None
    children: Dict[str, Any] = field(default_factory=dict)

    # For backward compatibility
    @property
    def anls_score(self):
        """Alias for score to maintain backward compatibility."""
        return self.score

    @anls_score.setter
    def anls_score(self, value):
        """Setter for anls_score that updates the score attribute."""
        self.score = value

`anls_score` `property` `writable`

Alias for score to maintain backward compatibility.

`stickler.structured_object_evaluator.utils.key_scores.construct_nested_dict(list_of_dicts)`

Construct a nested dictionary from a list of dictionaries with nested keys.

This function transforms a flat list of dictionaries with tuple keys into a hierarchical structure of ScoreNode objects. This is useful for representing and analyzing scores for nested data structures like dictionaries and lists.

Note: If there are duplicates of keys in the list of dictionaries, the last value will be used.

Parameters:

Name	Type	Description	Default
`list_of_dicts`	`List[Dict[Tuple[str, ...], float]]`	A list of dictionaries with nested keys.	required

Returns:

Type	Description
`Dict[str, ScoreNode]`	A nested dictionary of ScoreNode objects.

Example

list_of_dicts = [ {("a",): 3}, {("a", "b", "c"): 1}, {("a", "b", "d"): 2}, {("a", "c", "e"): 3}, ], construct_nested_dict(list_of_dicts) { "a": ScoreNode( anls_score=3, children={ "b": ScoreNode( children={ "c": ScoreNode(anls_score=1), "d": ScoreNode(anls_score=2), } ), "c": ScoreNode(children={"e": ScoreNode(anls_score=3)}), }, ) },

Source code in stickler/structured_object_evaluator/utils/key_scores.py

def construct_nested_dict(
    list_of_dicts: List[Dict[Tuple[str, ...], float]],
) -> Dict[str, ScoreNode]:
    """Construct a nested dictionary from a list of dictionaries with nested keys.

    This function transforms a flat list of dictionaries with tuple keys into a
    hierarchical structure of ScoreNode objects. This is useful for representing
    and analyzing scores for nested data structures like dictionaries and lists.

    Note: If there are duplicates of keys in the list of dictionaries, the last value will be used.

    Args:
        list_of_dicts: A list of dictionaries with nested keys.

    Returns:
        A nested dictionary of ScoreNode objects.

    Example:
        >>> list_of_dicts = [
                {("a",): 3},
                {("a", "b", "c"): 1},
                {("a", "b", "d"): 2},
                {("a", "c", "e"): 3},
            ],
        >>> construct_nested_dict(list_of_dicts)
            {
                "a": ScoreNode(
                    anls_score=3,
                    children={
                        "b": ScoreNode(
                            children={
                                "c": ScoreNode(anls_score=1),
                                "d": ScoreNode(anls_score=2),
                            }
                        ),
                        "c": ScoreNode(children={"e": ScoreNode(anls_score=3)}),
                    },
                )
            },
    """
    nested_dict: Dict[str, ScoreNode] = {}

    if len(list_of_dicts) == 0:
        return nested_dict

    for entry in list_of_dicts:
        for key_tuple, value in entry.items():
            current_dict: Dict[str, ScoreNode] = nested_dict
            # Traverse and build nested dict, except for last entry
            for key in key_tuple[:-1]:
                if key not in current_dict:
                    current_dict[key] = ScoreNode(name=key)
                current_dict = current_dict[key].children

            # Set the value for the final key
            final_key = key_tuple[-1]
            if final_key not in current_dict:
                current_dict[final_key] = ScoreNode(name=final_key)
            current_dict[final_key].score = value

    return nested_dict

`stickler.structured_object_evaluator.utils.key_scores.merge_and_calculate_mean(list_of_dicts)`

Merge a list of dictionaries and calculate the mean value for each key.

This function takes a list of dictionaries where keys are tuples of strings and values are floats. It combines the dictionaries and calculates the mean value for each unique key across all the dictionaries.

Parameters:

Name	Type	Description	Default
`list_of_dicts`	`List[Dict[Tuple[str, ...], float]]`	A list of dictionaries with tuple keys and float values.	required

Returns:

Type	Description
`List[Dict[Tuple[str, ...], float]]`	A list of dictionaries, each containing a single key-value pair where
`List[Dict[Tuple[str, ...], float]]`	values are the mean of the original values for the corresponding key.

Example

list_of_dicts = [ {('a', 'b'): 10.0, ('c', 'd'): 20.0}, {('a', 'b'): 30.0, ('e', 'f'): 40.0} ] merge_and_calculate_mean(list_of_dicts) [{('a', 'b'): 20.0}, {('c', 'd'): 20.0}, {('e', 'f'): 40.0}]

Source code in stickler/structured_object_evaluator/utils/key_scores.py

def merge_and_calculate_mean(
    list_of_dicts: List[Dict[Tuple[str, ...], float]],
) -> List[Dict[Tuple[str, ...], float]]:
    """
    Merge a list of dictionaries and calculate the mean value for each key.

    This function takes a list of dictionaries where keys are tuples of strings and
    values are floats. It combines the dictionaries and calculates the mean value
    for each unique key across all the dictionaries.

    Args:
        list_of_dicts: A list of dictionaries with tuple keys and float values.

    Returns:
        A list of dictionaries, each containing a single key-value pair where
        values are the mean of the original values for the corresponding key.

    Example:
        >>> list_of_dicts = [
                {('a', 'b'): 10.0, ('c', 'd'): 20.0},
                {('a', 'b'): 30.0, ('e', 'f'): 40.0}
            ]
        >>> merge_and_calculate_mean(list_of_dicts)
            [{('a', 'b'): 20.0}, {('c', 'd'): 20.0}, {('e', 'f'): 40.0}]
    """
    combined_scores: Dict[Tuple[str, ...], float] = {}
    count_dict: Dict[Tuple[str, ...], int] = {}

    # Combine scores for the same keys
    for d in list_of_dicts:
        for k, v in d.items():
            if k not in combined_scores:
                combined_scores[k] = 0
                count_dict[k] = 0
            combined_scores[k] += v
            count_dict[k] += 1

    # Calculate the mean for each key
    for k in combined_scores.keys():
        combined_scores[k] /= count_dict[k]

    # Convert back to a list of dictionaries
    list_combined_scores = [{k: v} for k, v in combined_scores.items()]

    return list_combined_scores

`stickler.structured_object_evaluator.utils.pretty_print`

Pretty print utilities for StructuredModelEvaluator results.

This module provides functions for displaying confusion matrix metrics in a more readable and visually appealing format.

`stickler.structured_object_evaluator.utils.pretty_print.print_confusion_matrix(results, field_filter=None, sort_by='name', show_details=True, use_color=True, output_file=None, nested_detail='standard')`

Pretty print confusion matrix metrics in a readable, visually appealing format.

Parameters:

Name	Type	Description	Default
`results`	`Union[Dict[str, Any], Any]`	Results from StructuredModelEvaluator.evaluate() or ProcessEvaluation from bulk evaluator	required
`field_filter`	`Optional[str]`	Optional regex to filter fields to display	`None`
`sort_by`	`str`	How to sort fields ('name', 'precision', 'recall', 'f1', etc.)	`'name'`
`show_details`	`bool`	Whether to show detailed metrics for each field	`True`
`use_color`	`bool`	Whether to use color in the output	`True`
`output_file`	`Optional[str]`	Optional file path to write the output to	`None`
`nested_detail`	`str`	Level of detail for nested objects: 'minimal' - Show only top-level fields 'standard' - Show nested fields with basic metrics (default) 'detailed' - Show comprehensive metrics for nested fields and their items	`'standard'`

Source code in stickler/structured_object_evaluator/utils/pretty_print.py

def print_confusion_matrix(
    results: Union[Dict[str, Any], Any],
    field_filter: Optional[str] = None,
    sort_by: str = "name",
    show_details: bool = True,
    use_color: bool = True,
    output_file: Optional[str] = None,
    nested_detail: str = "standard",
) -> None:
    """
    Pretty print confusion matrix metrics in a readable, visually appealing format.

    Args:
        results: Results from StructuredModelEvaluator.evaluate() or ProcessEvaluation from bulk evaluator
        field_filter: Optional regex to filter fields to display
        sort_by: How to sort fields ('name', 'precision', 'recall', 'f1', etc.)
        show_details: Whether to show detailed metrics for each field
        use_color: Whether to use color in the output
        output_file: Optional file path to write the output to
        nested_detail: Level of detail for nested objects:
                       'minimal' - Show only top-level fields
                       'standard' - Show nested fields with basic metrics (default)
                       'detailed' - Show comprehensive metrics for nested fields and their items
    """
    # Normalize results format
    normalized_results = _normalize_results_format(results)
    if normalized_results is None:
        print("Error: Results do not contain recognizable confusion matrix metrics")
        return

    # Use normalized results for processing
    results = normalized_results

    # Direct output to file if specified
    if output_file:
        try:
            with open(output_file, "w", encoding="utf-8") as f:
                original_stdout = sys.stdout
                sys.stdout = f
                use_color = False  # Disable color for file output

                # Print overall summary
                _print_overall_summary(results, use_color)

                # Print field-level metrics if requested
                if show_details:
                    _print_field_details(
                        results, field_filter, sort_by, use_color, nested_detail
                    )

                    # Print matrix visualization
                    _print_matrix_visualization(results, use_color)

                # Restore stdout before context manager closes
                sys.stdout = original_stdout
        except Exception as e:
            print(f"Error opening output file: {e}")
            return
    else:
        # Print to stdout
        # Print overall summary
        _print_overall_summary(results, use_color)

        # Print bulk evaluation info if available
        if "bulk_info" in results:
            _print_bulk_info_summary(results, use_color)

        # Print field-level metrics if requested
        if show_details:
            _print_field_details(
                results, field_filter, sort_by, use_color, nested_detail
            )

            # Print matrix visualization
            _print_matrix_visualization(results, use_color)

`stickler.structured_object_evaluator.utils.pretty_print.print_confusion_matrix_html(results, field_filter=None, nested_detail='standard')`

Generate HTML representation of confusion matrix metrics for Jupyter notebooks.

Parameters:

Name	Type	Description	Default
`results`	`Dict[str, Any]`	Results dictionary from StructuredModelEvaluator.evaluate()	required
`field_filter`	`Optional[str]`	Optional regex to filter fields to display	`None`

Returns:

Name	Type	Description
`str`	`str`	HTML string representation of the confusion matrix

Source code in stickler/structured_object_evaluator/utils/pretty_print.py

def print_confusion_matrix_html(
    results: Dict[str, Any],
    field_filter: Optional[str] = None,
    nested_detail: str = "standard",
) -> str:
    """
    Generate HTML representation of confusion matrix metrics for Jupyter notebooks.

    Args:
        results: Results dictionary from StructuredModelEvaluator.evaluate()
        field_filter: Optional regex to filter fields to display

    Returns:
        str: HTML string representation of the confusion matrix
    """
    # This is a placeholder for HTML output formatting
    # Implement this if you need to display results in Jupyter notebooks
    # with richer HTML formatting, tables, and visualizations

    # For now, use the same output as the terminal version
    from io import StringIO
    import sys

    # Capture output in a string
    old_stdout = sys.stdout
    mystdout = StringIO()
    sys.stdout = mystdout

    print_confusion_matrix(
        results, field_filter, use_color=False, nested_detail=nested_detail
    )

    sys.stdout = old_stdout

    # Return the captured output
    return f"<pre>{mystdout.getvalue()}</pre>"

Utils

stickler.structured_object_evaluator.utils

stickler.structured_object_evaluator.utils.anls_score

stickler.structured_object_evaluator.utils.anls_score.compare_structured_models(gt, pred)

stickler.structured_object_evaluator.utils.anls_score.anls_score(gt, pred, return_gt=False, return_key_scores=False)

stickler.structured_object_evaluator.utils.compare_json

stickler.structured_object_evaluator.utils.compare_json.compare_json(gt_json, pred_json, model_cls)

stickler.structured_object_evaluator.utils.key_scores

stickler.structured_object_evaluator.utils.key_scores.ScoreNode dataclass

anls_score property writable

stickler.structured_object_evaluator.utils.key_scores.construct_nested_dict(list_of_dicts)

stickler.structured_object_evaluator.utils.key_scores.merge_and_calculate_mean(list_of_dicts)

stickler.structured_object_evaluator.utils.pretty_print

stickler.structured_object_evaluator.utils.pretty_print.print_confusion_matrix(results, field_filter=None, sort_by='name', show_details=True, use_color=True, output_file=None, nested_detail='standard')

stickler.structured_object_evaluator.utils.pretty_print.print_confusion_matrix_html(results, field_filter=None, nested_detail='standard')

`stickler.structured_object_evaluator.utils`

`stickler.structured_object_evaluator.utils.anls_score`

`stickler.structured_object_evaluator.utils.anls_score.compare_structured_models(gt, pred)`

`stickler.structured_object_evaluator.utils.anls_score.anls_score(gt, pred, return_gt=False, return_key_scores=False)`

`stickler.structured_object_evaluator.utils.compare_json`

`stickler.structured_object_evaluator.utils.compare_json.compare_json(gt_json, pred_json, model_cls)`

`stickler.structured_object_evaluator.utils.key_scores`

`stickler.structured_object_evaluator.utils.key_scores.ScoreNode` `dataclass`

`anls_score` `property` `writable`

`stickler.structured_object_evaluator.utils.key_scores.construct_nested_dict(list_of_dicts)`

`stickler.structured_object_evaluator.utils.key_scores.merge_and_calculate_mean(list_of_dicts)`

`stickler.structured_object_evaluator.utils.pretty_print`

`stickler.structured_object_evaluator.utils.pretty_print.print_confusion_matrix(results, field_filter=None, sort_by='name', show_details=True, use_color=True, output_file=None, nested_detail='standard')`

`stickler.structured_object_evaluator.utils.pretty_print.print_confusion_matrix_html(results, field_filter=None, nested_detail='standard')`