Skip to content

Utils

stickler.structured_object_evaluator.utils

Utility functions for structured object evaluation.

stickler.structured_object_evaluator.utils.anls_score

ANLS score calculation for structured objects.

stickler.structured_object_evaluator.utils.anls_score.compare_structured_models(gt, pred)

Compare a ground truth model with a prediction.

This function wraps the compare_with method of StructuredModel for a more explicit API.

Parameters:

Name Type Description Default
gt StructuredModel

Ground truth model

required
pred StructuredModel

Prediction model

required

Returns:

Type Description
Dict[str, Any]

Comparison result dictionary

Source code in stickler/structured_object_evaluator/utils/anls_score.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def compare_structured_models(
    gt: StructuredModel, pred: StructuredModel
) -> Dict[str, Any]:
    """Compare a ground truth model with a prediction.

    This function wraps the compare_with method of StructuredModel for
    a more explicit API.

    Args:
        gt: Ground truth model
        pred: Prediction model

    Returns:
        Comparison result dictionary
    """
    return gt.compare_with(pred)

stickler.structured_object_evaluator.utils.anls_score.anls_score(gt, pred, return_gt=False, return_key_scores=False)

Calculate ANLS* score between two objects.

This function provides a simple API for getting an ANLS* score between any two objects, similar to the original anls_score function.

Parameters:

Name Type Description Default
gt Any

Ground truth object

required
pred Any

Prediction object

required
return_gt bool

Whether to return the closest ground truth

False
return_key_scores bool

Whether to return detailed key scores

False

Returns:

Type Description
Union[float, Tuple[float, Any], Tuple[float, Any, Dict[str, Any]]]

Either just the overall score (float), or a tuple with the score and

Union[float, Tuple[float, Any], Tuple[float, Any, Dict[str, Any]]]

closest ground truth, or a tuple with the score, closest ground truth,

Union[float, Tuple[float, Any], Tuple[float, Any, Dict[str, Any]]]

and key scores.

Source code in stickler/structured_object_evaluator/utils/anls_score.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def anls_score(
    gt: Any, pred: Any, return_gt: bool = False, return_key_scores: bool = False
) -> Union[float, Tuple[float, Any], Tuple[float, Any, Dict[str, Any]]]:
    """Calculate ANLS* score between two objects.

    This function provides a simple API for getting an ANLS* score
    between any two objects, similar to the original anls_score function.

    Args:
        gt: Ground truth object
        pred: Prediction object
        return_gt: Whether to return the closest ground truth
        return_key_scores: Whether to return detailed key scores

    Returns:
        Either just the overall score (float), or a tuple with the score and
        closest ground truth, or a tuple with the score, closest ground truth,
        and key scores.
    """
    import warnings
    from ..trees.base import ANLSTree

    # Store original gt object for possible return
    original_gt = gt

    # Handle classical QA dataset compatibility
    gt_is_list_str = isinstance(gt, list) and all(isinstance(x, str) for x in gt)
    pred_is_str = isinstance(pred, str)
    if gt_is_list_str and pred_is_str:
        warnings.warn(
            "Treating ground truth as a list of options. This is a compatibility mode for ST-VQA-like datasets."
        )
        gt = tuple(gt)

    # Create trees from the objects
    gt_tree = ANLSTree.make_tree(gt, is_gt=True)
    pred_tree = ANLSTree.make_tree(pred, is_gt=False)

    # Calculate ANLS score
    score, closest_gt, key_scores = gt_tree.anls(pred_tree)

    # Determine what to return for gt (smart detection)
    gt_to_return = original_gt if hasattr(original_gt, "model_dump") else closest_gt

    # Return the requested information
    if return_gt and return_key_scores:
        from .key_scores import construct_nested_dict

        key_scores_dict = construct_nested_dict(key_scores)
        return score, gt_to_return, key_scores_dict
    elif return_gt:
        return score, gt_to_return
    elif return_key_scores:
        from .key_scores import construct_nested_dict

        key_scores_dict = construct_nested_dict(key_scores)
        return score, key_scores_dict
    else:
        return score

stickler.structured_object_evaluator.utils.compare_json

stickler.structured_object_evaluator.utils.compare_json.compare_json(gt_json, pred_json, model_cls)

Compare JSON objects using a StructuredModel.

This function is a utility for comparing raw JSON objects using a StructuredModel class. It handles missing fields and extra fields gracefully.

Parameters:

Name Type Description Default
gt_json Dict[str, Any]

Ground truth JSON

required
pred_json Dict[str, Any]

Prediction JSON

required
model_cls Type[StructuredModel]

StructuredModel class to use for comparison

required

Returns:

Type Description
Dict[str, Any]

Dictionary with comparison results

Source code in stickler/structured_object_evaluator/utils/compare_json.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def compare_json(
    gt_json: Dict[str, Any], pred_json: Dict[str, Any], model_cls: Type[StructuredModel]
) -> Dict[str, Any]:
    """Compare JSON objects using a StructuredModel.

    This function is a utility for comparing raw JSON objects using a
    StructuredModel class. It handles missing fields and extra fields gracefully.

    Args:
        gt_json: Ground truth JSON
        pred_json: Prediction JSON
        model_cls: StructuredModel class to use for comparison

    Returns:
        Dictionary with comparison results
    """
    try:
        # Try to convert both JSONs to structured models
        gt_model = model_cls.from_json(gt_json)
        pred_model = model_cls.from_json(pred_json)

        # Compare the models
        return gt_model.compare_with(pred_model)
    except Exception as e:
        # Return error details if conversion fails
        return {
            "error": str(e),
            "overall_score": 0.0,
            "field_scores": {},
            "all_fields_matched": False,
        }

stickler.structured_object_evaluator.utils.key_scores

Utility functions for handling key scores in structured object evaluation.

stickler.structured_object_evaluator.utils.key_scores.ScoreNode dataclass

Node in a score tree representing scores for hierarchical structures.

Attributes:

Name Type Description
name str

The name of this node (key in the hierarchy).

score Optional[float]

The score for this node, or None if this is an intermediate node.

children Dict[str, Any]

A dictionary mapping child keys to their ScoreNode objects.

Source code in stickler/structured_object_evaluator/utils/key_scores.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
@dataclass
class ScoreNode:
    """Node in a score tree representing scores for hierarchical structures.

    Attributes:
        name: The name of this node (key in the hierarchy).
        score: The score for this node, or None if this is an intermediate node.
        children: A dictionary mapping child keys to their ScoreNode objects.
    """

    name: str = ""
    score: Optional[float] = None
    children: Dict[str, Any] = field(default_factory=dict)

    # For backward compatibility
    @property
    def anls_score(self):
        """Alias for score to maintain backward compatibility."""
        return self.score

    @anls_score.setter
    def anls_score(self, value):
        """Setter for anls_score that updates the score attribute."""
        self.score = value

anls_score property writable

Alias for score to maintain backward compatibility.

stickler.structured_object_evaluator.utils.key_scores.construct_nested_dict(list_of_dicts)

Construct a nested dictionary from a list of dictionaries with nested keys.

This function transforms a flat list of dictionaries with tuple keys into a hierarchical structure of ScoreNode objects. This is useful for representing and analyzing scores for nested data structures like dictionaries and lists.

Note: If there are duplicates of keys in the list of dictionaries, the last value will be used.

Parameters:

Name Type Description Default
list_of_dicts List[Dict[Tuple[str, ...], float]]

A list of dictionaries with nested keys.

required

Returns:

Type Description
Dict[str, ScoreNode]

A nested dictionary of ScoreNode objects.

Example

list_of_dicts = [ {("a",): 3}, {("a", "b", "c"): 1}, {("a", "b", "d"): 2}, {("a", "c", "e"): 3}, ], construct_nested_dict(list_of_dicts) { "a": ScoreNode( anls_score=3, children={ "b": ScoreNode( children={ "c": ScoreNode(anls_score=1), "d": ScoreNode(anls_score=2), } ), "c": ScoreNode(children={"e": ScoreNode(anls_score=3)}), }, ) },

Source code in stickler/structured_object_evaluator/utils/key_scores.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def construct_nested_dict(
    list_of_dicts: List[Dict[Tuple[str, ...], float]],
) -> Dict[str, ScoreNode]:
    """Construct a nested dictionary from a list of dictionaries with nested keys.

    This function transforms a flat list of dictionaries with tuple keys into a
    hierarchical structure of ScoreNode objects. This is useful for representing
    and analyzing scores for nested data structures like dictionaries and lists.

    Note: If there are duplicates of keys in the list of dictionaries, the last value will be used.

    Args:
        list_of_dicts: A list of dictionaries with nested keys.

    Returns:
        A nested dictionary of ScoreNode objects.

    Example:
        >>> list_of_dicts = [
                {("a",): 3},
                {("a", "b", "c"): 1},
                {("a", "b", "d"): 2},
                {("a", "c", "e"): 3},
            ],
        >>> construct_nested_dict(list_of_dicts)
            {
                "a": ScoreNode(
                    anls_score=3,
                    children={
                        "b": ScoreNode(
                            children={
                                "c": ScoreNode(anls_score=1),
                                "d": ScoreNode(anls_score=2),
                            }
                        ),
                        "c": ScoreNode(children={"e": ScoreNode(anls_score=3)}),
                    },
                )
            },
    """
    nested_dict: Dict[str, ScoreNode] = {}

    if len(list_of_dicts) == 0:
        return nested_dict

    for entry in list_of_dicts:
        for key_tuple, value in entry.items():
            current_dict: Dict[str, ScoreNode] = nested_dict
            # Traverse and build nested dict, except for last entry
            for key in key_tuple[:-1]:
                if key not in current_dict:
                    current_dict[key] = ScoreNode(name=key)
                current_dict = current_dict[key].children

            # Set the value for the final key
            final_key = key_tuple[-1]
            if final_key not in current_dict:
                current_dict[final_key] = ScoreNode(name=final_key)
            current_dict[final_key].score = value

    return nested_dict

stickler.structured_object_evaluator.utils.key_scores.merge_and_calculate_mean(list_of_dicts)

Merge a list of dictionaries and calculate the mean value for each key.

This function takes a list of dictionaries where keys are tuples of strings and values are floats. It combines the dictionaries and calculates the mean value for each unique key across all the dictionaries.

Parameters:

Name Type Description Default
list_of_dicts List[Dict[Tuple[str, ...], float]]

A list of dictionaries with tuple keys and float values.

required

Returns:

Type Description
List[Dict[Tuple[str, ...], float]]

A list of dictionaries, each containing a single key-value pair where

List[Dict[Tuple[str, ...], float]]

values are the mean of the original values for the corresponding key.

Example

list_of_dicts = [ {('a', 'b'): 10.0, ('c', 'd'): 20.0}, {('a', 'b'): 30.0, ('e', 'f'): 40.0} ] merge_and_calculate_mean(list_of_dicts) [{('a', 'b'): 20.0}, {('c', 'd'): 20.0}, {('e', 'f'): 40.0}]

Source code in stickler/structured_object_evaluator/utils/key_scores.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def merge_and_calculate_mean(
    list_of_dicts: List[Dict[Tuple[str, ...], float]],
) -> List[Dict[Tuple[str, ...], float]]:
    """
    Merge a list of dictionaries and calculate the mean value for each key.

    This function takes a list of dictionaries where keys are tuples of strings and
    values are floats. It combines the dictionaries and calculates the mean value
    for each unique key across all the dictionaries.

    Args:
        list_of_dicts: A list of dictionaries with tuple keys and float values.

    Returns:
        A list of dictionaries, each containing a single key-value pair where
        values are the mean of the original values for the corresponding key.

    Example:
        >>> list_of_dicts = [
                {('a', 'b'): 10.0, ('c', 'd'): 20.0},
                {('a', 'b'): 30.0, ('e', 'f'): 40.0}
            ]
        >>> merge_and_calculate_mean(list_of_dicts)
            [{('a', 'b'): 20.0}, {('c', 'd'): 20.0}, {('e', 'f'): 40.0}]
    """
    combined_scores: Dict[Tuple[str, ...], float] = {}
    count_dict: Dict[Tuple[str, ...], int] = {}

    # Combine scores for the same keys
    for d in list_of_dicts:
        for k, v in d.items():
            if k not in combined_scores:
                combined_scores[k] = 0
                count_dict[k] = 0
            combined_scores[k] += v
            count_dict[k] += 1

    # Calculate the mean for each key
    for k in combined_scores.keys():
        combined_scores[k] /= count_dict[k]

    # Convert back to a list of dictionaries
    list_combined_scores = [{k: v} for k, v in combined_scores.items()]

    return list_combined_scores

stickler.structured_object_evaluator.utils.pretty_print

Pretty print utilities for StructuredModelEvaluator results.

This module provides functions for displaying confusion matrix metrics in a more readable and visually appealing format.

stickler.structured_object_evaluator.utils.pretty_print.print_confusion_matrix(results, field_filter=None, sort_by='name', show_details=True, use_color=True, output_file=None, nested_detail='standard')

Pretty print confusion matrix metrics in a readable, visually appealing format.

Parameters:

Name Type Description Default
results Union[Dict[str, Any], Any]

Results from StructuredModelEvaluator.evaluate() or ProcessEvaluation from bulk evaluator

required
field_filter Optional[str]

Optional regex to filter fields to display

None
sort_by str

How to sort fields ('name', 'precision', 'recall', 'f1', etc.)

'name'
show_details bool

Whether to show detailed metrics for each field

True
use_color bool

Whether to use color in the output

True
output_file Optional[str]

Optional file path to write the output to

None
nested_detail str

Level of detail for nested objects: 'minimal' - Show only top-level fields 'standard' - Show nested fields with basic metrics (default) 'detailed' - Show comprehensive metrics for nested fields and their items

'standard'
Source code in stickler/structured_object_evaluator/utils/pretty_print.py
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def print_confusion_matrix(
    results: Union[Dict[str, Any], Any],
    field_filter: Optional[str] = None,
    sort_by: str = "name",
    show_details: bool = True,
    use_color: bool = True,
    output_file: Optional[str] = None,
    nested_detail: str = "standard",
) -> None:
    """
    Pretty print confusion matrix metrics in a readable, visually appealing format.

    Args:
        results: Results from StructuredModelEvaluator.evaluate() or ProcessEvaluation from bulk evaluator
        field_filter: Optional regex to filter fields to display
        sort_by: How to sort fields ('name', 'precision', 'recall', 'f1', etc.)
        show_details: Whether to show detailed metrics for each field
        use_color: Whether to use color in the output
        output_file: Optional file path to write the output to
        nested_detail: Level of detail for nested objects:
                       'minimal' - Show only top-level fields
                       'standard' - Show nested fields with basic metrics (default)
                       'detailed' - Show comprehensive metrics for nested fields and their items
    """
    # Normalize results format
    normalized_results = _normalize_results_format(results)
    if normalized_results is None:
        print("Error: Results do not contain recognizable confusion matrix metrics")
        return

    # Use normalized results for processing
    results = normalized_results

    # Direct output to file if specified
    if output_file:
        try:
            with open(output_file, "w", encoding="utf-8") as f:
                original_stdout = sys.stdout
                sys.stdout = f
                use_color = False  # Disable color for file output

                # Print overall summary
                _print_overall_summary(results, use_color)

                # Print field-level metrics if requested
                if show_details:
                    _print_field_details(
                        results, field_filter, sort_by, use_color, nested_detail
                    )

                    # Print matrix visualization
                    _print_matrix_visualization(results, use_color)

                # Restore stdout before context manager closes
                sys.stdout = original_stdout
        except Exception as e:
            print(f"Error opening output file: {e}")
            return
    else:
        # Print to stdout
        # Print overall summary
        _print_overall_summary(results, use_color)

        # Print bulk evaluation info if available
        if "bulk_info" in results:
            _print_bulk_info_summary(results, use_color)

        # Print field-level metrics if requested
        if show_details:
            _print_field_details(
                results, field_filter, sort_by, use_color, nested_detail
            )

            # Print matrix visualization
            _print_matrix_visualization(results, use_color)

stickler.structured_object_evaluator.utils.pretty_print.print_confusion_matrix_html(results, field_filter=None, nested_detail='standard')

Generate HTML representation of confusion matrix metrics for Jupyter notebooks.

Parameters:

Name Type Description Default
results Dict[str, Any]

Results dictionary from StructuredModelEvaluator.evaluate()

required
field_filter Optional[str]

Optional regex to filter fields to display

None

Returns:

Name Type Description
str str

HTML string representation of the confusion matrix

Source code in stickler/structured_object_evaluator/utils/pretty_print.py
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
def print_confusion_matrix_html(
    results: Dict[str, Any],
    field_filter: Optional[str] = None,
    nested_detail: str = "standard",
) -> str:
    """
    Generate HTML representation of confusion matrix metrics for Jupyter notebooks.

    Args:
        results: Results dictionary from StructuredModelEvaluator.evaluate()
        field_filter: Optional regex to filter fields to display

    Returns:
        str: HTML string representation of the confusion matrix
    """
    # This is a placeholder for HTML output formatting
    # Implement this if you need to display results in Jupyter notebooks
    # with richer HTML formatting, tables, and visualizations

    # For now, use the same output as the terminal version
    from io import StringIO
    import sys

    # Capture output in a string
    old_stdout = sys.stdout
    mystdout = StringIO()
    sys.stdout = mystdout

    print_confusion_matrix(
        results, field_filter, use_color=False, nested_detail=nested_detail
    )

    sys.stdout = old_stdout

    # Return the captured output
    return f"<pre>{mystdout.getvalue()}</pre>"