Skip to content

evaluator

BaseEvaluator

Bases: ABC

The BaseEvaluator abstract base class defines the common interface for evaluator classes.

Attributes:

Name Type Description
test Test

The test case.

target BaseTarget

The target agent being evaluated.

conversation Conversation

Captures the interaction between a user and an agent.

trace Trace

Captures steps during evaluation.

test_result TestResult

The result of the test which is set in BaseEvaluator.run.

input_token_count int

Number of input tokens processed by the evaluator.

output_token_count int

Number of output tokens generated by the evaluator.

model_id str

The ID of the Bedrock model used to run evaluation. If provisioned_throughput_arn is provided, then this will be set to the ARN of the provisioned throughput.

boto3_client BaseClient

A boto3 client representing Amazon Bedrock Runtime.

Source code in src/agenteval/evaluators/base_evaluator.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class BaseEvaluator(ABC):
    """The `BaseEvaluator` abstract base class defines the common interface for evaluator
    classes.

    Attributes:
        test (Test): The test case.
        target (BaseTarget): The target agent being evaluated.
        conversation (Conversation): Captures the interaction between a user and an agent.
        trace (Trace): Captures steps during evaluation.
        test_result (TestResult): The result of the test which is set in `BaseEvaluator.run`.
        input_token_count (int): Number of input tokens processed by the evaluator.
        output_token_count (int): Number of output tokens generated by the evaluator.
        model_id (str): The ID of the Bedrock model used to run evaluation. If `provisioned_throughput_arn` is provided,
            then this will be set to the ARN of the provisioned throughput.
        boto3_client (BaseClient): A `boto3` client representing Amazon Bedrock Runtime.
    """

    def __init__(
        self,
        test: Test,
        target: BaseTarget,
        work_dir: str,
        model_id: str,
        provisioned_throughput_arn: Optional[str] = None,
        aws_profile: Optional[str] = None,
        aws_region: Optional[str] = None,
        endpoint_url: Optional[str] = None,
        max_retry: int = _DEFAULT_MAX_RETRY,
    ):
        """Initialize the evaluator instance for a given `Test` and `Target`.

        Args:
            test (Test): The test case.
            target (BaseTarget): The target agent being evaluated.
            work_dir (str): The work directory.
            model_id (str): The ID of the Bedrock model used to run evaluation.
            provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput.
            aws_profile (str, optional): The AWS profile name.
            aws_region (str, optional): The AWS region.
            endpoint_url (str, optional): The endpoint URL for the AWS service.
            max_retry (int, optional): The maximum number of retry attempts.
        """
        self.test = test
        self.target = target
        self.conversation = Conversation()
        self.trace = Trace(work_dir=work_dir, test_name=test.name)
        self.test_result = None
        self.input_token_count = 0
        self.output_token_count = 0
        self.model_id = provisioned_throughput_arn or model_id
        self.bedrock_runtime_client = create_boto3_client(
            boto3_service_name=_BOTO3_SERVICE_NAME,
            aws_profile=aws_profile,
            aws_region=aws_region,
            endpoint_url=endpoint_url,
            max_retry=max_retry,
        )

    @abstractmethod
    def evaluate(self) -> TestResult:
        """Conduct a test.

        Returns:
            TestResult: The result of the test.
        """
        pass

    def _get_hook_cls(self, hook: Optional[str]) -> Optional[type[Hook]]:
        if hook:
            hook_cls = import_class(hook, parent_class=Hook)
            return hook_cls

    def invoke_model(self, request_body: dict) -> dict:
        """
        Invoke the Bedrock model using the `boto3_client`. This method will convert
        a request dictionary to a JSON string before passing it to the `InvokeModel` API.

        Refer to the `boto3` documentation for more details.

        Args:
            request_body (dict): The request payload as a dictionary.

        Returns:
            dict: The response from the model invocation.

        """
        response = self.bedrock_runtime_client.invoke_model(
            modelId=self.model_id, body=json.dumps(request_body)
        )

        self._incr_token_counts(response)

        return response

    def _incr_token_counts(self, response: dict):
        headers = response["ResponseMetadata"]["HTTPHeaders"]

        self.input_token_count += int(
            headers.get("x-amzn-bedrock-input-token-count", 0)
        )
        self.output_token_count += int(
            headers.get("x-amzn-bedrock-output-token-count", 0)
        )

    def run(self) -> TestResult:
        """
        Run the evaluator within a trace context manager and run hooks
        if provided.
        """

        hook_cls = self._get_hook_cls(self.test.hook)

        with self.trace:
            if hook_cls:
                hook_cls.pre_evaluate(self.test, self.trace)
            self.test_result = self.evaluate()
            if hook_cls:
                hook_cls.post_evaluate(self.test, self.test_result, self.trace)

        return self.test_result

__init__(test, target, work_dir, model_id, provisioned_throughput_arn=None, aws_profile=None, aws_region=None, endpoint_url=None, max_retry=_DEFAULT_MAX_RETRY)

Initialize the evaluator instance for a given Test and Target.

Parameters:

Name Type Description Default
test Test

The test case.

required
target BaseTarget

The target agent being evaluated.

required
work_dir str

The work directory.

required
model_id str

The ID of the Bedrock model used to run evaluation.

required
provisioned_throughput_arn str

The ARN of the provisioned throughput.

None
aws_profile str

The AWS profile name.

None
aws_region str

The AWS region.

None
endpoint_url str

The endpoint URL for the AWS service.

None
max_retry int

The maximum number of retry attempts.

_DEFAULT_MAX_RETRY
Source code in src/agenteval/evaluators/base_evaluator.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def __init__(
    self,
    test: Test,
    target: BaseTarget,
    work_dir: str,
    model_id: str,
    provisioned_throughput_arn: Optional[str] = None,
    aws_profile: Optional[str] = None,
    aws_region: Optional[str] = None,
    endpoint_url: Optional[str] = None,
    max_retry: int = _DEFAULT_MAX_RETRY,
):
    """Initialize the evaluator instance for a given `Test` and `Target`.

    Args:
        test (Test): The test case.
        target (BaseTarget): The target agent being evaluated.
        work_dir (str): The work directory.
        model_id (str): The ID of the Bedrock model used to run evaluation.
        provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput.
        aws_profile (str, optional): The AWS profile name.
        aws_region (str, optional): The AWS region.
        endpoint_url (str, optional): The endpoint URL for the AWS service.
        max_retry (int, optional): The maximum number of retry attempts.
    """
    self.test = test
    self.target = target
    self.conversation = Conversation()
    self.trace = Trace(work_dir=work_dir, test_name=test.name)
    self.test_result = None
    self.input_token_count = 0
    self.output_token_count = 0
    self.model_id = provisioned_throughput_arn or model_id
    self.bedrock_runtime_client = create_boto3_client(
        boto3_service_name=_BOTO3_SERVICE_NAME,
        aws_profile=aws_profile,
        aws_region=aws_region,
        endpoint_url=endpoint_url,
        max_retry=max_retry,
    )

evaluate() abstractmethod

Conduct a test.

Returns:

Name Type Description
TestResult TestResult

The result of the test.

Source code in src/agenteval/evaluators/base_evaluator.py
78
79
80
81
82
83
84
85
@abstractmethod
def evaluate(self) -> TestResult:
    """Conduct a test.

    Returns:
        TestResult: The result of the test.
    """
    pass

invoke_model(request_body)

Invoke the Bedrock model using the boto3_client. This method will convert a request dictionary to a JSON string before passing it to the InvokeModel API.

Refer to the boto3 documentation for more details.

Parameters:

Name Type Description Default
request_body dict

The request payload as a dictionary.

required

Returns:

Name Type Description
dict dict

The response from the model invocation.

Source code in src/agenteval/evaluators/base_evaluator.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def invoke_model(self, request_body: dict) -> dict:
    """
    Invoke the Bedrock model using the `boto3_client`. This method will convert
    a request dictionary to a JSON string before passing it to the `InvokeModel` API.

    Refer to the `boto3` documentation for more details.

    Args:
        request_body (dict): The request payload as a dictionary.

    Returns:
        dict: The response from the model invocation.

    """
    response = self.bedrock_runtime_client.invoke_model(
        modelId=self.model_id, body=json.dumps(request_body)
    )

    self._incr_token_counts(response)

    return response

run()

Run the evaluator within a trace context manager and run hooks if provided.

Source code in src/agenteval/evaluators/base_evaluator.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def run(self) -> TestResult:
    """
    Run the evaluator within a trace context manager and run hooks
    if provided.
    """

    hook_cls = self._get_hook_cls(self.test.hook)

    with self.trace:
        if hook_cls:
            hook_cls.pre_evaluate(self.test, self.trace)
        self.test_result = self.evaluate()
        if hook_cls:
            hook_cls.post_evaluate(self.test, self.test_result, self.trace)

    return self.test_result