evaluator

`BaseEvaluator`

Bases: ABC

The BaseEvaluator abstract base class defines the common interface for evaluator classes.

Attributes:

Name	Type	Description
`test`	`Test`	The test case.
`target`	`BaseTarget`	The target agent being evaluated.
`conversation`	`Conversation`	Captures the interaction between a user and an agent.
`trace`	`Trace`	Captures steps during evaluation.
`test_result`	`TestResult`	The result of the test which is set in `BaseEvaluator.run`.
`input_token_count`	`int`	Number of input tokens processed by the evaluator.
`output_token_count`	`int`	Number of output tokens generated by the evaluator.
`model_id`	`str`	The ID of the Bedrock model used to run evaluation. If `provisioned_throughput_arn` is provided, then this will be set to the ARN of the provisioned throughput.
`boto3_client`	`BaseClient`	A `boto3` client representing Amazon Bedrock Runtime.

Source code in src/agenteval/evaluators/base_evaluator.py

class BaseEvaluator(ABC):
    """The `BaseEvaluator` abstract base class defines the common interface for evaluator
    classes.

    Attributes:
        test (Test): The test case.
        target (BaseTarget): The target agent being evaluated.
        conversation (Conversation): Captures the interaction between a user and an agent.
        trace (Trace): Captures steps during evaluation.
        test_result (TestResult): The result of the test which is set in `BaseEvaluator.run`.
        input_token_count (int): Number of input tokens processed by the evaluator.
        output_token_count (int): Number of output tokens generated by the evaluator.
        model_id (str): The ID of the Bedrock model used to run evaluation. If `provisioned_throughput_arn` is provided,
            then this will be set to the ARN of the provisioned throughput.
        boto3_client (BaseClient): A `boto3` client representing Amazon Bedrock Runtime.
    """

    def __init__(
        self,
        test: Test,
        target: BaseTarget,
        work_dir: str,
        model_id: str,
        provisioned_throughput_arn: Optional[str] = None,
        aws_profile: Optional[str] = None,
        aws_region: Optional[str] = None,
        endpoint_url: Optional[str] = None,
        max_retry: int = _DEFAULT_MAX_RETRY,
    ):
        """Initialize the evaluator instance for a given `Test` and `Target`.

        Args:
            test (Test): The test case.
            target (BaseTarget): The target agent being evaluated.
            work_dir (str): The work directory.
            model_id (str): The ID of the Bedrock model used to run evaluation.
            provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput.
            aws_profile (str, optional): The AWS profile name.
            aws_region (str, optional): The AWS region.
            endpoint_url (str, optional): The endpoint URL for the AWS service.
            max_retry (int, optional): The maximum number of retry attempts.
        """
        self.test = test
        self.target = target
        self.conversation = Conversation()
        self.trace = Trace(work_dir=work_dir, test_name=test.name)
        self.test_result = None
        self.input_token_count = 0
        self.output_token_count = 0
        self.model_id = provisioned_throughput_arn or model_id
        self.bedrock_runtime_client = create_boto3_client(
            boto3_service_name=_BOTO3_SERVICE_NAME,
            aws_profile=aws_profile,
            aws_region=aws_region,
            endpoint_url=endpoint_url,
            max_retry=max_retry,
        )

    @abstractmethod
    def evaluate(self) -> TestResult:
        """Conduct a test.

        Returns:
            TestResult: The result of the test.
        """
        pass

    def _get_hook_cls(self, hook: Optional[str]) -> Optional[type[Hook]]:
        if hook:
            hook_cls = import_class(hook, parent_class=Hook)
            return hook_cls

    def invoke_model(self, request_body: dict) -> dict:
        """
        Invoke the Bedrock model using the `boto3_client`. This method will convert
        a request dictionary to a JSON string before passing it to the `InvokeModel` API.

        Refer to the `boto3` documentation for more details.

        Args:
            request_body (dict): The request payload as a dictionary.

        Returns:
            dict: The response from the model invocation.

        """
        response = self.bedrock_runtime_client.invoke_model(
            modelId=self.model_id, body=json.dumps(request_body)
        )

        self._incr_token_counts(response)

        return response

    def _incr_token_counts(self, response: dict):
        headers = response["ResponseMetadata"]["HTTPHeaders"]

        self.input_token_count += int(
            headers.get("x-amzn-bedrock-input-token-count", 0)
        )
        self.output_token_count += int(
            headers.get("x-amzn-bedrock-output-token-count", 0)
        )

    def run(self) -> TestResult:
        """
        Run the evaluator within a trace context manager and run hooks
        if provided.
        """

        hook_cls = self._get_hook_cls(self.test.hook)

        with self.trace:
            if hook_cls:
                hook_cls.pre_evaluate(self.test, self.trace)
            self.test_result = self.evaluate()
            if hook_cls:
                hook_cls.post_evaluate(self.test, self.test_result, self.trace)

        return self.test_result

`init(test, target, work_dir, model_id, provisioned_throughput_arn=None, aws_profile=None, aws_region=None, endpoint_url=None, max_retry=_DEFAULT_MAX_RETRY)`

Initialize the evaluator instance for a given Test and Target.

Parameters:

Name	Type	Description	Default
`test`	`Test`	The test case.	required
`target`	`BaseTarget`	The target agent being evaluated.	required
`work_dir`	`str`	The work directory.	required
`model_id`	`str`	The ID of the Bedrock model used to run evaluation.	required
`provisioned_throughput_arn`	`str`	The ARN of the provisioned throughput.	`None`
`aws_profile`	`str`	The AWS profile name.	`None`
`aws_region`	`str`	The AWS region.	`None`
`endpoint_url`	`str`	The endpoint URL for the AWS service.	`None`
`max_retry`	`int`	The maximum number of retry attempts.	`_DEFAULT_MAX_RETRY`

Source code in src/agenteval/evaluators/base_evaluator.py

def __init__(
    self,
    test: Test,
    target: BaseTarget,
    work_dir: str,
    model_id: str,
    provisioned_throughput_arn: Optional[str] = None,
    aws_profile: Optional[str] = None,
    aws_region: Optional[str] = None,
    endpoint_url: Optional[str] = None,
    max_retry: int = _DEFAULT_MAX_RETRY,
):
    """Initialize the evaluator instance for a given `Test` and `Target`.

    Args:
        test (Test): The test case.
        target (BaseTarget): The target agent being evaluated.
        work_dir (str): The work directory.
        model_id (str): The ID of the Bedrock model used to run evaluation.
        provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput.
        aws_profile (str, optional): The AWS profile name.
        aws_region (str, optional): The AWS region.
        endpoint_url (str, optional): The endpoint URL for the AWS service.
        max_retry (int, optional): The maximum number of retry attempts.
    """
    self.test = test
    self.target = target
    self.conversation = Conversation()
    self.trace = Trace(work_dir=work_dir, test_name=test.name)
    self.test_result = None
    self.input_token_count = 0
    self.output_token_count = 0
    self.model_id = provisioned_throughput_arn or model_id
    self.bedrock_runtime_client = create_boto3_client(
        boto3_service_name=_BOTO3_SERVICE_NAME,
        aws_profile=aws_profile,
        aws_region=aws_region,
        endpoint_url=endpoint_url,
        max_retry=max_retry,
    )

`evaluate()` `abstractmethod`

Conduct a test.

Returns:

Name	Type	Description
`TestResult`	`TestResult`	The result of the test.

Source code in src/agenteval/evaluators/base_evaluator.py

@abstractmethod
def evaluate(self) -> TestResult:
    """Conduct a test.

    Returns:
        TestResult: The result of the test.
    """
    pass

`invoke_model(request_body)`

Invoke the Bedrock model using the boto3_client. This method will convert a request dictionary to a JSON string before passing it to the InvokeModel API.

Refer to the boto3 documentation for more details.

Parameters:

Name	Type	Description	Default
`request_body`	`dict`	The request payload as a dictionary.	required

Returns:

Name	Type	Description
`dict`	`dict`	The response from the model invocation.

Source code in src/agenteval/evaluators/base_evaluator.py

def invoke_model(self, request_body: dict) -> dict:
    """
    Invoke the Bedrock model using the `boto3_client`. This method will convert
    a request dictionary to a JSON string before passing it to the `InvokeModel` API.

    Refer to the `boto3` documentation for more details.

    Args:
        request_body (dict): The request payload as a dictionary.

    Returns:
        dict: The response from the model invocation.

    """
    response = self.bedrock_runtime_client.invoke_model(
        modelId=self.model_id, body=json.dumps(request_body)
    )

    self._incr_token_counts(response)

    return response

`run()`

Run the evaluator within a trace context manager and run hooks if provided.

Source code in src/agenteval/evaluators/base_evaluator.py

def run(self) -> TestResult:
    """
    Run the evaluator within a trace context manager and run hooks
    if provided.
    """

    hook_cls = self._get_hook_cls(self.test.hook)

    with self.trace:
        if hook_cls:
            hook_cls.pre_evaluate(self.test, self.trace)
        self.test_result = self.evaluate()
        if hook_cls:
            hook_cls.post_evaluate(self.test, self.test_result, self.trace)

    return self.test_result

evaluator

BaseEvaluator

__init__(test, target, work_dir, model_id, provisioned_throughput_arn=None, aws_profile=None, aws_region=None, endpoint_url=None, max_retry=_DEFAULT_MAX_RETRY)

evaluate() abstractmethod

invoke_model(request_body)

run()

`BaseEvaluator`

`init(test, target, work_dir, model_id, provisioned_throughput_arn=None, aws_profile=None, aws_region=None, endpoint_url=None, max_retry=_DEFAULT_MAX_RETRY)`

`evaluate()` `abstractmethod`

`invoke_model(request_body)`

`run()`