Test Runs via SDK

Python

Learn how to programmatically run test runs using Maxim's SDK with custom datasets, flexible output functions, and evaluations for your AI applications.

The SDK uses a builder pattern to configure and run tests. Here's a basic example:

Test run template
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
result = maxim
  .create_test_run("My First SDK Test", "your-workspace-id")
  .with_data_structure() # your data structure here
  .with_data() # your data here
  .yields_output() # your output function here
  .with_workflow_id() # or you can pass workflow id from Maxim platform
  .with_prompt_version_id() # or you can pass prompt version id from Maxim platform
  .with_evaluators() # your evaluators here
  .run();

To copy workspace id, you can selected workspace switcher from the left topbar and click info icon.

image

Understanding Data Structure

The data structure is a crucial concept that helps maintain type safety and validates your data columns. It maps your data columns to specific types that Maxim understands.

Basic Structure

The data structure is an object where keys are your column names and values are the column types.

Example data structure
data_structure = {
    myQuestionColumn: "INPUT",
    expectedAnswerColumn: "EXPECTED_OUTPUT",
    contextColumn: "CONTEXT_TO_EVALUATE",
    additionalDataColumn: "VARIABLE"
}

Available Types

  • INPUT - Main input text (only one allowed)
  • EXPECTED_OUTPUT - Expected response (only one allowed)
  • CONTEXT_TO_EVALUATE - Context for evaluation (only one allowed)
  • VARIABLE - Additional data columns (multiple allowed)
  • NULLABLE_VARIABLE - Optional data columns (multiple allowed)

Example

Using the data structure
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
result = maxim
    .create_test_run("Question Answering Test", workspace_id)
    .with_data_structure({
        question: "INPUT",
        answer: "EXPECTED_OUTPUT",
        context: "CONTEXT_TO_EVALUATE",
        metadata: "NULLABLE_VARIABLE"
    })
    # ... rest of the configuration

Working with Data Sources

Maxim's SDK supports multiple ways to provide test data:

1. Callable

Using a callable
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
index = 0
 
def get_next_row() -> Optional[Dict[str, Any]]:
    index = index + 1
    return db.get_row(index)
 
 
result = maxim
    .create_test_run("CSV Test Run", workspace_id)
    .with_data_structure({
        question: "INPUT",
        answer: "EXPECTED_OUTPUT",
        context: "CONTEXT_TO_EVALUATE"
    })
    .with_data(get_next_row)
    # ... rest of the configuration

2. Manual Data Array

For smaller datasets or programmatically generated data:

Using a manual data array
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
manual_data = [
    {
        question: "What is the capital of France?",
        answer: "Paris",
        context: "France is a country in Western Europe..."
    },
    {
        question: "Who wrote Romeo and Juliet?",
        answer: "William Shakespeare",
        context: "William Shakespeare was an English playwright..."
    }
];
 
result = maxim
    .create_test_run("Manual Data Test", workspace_id)
    .with_data_structure({
        question: "INPUT",
        answer: "EXPECTED_OUTPUT",
        context: "CONTEXT_TO_EVALUATE"
    })
    .with_data(manual_data)
    # ... rest of the configuration

3. Platform Dataset

Use existing datasets from your Maxim workspace:

Using a platform dataset
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
result = maxim
    .create_test_run("Platform Dataset Test", workspaceId)
    .with_data_structure({
        question: "INPUT",
        answer: "EXPECTED_OUTPUT",
        context: "CONTEXT_TO_EVALUATE"
    })
    .with_data("your-dataset-id")
    # ... rest of the configuration

Trigger a test on a workflow stored on Maxim platform

Trigger test run on a workflow
const result = maxim
    .create_test_run("Custom Output Test", workspaceId)
    .with_data_structure({
        question: "INPUT",
        answer: "EXPECTED_OUTPUT",
        context: "CONTEXT_TO_EVALUATE"
    })
    .with_data(myData)
    .with_workflow_id(workflowIdFromDashboard, contextToEvaluate) # context to evaluate is optional; it can either be a variable used in the workflow or a column name present in the dataset

To get workflow id, go to workflows tab and from menu click on copy id.

topbar

Trigger a test on a prompt version stored on Maxim platform

Trigger test run on a prompt version
const result = maxim
    .create_test_run("Custom Output Test", workspaceId)
    .with_data_structure({
        question: "INPUT",
        answer: "EXPECTED_OUTPUT",
        context: "CONTEXT_TO_EVALUATE"
    })
    .with_data(myData)
    .with_prompt_version_id(promptVersionIdFromPlatform, contextToEvaluate) # context to evaluate is optional; it can either be a variable used in the prompt or a column name present in the dataset

To get prompt version id, go to prompts tab, select the version you want to run tests on and from menu click on copy version id.

topbar

Custom Output Function

The output function is where you define how to generate responses for your test cases:

Implementing a custom output function
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
def run(data):
    # ======================================================#
    # REPLACE THIS WITH YOUR ACTUAL WORKFLOW / MODEL CALL
    # ======================================================#
    response = {
        "text": "dummy response",
        "usage":{
            "prompt_tokens":10,
            "completion_tokens":20,
            "total_tokens":30,
            "latency": 233
        },
        "cost": {
            "input_cost": 0.2,
            "output_cost": 0.002,
            "total_cost": 0.202
        }
    }
    # ======================================================#
    # END
    # ======================================================#
    return {
        # Required: The actual output
        data: response.text,
 
        # Optional: Context used for evaluation
        # Returning a value here will utilize this context for
        # evaluation instead of the CONTEXT_TO_EVALUATE column (if provided)
        retrieved_context_to_evaluate: response.relevantContext,
 
        # Optional: Performance metrics
        meta: {
            usage: {
                prompt_tokens: response.usage.prompt_tokens,
                completion_tokens: response.usage.completion_tokens,
                total_tokens: response.usage.total_tokens,
                latency: response.latency
            },
            cost: {
                input_cost: response.cost.input,
                output_cost: response.cost.output,
                total_cost: response.cost.input + response.cost.output
            }
        }
    }
 
result = maxim
    .create_test_run("Custom Output Test", workspaceId)
    .with_data_structure({
        question: "INPUT",
        answer: "EXPECTED_OUTPUT",
        context: "CONTEXT_TO_EVALUATE"
    })
    .with_data(myData)
    .yields_output(lambda data: run(data))

If your output function throws an error, the entry will be marked as failed and you'll receive the index in the failed_entry_indices array after the run completes.

Adding Evaluators

Choose which evaluators to use for your test run:

Adding evaluators
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
result = maxim
    .create_test_run("Evaluated Test", workspace_id)
    # ... previous configuration
    .with_evaluators(
        "Faithfulness", # names of evaluators installed in your workspace
        "Semantic Similarity",
        "Answer Relevance"
    )

Human Evaluation

For evaluators that require human input, setting up the human evaluation configuration is required and can be done as follows:

Setting up human evaluation
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
result = maxim
    .create_test_run("Human Evaluated Test", workspace_id)
    # ... previous configuration
    .with_evaluators("Human Evaluator")
    .with_human_evaluation_config({
        emails: ["[email protected]"],
        instructions: "Please evaluate the response according to the evaluation criteria"
    })

Custom Evaluators

You can create custom evaluators to implement specific evaluation logic for your test runs:

Creating custom evaluators
from maxim import Maxim
from typing import Dict
from maxim.evaluators import BaseEvaluator
from maxim.models import LocalEvaluatorResultParameter, LocalEvaluatorReturn, PassFailCriteria, PassFailCriteriaForTestrunOverall, PassFailCriteriaOnEachEntry, ManualData, Data, TestRunLogger, YieldedOutput
 
class MyCustomEvaluator(BaseEvaluator):
    # implement evaluate function
    def evaluate(
        self, result: LocalEvaluatorResultParameter, data: ManualData
    ) -> Dict[str, LocalEvaluatorReturn]:
        # You can pass as many scores as you want in this dict
        # All of these will show up in the test run report
        return {
            "apostrophe-checker-2": LocalEvaluatorReturn(
                score="'" in result.output,
                reasoning="The output contains an apostrophe" if "'" in result.output else "The output does not contain an apostrophe"
            ),
            "contains_special_characters": LocalEvaluatorReturn(
                score=len([char for char in result.output if char in '!@#$%^&*(),.?":{}|<>']),
                reasoning="The output contains special characters"
            )
        }

Using Custom Evaluators

Once created, custom evaluators can be used alongside built-in evaluators:

Using custom evaluators
result = maxim
    .create_test_run("Custom Evaluated Test", workspace_id)
    # ... previous configuration
    .with_evaluators(
        # Platform evaluators
        "Faithfulness",
        "Semantic Similarity",
        # Custom evaluators
        MyCustomEvaluator(
            pass_fail_criteria={
                "apostrophe-checker-2": PassFailCriteria(
                    on_each_entry_pass_if=PassFailCriteriaOnEachEntry(
                        score_should_be="=",
                        value=True
                    ),
                    for_testrun_overall_pass_if=PassFailCriteriaForTestrunOverall(
                        overall_should_be=">=",
                        value=80,
                        for_result="percentageOfPassedResults"
                    )
                ),
                "contains-special-characters": PassFailCriteria(
                    on_each_entry_pass_if=PassFailCriteriaOnEachEntry(
                        score_should_be=">",
                        value=3
                    ),
                    for_testrun_overall_pass_if=PassFailCriteriaForTestrunOverall(
                        overall_should_be=">=",
                        value=80,
                        for_result="percentageOfPassedResults"
                    )
                )
            }
        )
    )

Advanced Configuration

Concurrency Control

Manage how many entries are processed in parallel:

Configuring concurrency
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
result = maxim
    .create_test_run("Concurrent Test", workspace_id)
    # ... previous configuration
    .with_concurrency(5) # Process 5 entries at a time

Timeout Configuration

Set custom timeout for long-running tests:

Configuring timeout
from maxim import Maxim, Config
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
result = maxim
    .create_test_run("Long Test", workspace_id)
    # ... previous configuration
    .run(120) # Wait up to 120 minutes

Complete Example

Here's a complete example combining all the features:

Test run example
from maxim import Maxim, Config
from maxim.evaluators.evaluators import create_custom_evaluator
from maxim.models.dataset import ManualData
from maxim.models.evaluator import LocalEvaluatorResultParameter, LocalEvaluatorReturn, PassFailCriteria, PassFailCriteriaForTestrunOverall, PassFailCriteriaOnEachEntry
 
maxim = Maxim(Config(api_key="YOUR_API_KEY"))
 
def apostrophe_checker(result: LocalEvaluatorResultParameter, data: ManualData) -> LocalEvaluatorReturn:
    if "'" in result.output:
        return LocalEvaluatorReturn(
            score=True,
            reasoning="The output contains an apostrophe"
        )
    else:
        return LocalEvaluatorReturn(
            score=False,
            reasoning="The output does not contain an apostrophe"
        )
 
custom_evaluator = create_custom_evaluator(
    "apostrophe-checker",
    apostrophe_checker,
    PassFailCriteria(
        on_each_entry=PassFailCriteriaOnEachEntry(
            score_should_be="=",
            value=True
        ),
        for_testrun_overall=PassFailCriteriaForTestrunOverall(
            overall_should_be=">=",
            value=80,
            for_result="percentageOfPassedResults"
        )
    )
)
 
def run(data):
    start_time = time.time()
 
    # Your model call here
    response = your_model.generate_answer(
        data.question,
        data.context
    )
 
    latency = (time.time() - start_time) * 1000  # Convert to milliseconds
 
    return {
        "data": response.answer,
        # Returning a value here will utilize this context for
        # evaluation instead of the CONTEXT_TO_EVALUATE column
        # (in this case, the `context` column)
        "retrieved_context_to_evaluate": response.retrieved_context,
        "meta": {
            "usage": {
                "prompt_tokens": response.tokens.prompt,
                "completion_tokens": response.tokens.completion,
                "total_tokens": response.tokens.total,
                "latency": latency
            },
            "cost": {
                "input_cost": response.cost.prompt,
                "output_cost": response.cost.completion,
                "total_cost": response.cost.total
            }
        }
    }
 
 
try:
    result = maxim
        .create_test_run(f"QA Evaluation {time.now()}", 'your-workspace-id')
        .with_data_structure({
            "question": "INPUT",
            "expected_answer": "EXPECTED_OUTPUT",
            "context": "CONTEXT_TO_EVALUATE",
            "metadata": "NULLABLE_VARIABLE"
        })
        .with_data(testData)
        .yields_output(lambda data : run(data))
        .with_evaluators(
            custom_evaluator,
            "Faithfulness",
            "Answer Relevance",
            "Human Evaluator"
        )
        .with_human_evaluation_config({
            "emails": ["[email protected]"],
            "instructions": 'Please evaluate the responses for accuracy and completeness. Consider both factual correctness and answer format.'
        })
        .with_concurrency(10)
        .run(30); # 30 minutes timeout
 
    print("Test Run Link:", result.test_run_result.link);
    print("Failed Entries:", result.failed_entry_indices);
    print("Evaluation Results:", result.test_run_result.result[0]);
    """
    the result.test_run_result.result[0] object looks like this (values are mock data):
    {
        cost: {
            input: 1.905419538506091,
            completion: 2.010163610111029,
            total: 3.915583148617119
        },
        latency: {
            min: 6,
            max: 484.5761906393187,
            p50: 438,
            p90: 484,
            p95: 484,
            p99: 484,
            mean: 346.2,
            standard_deviation: 179.4284,
            total: 5
        },
        name: 'sdk test run 1734931207308',
        usage: { completion: 206, input: 150, total: 356 },
        individual_evaluator_mean_score: {
            Faithfulness: { score: 0, outOf: 1, pass: False },
            'Answer Relevance': { score: 0.2, outOf: 1, pass: True },
            'apostrophe-checker': { score: 0.7, pass: False },
        }
    }
    """
except Exception as e:
    print("Test Run Failed:", e)
finally:
    maxim.cleanup();

On this page