Testing Tools in LLM Applications¶
Guide to testing SharedTools in LLM-powered applications, including strategies for testing tool interactions, LLM responses, and end-to-end workflows.
Testing Challenges with LLMs¶
LLM applications present unique testing challenges:
- Non-determinism: LLMs may return different outputs for the same input
- Tool Selection: LLM chooses which tools to use
- Parameter Generation: LLM generates tool parameters
- Multi-step Workflows: LLM orchestrates multiple tool calls
- Cost: API calls to LLMs can be expensive
- Speed: LLM responses can be slow
Testing Strategies¶
Strategy 1: Unit Test Tools in Isolation¶
Test tools without involving the LLM:
RSpec.describe SharedTools::Tools::BrowserTool do
let(:mock_driver) { instance_double(Browser::WatirDriver) }
let(:tool) { described_class.new(driver: mock_driver) }
it "executes visit action" do
expect(mock_driver).to receive(:goto).with(url: "https://test.com")
result = tool.execute(action: "visit", url: "https://test.com")
expect(result).to include("Navigated")
end
end
Benefits: - Fast - Deterministic - No LLM costs - Easy to debug
When to use: Always, as foundation of test suite
Strategy 2: Test Tool Selection Logic¶
Mock the LLM's tool selection:
RSpec.describe "Tool selection" do
let(:tools) do
[
SharedTools::Tools::BrowserTool.new(driver: mock_browser),
SharedTools::Tools::DiskTool.new(driver: mock_disk)
]
end
it "selects BrowserTool for web scraping" do
# Simulate LLM choosing browser tool
selected_tool = select_tool_for_task(
task: "Visit https://example.com",
tools: tools
)
expect(selected_tool).to be_a(SharedTools::Tools::BrowserTool)
end
it "selects DiskTool for file operations" do
selected_tool = select_tool_for_task(
task: "Read file.txt",
tools: tools
)
expect(selected_tool).to be_a(SharedTools::Tools::DiskTool)
end
end
Strategy 3: Test with Fixed LLM Responses¶
Record LLM responses and replay them:
RSpec.describe "LLM integration" do
let(:recorded_responses) do
{
"Visit example.com" => {
tool: "browser_tool",
action: "visit",
url: "https://example.com"
},
"Read file.txt" => {
tool: "disk_tool",
action: "file_read",
path: "./file.txt"
}
}
end
let(:mock_llm) do
double("LLM").tap do |llm|
allow(llm).to receive(:call) do |prompt|
recorded_responses[prompt]
end
end
end
it "handles visit command" do
response = mock_llm.call("Visit example.com")
tool = get_tool(response[:tool])
result = tool.execute(**response.except(:tool))
expect(result).to include("Navigated")
end
end
Strategy 4: Test with Real LLM (Sparingly)¶
Use real LLM for integration tests:
RSpec.describe "Full LLM integration", :slow, :integration do
let(:agent) do
RubyLLM::Agent.new(
tools: [
SharedTools::Tools::BrowserTool.new(driver: mock_browser),
SharedTools::Tools::DiskTool.new(driver: mock_disk)
]
)
end
it "scrapes and saves data" do
# This makes a real LLM API call
response = agent.process("Visit example.com and save the title to title.txt")
# Verify the LLM used the right tools
expect(mock_browser).to have_received(:goto)
expect(mock_disk).to have_received(:file_write)
end
end
Run sparingly:
Testing Patterns¶
Pattern 1: Mock LLM Agent¶
Create a predictable fake LLM:
class MockLLMAgent
def initialize(responses:)
@responses = responses
@call_count = 0
end
def call(prompt)
@call_count += 1
@responses[prompt] || raise("Unexpected prompt: #{prompt}")
end
attr_reader :call_count
end
RSpec.describe "LLM workflow" do
let(:agent) do
MockLLMAgent.new(
responses: {
"Scrape products" => {
tool: "browser_tool",
action: "visit",
url: "https://shop.com/products"
},
"Save to database" => {
tool: "database_tool",
statements: ["INSERT INTO products..."]
}
}
)
end
it "follows the workflow" do
agent.call("Scrape products")
agent.call("Save to database")
expect(agent.call_count).to eq(2)
end
end
Pattern 2: Fixture-Based Testing¶
Store LLM responses as fixtures:
# spec/fixtures/llm_responses/scrape_workflow.json
{
"steps": [
{
"prompt": "Visit the products page",
"response": {
"tool": "browser_tool",
"action": "visit",
"url": "https://example.com/products"
}
},
{
"prompt": "Get page content",
"response": {
"tool": "browser_tool",
"action": "page_inspect",
"full_html": true
}
}
]
}
# In tests
let(:workflow) { JSON.parse(File.read('spec/fixtures/llm_responses/scrape_workflow.json')) }
it "executes workflow" do
workflow['steps'].each do |step|
response = step['response']
tool = get_tool(response['tool'])
result = tool.execute(**response.except('tool'))
expect(result).to be_truthy
end
end
Pattern 3: Assertion on Tool Usage¶
Verify the LLM used tools correctly:
RSpec.describe "Tool usage tracking" do
let(:browser) { instance_spy(SharedTools::Tools::BrowserTool) }
let(:disk) { instance_spy(SharedTools::Tools::DiskTool) }
let(:agent) do
LLMAgent.new(tools: [browser, disk])
end
it "uses browser before disk" do
agent.process("Scrape example.com and save to file.txt")
# Verify order of operations
expect(browser).to have_received(:execute).ordered
expect(disk).to have_received(:execute).ordered
end
it "passes correct parameters" do
agent.process("Visit https://example.com")
expect(browser).to have_received(:execute).with(
hash_including(
action: "visit",
url: "https://example.com"
)
)
end
end
Pattern 4: Snapshot Testing¶
Record full LLM interactions for regression testing:
RSpec.describe "LLM snapshots" do
it "matches recorded interaction" do
VCR.use_cassette("scraping_workflow") do
result = agent.process("Scrape products from example.com")
expect(result).to match_snapshot("scraping_workflow_result")
end
end
end
Testing Tool Descriptions¶
Test that tool descriptions are accurate:
RSpec.describe "Tool descriptions" do
let(:tool) { SharedTools::Tools::BrowserTool }
it "includes all actions in description" do
description = tool.description
SharedTools::Tools::BrowserTool::ACTIONS.each do |action|
expect(description).to include(action)
end
end
it "documents required parameters" do
description = tool.description
expect(description).to include("action")
expect(description).to include("url")
expect(description).to include("selector")
end
it "provides usage examples" do
description = tool.description
expect(description).to match(/"action":\s*"visit"/)
expect(description).to match(/"action":\s*"click"/)
end
end
Testing Multi-Step Workflows¶
Sequential Tool Calls¶
RSpec.describe "Multi-step workflow" do
let(:browser) { instance_double(SharedTools::Tools::BrowserTool) }
let(:database) { instance_double(SharedTools::Tools::DatabaseTool) }
let(:disk) { instance_double(SharedTools::Tools::DiskTool) }
before do
allow(browser).to receive(:execute).and_return("<html>...</html>")
allow(database).to receive(:execute).and_return([{ status: :ok }])
allow(disk).to receive(:execute).and_return("Saved")
end
it "completes workflow in order" do
# Step 1: Scrape
html = browser.execute(action: "visit", url: "https://example.com")
expect(html).to be_a(String)
# Step 2: Store
results = database.execute(statements: ["INSERT..."])
expect(results.first[:status]).to eq(:ok)
# Step 3: Report
report = disk.execute(action: "file_write", path: "./report.txt", text: "...")
expect(report).to eq("Saved")
end
end
Error Recovery in Workflows¶
RSpec.describe "Workflow error handling" do
it "handles browser errors" do
allow(browser).to receive(:execute).and_raise("Network error")
expect {
workflow.execute
}.to raise_error(/Network error/)
# Verify cleanup happened
expect(browser).to have_received(:cleanup!)
end
it "continues after recoverable errors" do
allow(browser).to receive(:execute).and_raise("Element not found")
allow(workflow).to receive(:retry_browser_action).and_return("success")
result = workflow.execute_with_retry
expect(result).to eq("success")
expect(workflow).to have_received(:retry_browser_action)
end
end
Performance Testing¶
Measure Tool Execution Time¶
RSpec.describe "Performance" do
it "completes within acceptable time" do
start_time = Time.now
tool.execute(action: "complex_operation")
elapsed = Time.now - start_time
expect(elapsed).to be < 5.0 # 5 seconds max
end
end
Test with Large Datasets¶
RSpec.describe "Scalability" do
it "handles large result sets" do
# Generate 10,000 test records
records = 10_000.times.map { |i| ["Record #{i}"] }
allow(database).to receive(:execute).and_return(records)
result = tool.process_large_dataset
expect(result).to be_truthy
end
end
Testing Authorization¶
Test human-in-the-loop confirmation:
RSpec.describe "Authorization" do
before do
SharedTools.auto_execute(false)
end
it "prompts for confirmation" do
allow(STDIN).to receive(:getch).and_return('y')
expect(SharedTools).to receive(:execute?).and_return(true)
tool.execute(action: "delete", path: "./important.txt")
end
it "skips confirmation in auto mode" do
SharedTools.auto_execute(true)
expect(SharedTools).not_to receive(:execute?)
tool.execute(action: "delete", path: "./file.txt")
end
after do
SharedTools.auto_execute(false)
end
end
Testing with Different LLM Providers¶
Provider-Agnostic Tests¶
RSpec.describe "LLM provider compatibility" do
let(:tools) { [SharedTools::Tools::BrowserTool.new] }
shared_examples "LLM provider" do |provider_name|
it "works with #{provider_name}" do
agent = create_agent_for(provider_name, tools: tools)
result = agent.process("Visit example.com")
expect(result).to be_truthy
end
end
include_examples "LLM provider", :openai
include_examples "LLM provider", :anthropic
include_examples "LLM provider", :ollama
end
Continuous Integration¶
CI Configuration¶
# .github/workflows/test.yml
name: Tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
ruby-version: 3.2
- name: Install dependencies
run: bundle install
- name: Run unit tests
run: bundle exec rspec --tag ~integration --tag ~slow
- name: Run integration tests
run: bundle exec rspec --tag integration
env:
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
# Only run on main branch to save API costs
if: github.ref == 'refs/heads/main'
Test Organization¶
Directory Structure¶
spec/
├── unit/
│ ├── tools/
│ │ ├── browser_tool_spec.rb
│ │ ├── disk_tool_spec.rb
│ │ └── database_tool_spec.rb
│ └── drivers/
│ ├── watir_driver_spec.rb
│ └── sqlite_driver_spec.rb
├── integration/
│ ├── workflows/
│ │ ├── scraping_workflow_spec.rb
│ │ └── data_pipeline_spec.rb
│ └── llm/
│ ├── tool_selection_spec.rb
│ └── parameter_generation_spec.rb
├── fixtures/
│ ├── html/
│ ├── databases/
│ └── llm_responses/
└── support/
├── mock_drivers.rb
├── mock_llm.rb
└── shared_examples.rb
Test Configuration¶
# spec/spec_helper.rb
RSpec.configure do |config|
# Fast unit tests by default
config.filter_run_excluding :integration, :slow
# Tag LLM tests for special handling
config.around(:each, :llm) do |example|
skip "LLM tests disabled" unless ENV['RUN_LLM_TESTS']
example.run
end
# Setup mock drivers globally
config.before(:each) do
@mock_browser = instance_double(Browser::WatirDriver)
@mock_database = MockDatabaseDriver.new
@mock_disk = MockDiskDriver.new
end
end
Best Practices¶
DO:¶
- Test tools in isolation first
- Use mock drivers for unit tests
- Record LLM responses for deterministic tests
- Test error cases thoroughly
- Verify tool descriptions
- Test authorization system
- Use fixtures for complex data
- Tag expensive tests appropriately
DON'T:¶
- Make real LLM API calls in unit tests
- Test implementation details
- Assume LLM will always choose correct tool
- Ignore non-deterministic behavior
- Skip integration tests entirely
- Hard-code test data
- Leave slow tests untagged
- Test with real databases/browsers unless necessary
Debugging LLM Interactions¶
Log LLM Requests and Responses¶
RSpec.configure do |config|
config.around(:each, :llm) do |example|
logger = Logger.new('spec/logs/llm_interactions.log')
# Wrap LLM calls with logging
original_call = agent.method(:call)
allow(agent).to receive(:call) do |prompt|
logger.info("Prompt: #{prompt}")
response = original_call.call(prompt)
logger.info("Response: #{response.inspect}")
response
end
example.run
end
end
Inspect Tool Parameters¶
it "generates correct parameters" do
tool = spy("BrowserTool")
agent = LLMAgent.new(tools: [tool])
agent.process("Visit example.com")
# Inspect what parameters the LLM generated
expect(tool).to have_received(:execute) do |**params|
puts "LLM generated parameters: #{params.inspect}"
expect(params[:url]).to eq("https://example.com")
end
end
Next Steps¶
- Review Error Handling Guide
- Explore Testing Examples
- Read Development Guide
- Check Example Workflows