feat: Implement trace-level sampling with should_evaluate propagation#32
Conversation
There was a problem hiding this comment.
Pull request overview
This pull request implements trace-level sampling with should_evaluate propagation, introducing a mechanism to make a single sampling decision at the root span that propagates to all child spans in a trace. The feature ensures evaluators can be selectively run based on a configurable sample rate, with experiments always forcing evaluation regardless of the sample rate.
Changes:
- Adds trace-level sampling via
should_evaluateattribute that propagates from root to all child spans - Introduces global and per-trace sample rate configuration (default 0.0)
- Implements experiment override that forces evaluation regardless of sample rate
Reviewed changes
Copilot reviewed 8 out of 9 changed files in this pull request and generated 8 comments.
Show a summary per file
| File | Description |
|---|---|
| tests/observability/test_should_evaluate_propagation.py | Comprehensive test suite for should_evaluate propagation across sync contexts with various scenarios |
| basalt/observability/trace_context.py | Adds SHOULD_EVALUATE_CONTEXT_KEY and set_global_sample_rate function for managing trace-level sampling configuration |
| basalt/observability/semconv.py | Defines SHOULD_EVALUATE semantic convention attribute for span-level evaluation flags |
| basalt/observability/processors.py | Implements BasaltShouldEvaluateProcessor to apply should_evaluate from context to span attributes |
| basalt/observability/instrumentation.py | Installs BasaltShouldEvaluateProcessor and sets global sample rate from config during initialization |
| basalt/observability/context_managers.py | Implements sampling decision logic in both sync and async span creation, propagates should_evaluate through context |
| basalt/observability/config.py | Adds sample_rate field to TelemetryConfig with environment variable support (BASALT_SAMPLE_RATE) |
| basalt/observability/api.py | Passes evaluate_config and experiment parameters through to underlying context managers |
| .gitignore | Adds .env* pattern to ignore environment configuration files |
Comments suppressed due to low confidence (1)
basalt/observability/trace_context.py:137
- The configure_global_metadata function (line 136) overwrites the entire _DEFAULT_CONTEXT, which means if set_global_sample_rate is called before configure_global_metadata, the sample_rate will be lost. This creates an order-dependency bug between these two functions. Consider passing all existing fields when creating the new config: _TraceContextConfig(observe_metadata=metadata, sample_rate=_DEFAULT_CONTEXT.sample_rate, experiment=_DEFAULT_CONTEXT.experiment).
def configure_global_metadata(metadata: dict[str, Any] | None) -> None:
"""
Configure global observability metadata applied to all traces.
Args:
metadata: Dictionary of metadata key-value pairs.
"""
config = _TraceContextConfig(observe_metadata=metadata)
_set_trace_defaults(config)
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| """Tests for should_evaluate propagation across all spans in a trace.""" | ||
|
|
||
| from collections.abc import Sequence | ||
|
|
||
| import pytest | ||
| from opentelemetry import trace | ||
| from opentelemetry.sdk.trace import ReadableSpan, TracerProvider | ||
| from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter, SpanExportResult | ||
|
|
||
| from basalt.observability import observe, start_observe | ||
| from basalt.observability.context_managers import EvaluationConfig | ||
| from basalt.observability.decorators import ObserveKind | ||
| from basalt.observability.processors import BasaltShouldEvaluateProcessor | ||
| from basalt.observability.semconv import BasaltSpan | ||
|
|
||
|
|
||
| class InMemorySpanExporter(SpanExporter): | ||
| """Simple in-memory span exporter for testing.""" | ||
|
|
||
| def __init__(self): | ||
| self.spans = [] | ||
|
|
||
| def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: | ||
| self.spans.extend(spans) | ||
| return SpanExportResult.SUCCESS | ||
|
|
||
| def shutdown(self): | ||
| pass | ||
|
|
||
| def force_flush(self, timeout_millis: int = 30000) -> bool: | ||
| return True | ||
|
|
||
| def get_finished_spans(self): | ||
| return self.spans | ||
|
|
||
| def clear(self): | ||
| self.spans = [] | ||
|
|
||
|
|
||
| @pytest.fixture(scope="function") | ||
| def setup_tracer(): | ||
| """Setup tracer with in-memory exporter for testing.""" | ||
| exporter = InMemorySpanExporter() | ||
| provider = trace.get_tracer_provider() | ||
|
|
||
| # If provider is a ProxyTracerProvider, create a real one | ||
| if type(provider).__name__ == 'ProxyTracerProvider': | ||
| provider = TracerProvider() | ||
| provider.add_span_processor(BasaltShouldEvaluateProcessor()) | ||
| trace.set_tracer_provider(provider) | ||
|
|
||
| # Ensure BasaltShouldEvaluateProcessor is installed | ||
| if not hasattr(provider, '_basalt_should_evaluate_installed'): | ||
| processor = BasaltShouldEvaluateProcessor() | ||
| provider.add_span_processor(processor) | ||
| provider._basalt_should_evaluate_installed = True # type: ignore[attr-defined] | ||
|
|
||
| # Add the exporter processor | ||
| span_processor = SimpleSpanProcessor(exporter) | ||
| provider.add_span_processor(span_processor) | ||
|
|
||
| yield exporter | ||
|
|
||
| exporter.clear() | ||
|
|
||
|
|
||
| class TestShouldEvaluatePropagation: | ||
| """Test suite for should_evaluate propagation.""" | ||
|
|
||
| def test_sample_rate_1_propagates_to_children(self, setup_tracer): | ||
| """Test that should_evaluate=True propagates to all child spans.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="parent", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="grandchild", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="child2", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 4, f"Expected 4 spans, got {len(spans)}" | ||
|
|
||
| # All spans should have should_evaluate=True | ||
| for span in spans: | ||
| assert BasaltSpan.SHOULD_EVALUATE in span.attributes, \ | ||
| f"Span {span.name} missing should_evaluate" | ||
| assert span.attributes[BasaltSpan.SHOULD_EVALUATE] is True, \ | ||
| f"Span {span.name} has should_evaluate={span.attributes[BasaltSpan.SHOULD_EVALUATE]}, expected True" | ||
|
|
||
| def test_sample_rate_0_propagates_to_children(self, setup_tracer): | ||
| """Test that should_evaluate=False propagates to all child spans.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="parent", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="grandchild", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="child2", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 4, f"Expected 4 spans, got {len(spans)}" | ||
|
|
||
| # All spans should have should_evaluate=False | ||
| for span in spans: | ||
| assert BasaltSpan.SHOULD_EVALUATE in span.attributes, \ | ||
| f"Span {span.name} missing should_evaluate" | ||
| assert span.attributes[BasaltSpan.SHOULD_EVALUATE] is False, \ | ||
| f"Span {span.name} has should_evaluate={span.attributes[BasaltSpan.SHOULD_EVALUATE]}, expected False" | ||
|
|
||
| def test_experiment_forces_true_for_all_spans(self, setup_tracer): | ||
| """Test that experiment forces should_evaluate=True for all spans.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="parent", | ||
| feature_slug="test", | ||
| experiment="exp_123", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) # Would normally be False | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="grandchild", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="child2", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 4, f"Expected 4 spans, got {len(spans)}" | ||
|
|
||
| # All spans should have should_evaluate=True due to experiment | ||
| for span in spans: | ||
| assert BasaltSpan.SHOULD_EVALUATE in span.attributes, \ | ||
| f"Span {span.name} missing should_evaluate" | ||
| assert span.attributes[BasaltSpan.SHOULD_EVALUATE] is True, \ | ||
| f"Span {span.name} has should_evaluate={span.attributes[BasaltSpan.SHOULD_EVALUATE]}, expected True due to experiment" | ||
|
|
||
| def test_experiment_overrides_sample_rate_for_all_spans(self, setup_tracer): | ||
| """Test that experiment overrides sample_rate=0.0 for entire trace.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="experiment_trace", | ||
| feature_slug="test", | ||
| experiment="exp_456" | ||
| # No evaluate_config, global default is 0.0 | ||
| ): | ||
| with observe(name="processing", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="llm_call", kind=ObserveKind.GENERATION): | ||
| with observe(name="retrieval", kind=ObserveKind.RETRIEVAL): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 4, f"Expected 4 spans, got {len(spans)}" | ||
|
|
||
| # Verify all have should_evaluate=True | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True, \ | ||
| f"Span {span.name} should have should_evaluate=True with experiment" | ||
|
|
||
| def test_deeply_nested_spans_propagate(self, setup_tracer): | ||
| """Test propagation through deeply nested span hierarchy.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="root", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ): | ||
| with observe(name="level1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="level2", kind=ObserveKind.FUNCTION): | ||
| with observe(name="level3", kind=ObserveKind.FUNCTION): | ||
| with observe(name="level4", kind=ObserveKind.FUNCTION): | ||
| with observe(name="level5", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 6, f"Expected 6 spans, got {len(spans)}" | ||
|
|
||
| # All spans should have same should_evaluate value | ||
| should_evaluate_values = [ | ||
| span.attributes.get(BasaltSpan.SHOULD_EVALUATE) | ||
| for span in spans | ||
| ] | ||
| assert all(v is True for v in should_evaluate_values), \ | ||
| f"All spans should have should_evaluate=True, got: {should_evaluate_values}" | ||
|
|
||
| def test_multiple_child_branches_propagate(self, setup_tracer): | ||
| """Test propagation across multiple child branches.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="root", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ): | ||
| # Branch 1 | ||
| with observe(name="branch1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="branch1_child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| # Branch 2 | ||
| with observe(name="branch2", kind=ObserveKind.GENERATION): | ||
| with observe(name="branch2_child", kind=ObserveKind.RETRIEVAL): | ||
| pass | ||
|
|
||
| # Branch 3 | ||
| with observe(name="branch3", kind=ObserveKind.TOOL): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 6, f"Expected 6 spans, got {len(spans)}" | ||
|
|
||
| # All should be False | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is False, \ | ||
| f"Span {span.name} should have should_evaluate=False" | ||
|
|
||
| def test_decorator_style_propagation(self, setup_tracer): | ||
| """Test propagation works with decorator-style usage.""" | ||
| exporter = setup_tracer | ||
|
|
||
| @start_observe( | ||
| name="decorated_root", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ) | ||
| def root_function(): | ||
| with observe(name="child_in_decorator", kind=ObserveKind.FUNCTION): | ||
| nested_function() | ||
|
|
||
| @observe(name="nested_decorated", kind=ObserveKind.FUNCTION) | ||
| def nested_function(): | ||
| pass | ||
|
|
||
| root_function() | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 3, f"Expected 3 spans, got {len(spans)}" | ||
|
|
||
| # All should have should_evaluate=True | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True, \ | ||
| f"Span {span.name} should have should_evaluate=True" | ||
|
|
||
| def test_experiment_with_decorator_propagation(self, setup_tracer): | ||
| """Test experiment forces evaluation with decorator pattern.""" | ||
| exporter = setup_tracer | ||
|
|
||
| @start_observe( | ||
| name="experiment_decorated", | ||
| feature_slug="test", | ||
| experiment="exp_789", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ) | ||
| def experiment_function(): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| experiment_function() | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 2, f"Expected 2 spans, got {len(spans)}" | ||
|
|
||
| # Both should be True due to experiment | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True, \ | ||
| f"Span {span.name} should have should_evaluate=True with experiment" | ||
|
|
||
| def test_mixed_span_kinds_propagation(self, setup_tracer): | ||
| """Test propagation across different span kinds.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="mixed_trace", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ): | ||
| with observe(name="generation", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| with observe(name="retrieval", kind=ObserveKind.RETRIEVAL): | ||
| pass | ||
|
|
||
| with observe(name="tool", kind=ObserveKind.TOOL): | ||
| pass | ||
|
|
||
| with observe(name="function", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="event", kind=ObserveKind.EVENT): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 6, f"Expected 6 spans, got {len(spans)}" | ||
|
|
||
| # All different kinds should have same should_evaluate | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True, \ | ||
| f"Span {span.name} (kind={span.attributes.get('basalt.span.kind')}) should have should_evaluate=True" | ||
|
|
||
| def test_no_evaluate_config_uses_global_default(self, setup_tracer): | ||
| """Test that without evaluate_config, global default (0.0) is used for all spans.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="default_trace", | ||
| feature_slug="test" | ||
| # No evaluate_config, should use global default 0.0 | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="child2", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 3, f"Expected 3 spans, got {len(spans)}" | ||
|
|
||
| # All should be False (global default is 0.0) | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is False, \ | ||
| f"Span {span.name} should have should_evaluate=False with global default" | ||
|
|
||
| def test_trace_consistency(self, setup_tracer): | ||
| """Test that all spans in a trace have the SAME should_evaluate value.""" | ||
| exporter = setup_tracer | ||
|
|
||
| # Test with sample_rate=1.0 | ||
| with start_observe( | ||
| name="consistent_trace", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="grandchild", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| should_evaluate_values = [ | ||
| span.attributes.get(BasaltSpan.SHOULD_EVALUATE) | ||
| for span in spans | ||
| ] | ||
|
|
||
| # All values should be identical | ||
| assert len(set(should_evaluate_values)) == 1, \ | ||
| f"All spans should have same should_evaluate value, got: {should_evaluate_values}" | ||
| assert should_evaluate_values[0] is True | ||
|
|
||
|
|
||
| class TestExperimentShouldEvaluate: | ||
| """Tests specific to experiment forcing evaluation.""" | ||
|
|
||
| def test_experiment_string_forces_evaluation(self, setup_tracer): | ||
| """Test that string experiment ID forces evaluation.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="trace", | ||
| feature_slug="test", | ||
| experiment="exp_string", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
|
|
||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True | ||
| # Verify experiment ID is attached to root span | ||
| if span.name == "trace": | ||
| assert "basalt.experiment.id" in span.attributes | ||
| assert span.attributes["basalt.experiment.id"] == "exp_string" | ||
|
|
||
| def test_experiment_object_forces_evaluation(self, setup_tracer): | ||
| """Test that experiment object forces evaluation.""" | ||
| exporter = setup_tracer | ||
|
|
||
| class MockExperiment: | ||
| def __init__(self, id, name=None): | ||
| self.id = id | ||
| self.name = name | ||
|
|
||
| exp = MockExperiment(id="exp_obj", name="Test Experiment") | ||
|
|
||
| with start_observe( | ||
| name="trace", | ||
| feature_slug="test", | ||
| experiment=exp, | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
|
|
||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True | ||
|
|
||
| def test_experiment_without_evaluate_config(self, setup_tracer): | ||
| """Test experiment works without explicit evaluate_config.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="trace", | ||
| feature_slug="test", | ||
| experiment="exp_no_config" | ||
| # No evaluate_config at all | ||
| ): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
|
|
||
| # Should all be True due to experiment | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True | ||
|
|
||
| def test_no_experiment_respects_sample_rate_zero(self, setup_tracer): | ||
| """Test that without experiment, sample_rate=0.0 is respected.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="trace", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| # No experiment | ||
| ): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
|
|
||
| # Should all be False without experiment | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is False | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| pytest.main([__file__, "-v"]) |
There was a problem hiding this comment.
The test suite does not include any tests for async contexts (AsyncStartObserve, AsyncObserve). Since the implementation includes separate async code paths in _async_with_span_handle with identical sampling logic, these should be tested to ensure should_evaluate propagation works correctly in async scenarios. Consider adding tests like test_async_sample_rate_1_propagates_to_children and test_async_experiment_forces_evaluation.
| """Tests for should_evaluate propagation across all spans in a trace.""" | ||
|
|
||
| from collections.abc import Sequence | ||
|
|
||
| import pytest | ||
| from opentelemetry import trace | ||
| from opentelemetry.sdk.trace import ReadableSpan, TracerProvider | ||
| from opentelemetry.sdk.trace.export import SimpleSpanProcessor, SpanExporter, SpanExportResult | ||
|
|
||
| from basalt.observability import observe, start_observe | ||
| from basalt.observability.context_managers import EvaluationConfig | ||
| from basalt.observability.decorators import ObserveKind | ||
| from basalt.observability.processors import BasaltShouldEvaluateProcessor | ||
| from basalt.observability.semconv import BasaltSpan | ||
|
|
||
|
|
||
| class InMemorySpanExporter(SpanExporter): | ||
| """Simple in-memory span exporter for testing.""" | ||
|
|
||
| def __init__(self): | ||
| self.spans = [] | ||
|
|
||
| def export(self, spans: Sequence[ReadableSpan]) -> SpanExportResult: | ||
| self.spans.extend(spans) | ||
| return SpanExportResult.SUCCESS | ||
|
|
||
| def shutdown(self): | ||
| pass | ||
|
|
||
| def force_flush(self, timeout_millis: int = 30000) -> bool: | ||
| return True | ||
|
|
||
| def get_finished_spans(self): | ||
| return self.spans | ||
|
|
||
| def clear(self): | ||
| self.spans = [] | ||
|
|
||
|
|
||
| @pytest.fixture(scope="function") | ||
| def setup_tracer(): | ||
| """Setup tracer with in-memory exporter for testing.""" | ||
| exporter = InMemorySpanExporter() | ||
| provider = trace.get_tracer_provider() | ||
|
|
||
| # If provider is a ProxyTracerProvider, create a real one | ||
| if type(provider).__name__ == 'ProxyTracerProvider': | ||
| provider = TracerProvider() | ||
| provider.add_span_processor(BasaltShouldEvaluateProcessor()) | ||
| trace.set_tracer_provider(provider) | ||
|
|
||
| # Ensure BasaltShouldEvaluateProcessor is installed | ||
| if not hasattr(provider, '_basalt_should_evaluate_installed'): | ||
| processor = BasaltShouldEvaluateProcessor() | ||
| provider.add_span_processor(processor) | ||
| provider._basalt_should_evaluate_installed = True # type: ignore[attr-defined] | ||
|
|
||
| # Add the exporter processor | ||
| span_processor = SimpleSpanProcessor(exporter) | ||
| provider.add_span_processor(span_processor) | ||
|
|
||
| yield exporter | ||
|
|
||
| exporter.clear() | ||
|
|
||
|
|
||
| class TestShouldEvaluatePropagation: | ||
| """Test suite for should_evaluate propagation.""" | ||
|
|
||
| def test_sample_rate_1_propagates_to_children(self, setup_tracer): | ||
| """Test that should_evaluate=True propagates to all child spans.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="parent", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="grandchild", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="child2", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 4, f"Expected 4 spans, got {len(spans)}" | ||
|
|
||
| # All spans should have should_evaluate=True | ||
| for span in spans: | ||
| assert BasaltSpan.SHOULD_EVALUATE in span.attributes, \ | ||
| f"Span {span.name} missing should_evaluate" | ||
| assert span.attributes[BasaltSpan.SHOULD_EVALUATE] is True, \ | ||
| f"Span {span.name} has should_evaluate={span.attributes[BasaltSpan.SHOULD_EVALUATE]}, expected True" | ||
|
|
||
| def test_sample_rate_0_propagates_to_children(self, setup_tracer): | ||
| """Test that should_evaluate=False propagates to all child spans.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="parent", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="grandchild", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="child2", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 4, f"Expected 4 spans, got {len(spans)}" | ||
|
|
||
| # All spans should have should_evaluate=False | ||
| for span in spans: | ||
| assert BasaltSpan.SHOULD_EVALUATE in span.attributes, \ | ||
| f"Span {span.name} missing should_evaluate" | ||
| assert span.attributes[BasaltSpan.SHOULD_EVALUATE] is False, \ | ||
| f"Span {span.name} has should_evaluate={span.attributes[BasaltSpan.SHOULD_EVALUATE]}, expected False" | ||
|
|
||
| def test_experiment_forces_true_for_all_spans(self, setup_tracer): | ||
| """Test that experiment forces should_evaluate=True for all spans.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="parent", | ||
| feature_slug="test", | ||
| experiment="exp_123", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) # Would normally be False | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="grandchild", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="child2", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 4, f"Expected 4 spans, got {len(spans)}" | ||
|
|
||
| # All spans should have should_evaluate=True due to experiment | ||
| for span in spans: | ||
| assert BasaltSpan.SHOULD_EVALUATE in span.attributes, \ | ||
| f"Span {span.name} missing should_evaluate" | ||
| assert span.attributes[BasaltSpan.SHOULD_EVALUATE] is True, \ | ||
| f"Span {span.name} has should_evaluate={span.attributes[BasaltSpan.SHOULD_EVALUATE]}, expected True due to experiment" | ||
|
|
||
| def test_experiment_overrides_sample_rate_for_all_spans(self, setup_tracer): | ||
| """Test that experiment overrides sample_rate=0.0 for entire trace.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="experiment_trace", | ||
| feature_slug="test", | ||
| experiment="exp_456" | ||
| # No evaluate_config, global default is 0.0 | ||
| ): | ||
| with observe(name="processing", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="llm_call", kind=ObserveKind.GENERATION): | ||
| with observe(name="retrieval", kind=ObserveKind.RETRIEVAL): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 4, f"Expected 4 spans, got {len(spans)}" | ||
|
|
||
| # Verify all have should_evaluate=True | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True, \ | ||
| f"Span {span.name} should have should_evaluate=True with experiment" | ||
|
|
||
| def test_deeply_nested_spans_propagate(self, setup_tracer): | ||
| """Test propagation through deeply nested span hierarchy.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="root", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ): | ||
| with observe(name="level1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="level2", kind=ObserveKind.FUNCTION): | ||
| with observe(name="level3", kind=ObserveKind.FUNCTION): | ||
| with observe(name="level4", kind=ObserveKind.FUNCTION): | ||
| with observe(name="level5", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 6, f"Expected 6 spans, got {len(spans)}" | ||
|
|
||
| # All spans should have same should_evaluate value | ||
| should_evaluate_values = [ | ||
| span.attributes.get(BasaltSpan.SHOULD_EVALUATE) | ||
| for span in spans | ||
| ] | ||
| assert all(v is True for v in should_evaluate_values), \ | ||
| f"All spans should have should_evaluate=True, got: {should_evaluate_values}" | ||
|
|
||
| def test_multiple_child_branches_propagate(self, setup_tracer): | ||
| """Test propagation across multiple child branches.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="root", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ): | ||
| # Branch 1 | ||
| with observe(name="branch1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="branch1_child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| # Branch 2 | ||
| with observe(name="branch2", kind=ObserveKind.GENERATION): | ||
| with observe(name="branch2_child", kind=ObserveKind.RETRIEVAL): | ||
| pass | ||
|
|
||
| # Branch 3 | ||
| with observe(name="branch3", kind=ObserveKind.TOOL): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 6, f"Expected 6 spans, got {len(spans)}" | ||
|
|
||
| # All should be False | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is False, \ | ||
| f"Span {span.name} should have should_evaluate=False" | ||
|
|
||
| def test_decorator_style_propagation(self, setup_tracer): | ||
| """Test propagation works with decorator-style usage.""" | ||
| exporter = setup_tracer | ||
|
|
||
| @start_observe( | ||
| name="decorated_root", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ) | ||
| def root_function(): | ||
| with observe(name="child_in_decorator", kind=ObserveKind.FUNCTION): | ||
| nested_function() | ||
|
|
||
| @observe(name="nested_decorated", kind=ObserveKind.FUNCTION) | ||
| def nested_function(): | ||
| pass | ||
|
|
||
| root_function() | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 3, f"Expected 3 spans, got {len(spans)}" | ||
|
|
||
| # All should have should_evaluate=True | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True, \ | ||
| f"Span {span.name} should have should_evaluate=True" | ||
|
|
||
| def test_experiment_with_decorator_propagation(self, setup_tracer): | ||
| """Test experiment forces evaluation with decorator pattern.""" | ||
| exporter = setup_tracer | ||
|
|
||
| @start_observe( | ||
| name="experiment_decorated", | ||
| feature_slug="test", | ||
| experiment="exp_789", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ) | ||
| def experiment_function(): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| experiment_function() | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 2, f"Expected 2 spans, got {len(spans)}" | ||
|
|
||
| # Both should be True due to experiment | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True, \ | ||
| f"Span {span.name} should have should_evaluate=True with experiment" | ||
|
|
||
| def test_mixed_span_kinds_propagation(self, setup_tracer): | ||
| """Test propagation across different span kinds.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="mixed_trace", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ): | ||
| with observe(name="generation", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| with observe(name="retrieval", kind=ObserveKind.RETRIEVAL): | ||
| pass | ||
|
|
||
| with observe(name="tool", kind=ObserveKind.TOOL): | ||
| pass | ||
|
|
||
| with observe(name="function", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="event", kind=ObserveKind.EVENT): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 6, f"Expected 6 spans, got {len(spans)}" | ||
|
|
||
| # All different kinds should have same should_evaluate | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True, \ | ||
| f"Span {span.name} (kind={span.attributes.get('basalt.span.kind')}) should have should_evaluate=True" | ||
|
|
||
| def test_no_evaluate_config_uses_global_default(self, setup_tracer): | ||
| """Test that without evaluate_config, global default (0.0) is used for all spans.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="default_trace", | ||
| feature_slug="test" | ||
| # No evaluate_config, should use global default 0.0 | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| with observe(name="child2", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| assert len(spans) == 3, f"Expected 3 spans, got {len(spans)}" | ||
|
|
||
| # All should be False (global default is 0.0) | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is False, \ | ||
| f"Span {span.name} should have should_evaluate=False with global default" | ||
|
|
||
| def test_trace_consistency(self, setup_tracer): | ||
| """Test that all spans in a trace have the SAME should_evaluate value.""" | ||
| exporter = setup_tracer | ||
|
|
||
| # Test with sample_rate=1.0 | ||
| with start_observe( | ||
| name="consistent_trace", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=1.0) | ||
| ): | ||
| with observe(name="child1", kind=ObserveKind.FUNCTION): | ||
| with observe(name="grandchild", kind=ObserveKind.GENERATION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
| should_evaluate_values = [ | ||
| span.attributes.get(BasaltSpan.SHOULD_EVALUATE) | ||
| for span in spans | ||
| ] | ||
|
|
||
| # All values should be identical | ||
| assert len(set(should_evaluate_values)) == 1, \ | ||
| f"All spans should have same should_evaluate value, got: {should_evaluate_values}" | ||
| assert should_evaluate_values[0] is True | ||
|
|
||
|
|
||
| class TestExperimentShouldEvaluate: | ||
| """Tests specific to experiment forcing evaluation.""" | ||
|
|
||
| def test_experiment_string_forces_evaluation(self, setup_tracer): | ||
| """Test that string experiment ID forces evaluation.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="trace", | ||
| feature_slug="test", | ||
| experiment="exp_string", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
|
|
||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True | ||
| # Verify experiment ID is attached to root span | ||
| if span.name == "trace": | ||
| assert "basalt.experiment.id" in span.attributes | ||
| assert span.attributes["basalt.experiment.id"] == "exp_string" | ||
|
|
||
| def test_experiment_object_forces_evaluation(self, setup_tracer): | ||
| """Test that experiment object forces evaluation.""" | ||
| exporter = setup_tracer | ||
|
|
||
| class MockExperiment: | ||
| def __init__(self, id, name=None): | ||
| self.id = id | ||
| self.name = name | ||
|
|
||
| exp = MockExperiment(id="exp_obj", name="Test Experiment") | ||
|
|
||
| with start_observe( | ||
| name="trace", | ||
| feature_slug="test", | ||
| experiment=exp, | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| ): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
|
|
||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True | ||
|
|
||
| def test_experiment_without_evaluate_config(self, setup_tracer): | ||
| """Test experiment works without explicit evaluate_config.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="trace", | ||
| feature_slug="test", | ||
| experiment="exp_no_config" | ||
| # No evaluate_config at all | ||
| ): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
|
|
||
| # Should all be True due to experiment | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is True | ||
|
|
||
| def test_no_experiment_respects_sample_rate_zero(self, setup_tracer): | ||
| """Test that without experiment, sample_rate=0.0 is respected.""" | ||
| exporter = setup_tracer | ||
|
|
||
| with start_observe( | ||
| name="trace", | ||
| feature_slug="test", | ||
| evaluate_config=EvaluationConfig(sample_rate=0.0) | ||
| # No experiment | ||
| ): | ||
| with observe(name="child", kind=ObserveKind.FUNCTION): | ||
| pass | ||
|
|
||
| spans = exporter.get_finished_spans() | ||
|
|
||
| # Should all be False without experiment | ||
| for span in spans: | ||
| assert span.attributes.get(BasaltSpan.SHOULD_EVALUATE) is False | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| pytest.main([__file__, "-v"]) |
There was a problem hiding this comment.
There are no tests verifying the behavior when sample_rate is set to boundary values between 0.0 and 1.0 (e.g., 0.5). While deterministic testing of probabilistic sampling is challenging, consider adding tests that verify the sampling decision is made consistently for all spans in a trace, even for intermediate sample rates. This ensures the trace-level consistency property holds for all sample_rate values, not just 0.0 and 1.0.
| """ | ||
|
|
||
| sample_rate: float = 1.0 | ||
| sample_rate: float = 0.0 |
There was a problem hiding this comment.
This is a breaking API change. The default sample_rate has been changed from 1.0 (100% sampling) to 0.0 (no sampling). This means that existing code relying on the default behavior will now skip evaluations unless explicitly configured. Consider documenting this as a breaking change in release notes or providing a migration guide for users who depend on the previous default behavior.
| @pytest.fixture(scope="function") | ||
| def setup_tracer(): | ||
| """Setup tracer with in-memory exporter for testing.""" | ||
| exporter = InMemorySpanExporter() | ||
| provider = trace.get_tracer_provider() | ||
|
|
||
| # If provider is a ProxyTracerProvider, create a real one | ||
| if type(provider).__name__ == 'ProxyTracerProvider': | ||
| provider = TracerProvider() | ||
| provider.add_span_processor(BasaltShouldEvaluateProcessor()) | ||
| trace.set_tracer_provider(provider) | ||
|
|
||
| # Ensure BasaltShouldEvaluateProcessor is installed | ||
| if not hasattr(provider, '_basalt_should_evaluate_installed'): | ||
| processor = BasaltShouldEvaluateProcessor() | ||
| provider.add_span_processor(processor) | ||
| provider._basalt_should_evaluate_installed = True # type: ignore[attr-defined] | ||
|
|
||
| # Add the exporter processor | ||
| span_processor = SimpleSpanProcessor(exporter) | ||
| provider.add_span_processor(span_processor) | ||
|
|
||
| yield exporter | ||
|
|
||
| exporter.clear() |
There was a problem hiding this comment.
The fixture does not properly clean up the tracer provider state between tests. While exporter.clear() is called, the tracer provider and its processors remain in place, potentially causing state leakage between tests. Consider adding teardown logic to remove the span processors or reset the tracer provider to its original state to ensure test isolation.
| sample_rate_env = os.getenv("BASALT_SAMPLE_RATE") | ||
| if sample_rate_env: | ||
| try: | ||
| rate = float(sample_rate_env) | ||
| if 0.0 <= rate <= 1.0: | ||
| cfg.sample_rate = rate | ||
| except ValueError: | ||
| pass # Ignore invalid values |
There was a problem hiding this comment.
When BASALT_SAMPLE_RATE environment variable contains an invalid value (line 195), the error is silently ignored. This makes debugging difficult if users misconfigure the value. Consider logging a warning when an invalid sample_rate is provided so users are aware their configuration is being ignored. Example: logger.warning("Invalid BASALT_SAMPLE_RATE value '%s', must be a float between 0.0 and 1.0", sample_rate_env).
| sample_rate_env = os.getenv("BASALT_SAMPLE_RATE") | ||
| if sample_rate_env: | ||
| try: | ||
| rate = float(sample_rate_env) | ||
| if 0.0 <= rate <= 1.0: | ||
| cfg.sample_rate = rate | ||
| except ValueError: | ||
| pass # Ignore invalid values |
There was a problem hiding this comment.
The docstring on line 165 does not list BASALT_SAMPLE_RATE as a supported environment variable, even though it is implemented in lines 189-196. This inconsistency makes the configuration option undiscoverable for users. Consider updating the docstring to include BASALT_SAMPLE_RATE in the list of supported environment variables (note: the docstring is at line 165, outside this diff region, but the omission impacts the discoverability of this feature).
| # Make trace-level sampling decision | ||
| should_evaluate_token = None | ||
| if is_root: | ||
| # Root span: make new sampling decision | ||
| # If experiment is attached, ALWAYS evaluate (should_evaluate=True) | ||
| if experiment is not None: | ||
| should_evaluate = True | ||
| else: | ||
| # Get sample_rate from evaluate_config if provided, otherwise use global default | ||
| if evaluate_config is not None: | ||
| effective_sample_rate = evaluate_config.sample_rate | ||
| else: | ||
| effective_sample_rate = defaults.sample_rate | ||
| should_evaluate = random.random() < effective_sample_rate | ||
| should_evaluate_token = attach(set_value(SHOULD_EVALUATE_CONTEXT_KEY, should_evaluate)) | ||
| else: | ||
| # Check if should_evaluate already exists in context | ||
| existing_should_evaluate = otel_context.get_value(SHOULD_EVALUATE_CONTEXT_KEY) | ||
| if existing_should_evaluate is None: | ||
| # Orphan span without root - make its own decision | ||
| # If experiment is attached, ALWAYS evaluate | ||
| if experiment is not None: | ||
| should_evaluate = True | ||
| else: | ||
| if evaluate_config is not None: | ||
| effective_sample_rate = evaluate_config.sample_rate | ||
| else: | ||
| effective_sample_rate = defaults.sample_rate | ||
| should_evaluate = random.random() < effective_sample_rate | ||
| should_evaluate_token = attach(set_value(SHOULD_EVALUATE_CONTEXT_KEY, should_evaluate)) |
There was a problem hiding this comment.
This sampling decision logic is duplicated in the async version (_async_with_span_handle) at lines 893-922. Consider extracting this into a shared helper function to avoid code duplication and ensure consistent behavior between sync and async implementations. The logic spans approximately 30 lines and is identical in both places, making it a prime candidate for refactoring.
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
No description provided.