Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ test-helm: ## Test Helm charts (lint, template, validate)
helm template test-release $(HELM_CHART_DIR)/ \
--set config.resourceType=nodepools \
--set config.pollInterval=10s \
--set config.maxAgeReady=1h > /dev/null
--set config.conditions.rules[0].maxAge=1h > /dev/null
@echo "Custom resource selector template OK"
@echo ""
@echo "All Helm chart tests passed!"
Expand Down
16 changes: 12 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ Create a configuration file based on the examples in the `configs/` directory:
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `poll_interval` | duration | `5s` | How often to poll the API for resource updates |
| `max_age_not_ready` | duration | `10s` | Max age interval for resources not ready |
| `max_age_ready` | duration | `30m` | Max age interval for ready resources |
| `conditions.reference_time` | string | `conditionTime(resource, "Ready")` | CEL expression for reference timestamp |
| `conditions.rules` | array | See below | CEL expression rules with name, expression, and max_age |
| `hyperfleet_api.timeout` | duration | `5s` | Request timeout for API calls |
| `resource_selector` | array | `[]` | Label selectors for filtering resources (enables sharding) |
| `message_data` | map | `{}` | Template fields for CloudEvents data payload |
Expand Down Expand Up @@ -229,8 +229,16 @@ This uses all defaults. Broker configuration is managed via `broker.yaml` or env
```yaml
resource_type: clusters
poll_interval: 5s
max_age_not_ready: 10s
max_age_ready: 30m

conditions:
reference_time: 'conditionTime(resource, "Ready")'
rules:
- name: isReady
expression: 'status(resource, "Ready") == "True"'
max_age: 30m
- name: isNotReady
expression: 'status(resource, "Ready") == "False"'
max_age: 10s

resource_selector:
- label: shard
Expand Down
4 changes: 2 additions & 2 deletions charts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ The following table lists the configurable parameters of the Sentinel chart and
|-----------|-------------|---------|
| `config.resourceType` | Resource type to watch | `clusters` |
| `config.pollInterval` | Polling interval | `5s` |
| `config.maxAgeNotReady` | Max age for not ready resources | `10s` |
| `config.maxAgeReady` | Max age for ready resources | `30m` |
| `config.conditions.referenceTime` | CEL expression for reference timestamp | `conditionTime(resource, "Ready")` |
| `config.conditions.rules` | CEL expression rules with name, expression, and max_age | See values.yaml |
| `config.resourceSelector` | Resource selector for sharding | See values.yaml |
| `config.hyperfleetApi.baseUrl` | HyperFleet API base URL | `http://hyperfleet-api:8000` |
| `config.hyperfleetApi.timeout` | API timeout | `5s` |
Expand Down
10 changes: 8 additions & 2 deletions charts/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,14 @@ data:
# Sentinel configuration
resource_type: {{ .Values.config.resourceType }}
poll_interval: {{ .Values.config.pollInterval }}
max_age_not_ready: {{ .Values.config.maxAgeNotReady }}
max_age_ready: {{ .Values.config.maxAgeReady }}
conditions:
reference_time: {{ .Values.config.conditions.referenceTime | quote }}
rules:
{{- range .Values.config.conditions.rules }}
- name: {{ .name | quote }}
expression: {{ .expression | quote }}
max_age: {{ .maxAge }}
{{- end }}

{{- if .Values.config.resourceSelector }}
# Resource selector for horizontal sharding
Expand Down
19 changes: 14 additions & 5 deletions charts/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,20 @@ config:
# How often to poll the API for resource updates
pollInterval: 5s

# Max age interval for resources not ready
maxAgeNotReady: 10s

# Max age interval for ready resources
maxAgeReady: 30m
# Condition rules for decision making using CEL expressions
# Available CEL helper functions:
# condition(resource, type) → map - returns full condition map
# status(resource, type) → string - returns condition status
# conditionTime(resource, type) → string - returns last_updated_time (RFC3339)
conditions:
referenceTime: 'conditionTime(resource, "Ready")'
rules:
- name: isReady
expression: 'status(resource, "Ready") == "True"'
maxAge: 30m
- name: isNotReady
expression: 'status(resource, "Ready") == "False"'
maxAge: 10s

# Resource selector for horizontal sharding
# Deploy multiple Sentinel instances with different shard values
Expand Down
7 changes: 6 additions & 1 deletion cmd/sentinel/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,12 @@ func runServe(cfg *config.SentinelConfig, logCfg *logger.LogConfig, healthBindAd
return fmt.Errorf("failed to initialize OpenAPI client: %w", err)

}
decisionEngine := engine.NewDecisionEngine(cfg.MaxAgeNotReady, cfg.MaxAgeReady)
compiledConditions, err := engine.CompileConditions(cfg.Conditions)
if err != nil {
log.Errorf(ctx, "Failed to compile condition expressions: %v", err)
return fmt.Errorf("failed to compile condition expressions: %w", err)
}
decisionEngine := engine.NewDecisionEngine(compiledConditions)

// Initialize broker metrics recorder
// Broker metrics (messages_published_total, errors_total, etc.) are registered
Expand Down
19 changes: 16 additions & 3 deletions configs/dev-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,22 @@ resource_type: clusters
# Faster polling for development - see changes quickly.
poll_interval: 2s

# Shorter max age intervals for development.
max_age_not_ready: 5s
max_age_ready: 2m
# Condition rules for development - shorter intervals for faster feedback.
# Uses CEL expressions for flexible condition matching.
#
# Available CEL helper functions:
# condition(resource, type) → map - returns full condition map
# status(resource, type) → string - returns condition status
# conditionTime(resource, type) → string - returns last_updated_time (RFC3339)
conditions:
reference_time: 'conditionTime(resource, "Ready")'
rules:
- name: isReady
expression: 'status(resource, "Ready") == "True"'
max_age: 2m
- name: isNotReady
expression: 'status(resource, "Ready") == "False"'
max_age: 5s

# No resource selector - watch all resources in development.
# resource_selector: []
Expand Down
23 changes: 16 additions & 7 deletions configs/gcp-pubsub-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,22 @@ resource_type: clusters
# Accepts Go duration format: ns, us/µs, ms, s, m, h.
poll_interval: 5s

# Max age interval for resources that are not ready.
# Resources in transitional states are re-checked more frequently.
max_age_not_ready: 10s

# Max age interval for resources that are ready and stable.
# Stable resources are checked less frequently to reduce API load.
max_age_ready: 30m
# Condition rules for decision making.
# Uses CEL expressions for flexible condition matching.
#
# Available CEL helper functions:
# condition(resource, type) → map - returns full condition map
# status(resource, type) → string - returns condition status
# conditionTime(resource, type) → string - returns last_updated_time (RFC3339)
conditions:
reference_time: 'conditionTime(resource, "Ready")'
rules:
- name: isReady
expression: 'status(resource, "Ready") == "True"'
max_age: 30m # Stable resources checked less frequently
- name: isNotReady
expression: 'status(resource, "Ready") == "False"'
max_age: 10s # Transitional resources re-checked more frequently

# Resource selector (optional) - filter resources by labels.
# If empty or omitted, all resources of the specified type are watched.
Expand Down
23 changes: 16 additions & 7 deletions configs/rabbitmq-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,22 @@ resource_type: clusters
# Accepts Go duration format: ns, us/µs, ms, s, m, h.
poll_interval: 5s

# Max age interval for resources that are not ready.
# Resources in transitional states are re-checked more frequently.
max_age_not_ready: 10s

# Max age interval for resources that are ready and stable.
# Stable resources are checked less frequently to reduce API load.
max_age_ready: 30m
# Condition rules for decision making.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't found how are condition combined, is it AND or OR?

# Uses CEL expressions for flexible condition matching.
#
# Available CEL helper functions:
# condition(resource, type) → map - returns full condition map
# status(resource, type) → string - returns condition status
# conditionTime(resource, type) → string - returns last_updated_time (RFC3339)
conditions:
reference_time: 'conditionTime(resource, "Ready")'
Copy link
Contributor

@rh-amarin rh-amarin Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is not clear to me what reference_time is used for, I guess is to compare with the max_age property?

  • But in this case, these are really a mix between CEL expressions and some hardcoded rules
  • How does the expression and max_age combine in the same rule?

In my mind, the CEL engine could be more generic.

  • Instead of the hardcoded reference_time there could be a params section to define custom variables to use later in other rules
  • `conditionTime(resource, "Ready") does here too much magic
    • Traverses the resource object and decides between 2 properties

One idea, but I think we should brainstorm this:

rules:
  params:
    - name: max_age
      expression: 30m
    - name: ready
      expression: resource.conditions.filter(c, c.type=="Ready")[0]
    - name: ref_time
      expression: has(ready.last_updated_time ) 
                         ? ready.last_updated_time
                         : ready.created_time
  conditions:
    - operator: AND
      - name: state-strategy
        expression: ready != "True"
      - name: time-based-strategy
        expression: ref_time > max_age
      - operator: OR
        - name: xxx
          expression: expr1
        - name: yy  
          expression: expr2

rules:
  params:
    - name: max_age
      expression: 30m
    - name: ready
      expression: resource.conditions.filter(c, c.type=="Ready")[0]
    - name: ref_time
      expression: has(ready.last_updated_time ) 
                         ? ready.last_updated_time
                         : ready.created_time
    - name: operand1
        expression: ready != "True"
  conditions:
      - name: state-strategy
        expression: operand1 AND operand2 AND (operand3 OR operand4)

trigger_decission:
    - name: max_age
      expression: 30m
    - name: ready
      expression: resource.conditions.filter(c, c.type=="Ready")[0]
    - name: ref_time
      expression: has(ready.last_updated_time ) 
                         ? ready.last_updated_time
                         : ready.created_time
    - name: operand1
        expression: ready != "True"
     - name: result
       expression: operand1 AND operand2 AND (operand3 OR operand4)
      - name: reason
       expression: xxxx

Copy link
Contributor Author

@rafabene rafabene Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My proposal:

trigger_decission:
    - name: max_age
      expression: 30m
    - name: ready
      expression: resource.conditions.filter(c, c.type=="Ready")[0]
    - name: ref_time
      expression: has(ready.last_updated_time ) 
                         ? ready.last_updated_time
                         : ready.created_time
    - name: operand1
        expression: ready != "True"
    result: 
        expression: operand1 AND operand2 AND (operand3 OR operand4)
    reason:  
        expression: xxxx

OR

message_decision:
    params:
      max_age: 30m
      ready: resource.conditions.filter(c, c.type=="Ready")[0]
      ref_time: has(ready.last_updated_time ) 
                           ? ready.last_updated_time
                           : ready.created_time
      operand1:  ready != "True"
    result: operand1 AND operand2 AND (operand3 OR operand4)

rules:
- name: isReady
expression: 'status(resource, "Ready") == "True"'
max_age: 30m # Stable resources checked less frequently
- name: isNotReady
expression: 'status(resource, "Ready") == "False"'
max_age: 10s # Transitional resources re-checked more frequently

# Resource selector (optional) - filter resources by labels.
# If empty or omitted, all resources of the specified type are watched.
Expand Down
51 changes: 30 additions & 21 deletions docs/sentinel-operator-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ Time-based reconciliation ensures **eventual consistency** by publishing events

**How It Works:**

Sentinel uses two configurable max age intervals based on the resource's status (`Ready` condition):
Sentinel uses configurable **CEL expressions** to evaluate conditions and determine max age intervals. The default configuration uses the `Ready` condition:

| Resource State | Default Interval | Rationale |
|----------------|------------------|----------------------------------------------------------------------------------------|
Expand All @@ -150,13 +150,14 @@ Sentinel uses two configurable max age intervals based on the resource's status

When the resource's `generation` matches the `Ready` condition's `ObservedGeneration` (indicating the condition reflects the current state), Sentinel checks if enough time has elapsed:

1. Calculate reference timestamp:
- If `status.last_updated` exists → use it (adapter has processed resource)
- Otherwise → use `created_time` (new resource never processed)
1. Calculate reference timestamp using the `reference_time` CEL expression:
- Default: `conditionTime(resource, "Ready")` returns the condition's `last_updated_time`
- If evaluation fails or returns zero → falls back to `created_time`

2. Determine max age interval:
- If resource is ready (`Ready` condition status == True) → use `max_age_ready` (default: 30m)
- If resource is not ready (`Ready` condition status == False) → use `max_age_not_ready` (default: 10s)
2. Determine max age interval using `conditions.rules`:
- Evaluate each rule's CEL `expression` in order against the resource
- First matching rule's `max_age` is used
- If no rule matches, uses the smallest `max_age` (most conservative fallback)

3. Calculate next event time:
```text
Expand All @@ -171,13 +172,13 @@ When the resource's `generation` matches the `Ready` condition's `ObservedGenera

```mermaid
graph TD
A[Determine Reference Time] --> B{last_updated exists?}
B -->|Yes| C[Use last_updated]
B -->|No| D[Use created_time]
C --> E{Resource Ready?}
A[Evaluate reference_time CEL] --> B{Evaluation succeeded?}
B -->|Yes| C[Use result as reference time]
B -->|No| D[Use created_time as fallback]
C --> E{Evaluate rules in order}
D --> E
E -->|Yes| F[Max Age = 30m]
E -->|No| G[Max Age = 10s]
E -->|First match| F[Use matching rule's max_age]
E -->|No match| G[Use smallest max_age as fallback]
F --> H{now >= reference + max_age?}
G --> H
H -->|Yes| I[Publish: max age exceeded]
Expand Down Expand Up @@ -273,10 +274,19 @@ Sentinel uses YAML-based configuration with environment variable overrides for s
# Required: Resource type to watch
resource_type: clusters

# Optional: Polling and age intervals
# Optional: Polling interval
poll_interval: 5s
max_age_not_ready: 10s
max_age_ready: 30m

# Optional: Condition rules for decision making (CEL expressions)
conditions:
reference_time: 'conditionTime(resource, "Ready")'
rules:
- name: isReady
expression: 'status(resource, "Ready") == "True"'
max_age: 30m
- name: isNotReady
expression: 'status(resource, "Ready") == "False"'
max_age: 10s

# Optional: Resource filtering
resource_selector:
Expand Down Expand Up @@ -318,8 +328,8 @@ These fields have sensible defaults and can be omitted:
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `poll_interval` | duration | `5s` | How often to poll the API for resource updates |
| `max_age_not_ready` | duration | `10s` | Max age interval for not-ready resources |
| `max_age_ready` | duration | `30m` | Max age interval for ready resources |
| `conditions.reference_time` | string | `conditionTime(resource, "Ready")` | CEL expression for reference timestamp |
| `conditions.rules` | array | See defaults | CEL expression rules with name, expression, and max_age |
| `hyperfleet_api.timeout` | duration | `5s` | Request timeout for API calls |
| `resource_selector` | array | `[]` | Label selectors for filtering (empty = all resources) |
| `message_data` | map | `{}` | CEL expressions for CloudEvents payload |
Expand Down Expand Up @@ -563,8 +573,7 @@ Follow this checklist to ensure successful Sentinel deployment and operation.

- [ ] Review and adjust polling intervals:
- `poll_interval` - How often to poll the HyperFleet API (default: `5s`)
- `max_age_not_ready` - Reconciliation interval for not-ready resources (default: `10s`)
- `max_age_ready` - Reconciliation interval for ready resources (default: `30m`)
- `conditions` - CEL-based condition rules (default: Ready condition, True→30m, False→10s)
- Reference: [Optional Fields](#optional-fields)

**Design CloudEvents Payload**
Expand Down Expand Up @@ -678,7 +687,7 @@ For detailed deployment guidance, see [docs/running-sentinel.md](running-sentine
| Symptom | Likely Cause | Solution |
|------------------------------------------------------------------------------------------------------------------------------|--------------|----------|
| **Events not published, resources not found** | Resource selector mismatch | Verify `resource_selector` matches resource labels. Empty selector watches ALL resources. Check logs: `kubectl logs -n hyperfleet-system -l app.kubernetes.io/name=sentinel` |
| **Events not published, resources found but skipped** | Max age not exceeded | Normal behavior. Events publish when `generation > observed_generation` OR max age interval elapsed (`max_age_ready`: 30m, `max_age_not_ready`: 10s). |
| **Events not published, resources found but skipped** | Max age not exceeded | Normal behavior. Events publish when `generation > observed_generation` OR max age interval elapsed (configured via `conditions`). |
| **API connection errors, DNS lookup fails** | Wrong service name or namespace | Verify endpoint format: `http://<service>.<namespace>.svc.cluster.local:8080`. Check API is running: `kubectl get pods -n hyperfleet-system -l app=hyperfleet-api` |
| **API returns 401 Unauthorized** | Missing authentication | Add auth headers to `hyperfleet_api` config if API requires authentication. |
| **API returns 404 Not Found** | Wrong API version in path | Verify endpoint uses correct API version: `/api/v1/clusters` or `/api/hyperfleet/v1/clusters` |
Expand Down
47 changes: 38 additions & 9 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,24 @@ type LabelSelector struct {
// LabelSelectorList is a list of label selectors
type LabelSelectorList []LabelSelector

// ConditionRule defines a CEL expression that, when matched, determines the max age for a resource
type ConditionRule struct {
Name string `mapstructure:"name"`
Expression string `mapstructure:"expression"`
MaxAge time.Duration `mapstructure:"max_age"`
}

// Conditions configures CEL-based condition evaluation for the decision engine
type Conditions struct {
ReferenceTime string `mapstructure:"reference_time"`
Rules []ConditionRule `mapstructure:"rules"`
}

// SentinelConfig represents the Sentinel configuration
type SentinelConfig struct {
ResourceType string `mapstructure:"resource_type"`
PollInterval time.Duration `mapstructure:"poll_interval"`
MaxAgeNotReady time.Duration `mapstructure:"max_age_not_ready"`
MaxAgeReady time.Duration `mapstructure:"max_age_ready"`
Conditions Conditions `mapstructure:"conditions"`
ResourceSelector LabelSelectorList `mapstructure:"resource_selector"`
HyperFleetAPI *HyperFleetAPIConfig `mapstructure:"hyperfleet_api"`
MessageData map[string]interface{} `mapstructure:"message_data"`
Expand Down Expand Up @@ -57,9 +69,14 @@ func (ls LabelSelectorList) ToMap() map[string]string {
func NewSentinelConfig() *SentinelConfig {
return &SentinelConfig{
// ResourceType is required and must be set in config file
PollInterval: 5 * time.Second,
MaxAgeNotReady: 10 * time.Second,
MaxAgeReady: 30 * time.Minute,
PollInterval: 5 * time.Second,
Conditions: Conditions{
ReferenceTime: `conditionTime(resource, "Ready")`,
Rules: []ConditionRule{
{Name: "isReady", Expression: `status(resource, "Ready") == "True"`, MaxAge: 30 * time.Minute},
{Name: "isNotReady", Expression: `status(resource, "Ready") == "False"`, MaxAge: 10 * time.Second},
},
},
ResourceSelector: []LabelSelector{}, // Empty means watch all resources
HyperFleetAPI: &HyperFleetAPIConfig{
// Endpoint is required and must be set in config file
Expand Down Expand Up @@ -138,12 +155,24 @@ func (c *SentinelConfig) Validate() error {
return fmt.Errorf("poll_interval must be positive")
}

if c.MaxAgeNotReady <= 0 {
return fmt.Errorf("max_age_not_ready must be positive")
if strings.TrimSpace(c.Conditions.ReferenceTime) == "" {
return fmt.Errorf("conditions.reference_time is required")
}

if c.MaxAgeReady <= 0 {
return fmt.Errorf("max_age_ready must be positive")
if len(c.Conditions.Rules) == 0 {
return fmt.Errorf("conditions.rules must have at least one rule")
}

for i, rule := range c.Conditions.Rules {
if strings.TrimSpace(rule.Name) == "" {
return fmt.Errorf("conditions.rules[%d].name is required", i)
}
if strings.TrimSpace(rule.Expression) == "" {
return fmt.Errorf("conditions.rules[%d].expression is required", i)
}
if rule.MaxAge <= 0 {
return fmt.Errorf("conditions.rules[%d].max_age must be positive", i)
}
}

if c.MessageData == nil {
Expand Down
Loading