Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 21 additions & 29 deletions R/frs_classify.R
Original file line number Diff line number Diff line change
Expand Up @@ -252,42 +252,34 @@ frs_classify <- function(conn, table, label,
mus <- .frs_opt("measure_us_col")

# Segments are accessible if no break is downstream of them.
# A segment is BLOCKED (upstream of break) when:
# 1. Same BLK: segment measure >= break measure (break is downstream)
# 2. Different BLK: fwa_upstream(break_pos, segment) = TRUE
# (segment is upstream of the break on the network)
# Split into two separate NOT EXISTS with AND (not OR) so PG can
# use indexes on both independently.
#
# 1. Same BLK: pure measure comparison — instant with btree index
# 2. Cross BLK: ltree comparison using enriched wscode/localcode
# on the breaks table — fast with GIST index, no join to
# fwa_stream_networks_sp (4.9M rows)
sql <- sprintf(
"UPDATE %s s SET %s = %s
WHERE NOT EXISTS (
-- Same BLK: break is downstream if measure <= segment measure
SELECT 1 FROM %s b
WHERE (
-- Same BLK: segment is upstream of break if its measure >= break measure
b.%s = s.%s
WHERE b.%s = s.%s
AND b.%s <= s.%s
)
OR (
-- Different BLK: is segment upstream of the break?
b.%s != s.%s
AND EXISTS (
SELECT 1 FROM whse_basemapping.fwa_stream_networks_sp f
WHERE f.%s = b.%s
AND b.%s >= f.%s
AND b.%s < f.%s
AND fwa_upstream(
f.wscode_ltree, f.localcode_ltree,
s.%s, s.%s
)
)
AND NOT EXISTS (
-- Cross BLK: break is downstream on the network (ltree)
SELECT 1 FROM %s b
WHERE b.%s != s.%s
AND b.wscode_ltree IS NOT NULL
AND fwa_upstream(
b.wscode_ltree, b.localcode_ltree,
s.%s, s.%s
)
)
)",
table, label, ifelse(value, "TRUE", "FALSE"), breaks,
blk, blk, # same BLK check
mds, mds, # measure comparison
blk, blk, # different BLK check
blk, blk, # join to FWA
mds, mds, # measure range check (ds)
mds, mus, # measure range check (us)
wsc, loc # ltree columns on working table
table, label, ifelse(value, "TRUE", "FALSE"),
breaks, blk, blk, mds, mds, # same BLK
breaks, blk, blk, wsc, loc # cross BLK
)

# Append user-supplied where filter to scope which rows get classified
Expand Down
31 changes: 31 additions & 0 deletions R/frs_habitat.R
Original file line number Diff line number Diff line change
Expand Up @@ -463,12 +463,43 @@ frs_habitat_access <- function(conn, table, threshold,
}
}

# Enrich breaks with ltree codes for fast cross-BLK classification
.frs_enrich_breaks(conn, to)

.frs_index_working(conn, to)

invisible(conn)
}


#' Enrich breaks table with ltree codes from FWA base network
#'
#' Adds `wscode_ltree` and `localcode_ltree` columns by joining each break
#' point to the FWA stream segment it falls within. These columns enable
#' fast cross-BLK classification via ltree comparison instead of joining
#' back to the 4.9M row `fwa_stream_networks_sp` table at classify time.
#'
#' @param conn DBI connection.
#' @param breaks Schema-qualified breaks table name.
#' @noRd
.frs_enrich_breaks <- function(conn, breaks) {
.frs_db_execute(conn, sprintf(
"ALTER TABLE %s ADD COLUMN IF NOT EXISTS wscode_ltree ltree", breaks))
.frs_db_execute(conn, sprintf(
"ALTER TABLE %s ADD COLUMN IF NOT EXISTS localcode_ltree ltree", breaks))

.frs_db_execute(conn, sprintf(
"UPDATE %s b SET
wscode_ltree = f.wscode_ltree,
localcode_ltree = f.localcode_ltree
FROM whse_basemapping.fwa_stream_networks_sp f
WHERE b.blue_line_key = f.blue_line_key
AND b.downstream_route_measure >= f.downstream_route_measure
AND b.downstream_route_measure < f.upstream_route_measure",
breaks))
}


#' Scope break sources to a WSG code
#'
#' When AOI is a 4-letter WSG code, appends a `watershed_group_code`
Expand Down
18 changes: 9 additions & 9 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,22 @@ services:
db:
image: imresamu/postgis:17-3.5-bullseye
container_name: fresh-db
shm_size: 16gb
shm_size: 36gb
command: >
postgres
-c default_statistics_target=100
-c max_connections=40
-c max_locks_per_transaction=64
-c checkpoint_timeout=30min
-c maintenance_work_mem=1GB
-c effective_cache_size=16GB
-c work_mem=1GB
-c maintenance_work_mem=4GB
-c effective_cache_size=96GB
-c work_mem=2GB
-c max_wal_size=10GB
-c wal_buffers=16MB
-c shared_buffers=8GB
-c max_parallel_workers_per_gather=4
-c max_parallel_workers=8
-c max_worker_processes=12
-c wal_buffers=64MB
-c shared_buffers=32GB
-c max_parallel_workers_per_gather=8
-c max_parallel_workers=14
-c max_worker_processes=16
volumes:
- ./postgres-data:/var/lib/postgresql/data
ports:
Expand Down
68 changes: 68 additions & 0 deletions docker/tuning.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# PostgreSQL Tuning

Tuning notes for the local Docker fwapg instance. Settings in `docker-compose.yml` are tuned for the development machine — adjust for your hardware.

## Current machine

- Apple M4 Max Pro
- 128 GB RAM
- 16 cores

## Settings rationale

| Setting | Value | Rule of thumb |
|---------|-------|---------------|
| `shared_buffers` | 32GB | ~25% of RAM |
| `effective_cache_size` | 96GB | ~75% of RAM (tells planner about OS cache) |
| `work_mem` | 2GB | Per-operation sort/hash budget. High because fresh queries do large joins and ltree comparisons |
| `maintenance_work_mem` | 4GB | Faster index builds, VACUUM ANALYZE |
| `wal_buffers` | 64MB | Scales with shared_buffers |
| `max_parallel_workers_per_gather` | 8 | Cores per query — aggressive for single-user dev |
| `max_parallel_workers` | 14 | Leave 2 cores for OS/Docker |
| `max_worker_processes` | 16 | Match core count |
| `shm_size` | 36gb | Must exceed shared_buffers (Docker constraint) |

## Scaling for other machines

**General formula:**

```
shared_buffers = RAM * 0.25
effective_cache_size = RAM * 0.75
work_mem = 1-2GB (fresh workload is join-heavy)
maintenance_work_mem = 2-4GB
max_parallel_workers = cores - 2
max_parallel_workers_per_gather = cores / 2
max_worker_processes = cores
shm_size = shared_buffers + 4GB
```

**Smaller machine (32GB RAM, 8 cores):**

```yaml
shm_size: 12gb
shared_buffers: 8GB
effective_cache_size: 24GB
work_mem: 1GB
maintenance_work_mem: 2GB
max_parallel_workers_per_gather: 4
max_parallel_workers: 6
max_worker_processes: 8
```

## Verifying settings

```bash
docker compose exec db psql -U postgres -d fwapg -c "SHOW shared_buffers; SHOW work_mem; SHOW effective_cache_size;"
```

## Applying changes

Settings live in `docker-compose.yml` command args. Restart to apply:

```bash
docker compose down
docker compose up -d db
```

Data persists in `postgres-data/` — no reload needed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## Classify benchmark — ADMS optimized query
## Date: 2026-04-04
## DB: local Docker fwapg (port 5432)
## Branch: optimize-classify
## Issue: #72

## Setup
# WSG: ADMS (11,520 segments)
# Break sources: gradient (28,960) + falls (7) + crossings (3,597)
# Total breaks: 32,564 (all enriched with ltree codes)
# Indexes: blue_line_key, (blue_line_key, downstream_route_measure),
# wscode_ltree GIST+BTREE, localcode_ltree GIST+BTREE

## Changes
# 1. Enrich breaks with wscode_ltree + localcode_ltree from FWA base
# (one-time UPDATE ... FROM fwa_stream_networks_sp after all breaks combined)
# 2. Split classify OR into two separate NOT EXISTS with AND
# - Same BLK: pure measure comparison (btree index)
# - Cross BLK: ltree comparison using enriched codes (GIST index)
# No join to fwa_stream_networks_sp (4.9M rows) at classify time

## Result
# Classify: 0.2s
# Accessible: 1,060 / 11,520 segments (9.2%) — matches baseline

## Comparison
# Before (v0.7.0, indexes only): 742.1s
# After (optimized query): 0.2s
# Speedup: 3,700x
37 changes: 37 additions & 0 deletions scripts/habitat/logs/20260404_habitat_benchmark-adms-optimized.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
## Habitat pipeline benchmark — ADMS optimized
## Date: 2026-04-04
## DB: local Docker fwapg (port 5432)
## Branch: optimize-classify
## Issue: #72

## Setup
# WSG: ADMS (11,520 segments, 5 species: BT, CH, CO, RB, SK)
# Break sources: gradient + falls (7 ADMS) + crossings (3,597 ADMS)
# Optimizations: ltree-enriched breaks, split NOT EXISTS query, indexes

## Results
# Phase 1 (partition prep): 23.3s
# Base extract + indexes: 4.2s
# Access 15% (gradient + falls + crossings + enrich + index): 5.5s
# Access 25%: 4.6s
# Habitat breaks (3 thresholds): 9s
#
# Phase 2 (species classification): 67.2s
# BT: 13.6s, CH: 13.2s, CO: 13.2s, RB: 13.7s, SK: 13.5s
#
# Total: 90.6s

## Comparison
# v0.6.0 remote DB, falls only, no indexes: 88s
# v0.7.0 local Docker, falls only, no indexes: 88s
# v0.7.0 local Docker, falls+crossings, unscoped: 5,888s (bug — 533k province breaks)
# v0.7.0 local Docker, falls+crossings, indexes only: ~840s per species
# Optimized: 90s total (13s/species)

## Breakdown per species (Phase 2)
# ~0.5s: copy base table
# ~3s: frs_col_generate (recompute gradient from geometry)
# ~0.2s: frs_break_apply (split geometry at breaks)
# ~0.2s: frs_classify accessible (optimized — was 742s)
# ~6s: frs_classify spawning/rearing (attribute ranges)
# ~3s: frs_classify lake rearing + categorize
Loading