Skip to content

Commit ee09ea1

Browse files
authored
Merge pull request #601 from Police-Data-Accessibility-Project/issue-566-optimize-annotation-load-time
feat(db): improve `GET /annotate/all` performance
2 parents b52c99e + 9e5c124 commit ee09ea1

19 files changed

Lines changed: 851 additions & 15 deletions

File tree

.github/workflows/benchmark.yml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
name: Annotation Benchmark
2+
3+
on:
4+
workflow_dispatch:
5+
pull_request:
6+
branches:
7+
- main
8+
9+
jobs:
10+
benchmark:
11+
runs-on: ubuntu-latest
12+
timeout-minutes: 30
13+
container: python:3.11.9
14+
15+
services:
16+
postgres:
17+
image: postgres:15
18+
env:
19+
POSTGRES_PASSWORD: postgres
20+
options: >-
21+
--health-cmd pg_isready
22+
--health-interval 10s
23+
--health-timeout 5s
24+
--health-retries 5
25+
26+
env:
27+
POSTGRES_PASSWORD: postgres
28+
POSTGRES_USER: postgres
29+
POSTGRES_DB: postgres
30+
POSTGRES_HOST: postgres
31+
POSTGRES_PORT: 5432
32+
GOOGLE_API_KEY: TEST
33+
GOOGLE_CSE_ID: TEST
34+
PROFILE_DIR: profiles
35+
36+
steps:
37+
- name: Checkout repository
38+
uses: actions/checkout@v4
39+
40+
- name: Install uv and set the python version
41+
uses: astral-sh/setup-uv@v5
42+
43+
- name: Install the project
44+
run: uv sync --locked --group dev
45+
46+
- name: Create profiles directory
47+
run: mkdir -p profiles
48+
49+
- name: Run benchmark tests
50+
run: |
51+
uv run pytest tests/automated/integration/benchmark \
52+
-m "manual and benchmark" \
53+
--benchmark-json=benchmark-results.json \
54+
-v
55+
56+
- name: Post benchmark summary
57+
run: uv run python scripts/post_benchmark_summary.py
58+
59+
- name: Upload benchmark results
60+
uses: actions/upload-artifact@v4
61+
with:
62+
name: benchmark-results-${{ github.sha }}
63+
path: |
64+
benchmark-results.json
65+
profiles/
66+
retention-days: 90
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
"""Materialize url_annotation_count_view and url_annotation_flags
2+
3+
Revision ID: c8e4f1a2b3d5
4+
Revises: 759ce7d0772b
5+
Create Date: 2026-02-26 00:00:00.000000
6+
7+
"""
8+
from typing import Optional, Sequence, Union
9+
10+
from alembic import op
11+
12+
13+
# revision identifiers, used by Alembic.
14+
revision: str = 'c8e4f1a2b3d5'
15+
down_revision: Optional[str] = '1fb2286a016c'
16+
branch_labels: Union[str, Sequence[str], None] = None
17+
depends_on: Union[str, Sequence[str], None] = None
18+
19+
_URL_ANNOTATION_COUNT_VIEW_SQL = """
20+
WITH
21+
auto_location_count AS (
22+
SELECT
23+
u_1.id,
24+
count(anno.url_id) AS cnt
25+
FROM
26+
urls u_1
27+
JOIN annotation__location__auto__subtasks anno
28+
ON u_1.id = anno.url_id
29+
GROUP BY
30+
u_1.id
31+
)
32+
, auto_agency_count AS (
33+
SELECT
34+
u_1.id,
35+
count(anno.url_id) AS cnt
36+
FROM
37+
urls u_1
38+
JOIN annotation__agency__auto__subtasks anno
39+
ON u_1.id = anno.url_id
40+
GROUP BY
41+
u_1.id
42+
)
43+
, auto_url_type_count AS (
44+
SELECT
45+
u_1.id,
46+
count(anno.url_id) AS cnt
47+
FROM
48+
urls u_1
49+
JOIN annotation__url_type__auto anno
50+
ON u_1.id = anno.url_id
51+
GROUP BY
52+
u_1.id
53+
)
54+
, auto_record_type_count AS (
55+
SELECT
56+
u_1.id,
57+
count(anno.url_id) AS cnt
58+
FROM
59+
urls u_1
60+
JOIN annotation__record_type__auto anno
61+
ON u_1.id = anno.url_id
62+
GROUP BY
63+
u_1.id
64+
)
65+
, user_location_count AS (
66+
SELECT
67+
u_1.id,
68+
count(anno.url_id) AS cnt
69+
FROM
70+
urls u_1
71+
JOIN annotation__location__user anno
72+
ON u_1.id = anno.url_id
73+
GROUP BY
74+
u_1.id
75+
)
76+
, user_agency_count AS (
77+
SELECT
78+
u_1.id,
79+
count(anno.url_id) AS cnt
80+
FROM
81+
urls u_1
82+
JOIN annotation__agency__user anno
83+
ON u_1.id = anno.url_id
84+
GROUP BY
85+
u_1.id
86+
)
87+
, user_url_type_count AS (
88+
SELECT
89+
u_1.id,
90+
count(anno.url_id) AS cnt
91+
FROM
92+
urls u_1
93+
JOIN annotation__url_type__user anno
94+
ON u_1.id = anno.url_id
95+
GROUP BY
96+
u_1.id
97+
)
98+
, user_record_type_count AS (
99+
SELECT
100+
u_1.id,
101+
count(anno.url_id) AS cnt
102+
FROM
103+
urls u_1
104+
JOIN annotation__record_type__user anno
105+
ON u_1.id = anno.url_id
106+
GROUP BY
107+
u_1.id
108+
)
109+
, anon_location_count AS (
110+
SELECT
111+
u_1.id,
112+
count(anno.url_id) AS cnt
113+
FROM
114+
urls u_1
115+
JOIN annotation__location__anon anno
116+
ON u_1.id = anno.url_id
117+
GROUP BY
118+
u_1.id
119+
)
120+
, anon_agency_count AS (
121+
SELECT
122+
u_1.id,
123+
count(anno.url_id) AS cnt
124+
FROM
125+
urls u_1
126+
JOIN annotation__agency__anon anno
127+
ON u_1.id = anno.url_id
128+
GROUP BY
129+
u_1.id
130+
)
131+
, anon_url_type_count AS (
132+
SELECT
133+
u_1.id,
134+
count(anno.url_id) AS cnt
135+
FROM
136+
urls u_1
137+
JOIN annotation__url_type__anon anno
138+
ON u_1.id = anno.url_id
139+
GROUP BY
140+
u_1.id
141+
)
142+
, anon_record_type_count AS (
143+
SELECT
144+
u_1.id,
145+
count(anno.url_id) AS cnt
146+
FROM
147+
urls u_1
148+
JOIN annotation__record_type__anon anno
149+
ON u_1.id = anno.url_id
150+
GROUP BY
151+
u_1.id
152+
)
153+
SELECT
154+
u.id AS url_id,
155+
COALESCE(auto_ag.cnt, 0::bigint) AS auto_agency_count,
156+
COALESCE(auto_loc.cnt, 0::bigint) AS auto_location_count,
157+
COALESCE(auto_rec.cnt, 0::bigint) AS auto_record_type_count,
158+
COALESCE(auto_typ.cnt, 0::bigint) AS auto_url_type_count,
159+
COALESCE(user_ag.cnt, 0::bigint) AS user_agency_count,
160+
COALESCE(user_loc.cnt, 0::bigint) AS user_location_count,
161+
COALESCE(user_rec.cnt, 0::bigint) AS user_record_type_count,
162+
COALESCE(user_typ.cnt, 0::bigint) AS user_url_type_count,
163+
COALESCE(anon_ag.cnt, 0::bigint) AS anon_agency_count,
164+
COALESCE(anon_loc.cnt, 0::bigint) AS anon_location_count,
165+
COALESCE(anon_rec.cnt, 0::bigint) AS anon_record_type_count,
166+
COALESCE(anon_typ.cnt, 0::bigint) AS anon_url_type_count,
167+
COALESCE(auto_ag.cnt, 0::bigint) + COALESCE(auto_loc.cnt, 0::bigint) + COALESCE(auto_rec.cnt, 0::bigint) +
168+
COALESCE(auto_typ.cnt, 0::bigint) + COALESCE(user_ag.cnt, 0::bigint) + COALESCE(user_loc.cnt, 0::bigint) +
169+
COALESCE(user_rec.cnt, 0::bigint) + COALESCE(user_typ.cnt, 0::bigint) + COALESCE(anon_ag.cnt, 0::bigint) +
170+
COALESCE(anon_loc.cnt, 0::bigint) + COALESCE(anon_rec.cnt, 0::bigint) + COALESCE(anon_typ.cnt, 0::bigint) AS total_anno_count
171+
172+
FROM
173+
urls u
174+
LEFT JOIN auto_agency_count auto_ag
175+
ON auto_ag.id = u.id
176+
LEFT JOIN auto_location_count auto_loc
177+
ON auto_loc.id = u.id
178+
LEFT JOIN auto_record_type_count auto_rec
179+
ON auto_rec.id = u.id
180+
LEFT JOIN auto_url_type_count auto_typ
181+
ON auto_typ.id = u.id
182+
LEFT JOIN user_agency_count user_ag
183+
ON user_ag.id = u.id
184+
LEFT JOIN user_location_count user_loc
185+
ON user_loc.id = u.id
186+
LEFT JOIN user_record_type_count user_rec
187+
ON user_rec.id = u.id
188+
LEFT JOIN user_url_type_count user_typ
189+
ON user_typ.id = u.id
190+
LEFT JOIN anon_agency_count anon_ag
191+
ON anon_ag.id = u.id
192+
LEFT JOIN anon_location_count anon_loc
193+
ON anon_loc.id = u.id
194+
LEFT JOIN anon_record_type_count anon_rec
195+
ON anon_rec.id = u.id
196+
LEFT JOIN anon_url_type_count anon_typ
197+
ON anon_typ.id = u.id
198+
"""
199+
200+
_URL_ANNOTATION_FLAGS_SQL = """
201+
SELECT u.id as url_id,
202+
EXISTS (SELECT 1 FROM public.annotation__record_type__auto a WHERE a.url_id = u.id) AS has_auto_record_type_suggestion,
203+
EXISTS (SELECT 1 FROM public.annotation__url_type__auto a WHERE a.url_id = u.id) AS has_auto_relevant_suggestion,
204+
EXISTS (SELECT 1 FROM public.annotation__agency__auto__subtasks a WHERE a.url_id = u.id) AS has_auto_agency_suggestion,
205+
EXISTS (SELECT 1 FROM public.annotation__location__auto__subtasks a WHERE a.url_id = u.id) AS has_auto_location_suggestion,
206+
EXISTS (SELECT 1 FROM public.annotation__record_type__user a WHERE a.url_id = u.id) AS has_user_record_type_suggestion,
207+
EXISTS (SELECT 1 FROM public.annotation__url_type__user a WHERE a.url_id = u.id) AS has_user_relevant_suggestion,
208+
EXISTS (SELECT 1 FROM public.annotation__agency__user a WHERE a.url_id = u.id) AS has_user_agency_suggestion,
209+
EXISTS (SELECT 1 FROM public.annotation__location__user a WHERE a.url_id = u.id) AS has_user_location_suggestion,
210+
EXISTS (SELECT 1 FROM public.link_agencies__urls a WHERE a.url_id = u.id) AS has_confirmed_agency,
211+
EXISTS (SELECT 1 FROM public.reviewing_user_url a WHERE a.url_id = u.id) AS was_reviewed
212+
FROM urls u
213+
"""
214+
215+
216+
def upgrade() -> None:
217+
"""Convert url_annotation_count_view and url_annotation_flags to materialized views."""
218+
# Drop regular views
219+
op.execute("DROP VIEW IF EXISTS url_annotation_count_view")
220+
op.execute("DROP VIEW IF EXISTS url_annotation_flags")
221+
222+
# Recreate as materialized views
223+
op.execute(
224+
f"CREATE MATERIALIZED VIEW url_annotation_count_view AS {_URL_ANNOTATION_COUNT_VIEW_SQL}"
225+
)
226+
op.execute(
227+
f"CREATE MATERIALIZED VIEW url_annotation_flags AS {_URL_ANNOTATION_FLAGS_SQL}"
228+
)
229+
230+
# Unique indexes required for REFRESH MATERIALIZED VIEW CONCURRENTLY
231+
op.execute("CREATE UNIQUE INDEX ON url_annotation_count_view (url_id)")
232+
op.execute("CREATE UNIQUE INDEX ON url_annotation_flags (url_id)")
233+
234+
235+
def downgrade() -> None:
236+
"""Revert url_annotation_count_view and url_annotation_flags to regular views."""
237+
op.execute("DROP MATERIALIZED VIEW IF EXISTS url_annotation_count_view")
238+
op.execute("DROP MATERIALIZED VIEW IF EXISTS url_annotation_flags")
239+
240+
# Recreate as regular views
241+
op.execute(
242+
f"CREATE VIEW url_annotation_count_view AS {_URL_ANNOTATION_COUNT_VIEW_SQL}"
243+
)
244+
op.execute(
245+
f"CREATE OR REPLACE VIEW url_annotation_flags AS ({_URL_ANNOTATION_FLAGS_SQL})"
246+
)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""merge_heads
2+
3+
Revision ID: f831e447b1cb
4+
Revises: c8e4f1a2b3d5, 94e2b850fb30, c2f46d1af640
5+
Create Date: 2026-03-09 17:09:11.129775
6+
7+
"""
8+
from typing import Optional, Sequence
9+
10+
11+
# revision identifiers, used by Alembic.
12+
revision: str = 'f831e447b1cb'
13+
down_revision: Optional[tuple[str, ...]] = ('c8e4f1a2b3d5', '94e2b850fb30', 'c2f46d1af640')
14+
branch_labels: Optional[str | Sequence[str]] = None
15+
depends_on: Optional[str | Sequence[str]] = None
16+
17+
18+
def upgrade() -> None:
19+
"""Merge multiple heads."""
20+
pass
21+
22+
23+
def downgrade() -> None:
24+
"""Downgrade merge."""
25+
pass

docs/development.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,14 @@ At minimum, you need the database connection variables:
3535

3636
```dotenv
3737
POSTGRES_USER=test_source_collector_user
38-
POSTGRES_PASSWORD=HanviliciousHamiltonHilltops
38+
POSTGRES_PASSWORD=<see local_database/docker-compose.yml>
3939
POSTGRES_DB=source_collector_test_db
4040
POSTGRES_HOST=127.0.0.1
4141
POSTGRES_PORT=5432
4242
DEV=true
4343
```
4444

45-
These match the defaults in `local_database/docker-compose.yml`.
45+
The password and other defaults are defined in `local_database/docker-compose.yml`.
4646

4747
### API Keys
4848

pyproject.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,14 @@ dev = [
5656
"pytest>=7.2.2",
5757
"pytest-asyncio~=0.25.2",
5858
"pytest-mock==3.12.0",
59+
"pytest-benchmark~=5.2",
5960
"pytest-timeout~=2.3.1",
6061
"vulture>=2.14",
62+
"flake8>=7.3.0",
63+
"flake8-docstrings>=1.7.0",
64+
"flake8-simplify>=0.30.0",
65+
"flake8-unused-arguments>=0.0.14",
66+
"flake8-annotations>=3.2.0",
67+
"pyinstrument>=4.6.0",
6168
]
6269

pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ timeout = 300
33
asyncio_default_fixture_loop_scope=function
44
markers =
55
manual: mark test as manual-only (excluded from default test runs)
6+
benchmark: mark test as a performance benchmark (subset of manual)
67
asyncio_mode = auto

0 commit comments

Comments
 (0)