Skip to content

Commit d744613

Browse files
committed
Align Bosnia fetcher with design docs
1 parent e810269 commit d744613

5 files changed

Lines changed: 181 additions & 75 deletions

rivretrieve/bosnia_herzegovina.py

Lines changed: 78 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,26 @@
1414

1515

1616
class BosniaHerzegovinaFetcher(base.RiverDataFetcher):
17-
"""Fetches river gauge data from vodostaji.voda.ba.
18-
19-
Data Source: [Federal Hydrometeorological Institute portal](https://vodostaji.voda.ba/#2031)
20-
21-
Supported Variables:
22-
- ``constants.DISCHARGE_DAILY_MEAN`` (m3/s)
23-
- ``constants.DISCHARGE_INSTANT`` (m3/s)
24-
- ``constants.STAGE_DAILY_MEAN`` (m)
25-
- ``constants.STAGE_INSTANT`` (m)
26-
- ``constants.WATER_TEMPERATURE_DAILY_MEAN`` (degC)
27-
- ``constants.WATER_TEMPERATURE_INSTANT`` (degC)
17+
"""Fetches river gauge data from the Federal Hydrometeorological Institute portal.
18+
19+
Data source:
20+
- https://vodostaji.voda.ba/
21+
22+
Supported variables:
23+
- constants.DISCHARGE_DAILY_MEAN (m³/s)
24+
- constants.DISCHARGE_INSTANT (m³/s)
25+
- constants.STAGE_DAILY_MEAN (m)
26+
- constants.STAGE_INSTANT (m)
27+
- constants.WATER_TEMPERATURE_DAILY_MEAN (°C)
28+
- constants.WATER_TEMPERATURE_INSTANT (°C)
29+
30+
Data description and API:
31+
- live station metadata snapshot: https://vodostaji.voda.ba/data/internet/layers/20/index.json
32+
- annual station workbooks:
33+
https://vodostaji.voda.ba/data/internet/stations/<group>/<gauge_id>/<parameter>/<file>
34+
35+
Terms of use:
36+
- see https://vodostaji.voda.ba/
2837
"""
2938

3039
METADATA_URL = "https://vodostaji.voda.ba/data/internet/layers/20/index.json"
@@ -52,6 +61,16 @@ class BosniaHerzegovinaFetcher(base.RiverDataFetcher):
5261
},
5362
}
5463

64+
@staticmethod
65+
def _empty_result(variable: str) -> pd.DataFrame:
66+
"""Returns an empty standardized RivRetrieve time series frame."""
67+
return pd.DataFrame(columns=[constants.TIME_INDEX, variable]).set_index(constants.TIME_INDEX)
68+
69+
@staticmethod
70+
def _empty_metadata() -> pd.DataFrame:
71+
"""Returns an empty metadata frame indexed by gauge ID."""
72+
return pd.DataFrame(columns=[constants.GAUGE_ID]).set_index(constants.GAUGE_ID)
73+
5574
@staticmethod
5675
def get_cached_metadata() -> pd.DataFrame:
5776
"""Retrieves cached Bosnia and Herzegovina gauge metadata."""
@@ -62,7 +81,11 @@ def get_available_variables() -> tuple[str, ...]:
6281
return tuple(BosniaHerzegovinaFetcher.VARIABLE_MAP.keys())
6382

6483
def get_metadata(self) -> pd.DataFrame:
65-
"""Downloads and normalizes station metadata from the live JSON snapshot."""
84+
"""Downloads and normalizes station metadata from the live JSON snapshot.
85+
86+
Keeps provider-specific metadata columns, standardizes the key RivRetrieve
87+
metadata fields, and returns a DataFrame indexed by ``constants.GAUGE_ID``.
88+
"""
6689
session = utils.requests_retry_session()
6790

6891
try:
@@ -77,7 +100,7 @@ def get_metadata(self) -> pd.DataFrame:
77100
raise
78101

79102
if not isinstance(data, list) or not data:
80-
return pd.DataFrame().set_index(constants.GAUGE_ID)
103+
return self._empty_metadata()
81104

82105
df = pd.json_normalize(data)
83106
rename_map = {
@@ -163,21 +186,29 @@ def _parse_data(
163186
"""Parses the Excel bytes into the standard RivRetrieve data frame layout."""
164187
content, station_group = raw_data
165188
if not content:
166-
return pd.DataFrame(columns=[constants.TIME_INDEX, variable]).set_index(constants.TIME_INDEX)
189+
return self._empty_result(variable)
167190

168191
try:
169-
df = pd.read_excel(BytesIO(content), skiprows=8, names=[constants.TIME_INDEX, variable])
192+
df = pd.read_excel(
193+
BytesIO(content),
194+
skiprows=8,
195+
header=None,
196+
names=[constants.TIME_INDEX, variable],
197+
)
170198
except Exception as exc:
171199
logger.error(f"Failed to parse Bosnia and Herzegovina data for {gauge_id}: {exc}")
172-
return pd.DataFrame(columns=[constants.TIME_INDEX, variable]).set_index(constants.TIME_INDEX)
200+
return self._empty_result(variable)
173201

174202
if df.empty:
175-
return pd.DataFrame(columns=[constants.TIME_INDEX, variable]).set_index(constants.TIME_INDEX)
203+
return self._empty_result(variable)
176204

177205
df[constants.TIME_INDEX] = pd.to_datetime(df[constants.TIME_INDEX], dayfirst=True, errors="coerce")
178206
df[variable] = pd.to_numeric(df[variable], errors="coerce")
179207
df = df.dropna(subset=[constants.TIME_INDEX, variable])
180208

209+
if variable in {constants.STAGE_DAILY_MEAN, constants.STAGE_INSTANT}:
210+
df[variable] = df[variable] / 100.0
211+
181212
if variable in {
182213
constants.DISCHARGE_DAILY_MEAN,
183214
constants.STAGE_DAILY_MEAN,
@@ -199,18 +230,44 @@ def get_data(
199230
start_date: Optional[str] = None,
200231
end_date: Optional[str] = None,
201232
) -> pd.DataFrame:
202-
"""Fetches and parses time series data for a specific gauge and variable."""
233+
"""Fetches and parses time series data for a specific gauge and variable.
234+
235+
This method retrieves the requested data from the provider's workbook endpoint,
236+
parses it, and returns it in a standardized pandas DataFrame format.
237+
238+
Args:
239+
gauge_id: The site-specific identifier for the gauge.
240+
variable: The variable to fetch. Must be one of the strings listed
241+
in the fetcher's ``get_available_variables()`` output.
242+
These are typically defined in ``rivretrieve.constants``.
243+
start_date: Optional start date for the data retrieval in 'YYYY-MM-DD' format.
244+
If None, data is fetched from the earliest available date in the workbook.
245+
end_date: Optional end date for the data retrieval in 'YYYY-MM-DD' format.
246+
If None, data is fetched up to the latest available date in the workbook.
247+
248+
Returns:
249+
pd.DataFrame: A pandas DataFrame indexed by datetime objects (``constants.TIME_INDEX``)
250+
with a single column named after the requested ``variable``. The DataFrame
251+
will be empty if no data is found for the given parameters.
252+
253+
Raises:
254+
ValueError: If the requested ``variable`` is not supported by this fetcher.
255+
"""
203256
start_date = utils.format_start_date(start_date)
204257
end_date = utils.format_end_date(end_date)
205258

206259
if variable not in self.get_available_variables():
207260
raise ValueError(f"Unsupported variable: {variable}")
208261

209-
raw_data = self._download_data(gauge_id, variable, start_date, end_date)
210-
df = self._parse_data(gauge_id, raw_data, variable)
262+
try:
263+
raw_data = self._download_data(gauge_id, variable, start_date, end_date)
264+
df = self._parse_data(gauge_id, raw_data, variable)
265+
except Exception as exc:
266+
logger.error(f"Failed to fetch Bosnia and Herzegovina data for {gauge_id} ({variable}): {exc}")
267+
return self._empty_result(variable)
211268

212269
if df.empty:
213-
return df
270+
return self._empty_result(variable)
214271

215272
start_dt = pd.to_datetime(start_date)
216273
end_dt = pd.to_datetime(end_date) + pd.Timedelta(days=1)

tests/test_bosnia_herzegovina.py

Lines changed: 103 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import json
2-
import os
32
import unittest
43
from pathlib import Path
54
from unittest.mock import MagicMock, patch
@@ -14,111 +13,161 @@
1413
class TestBosniaHerzegovinaFetcher(unittest.TestCase):
1514
def setUp(self):
1615
self.fetcher = BosniaHerzegovinaFetcher()
17-
self.test_data_dir = Path(os.path.dirname(__file__)) / "test_data"
16+
self.test_data_dir = Path(__file__).parent / "test_data"
1817

1918
def _load_json(self, filename):
20-
with open(self.test_data_dir / filename, "r", encoding="utf-8") as f:
21-
return json.load(f)
19+
with (self.test_data_dir / filename).open("r", encoding="utf-8") as file_handle:
20+
return json.load(file_handle)
21+
22+
def _load_bytes(self, filename):
23+
return (self.test_data_dir / filename).read_bytes()
24+
25+
@staticmethod
26+
def _build_response(status_code=200, content=b"", json_data=None):
27+
response = MagicMock()
28+
response.status_code = status_code
29+
response.content = content
30+
response.json.return_value = json_data
31+
response.raise_for_status = MagicMock()
32+
return response
2233

2334
@patch("rivretrieve.utils.requests_retry_session")
2435
def test_get_metadata(self, mock_requests_session):
2536
mock_session = MagicMock()
2637
mock_requests_session.return_value = mock_session
2738

28-
mock_response = MagicMock()
29-
mock_response.json.return_value = self._load_json("bosnia_herzegovina_metadata_sample.json")
30-
mock_response.raise_for_status = MagicMock()
39+
mock_response = self._build_response(
40+
json_data=self._load_json("bosnia_herzegovina_metadata_sample.json")
41+
)
3142
mock_session.get.return_value = mock_response
3243

3344
result_df = self.fetcher.get_metadata()
3445

46+
self.assertEqual(result_df.index.name, constants.GAUGE_ID)
3547
self.assertEqual(list(result_df.index), ["4510", "4121"])
3648
self.assertEqual(result_df.loc["4510", constants.STATION_NAME], "HS Kaloševići")
3749
self.assertEqual(result_df.loc["4510", constants.RIVER], "Usora")
3850
self.assertAlmostEqual(result_df.loc["4510", constants.LATITUDE], 44.64680728070949)
3951
self.assertAlmostEqual(result_df.loc["4510", constants.LONGITUDE], 17.90406242892678)
52+
self.assertIn("metadata_station_carteasting", result_df.columns)
53+
self.assertIn("catchment", result_df.columns)
4054
self.assertEqual(result_df.loc["4510", constants.COUNTRY], "Bosnia and Herzegovina")
4155
self.assertEqual(result_df.loc["4510", constants.SOURCE], "vodostaji.voda.ba")
4256
self.assertAlmostEqual(result_df.loc["4121", constants.AREA], 123.4)
57+
mock_session.get.assert_called_once_with(self.fetcher.METADATA_URL, timeout=30)
4358

44-
@patch("pandas.read_excel")
4559
@patch("rivretrieve.utils.requests_retry_session")
46-
def test_get_data_instant_discharge_detects_station_group(self, mock_requests_session, mock_read_excel):
60+
def test_get_data_instant_discharge_detects_station_group(self, mock_requests_session):
4761
mock_session = MagicMock()
4862
mock_requests_session.return_value = mock_session
49-
mock_read_excel.return_value = pd.DataFrame(
50-
{
51-
constants.TIME_INDEX: [
52-
"01.01.2025 00:00",
53-
"01.01.2025 01:00",
54-
"01.01.2025 02:00",
55-
"02.01.2025 00:00",
56-
],
57-
constants.DISCHARGE_INSTANT: [1.0, 2.0, 3.0, 4.0],
58-
}
59-
)
6063

61-
missing_response = MagicMock(status_code=404, content=b"")
62-
success_response = MagicMock(status_code=200, content=b"fake-xlsx-content")
63-
mock_session.get.side_effect = [missing_response, missing_response, success_response]
64+
missing_response = self._build_response(status_code=404)
65+
success_response = self._build_response(
66+
status_code=200,
67+
content=self._load_bytes("bosnia_herzegovina_4510_discharge_20250323.xlsx"),
68+
)
69+
mock_session.get.side_effect = [missing_response, missing_response, missing_response, success_response]
6470

6571
result_df = self.fetcher.get_data(
6672
gauge_id="4510",
6773
variable=constants.DISCHARGE_INSTANT,
68-
start_date="2025-01-01",
69-
end_date="2025-01-01",
74+
start_date="2025-03-23",
75+
end_date="2025-03-23",
7076
)
7177

7278
expected_df = pd.DataFrame(
7379
{
74-
constants.TIME_INDEX: pd.to_datetime(
75-
["2025-01-01 00:00:00", "2025-01-01 01:00:00", "2025-01-01 02:00:00"]
76-
),
77-
constants.DISCHARGE_INSTANT: [1.0, 2.0, 3.0],
80+
constants.TIME_INDEX: pd.date_range("2025-03-23 00:00:00", periods=24, freq="h"),
81+
constants.DISCHARGE_INSTANT: [
82+
8.304,
83+
7.958,
84+
8.105,
85+
8.007,
86+
7.909,
87+
7.762,
88+
7.958,
89+
7.665,
90+
7.713,
91+
8.205,
92+
8.007,
93+
7.328,
94+
7.860,
95+
8.105,
96+
7.568,
97+
7.811,
98+
7.958,
99+
7.762,
100+
7.665,
101+
7.280,
102+
7.568,
103+
7.472,
104+
7.472,
105+
7.280,
106+
],
78107
}
79108
).set_index(constants.TIME_INDEX)
80109

81-
assert_frame_equal(result_df, expected_df)
82-
self.assertEqual(result_df.attrs["station_group"], 3)
83-
self.assertEqual(mock_session.get.call_count, 3)
110+
assert_frame_equal(result_df, expected_df, check_dtype=False)
111+
self.assertEqual(result_df.index.name, constants.TIME_INDEX)
112+
self.assertEqual(result_df.attrs["station_group"], 4)
113+
self.assertEqual(mock_session.get.call_count, 4)
84114
self.assertIn("/1/4510/Q/Q_1Y.xlsx", mock_session.get.call_args_list[0].args[0])
85-
self.assertIn("/3/4510/Q/Q_1Y.xlsx", mock_session.get.call_args_list[2].args[0])
115+
self.assertIn("/4/4510/Q/Q_1Y.xlsx", mock_session.get.call_args_list[3].args[0])
116+
self.assertTrue(all(call.kwargs["timeout"] == 20 for call in mock_session.get.call_args_list))
86117

87-
@patch("pandas.read_excel")
88118
@patch("rivretrieve.utils.requests_retry_session")
89-
def test_get_data_daily_temperature(self, mock_requests_session, mock_read_excel):
119+
def test_get_data_daily_stage_converts_centimeters_to_meters(self, mock_requests_session):
90120
mock_session = MagicMock()
91121
mock_requests_session.return_value = mock_session
92-
mock_read_excel.return_value = pd.DataFrame(
93-
{
94-
constants.TIME_INDEX: [
95-
"01.01.2025 00:00",
96-
"01.01.2025 12:00",
97-
"02.01.2025 00:00",
98-
"02.01.2025 12:00",
99-
],
100-
constants.WATER_TEMPERATURE_DAILY_MEAN: [10.0, 11.0, 12.0, 12.0],
101-
}
122+
missing_response = self._build_response(status_code=404)
123+
success_response = self._build_response(
124+
status_code=200,
125+
content=self._load_bytes("bosnia_herzegovina_4510_stage_20250323.xlsx"),
102126
)
103-
104-
success_response = MagicMock(status_code=200, content=b"fake-xlsx-content")
105-
mock_session.get.return_value = success_response
127+
mock_session.get.side_effect = [missing_response, missing_response, missing_response, success_response]
106128

107129
result_df = self.fetcher.get_data(
108130
gauge_id="4510",
109-
variable=constants.WATER_TEMPERATURE_DAILY_MEAN,
110-
start_date="2025-01-01",
111-
end_date="2025-01-02",
131+
variable=constants.STAGE_DAILY_MEAN,
132+
start_date="2025-03-23",
133+
end_date="2025-03-24",
112134
)
113135

114136
expected_df = pd.DataFrame(
115137
{
116-
constants.TIME_INDEX: pd.to_datetime(["2025-01-01", "2025-01-02"]),
117-
constants.WATER_TEMPERATURE_DAILY_MEAN: [10.5, 12.0],
138+
constants.TIME_INDEX: pd.to_datetime(["2025-03-23", "2025-03-24"]),
139+
constants.STAGE_DAILY_MEAN: [0.8113333333333334, 0.9504166666666667],
118140
}
119141
).set_index(constants.TIME_INDEX)
120142

121-
assert_frame_equal(result_df, expected_df)
143+
assert_frame_equal(result_df, expected_df, check_dtype=False)
144+
self.assertIn("/4/4510/H/H_1Y.xlsx", mock_session.get.call_args_list[3].args[0])
145+
146+
@patch("rivretrieve.utils.requests_retry_session")
147+
def test_get_data_returns_standardized_empty_frame_for_empty_temperature_workbook(self, mock_requests_session):
148+
mock_session = MagicMock()
149+
mock_requests_session.return_value = mock_session
150+
missing_response = self._build_response(status_code=404)
151+
success_response = self._build_response(
152+
status_code=200,
153+
content=self._load_bytes("bosnia_herzegovina_4510_water_temperature_20250323.xlsx"),
154+
)
155+
mock_session.get.side_effect = [missing_response, missing_response, missing_response, success_response]
156+
157+
result_df = self.fetcher.get_data(
158+
gauge_id="4510",
159+
variable=constants.WATER_TEMPERATURE_INSTANT,
160+
start_date="2025-03-23",
161+
end_date="2025-03-23",
162+
)
163+
164+
expected_df = pd.DataFrame(
165+
columns=[constants.TIME_INDEX, constants.WATER_TEMPERATURE_INSTANT]
166+
).set_index(constants.TIME_INDEX)
167+
168+
assert_frame_equal(result_df, expected_df, check_dtype=False)
169+
self.assertEqual(result_df.index.name, constants.TIME_INDEX)
170+
self.assertIn("/4/4510/WT/Tvode_1Y.xlsx", mock_session.get.call_args_list[3].args[0])
122171

123172

124173
if __name__ == "__main__":
122 KB
Binary file not shown.
119 KB
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)