Coverage for brodata / gm.py: 66%
217 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-20 14:37 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-20 14:37 +0000
1import os
2from zipfile import ZipFile
3import logging
4import requests
5import urllib.request
6import json
7import pandas as pd
8import geopandas as gpd
9from shapely.geometry import MultiPolygon, Polygon
10from . import gmw, gld, gar, util
13base_url = "https://api.pdok.nl/bzk/bro-gminsamenhang-karakteristieken/ogc/v1"
15logger = logging.getLogger(__name__)
18def conformance():
19 url = f"{base_url}/conformance"
20 r = requests.get(url, params={"f": "json"})
21 if not r.ok:
22 raise Exception(f"Retrieving data from {url} failed")
23 return r.json()
26def collections():
27 url = f"{base_url}/collections"
28 r = requests.get(url, params={"f": "json"})
29 if not r.ok:
30 raise Exception(f"Retrieving data from {url} failed")
31 return r.json()
34def gm_gld_collection():
35 url = f"{base_url}/collections/gm_gld"
36 r = requests.get(url, params={"f": "json"})
37 if not r.ok:
38 raise Exception(f"Retrieving data from {url} failed")
39 return r.json()
42def _gm_items(
43 url,
44 extent=None,
45 crs="http://www.opengis.net/def/crs/EPSG/0/28992",
46 limit=1000,
47 time_columns=None,
48 to_file=None,
49 zipfile=None,
50 redownload=False,
51 **kwargs,
52):
53 """
54 Fetches and parses geospatial features from a GeoJSON endpoint, with optional
55 filtering, pagination support, and time column localization.
57 Retrieves data from a remote URL, a local file, or within a zip archive. Supports
58 bounding box filtering, CRS specification, and conversion of datetime columns to
59 Dutch winter time (UTC+1).
61 Parameters
62 ----------
63 url : str
64 The base URL to request the GeoJSON data from.
65 extent : list, tuple, shapely.geometry.Polygon or shapely.geometry.MultiPolygon, optional
66 The spatial extent ([xmin, xmax, ymin, ymax]) or polygon geometry to filter the data.
67 When a polygon is provided, its bounding box is used for the spatial query.
68 crs : string, optional
69 The coordinate reference system of the requested extent and the geometries in
70 the response. Possible values are:
71 http://www.opengis.net/def/crs/OGC/1.3/CRS84
72 http://www.opengis.net/def/crs/EPSG/0/28992
73 http://www.opengis.net/def/crs/EPSG/0/3857
74 http://www.opengis.net/def/crs/EPSG/0/4258
75 The default is "http://www.opengis.net/def/crs/EPSG/0/28992".
76 limit : int, optional
77 Limits the number of items that are presented in the response document. The
78 maximum allowed value is 1000. The default is 1000.
79 time_columns : list of str, optional
80 Names of columns containing datetime values to convert to Dutch winter time.
81 If None, columns ending with '_time' are automatically selected.
82 to_file : str, optional
83 Path to save the downloaded GeoJSON file. If the file exists and
84 `redownload` is False, it will be reused.
85 zipfile : ZipFile, optional
86 A `zipfile.ZipFile` object from which to read the `to_file` if provided.
87 redownload : bool, optional
88 If True, forces redownload of the data even if `to_file` exists.
89 **kwargs : dict
90 Additional query parameters to include in the request.
92 Returns
93 -------
94 gdf : geopandas.GeoDataFrame
95 A GeoDataFrame containing the parsed geospatial features.
97 """
98 if zipfile is not None:
99 with zipfile.open(to_file) as f:
100 json_data = json.load(f)
101 elif redownload or to_file is None or not os.path.isfile(to_file):
102 params = {"f": "json", "crs": crs, "limit": limit}
103 if extent is not None:
104 if isinstance(extent, (Polygon, MultiPolygon)):
105 xmin, ymin, xmax, ymax = extent.bounds
106 else:
107 xmin, xmax, ymin, ymax = extent
108 bbox = f"{xmin},{ymin},{xmax},{ymax}"
109 params["bbox-crs"] = crs
110 params["bbox"] = bbox
112 for key in kwargs:
113 params[key] = kwargs[key]
114 r = requests.get(url, params=params)
116 if not r.ok:
117 detail = r.json()["detail"]
118 raise Exception(f"Retrieving data from {url} failed: {detail}")
119 if to_file is not None:
120 with open(to_file, "w") as f:
121 f.write(r.text)
122 json_data = r.json()
123 else:
124 with open(to_file) as f:
125 json_data = json.load(f)
126 if len(json_data["features"]) == 0:
127 msg = "No data found"
128 if extent is not None:
129 msg = "%s for extent=%s" % (msg, extent)
130 msg = "%s on %s" % (msg, url)
131 logger.warning(msg)
132 return
133 gdf = gpd.GeoDataFrame.from_features(json_data["features"], crs=crs)
134 url = _get_next_url(json_data)
135 if url is not None:
136 gdfs = [gdf]
137 while url is not None:
138 r = requests.get(url)
139 if not r.ok:
140 raise Exception(f"Retrieving data from {url} failed")
141 json_data = r.json()
142 gdfs.append(gpd.GeoDataFrame.from_features(json_data["features"], crs=crs))
143 url = _get_next_url(json_data)
144 gdf = pd.concat(gdfs, ignore_index=True)
145 if time_columns is None:
146 time_columns = gdf.columns[gdf.columns.str.endswith("_time")]
147 one_hour = pd.Timedelta(1, "hour")
148 for column in time_columns:
149 # transform date to dutch winter time
150 gdf[column] = (
151 pd.to_datetime(gdf[column], utc=True).dt.tz_localize(None) + one_hour
152 )
153 # Filter results to polygon if polygon extent was provided
154 if extent is not None and isinstance(extent, (Polygon, MultiPolygon)):
155 gdf = gdf[gdf.intersects(extent)]
156 return gdf
159def _get_next_url(json_data):
160 links = pd.DataFrame(json_data["links"])
161 next_mask = links["rel"] == "next"
162 if next_mask.any():
163 if next_mask.sum() > 1:
164 raise (ValueError("More than 1 'next' page"))
165 url = links.loc[next_mask, "href"].iloc[0]
166 return url
167 else:
168 return None
171def gar_items(*args, **kwargs):
172 url = f"{base_url}/collections/gm_gar/items"
174 gdf = _gm_items(url, *args, **kwargs)
175 return gdf
178def gld_items(*args, tmin=None, tmax=None, **kwargs):
179 url = f"{base_url}/collections/gm_gld/items"
181 gdf = _gm_items(url, *args, **kwargs)
183 return gdf
186def gmn_items(*args, **kwargs):
187 url = f"{base_url}/collections/gm_gmn/items"
189 gdf = _gm_items(url, *args, **kwargs)
191 return gdf
194def gmn_measuringpoint_items(*args, **kwargs):
195 url = f"{base_url}/collections/gm_gmn_measuringpoint/items"
197 gdf = _gm_items(url, *args, **kwargs)
199 return gdf
202def gmn_reference_items(*args, **kwargs):
203 url = f"{base_url}/collections/gm_gmn_reference/items"
205 gdf = _gm_items(url, *args, **kwargs)
207 return gdf
210def gmw_items(*args, **kwargs):
211 url = f"{base_url}/collections/gm_gmw/items"
213 gdf = _gm_items(url, *args, **kwargs)
215 return gdf
218def gmw_monitoringtube_items(*args, **kwargs):
219 url = f"{base_url}/collections/gm_gmw_monitoringtube/items"
221 gdf = _gm_items(url, *args, **kwargs)
223 return gdf
226def get_data_in_extent(
227 extent,
228 kind="gld",
229 tmin=None,
230 tmax=None,
231 silent=False,
232 combine=True,
233 index=None,
234 as_csv=False,
235 status=None,
236 observation_type=None,
237 qualifier=None,
238 to_path=None,
239 to_zip=None,
240 redownload=False,
241 continue_on_error=False,
242 sort=True,
243 drop_duplicates=True,
244 progress_callback=None,
245):
246 """
247 Retrieve metadata and observations within a specified spatial extent.
249 This function fetches monitoring well characteristics and groundwater observations
250 within the given spatial extent. It can combine the data for specific observation
251 types and return either individual dataframes or a combined dataframe.
253 Parameters
254 ----------
255 extent : object
256 The spatial extent ([xmin, xmax, ymin, ymax]) to filter the data.
257 kind : str, optional
258 The type of observations to retrieve. Valid values are {'gld', 'gar'} for
259 groundwater level dossier or groundwater analysis report. When kind is None, no
260 observations are downloaded. Defaults to 'gld'.
261 tmin : str or datetime, optional
262 The minimum time for filtering observations. Defaults to None.
263 tmax : str or datetime, optional
264 The maximum time for filtering observations. Defaults to None.
265 silent : bool, optional
266 If True, suppresses progress logging. Defaults to False.
267 combine : bool, optional
268 If True, combines the tube properties, and observations into a single
269 dataframe. Defaults to True.
270 index : str, optional
271 The column to use for indexing in the resulting dataframe. If None, the index
272 will be set to a MultiIndex of the columns "gmw_bro_id" and "tube_number".
273 Defaults to None.
274 as_csv : bool, optional
275 If True, the measurement data is requested as CSV files instead of XML files
276 (only supported for 'gld'). Defaults to False.
277 status : str, optional
278 A status string for additional filtering. Possible values are
279 "volledigBeoordeeld", "voorlopig" and "onbekend" Only valid if `kind` is 'gld'.
280 Defaults to None.
281 observation_type : str, optional
282 An observation type string for additional filtering. Possible values are
283 "reguliereMeting" and "controleMeting". Only valid if `kind` is 'gld'. Defaults
284 to None.
285 qualifier : str or list of str, optional
286 A string or list of strings used to filter the observations. Only valid if
287 `kind` is 'gld'. Defaults to None.
288 to_path : str, optional
289 If not None, save the downloaded files in the directory named to_path. The
290 default is None.
291 to_zip : str, optional
292 If not None, save the downloaded files in a zip-file named to_zip. The default
293 is None.
294 redownload : bool, optional
295 When downloaded files exist in to_path or to_zip, read from these files when
296 redownload is False. If redownload is True, download the data again from the
297 BRO-servers. The default is False.
298 continue_on_error : bool, optional
299 If True, continue after an error occurs during downloading or processing of
300 individual observation data. Defaults to False.
301 sort : bool, optional
302 If True, sort the observations. Only used if `kind` is 'gld'. Defaults to True.
303 drop_duplicates : bool, optional
304 If True, drop duplicate observations based on their timestamp. Only used if
305 `kind` is 'gld'. Defaults to True.
306 progress_callback : function, optional
307 A callback function that takes two arguments (current, total) to report
308 progress. If None, no progress reporting is done. Defaults to None.
310 Returns
311 -------
312 gdf : pd.DataFrame
313 A dataframe containing tube properties and metadata within the specified extent.
315 obs_df : pd.DataFrame, optional
316 A dataframe containing the observations for the specified wells. Returned only if
317 `combine` is False.
318 """
320 if isinstance(extent, str):
321 if to_zip is not None:
322 raise (Exception("When extent is a string, do not supply to_zip"))
323 to_zip = extent
324 if not os.path.isfile(to_zip):
325 raise (FileExistsError(f"The file {to_zip} is not present"))
326 extent = None
327 redownload = False
329 zipfile = None
330 _files = None
331 if to_zip is not None:
332 if not redownload and os.path.isfile(to_zip):
333 logger.info("Reading data from %s", to_zip)
334 zipfile = ZipFile(to_zip)
335 else:
336 if to_path is None:
337 to_path = os.path.splitext(to_zip)[0]
338 remove_path_again = not os.path.isdir(to_path)
339 _files = []
341 if to_path is not None and not os.path.isdir(to_path):
342 os.makedirs(to_path)
344 to_file = util._get_to_file("gm_gmw_monitoringtube.json", zipfile, to_path, _files)
345 tubes = gmw_monitoringtube_items(
346 extent, to_file=to_file, redownload=redownload, zipfile=zipfile
347 )
349 if index is None:
350 index = ["gmw_bro_id", "tube_number"]
351 tubes = tubes.set_index(index)
353 if kind is None:
354 return tubes
356 if kind == "gar":
357 to_file = util._get_to_file("gm_gar.json", zipfile, to_path, _files)
358 meas_gdf = gar_items(
359 extent, to_file=to_file, redownload=redownload, zipfile=zipfile
360 )
361 if tmin is not None:
362 meas_gdf = meas_gdf[meas_gdf["sampling_date_time"] >= tmin]
364 if tmax is not None:
365 meas_gdf = meas_gdf[meas_gdf["sampling_date_time"] <= tmax]
366 meas_cl = gar.GroundwaterAnalysisReport
367 elif kind == "gld":
368 to_file = util._get_to_file("gm_gld.json", zipfile, to_path, _files)
369 meas_gdf = gld_items(
370 extent, to_file=to_file, redownload=redownload, zipfile=zipfile
371 )
372 if tmin is not None:
373 meas_gdf = meas_gdf[meas_gdf["research_last_date"] >= tmin]
375 if tmax is not None:
376 meas_gdf = meas_gdf[meas_gdf["research_first_date"] <= tmax]
377 meas_cl = gld.GroundwaterLevelDossier
378 else:
379 raise (ValueError(f"kind='{kind}' not supported"))
381 gld_kwargs = gmw._get_gld_kwargs(
382 kind, tmin, tmax, qualifier, status, observation_type, sort, drop_duplicates
383 )
385 meas_gdf = meas_gdf.set_index("bro_id")
386 measurement_objects = []
387 if zipfile is None:
388 desc = f"Downloading {kind}-observations"
389 else:
390 desc = f"Reading {kind}-observations from {to_zip}"
391 if as_csv and kind != "gld":
392 raise (Exception("as_csv=True is only supported for kind=='gld'"))
393 if qualifier is not None and kind != "gld":
394 raise (Exception("A qualifier is only supported for kind=='gld'"))
395 datcol = gmw._get_data_column(kind)
396 for bro_id in util.tqdm(meas_gdf.index, disable=silent, desc=desc):
397 obsdata = gmw._download_observations_for_bro_id(
398 bro_id,
399 meas_cl,
400 as_csv,
401 zipfile,
402 to_path,
403 _files,
404 gld_kwargs,
405 redownload=redownload,
406 continue_on_error=continue_on_error,
407 )
409 if as_csv:
410 meas_dict = {"broId": bro_id, datcol: obsdata}
411 else:
412 meas_dict = obsdata.to_dict()
413 meas_dict["gm_gmw_monitoringtube_fk"] = meas_gdf.at[
414 bro_id, "gm_gmw_monitoringtube_fk"
415 ]
416 measurement_objects.append(meas_dict)
418 if progress_callback is not None:
419 progress_callback(len(measurement_objects), len(meas_gdf.index))
420 obs_df = pd.DataFrame(measurement_objects)
422 if zipfile is not None:
423 zipfile.close()
424 if zipfile is None and to_zip is not None:
425 util._save_data_to_zip(to_zip, _files, remove_path_again, to_path)
427 # only keep tubes with active measurements
428 mask = tubes["gm_gmw_monitoringtube_pk"].isin(meas_gdf["gm_gmw_monitoringtube_fk"])
429 tubes = tubes[mask]
431 if combine and kind in ["gld", "gar"]:
432 logger.info("Adding observations to tube-properties")
434 if kind == "gld":
435 idcol = "groundwaterLevelDossier"
436 elif kind == "gar":
437 idcol = "groundwaterAnalysisReport"
439 data = {}
440 ids = {}
441 for index in tubes.index:
442 mask = (
443 obs_df["gm_gmw_monitoringtube_fk"]
444 == tubes.at[index, "gm_gmw_monitoringtube_pk"]
445 )
446 data[index] = gmw._combine_observations(obs_df.loc[mask, datcol], kind=kind)
447 ids[index] = list(obs_df.loc[mask, "broId"])
448 tubes[datcol] = data
449 tubes[idcol] = ids
450 return tubes
451 else:
452 return tubes, obs_df
455def get_kenset_geopackage(to_file=None, layer=None, redownload=False, index="bro_id"):
456 """
457 Download or read data from a geopackage-file for the whole of the Netherlands.
459 Parameters
460 ----------
461 to_file : str, optional
462 Path to save the downloaded GeoPackage file (with the extension `.gpkg`). If the
463 file exists and `redownload` is False, it will be reused. The default is None.
464 layer : str, optional
465 The layer within the geopackage. Possible values are 'gm_gmw',
466 'gm_gmw_monitoringtube', 'gm_gld', 'gm_gar', 'gm_gmn', 'gm_gmn_measuringpoint'
467 and 'gm_gmn_reference'. The default is None, which read data from the layer
468 "gm_gmw".
469 redownload : bool, optional
470 If True, forces redownload of the data even if `to_file` exists. The default is
471 False.
472 index : str, optional
473 The column to use for indexing in the resulting GeoDataFrame. The default is
474 "bro_id".
476 Returns
477 -------
478 gdf : gpd.GeoDataFrame
479 A GeoDataFrame containing the resulting objects.
481 """
482 url = "https://service.pdok.nl/bzk/bro-gminsamenhang-karakteristieken/atom/downloads/brogmkenset.gpkg"
483 if to_file is not None:
484 if redownload or not os.path.isfile(to_file):
485 urllib.request.urlretrieve(url, to_file)
486 url = to_file
487 gdf = gpd.read_file(url, layer=layer)
488 if index in gdf.columns:
489 gdf = gdf.set_index(index)
490 return gdf