Coverage for brodata / gm.py: 66%

217 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-20 14:37 +0000

1import os 

2from zipfile import ZipFile 

3import logging 

4import requests 

5import urllib.request 

6import json 

7import pandas as pd 

8import geopandas as gpd 

9from shapely.geometry import MultiPolygon, Polygon 

10from . import gmw, gld, gar, util 

11 

12 

13base_url = "https://api.pdok.nl/bzk/bro-gminsamenhang-karakteristieken/ogc/v1" 

14 

15logger = logging.getLogger(__name__) 

16 

17 

18def conformance(): 

19 url = f"{base_url}/conformance" 

20 r = requests.get(url, params={"f": "json"}) 

21 if not r.ok: 

22 raise Exception(f"Retrieving data from {url} failed") 

23 return r.json() 

24 

25 

26def collections(): 

27 url = f"{base_url}/collections" 

28 r = requests.get(url, params={"f": "json"}) 

29 if not r.ok: 

30 raise Exception(f"Retrieving data from {url} failed") 

31 return r.json() 

32 

33 

34def gm_gld_collection(): 

35 url = f"{base_url}/collections/gm_gld" 

36 r = requests.get(url, params={"f": "json"}) 

37 if not r.ok: 

38 raise Exception(f"Retrieving data from {url} failed") 

39 return r.json() 

40 

41 

42def _gm_items( 

43 url, 

44 extent=None, 

45 crs="http://www.opengis.net/def/crs/EPSG/0/28992", 

46 limit=1000, 

47 time_columns=None, 

48 to_file=None, 

49 zipfile=None, 

50 redownload=False, 

51 **kwargs, 

52): 

53 """ 

54 Fetches and parses geospatial features from a GeoJSON endpoint, with optional 

55 filtering, pagination support, and time column localization. 

56 

57 Retrieves data from a remote URL, a local file, or within a zip archive. Supports 

58 bounding box filtering, CRS specification, and conversion of datetime columns to 

59 Dutch winter time (UTC+1). 

60 

61 Parameters 

62 ---------- 

63 url : str 

64 The base URL to request the GeoJSON data from. 

65 extent : list, tuple, shapely.geometry.Polygon or shapely.geometry.MultiPolygon, optional 

66 The spatial extent ([xmin, xmax, ymin, ymax]) or polygon geometry to filter the data. 

67 When a polygon is provided, its bounding box is used for the spatial query. 

68 crs : string, optional 

69 The coordinate reference system of the requested extent and the geometries in 

70 the response. Possible values are: 

71 http://www.opengis.net/def/crs/OGC/1.3/CRS84 

72 http://www.opengis.net/def/crs/EPSG/0/28992 

73 http://www.opengis.net/def/crs/EPSG/0/3857 

74 http://www.opengis.net/def/crs/EPSG/0/4258 

75 The default is "http://www.opengis.net/def/crs/EPSG/0/28992". 

76 limit : int, optional 

77 Limits the number of items that are presented in the response document. The 

78 maximum allowed value is 1000. The default is 1000. 

79 time_columns : list of str, optional 

80 Names of columns containing datetime values to convert to Dutch winter time. 

81 If None, columns ending with '_time' are automatically selected. 

82 to_file : str, optional 

83 Path to save the downloaded GeoJSON file. If the file exists and 

84 `redownload` is False, it will be reused. 

85 zipfile : ZipFile, optional 

86 A `zipfile.ZipFile` object from which to read the `to_file` if provided. 

87 redownload : bool, optional 

88 If True, forces redownload of the data even if `to_file` exists. 

89 **kwargs : dict 

90 Additional query parameters to include in the request. 

91 

92 Returns 

93 ------- 

94 gdf : geopandas.GeoDataFrame 

95 A GeoDataFrame containing the parsed geospatial features. 

96 

97 """ 

98 if zipfile is not None: 

99 with zipfile.open(to_file) as f: 

100 json_data = json.load(f) 

101 elif redownload or to_file is None or not os.path.isfile(to_file): 

102 params = {"f": "json", "crs": crs, "limit": limit} 

103 if extent is not None: 

104 if isinstance(extent, (Polygon, MultiPolygon)): 

105 xmin, ymin, xmax, ymax = extent.bounds 

106 else: 

107 xmin, xmax, ymin, ymax = extent 

108 bbox = f"{xmin},{ymin},{xmax},{ymax}" 

109 params["bbox-crs"] = crs 

110 params["bbox"] = bbox 

111 

112 for key in kwargs: 

113 params[key] = kwargs[key] 

114 r = requests.get(url, params=params) 

115 

116 if not r.ok: 

117 detail = r.json()["detail"] 

118 raise Exception(f"Retrieving data from {url} failed: {detail}") 

119 if to_file is not None: 

120 with open(to_file, "w") as f: 

121 f.write(r.text) 

122 json_data = r.json() 

123 else: 

124 with open(to_file) as f: 

125 json_data = json.load(f) 

126 if len(json_data["features"]) == 0: 

127 msg = "No data found" 

128 if extent is not None: 

129 msg = "%s for extent=%s" % (msg, extent) 

130 msg = "%s on %s" % (msg, url) 

131 logger.warning(msg) 

132 return 

133 gdf = gpd.GeoDataFrame.from_features(json_data["features"], crs=crs) 

134 url = _get_next_url(json_data) 

135 if url is not None: 

136 gdfs = [gdf] 

137 while url is not None: 

138 r = requests.get(url) 

139 if not r.ok: 

140 raise Exception(f"Retrieving data from {url} failed") 

141 json_data = r.json() 

142 gdfs.append(gpd.GeoDataFrame.from_features(json_data["features"], crs=crs)) 

143 url = _get_next_url(json_data) 

144 gdf = pd.concat(gdfs, ignore_index=True) 

145 if time_columns is None: 

146 time_columns = gdf.columns[gdf.columns.str.endswith("_time")] 

147 one_hour = pd.Timedelta(1, "hour") 

148 for column in time_columns: 

149 # transform date to dutch winter time 

150 gdf[column] = ( 

151 pd.to_datetime(gdf[column], utc=True).dt.tz_localize(None) + one_hour 

152 ) 

153 # Filter results to polygon if polygon extent was provided 

154 if extent is not None and isinstance(extent, (Polygon, MultiPolygon)): 

155 gdf = gdf[gdf.intersects(extent)] 

156 return gdf 

157 

158 

159def _get_next_url(json_data): 

160 links = pd.DataFrame(json_data["links"]) 

161 next_mask = links["rel"] == "next" 

162 if next_mask.any(): 

163 if next_mask.sum() > 1: 

164 raise (ValueError("More than 1 'next' page")) 

165 url = links.loc[next_mask, "href"].iloc[0] 

166 return url 

167 else: 

168 return None 

169 

170 

171def gar_items(*args, **kwargs): 

172 url = f"{base_url}/collections/gm_gar/items" 

173 

174 gdf = _gm_items(url, *args, **kwargs) 

175 return gdf 

176 

177 

178def gld_items(*args, tmin=None, tmax=None, **kwargs): 

179 url = f"{base_url}/collections/gm_gld/items" 

180 

181 gdf = _gm_items(url, *args, **kwargs) 

182 

183 return gdf 

184 

185 

186def gmn_items(*args, **kwargs): 

187 url = f"{base_url}/collections/gm_gmn/items" 

188 

189 gdf = _gm_items(url, *args, **kwargs) 

190 

191 return gdf 

192 

193 

194def gmn_measuringpoint_items(*args, **kwargs): 

195 url = f"{base_url}/collections/gm_gmn_measuringpoint/items" 

196 

197 gdf = _gm_items(url, *args, **kwargs) 

198 

199 return gdf 

200 

201 

202def gmn_reference_items(*args, **kwargs): 

203 url = f"{base_url}/collections/gm_gmn_reference/items" 

204 

205 gdf = _gm_items(url, *args, **kwargs) 

206 

207 return gdf 

208 

209 

210def gmw_items(*args, **kwargs): 

211 url = f"{base_url}/collections/gm_gmw/items" 

212 

213 gdf = _gm_items(url, *args, **kwargs) 

214 

215 return gdf 

216 

217 

218def gmw_monitoringtube_items(*args, **kwargs): 

219 url = f"{base_url}/collections/gm_gmw_monitoringtube/items" 

220 

221 gdf = _gm_items(url, *args, **kwargs) 

222 

223 return gdf 

224 

225 

226def get_data_in_extent( 

227 extent, 

228 kind="gld", 

229 tmin=None, 

230 tmax=None, 

231 silent=False, 

232 combine=True, 

233 index=None, 

234 as_csv=False, 

235 status=None, 

236 observation_type=None, 

237 qualifier=None, 

238 to_path=None, 

239 to_zip=None, 

240 redownload=False, 

241 continue_on_error=False, 

242 sort=True, 

243 drop_duplicates=True, 

244 progress_callback=None, 

245): 

246 """ 

247 Retrieve metadata and observations within a specified spatial extent. 

248 

249 This function fetches monitoring well characteristics and groundwater observations 

250 within the given spatial extent. It can combine the data for specific observation 

251 types and return either individual dataframes or a combined dataframe. 

252 

253 Parameters 

254 ---------- 

255 extent : object 

256 The spatial extent ([xmin, xmax, ymin, ymax]) to filter the data. 

257 kind : str, optional 

258 The type of observations to retrieve. Valid values are {'gld', 'gar'} for 

259 groundwater level dossier or groundwater analysis report. When kind is None, no 

260 observations are downloaded. Defaults to 'gld'. 

261 tmin : str or datetime, optional 

262 The minimum time for filtering observations. Defaults to None. 

263 tmax : str or datetime, optional 

264 The maximum time for filtering observations. Defaults to None. 

265 silent : bool, optional 

266 If True, suppresses progress logging. Defaults to False. 

267 combine : bool, optional 

268 If True, combines the tube properties, and observations into a single 

269 dataframe. Defaults to True. 

270 index : str, optional 

271 The column to use for indexing in the resulting dataframe. If None, the index 

272 will be set to a MultiIndex of the columns "gmw_bro_id" and "tube_number". 

273 Defaults to None. 

274 as_csv : bool, optional 

275 If True, the measurement data is requested as CSV files instead of XML files 

276 (only supported for 'gld'). Defaults to False. 

277 status : str, optional 

278 A status string for additional filtering. Possible values are 

279 "volledigBeoordeeld", "voorlopig" and "onbekend" Only valid if `kind` is 'gld'. 

280 Defaults to None. 

281 observation_type : str, optional 

282 An observation type string for additional filtering. Possible values are 

283 "reguliereMeting" and "controleMeting". Only valid if `kind` is 'gld'. Defaults 

284 to None. 

285 qualifier : str or list of str, optional 

286 A string or list of strings used to filter the observations. Only valid if 

287 `kind` is 'gld'. Defaults to None. 

288 to_path : str, optional 

289 If not None, save the downloaded files in the directory named to_path. The 

290 default is None. 

291 to_zip : str, optional 

292 If not None, save the downloaded files in a zip-file named to_zip. The default 

293 is None. 

294 redownload : bool, optional 

295 When downloaded files exist in to_path or to_zip, read from these files when 

296 redownload is False. If redownload is True, download the data again from the 

297 BRO-servers. The default is False. 

298 continue_on_error : bool, optional 

299 If True, continue after an error occurs during downloading or processing of 

300 individual observation data. Defaults to False. 

301 sort : bool, optional 

302 If True, sort the observations. Only used if `kind` is 'gld'. Defaults to True. 

303 drop_duplicates : bool, optional 

304 If True, drop duplicate observations based on their timestamp. Only used if 

305 `kind` is 'gld'. Defaults to True. 

306 progress_callback : function, optional 

307 A callback function that takes two arguments (current, total) to report 

308 progress. If None, no progress reporting is done. Defaults to None. 

309 

310 Returns 

311 ------- 

312 gdf : pd.DataFrame 

313 A dataframe containing tube properties and metadata within the specified extent. 

314 

315 obs_df : pd.DataFrame, optional 

316 A dataframe containing the observations for the specified wells. Returned only if 

317 `combine` is False. 

318 """ 

319 

320 if isinstance(extent, str): 

321 if to_zip is not None: 

322 raise (Exception("When extent is a string, do not supply to_zip")) 

323 to_zip = extent 

324 if not os.path.isfile(to_zip): 

325 raise (FileExistsError(f"The file {to_zip} is not present")) 

326 extent = None 

327 redownload = False 

328 

329 zipfile = None 

330 _files = None 

331 if to_zip is not None: 

332 if not redownload and os.path.isfile(to_zip): 

333 logger.info("Reading data from %s", to_zip) 

334 zipfile = ZipFile(to_zip) 

335 else: 

336 if to_path is None: 

337 to_path = os.path.splitext(to_zip)[0] 

338 remove_path_again = not os.path.isdir(to_path) 

339 _files = [] 

340 

341 if to_path is not None and not os.path.isdir(to_path): 

342 os.makedirs(to_path) 

343 

344 to_file = util._get_to_file("gm_gmw_monitoringtube.json", zipfile, to_path, _files) 

345 tubes = gmw_monitoringtube_items( 

346 extent, to_file=to_file, redownload=redownload, zipfile=zipfile 

347 ) 

348 

349 if index is None: 

350 index = ["gmw_bro_id", "tube_number"] 

351 tubes = tubes.set_index(index) 

352 

353 if kind is None: 

354 return tubes 

355 

356 if kind == "gar": 

357 to_file = util._get_to_file("gm_gar.json", zipfile, to_path, _files) 

358 meas_gdf = gar_items( 

359 extent, to_file=to_file, redownload=redownload, zipfile=zipfile 

360 ) 

361 if tmin is not None: 

362 meas_gdf = meas_gdf[meas_gdf["sampling_date_time"] >= tmin] 

363 

364 if tmax is not None: 

365 meas_gdf = meas_gdf[meas_gdf["sampling_date_time"] <= tmax] 

366 meas_cl = gar.GroundwaterAnalysisReport 

367 elif kind == "gld": 

368 to_file = util._get_to_file("gm_gld.json", zipfile, to_path, _files) 

369 meas_gdf = gld_items( 

370 extent, to_file=to_file, redownload=redownload, zipfile=zipfile 

371 ) 

372 if tmin is not None: 

373 meas_gdf = meas_gdf[meas_gdf["research_last_date"] >= tmin] 

374 

375 if tmax is not None: 

376 meas_gdf = meas_gdf[meas_gdf["research_first_date"] <= tmax] 

377 meas_cl = gld.GroundwaterLevelDossier 

378 else: 

379 raise (ValueError(f"kind='{kind}' not supported")) 

380 

381 gld_kwargs = gmw._get_gld_kwargs( 

382 kind, tmin, tmax, qualifier, status, observation_type, sort, drop_duplicates 

383 ) 

384 

385 meas_gdf = meas_gdf.set_index("bro_id") 

386 measurement_objects = [] 

387 if zipfile is None: 

388 desc = f"Downloading {kind}-observations" 

389 else: 

390 desc = f"Reading {kind}-observations from {to_zip}" 

391 if as_csv and kind != "gld": 

392 raise (Exception("as_csv=True is only supported for kind=='gld'")) 

393 if qualifier is not None and kind != "gld": 

394 raise (Exception("A qualifier is only supported for kind=='gld'")) 

395 datcol = gmw._get_data_column(kind) 

396 for bro_id in util.tqdm(meas_gdf.index, disable=silent, desc=desc): 

397 obsdata = gmw._download_observations_for_bro_id( 

398 bro_id, 

399 meas_cl, 

400 as_csv, 

401 zipfile, 

402 to_path, 

403 _files, 

404 gld_kwargs, 

405 redownload=redownload, 

406 continue_on_error=continue_on_error, 

407 ) 

408 

409 if as_csv: 

410 meas_dict = {"broId": bro_id, datcol: obsdata} 

411 else: 

412 meas_dict = obsdata.to_dict() 

413 meas_dict["gm_gmw_monitoringtube_fk"] = meas_gdf.at[ 

414 bro_id, "gm_gmw_monitoringtube_fk" 

415 ] 

416 measurement_objects.append(meas_dict) 

417 

418 if progress_callback is not None: 

419 progress_callback(len(measurement_objects), len(meas_gdf.index)) 

420 obs_df = pd.DataFrame(measurement_objects) 

421 

422 if zipfile is not None: 

423 zipfile.close() 

424 if zipfile is None and to_zip is not None: 

425 util._save_data_to_zip(to_zip, _files, remove_path_again, to_path) 

426 

427 # only keep tubes with active measurements 

428 mask = tubes["gm_gmw_monitoringtube_pk"].isin(meas_gdf["gm_gmw_monitoringtube_fk"]) 

429 tubes = tubes[mask] 

430 

431 if combine and kind in ["gld", "gar"]: 

432 logger.info("Adding observations to tube-properties") 

433 

434 if kind == "gld": 

435 idcol = "groundwaterLevelDossier" 

436 elif kind == "gar": 

437 idcol = "groundwaterAnalysisReport" 

438 

439 data = {} 

440 ids = {} 

441 for index in tubes.index: 

442 mask = ( 

443 obs_df["gm_gmw_monitoringtube_fk"] 

444 == tubes.at[index, "gm_gmw_monitoringtube_pk"] 

445 ) 

446 data[index] = gmw._combine_observations(obs_df.loc[mask, datcol], kind=kind) 

447 ids[index] = list(obs_df.loc[mask, "broId"]) 

448 tubes[datcol] = data 

449 tubes[idcol] = ids 

450 return tubes 

451 else: 

452 return tubes, obs_df 

453 

454 

455def get_kenset_geopackage(to_file=None, layer=None, redownload=False, index="bro_id"): 

456 """ 

457 Download or read data from a geopackage-file for the whole of the Netherlands. 

458 

459 Parameters 

460 ---------- 

461 to_file : str, optional 

462 Path to save the downloaded GeoPackage file (with the extension `.gpkg`). If the 

463 file exists and `redownload` is False, it will be reused. The default is None. 

464 layer : str, optional 

465 The layer within the geopackage. Possible values are 'gm_gmw', 

466 'gm_gmw_monitoringtube', 'gm_gld', 'gm_gar', 'gm_gmn', 'gm_gmn_measuringpoint' 

467 and 'gm_gmn_reference'. The default is None, which read data from the layer 

468 "gm_gmw". 

469 redownload : bool, optional 

470 If True, forces redownload of the data even if `to_file` exists. The default is 

471 False. 

472 index : str, optional 

473 The column to use for indexing in the resulting GeoDataFrame. The default is 

474 "bro_id". 

475 

476 Returns 

477 ------- 

478 gdf : gpd.GeoDataFrame 

479 A GeoDataFrame containing the resulting objects. 

480 

481 """ 

482 url = "https://service.pdok.nl/bzk/bro-gminsamenhang-karakteristieken/atom/downloads/brogmkenset.gpkg" 

483 if to_file is not None: 

484 if redownload or not os.path.isfile(to_file): 

485 urllib.request.urlretrieve(url, to_file) 

486 url = to_file 

487 gdf = gpd.read_file(url, layer=layer) 

488 if index in gdf.columns: 

489 gdf = gdf.set_index(index) 

490 return gdf