Coverage for brodata/gm.py: 66%

1import os

2from zipfile import ZipFile

3import logging

4import requests

5import urllib.request

6import json

7import pandas as pd

8import geopandas as gpd

9from shapely.geometry import MultiPolygon, Polygon

10from . import gmw, gld, gar, util

13base_url = "https://api.pdok.nl/bzk/bro-gminsamenhang-karakteristieken/ogc/v1"

15logger = logging.getLogger(__name__)

18def conformance():

19 url = f"{base_url}/conformance"

20 r = requests.get(url, params={"f": "json"})

21 if not r.ok:

22 raise Exception(f"Retrieving data from {url} failed")

23 return r.json()

26def collections():

27 url = f"{base_url}/collections"

28 r = requests.get(url, params={"f": "json"})

29 if not r.ok:

30 raise Exception(f"Retrieving data from {url} failed")

31 return r.json()

34def gm_gld_collection():

35 url = f"{base_url}/collections/gm_gld"

36 r = requests.get(url, params={"f": "json"})

37 if not r.ok:

38 raise Exception(f"Retrieving data from {url} failed")

39 return r.json()

42def _gm_items(

43 url,

44 extent=None,

45 crs="http://www.opengis.net/def/crs/EPSG/0/28992",

46 limit=1000,

47 time_columns=None,

48 to_file=None,

49 zipfile=None,

50 redownload=False,

51 **kwargs,

52):

53 """

54 Fetches and parses geospatial features from a GeoJSON endpoint, with optional

55 filtering, pagination support, and time column localization.

57 Retrieves data from a remote URL, a local file, or within a zip archive. Supports

58 bounding box filtering, CRS specification, and conversion of datetime columns to

59 Dutch winter time (UTC+1).

61 Parameters

62 ----------

63 url : str

64 The base URL to request the GeoJSON data from.

65 extent : list, tuple, shapely.geometry.Polygon or shapely.geometry.MultiPolygon, optional

66 The spatial extent ([xmin, xmax, ymin, ymax]) or polygon geometry to filter the data.

67 When a polygon is provided, its bounding box is used for the spatial query.

68 crs : string, optional

69 The coordinate reference system of the requested extent and the geometries in

70 the response. Possible values are:

71 http://www.opengis.net/def/crs/OGC/1.3/CRS84

72 http://www.opengis.net/def/crs/EPSG/0/28992

73 http://www.opengis.net/def/crs/EPSG/0/3857

74 http://www.opengis.net/def/crs/EPSG/0/4258

75 The default is "http://www.opengis.net/def/crs/EPSG/0/28992".

76 limit : int, optional

77 Limits the number of items that are presented in the response document. The

78 maximum allowed value is 1000. The default is 1000.

79 time_columns : list of str, optional

80 Names of columns containing datetime values to convert to Dutch winter time.

81 If None, columns ending with '_time' are automatically selected.

82 to_file : str, optional

83 Path to save the downloaded GeoJSON file. If the file exists and

84 `redownload` is False, it will be reused.

85 zipfile : ZipFile, optional

86 A `zipfile.ZipFile` object from which to read the `to_file` if provided.

87 redownload : bool, optional

88 If True, forces redownload of the data even if `to_file` exists.

89 **kwargs : dict

90 Additional query parameters to include in the request.

92 Returns

93 -------

94 gdf : geopandas.GeoDataFrame

95 A GeoDataFrame containing the parsed geospatial features.

97 """

98 if zipfile is not None:

99 with zipfile.open(to_file) as f:

100 json_data = json.load(f)

101 elif redownload or to_file is None or not os.path.isfile(to_file):

102 params = {"f": "json", "crs": crs, "limit": limit}

103 if extent is not None:

104 if isinstance(extent, (Polygon, MultiPolygon)):

105 xmin, ymin, xmax, ymax = extent.bounds

106 else:

107 xmin, xmax, ymin, ymax = extent

108 bbox = f"{xmin},{ymin},{xmax},{ymax}"

109 params["bbox-crs"] = crs

110 params["bbox"] = bbox

111

112 for key in kwargs:

113 params[key] = kwargs[key]

114 r = requests.get(url, params=params)

115

116 if not r.ok:

117 detail = r.json()["detail"]

118 raise Exception(f"Retrieving data from {url} failed: {detail}")

119 if to_file is not None:

120 with open(to_file, "w") as f:

121 f.write(r.text)

122 json_data = r.json()

123 else:

124 with open(to_file) as f:

125 json_data = json.load(f)

126 if len(json_data["features"]) == 0:

127 msg = "No data found"

128 if extent is not None:

129 msg = "%s for extent=%s" % (msg, extent)

130 msg = "%s on %s" % (msg, url)

131 logger.warning(msg)

132 return

133 gdf = gpd.GeoDataFrame.from_features(json_data["features"], crs=crs)

134 url = _get_next_url(json_data)

135 if url is not None:

136 gdfs = [gdf]

137 while url is not None:

138 r = requests.get(url)

139 if not r.ok:

140 raise Exception(f"Retrieving data from {url} failed")

141 json_data = r.json()

142 gdfs.append(gpd.GeoDataFrame.from_features(json_data["features"], crs=crs))

143 url = _get_next_url(json_data)

144 gdf = pd.concat(gdfs, ignore_index=True)

145 if time_columns is None:

146 time_columns = gdf.columns[gdf.columns.str.endswith("_time")]

147 one_hour = pd.Timedelta(1, "hour")

148 for column in time_columns:

149 # transform date to dutch winter time

150 gdf[column] = (

151 pd.to_datetime(gdf[column], utc=True).dt.tz_localize(None) + one_hour

152 )

153 # Filter results to polygon if polygon extent was provided

154 if extent is not None and isinstance(extent, (Polygon, MultiPolygon)):

155 gdf = gdf[gdf.intersects(extent)]

156 return gdf

157

158

159def _get_next_url(json_data):

160 links = pd.DataFrame(json_data["links"])

161 next_mask = links["rel"] == "next"

162 if next_mask.any():

163 if next_mask.sum() > 1:

164 raise (ValueError("More than 1 'next' page"))

165 url = links.loc[next_mask, "href"].iloc[0]

166 return url

167 else:

168 return None

169

170

171def gar_items(*args, **kwargs):

172 url = f"{base_url}/collections/gm_gar/items"

173

174 gdf = _gm_items(url, *args, **kwargs)

175 return gdf

176

177

178def gld_items(*args, tmin=None, tmax=None, **kwargs):

179 url = f"{base_url}/collections/gm_gld/items"

180

181 gdf = _gm_items(url, *args, **kwargs)

182

183 return gdf

184

185

186def gmn_items(*args, **kwargs):

187 url = f"{base_url}/collections/gm_gmn/items"

188

189 gdf = _gm_items(url, *args, **kwargs)

190

191 return gdf

192

193

194def gmn_measuringpoint_items(*args, **kwargs):

195 url = f"{base_url}/collections/gm_gmn_measuringpoint/items"

196

197 gdf = _gm_items(url, *args, **kwargs)

198

199 return gdf

200

201

202def gmn_reference_items(*args, **kwargs):

203 url = f"{base_url}/collections/gm_gmn_reference/items"

204

205 gdf = _gm_items(url, *args, **kwargs)

206

207 return gdf

208

209

210def gmw_items(*args, **kwargs):

211 url = f"{base_url}/collections/gm_gmw/items"

212

213 gdf = _gm_items(url, *args, **kwargs)

214

215 return gdf

216

217

218def gmw_monitoringtube_items(*args, **kwargs):

219 url = f"{base_url}/collections/gm_gmw_monitoringtube/items"

220

221 gdf = _gm_items(url, *args, **kwargs)

222

223 return gdf

224

225

226def get_data_in_extent(

227 extent,

228 kind="gld",

229 tmin=None,

230 tmax=None,

231 silent=False,

232 combine=True,

233 index=None,

234 as_csv=False,

235 status=None,

236 observation_type=None,

237 qualifier=None,

238 to_path=None,

239 to_zip=None,

240 redownload=False,

241 continue_on_error=False,

242 sort=True,

243 drop_duplicates=True,

244 progress_callback=None,

245):

246 """

247 Retrieve metadata and observations within a specified spatial extent.

248

249 This function fetches monitoring well characteristics and groundwater observations

250 within the given spatial extent. It can combine the data for specific observation

251 types and return either individual dataframes or a combined dataframe.

252

253 Parameters

254 ----------

255 extent : object

256 The spatial extent ([xmin, xmax, ymin, ymax]) to filter the data.

257 kind : str, optional

258 The type of observations to retrieve. Valid values are {'gld', 'gar'} for

259 groundwater level dossier or groundwater analysis report. When kind is None, no

260 observations are downloaded. Defaults to 'gld'.

261 tmin : str or datetime, optional

262 The minimum time for filtering observations. Defaults to None.

263 tmax : str or datetime, optional

264 The maximum time for filtering observations. Defaults to None.

265 silent : bool, optional

266 If True, suppresses progress logging. Defaults to False.

267 combine : bool, optional

268 If True, combines the tube properties, and observations into a single

269 dataframe. Defaults to True.

270 index : str, optional

271 The column to use for indexing in the resulting dataframe. If None, the index

272 will be set to a MultiIndex of the columns "gmw_bro_id" and "tube_number".

273 Defaults to None.

274 as_csv : bool, optional

275 If True, the measurement data is requested as CSV files instead of XML files

276 (only supported for 'gld'). Defaults to False.

277 status : str, optional

278 A status string for additional filtering. Possible values are

279 "volledigBeoordeeld", "voorlopig" and "onbekend" Only valid if `kind` is 'gld'.

280 Defaults to None.

281 observation_type : str, optional

282 An observation type string for additional filtering. Possible values are

283 "reguliereMeting" and "controleMeting". Only valid if `kind` is 'gld'. Defaults

284 to None.

285 qualifier : str or list of str, optional

286 A string or list of strings used to filter the observations. Only valid if

287 `kind` is 'gld'. Defaults to None.

288 to_path : str, optional

289 If not None, save the downloaded files in the directory named to_path. The

290 default is None.

291 to_zip : str, optional

292 If not None, save the downloaded files in a zip-file named to_zip. The default

293 is None.

294 redownload : bool, optional

295 When downloaded files exist in to_path or to_zip, read from these files when

296 redownload is False. If redownload is True, download the data again from the

297 BRO-servers. The default is False.

298 continue_on_error : bool, optional

299 If True, continue after an error occurs during downloading or processing of

300 individual observation data. Defaults to False.

301 sort : bool, optional

302 If True, sort the observations. Only used if `kind` is 'gld'. Defaults to True.

303 drop_duplicates : bool, optional

304 If True, drop duplicate observations based on their timestamp. Only used if

305 `kind` is 'gld'. Defaults to True.

306 progress_callback : function, optional

307 A callback function that takes two arguments (current, total) to report

308 progress. If None, no progress reporting is done. Defaults to None.

309

310 Returns

311 -------

312 gdf : pd.DataFrame

313 A dataframe containing tube properties and metadata within the specified extent.

314

315 obs_df : pd.DataFrame, optional

316 A dataframe containing the observations for the specified wells. Returned only if

317 `combine` is False.

318 """

319

320 if isinstance(extent, str):

321 if to_zip is not None:

322 raise (Exception("When extent is a string, do not supply to_zip"))

323 to_zip = extent

324 if not os.path.isfile(to_zip):

325 raise (FileExistsError(f"The file {to_zip} is not present"))

326 extent = None

327 redownload = False

328

329 zipfile = None

330 _files = None

331 if to_zip is not None:

332 if not redownload and os.path.isfile(to_zip):

333 logger.info("Reading data from %s", to_zip)

334 zipfile = ZipFile(to_zip)

335 else:

336 if to_path is None:

337 to_path = os.path.splitext(to_zip)[0]

338 remove_path_again = not os.path.isdir(to_path)

339 _files = []

340

341 if to_path is not None and not os.path.isdir(to_path):

342 os.makedirs(to_path)

343

344 to_file = util._get_to_file("gm_gmw_monitoringtube.json", zipfile, to_path, _files)

345 tubes = gmw_monitoringtube_items(

346 extent, to_file=to_file, redownload=redownload, zipfile=zipfile

347 )

348

349 if index is None:

350 index = ["gmw_bro_id", "tube_number"]

351 tubes = tubes.set_index(index)

352

353 if kind is None:

354 return tubes

355

356 if kind == "gar":

357 to_file = util._get_to_file("gm_gar.json", zipfile, to_path, _files)

358 meas_gdf = gar_items(

359 extent, to_file=to_file, redownload=redownload, zipfile=zipfile

360 )

361 if tmin is not None:

362 meas_gdf = meas_gdf[meas_gdf["sampling_date_time"] >= tmin]

363

364 if tmax is not None:

365 meas_gdf = meas_gdf[meas_gdf["sampling_date_time"] <= tmax]

366 meas_cl = gar.GroundwaterAnalysisReport

367 elif kind == "gld":

368 to_file = util._get_to_file("gm_gld.json", zipfile, to_path, _files)

369 meas_gdf = gld_items(

370 extent, to_file=to_file, redownload=redownload, zipfile=zipfile

371 )

372 if tmin is not None:

373 meas_gdf = meas_gdf[meas_gdf["research_last_date"] >= tmin]

374

375 if tmax is not None:

376 meas_gdf = meas_gdf[meas_gdf["research_first_date"] <= tmax]

377 meas_cl = gld.GroundwaterLevelDossier

378 else:

379 raise (ValueError(f"kind='{kind}' not supported"))

380

381 gld_kwargs = gmw._get_gld_kwargs(

382 kind, tmin, tmax, qualifier, status, observation_type, sort, drop_duplicates

383 )

384

385 meas_gdf = meas_gdf.set_index("bro_id")

386 measurement_objects = []

387 if zipfile is None:

388 desc = f"Downloading {kind}-observations"

389 else:

390 desc = f"Reading {kind}-observations from {to_zip}"

391 if as_csv and kind != "gld":

392 raise (Exception("as_csv=True is only supported for kind=='gld'"))

393 if qualifier is not None and kind != "gld":

394 raise (Exception("A qualifier is only supported for kind=='gld'"))

395 datcol = gmw._get_data_column(kind)

396 for bro_id in util.tqdm(meas_gdf.index, disable=silent, desc=desc):

397 obsdata = gmw._download_observations_for_bro_id(

398 bro_id,

399 meas_cl,

400 as_csv,

401 zipfile,

402 to_path,

403 _files,

404 gld_kwargs,

405 redownload=redownload,

406 continue_on_error=continue_on_error,

407 )

408

409 if as_csv:

410 meas_dict = {"broId": bro_id, datcol: obsdata}

411 else:

412 meas_dict = obsdata.to_dict()

413 meas_dict["gm_gmw_monitoringtube_fk"] = meas_gdf.at[

414 bro_id, "gm_gmw_monitoringtube_fk"

415 ]

416 measurement_objects.append(meas_dict)

417

418 if progress_callback is not None:

419 progress_callback(len(measurement_objects), len(meas_gdf.index))

420 obs_df = pd.DataFrame(measurement_objects)

421

422 if zipfile is not None:

423 zipfile.close()

424 if zipfile is None and to_zip is not None:

425 util._save_data_to_zip(to_zip, _files, remove_path_again, to_path)

426

427 # only keep tubes with active measurements

428 mask = tubes["gm_gmw_monitoringtube_pk"].isin(meas_gdf["gm_gmw_monitoringtube_fk"])

429 tubes = tubes[mask]

430

431 if combine and kind in ["gld", "gar"]:

432 logger.info("Adding observations to tube-properties")

433

434 if kind == "gld":

435 idcol = "groundwaterLevelDossier"

436 elif kind == "gar":

437 idcol = "groundwaterAnalysisReport"

438

439 data = {}

440 ids = {}

441 for index in tubes.index:

442 mask = (

443 obs_df["gm_gmw_monitoringtube_fk"]

444 == tubes.at[index, "gm_gmw_monitoringtube_pk"]

445 )

446 data[index] = gmw._combine_observations(obs_df.loc[mask, datcol], kind=kind)

447 ids[index] = list(obs_df.loc[mask, "broId"])

448 tubes[datcol] = data

449 tubes[idcol] = ids

450 return tubes

451 else:

452 return tubes, obs_df

453

454

455def get_kenset_geopackage(to_file=None, layer=None, redownload=False, index="bro_id"):

456 """

457 Download or read data from a geopackage-file for the whole of the Netherlands.

458

459 Parameters

460 ----------

461 to_file : str, optional

462 Path to save the downloaded GeoPackage file (with the extension `.gpkg`). If the

463 file exists and `redownload` is False, it will be reused. The default is None.

464 layer : str, optional

465 The layer within the geopackage. Possible values are 'gm_gmw',

466 'gm_gmw_monitoringtube', 'gm_gld', 'gm_gar', 'gm_gmn', 'gm_gmn_measuringpoint'

467 and 'gm_gmn_reference'. The default is None, which read data from the layer

468 "gm_gmw".

469 redownload : bool, optional

470 If True, forces redownload of the data even if `to_file` exists. The default is

471 False.

472 index : str, optional

473 The column to use for indexing in the resulting GeoDataFrame. The default is

474 "bro_id".

475

476 Returns

477 -------

478 gdf : gpd.GeoDataFrame

479 A GeoDataFrame containing the resulting objects.

480

481 """

482 url = "https://service.pdok.nl/bzk/bro-gminsamenhang-karakteristieken/atom/downloads/brogmkenset.gpkg"

483 if to_file is not None:

484 if redownload or not os.path.isfile(to_file):

485 urllib.request.urlretrieve(url, to_file)

486 url = to_file

487 gdf = gpd.read_file(url, layer=layer)

488 if index in gdf.columns:

489 gdf = gdf.set_index(index)

490 return gdf

Coverage for brodata / gm.py: 66%

217 statements