Coverage for brodata / bro.py: 80%

596 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-13 12:57 +0000

1import logging 

2import os 

3import types 

4from xml.etree import ElementTree 

5from abc import ABC, abstractmethod 

6from io import StringIO 

7from zipfile import ZipFile 

8 

9from shapely.geometry import MultiPolygon, Point, Polygon 

10import shapely 

11import numpy as np 

12import geopandas as gpd 

13import pandas as pd 

14import requests 

15from pyproj import Transformer 

16 

17from . import util, gml 

18 

19logger = logging.getLogger(__name__) 

20 

21 

22# %% 

23def _get_bro_ids_of_bronhouder(cl, bronhouder): 

24 """ 

25 Retrieve list of BRO (Basisregistratie Ondergrond) IDs for a given bronhouder. 

26 

27 This function sends a GET request to the REST API to fetch the BRO IDs associated 

28 with the specified bronhouder. If the request is unsuccessful, it logs an error 

29 message. 

30 

31 Parameters 

32 ---------- 

33 bronhouder : str 

34 The identifier for the bronhouder to retrieve the associated BRO IDs. 

35 

36 Returns 

37 ------- 

38 list or None 

39 A list of BRO IDs if the request is successful. Returns `None` if the request 

40 fails. 

41 """ 

42 url = f"{cl._rest_url}/bro-ids?" 

43 params = dict(bronhouder=bronhouder) 

44 req = util.get_with_rate_limit(url, params=params) 

45 if req.status_code > 200: 

46 logger.error(req.json()["errors"][0]["message"]) 

47 return 

48 bro_ids = req.json()["broIds"] 

49 return bro_ids 

50 

51 

52def _get_characteristics( 

53 cl, 

54 extent=None, 

55 tmin=None, 

56 tmax=None, 

57 x=None, 

58 y=None, 

59 radius=1000.0, 

60 epsg=28992, 

61 to_file=None, 

62 redownload=False, 

63 use_all_corners_of_extent=True, 

64 timeout=5, 

65 zipfile=None, 

66): 

67 """ 

68 Get characteristics of a set of registered objects for a given object class. 

69 

70 The maximum number of objects that can be retrieved is 2000 for a single request. 

71 

72 Parameters 

73 ---------- 

74 extent : list, tuple, shapely.geometry.Polygon or shapely.geometry.MultiPolygon, optional 

75 Download the characteristics within extent ([xmin, xmax, ymin, ymax]) or 

76 within the bounds of a polygon. The default is None. 

77 tmin : str or pd.Timestamp, optional 

78 The minimum registrationPeriod of the requested characteristics. The default is 

79 None. 

80 tmax : str or pd.Timestamp, optional 

81 The maximum registrationPeriod of the requested characteristics. The default is 

82 None. 

83 x : float, optional 

84 The x-coordinate of the center of the circle in which the characteristics are 

85 requested. The default is None. 

86 y : float, optional 

87 The y-coordinate of the center of the circle in which the characteristics are 

88 requested. The default is None. 

89 radius : float, optional 

90 The radius in meters of the center of the circle in which the characteristics 

91 are requested. The default is 1000.0. 

92 epsg : str, optional 

93 The coordinate reference system of the specified extent, x or y, and of the 

94 resulting GeoDataFrame. The default is 28992, which is the Dutch RD-system. 

95 to_file : str, optional 

96 When not None, save the characteristics to a file with a name as specified in 

97 to_file. The defaults None. 

98 redownload : bool, optional 

99 When the downloaded file exists in to_file, read from this file when redownload 

100 is False. If redownload is True, download the data again from the BRO-servers. 

101 The default is False. 

102 use_all_corners_of_extent : bool, optional 

103 Because the extent by default is given in epsg 28992, some locations along the 

104 border of a requested extent will not be returned in the result. To solve this 

105 issue, when use_all_corners_of_extent is True, all four corners of the extent 

106 are used to calculate the minimum and maximum lan and lon values. The default is 

107 True. 

108 timeout : int or float, optional 

109 A number indicating how many seconds to wait for the client to make a connection 

110 and/or send a response. The default is 5. 

111 zipfile : zipfile.ZipFile, optional 

112 A zipfile-object. When not None, zipfile is used to read previously downloaded 

113 data from. The default is None. 

114 

115 Returns 

116 ------- 

117 gpd.GeoDataFrame 

118 A GeoDataFrame contraining the characteristics. 

119 

120 Notes 

121 ----- 

122 Haalt de karakteristieken op van een set van registratie objecten, gegeven een 

123 kenmerkenverzameling (kenset). 

124 

125 De karakteristieken geven een samenvatting van een object zodat een verdere selectie 

126 gemaakt kan worden. Het past in een tweetrapsbenadering, waarbij de eerste stap 

127 bestaat uit het ophalen van de karakteristieken en de 2e stap uit het ophalen van de 

128 gewenste registratie objecten. Het resultaat van deze operatie is gemaximaliseerd op 

129 2000. 

130 """ 

131 if zipfile is None and ( 

132 redownload or to_file is None or not os.path.isfile(to_file) 

133 ): 

134 url = f"{cl._rest_url}/characteristics/searches?" 

135 

136 data = {} 

137 if tmin is not None or tmax is not None: 

138 data["registrationPeriod"] = {} 

139 if tmin is not None: 

140 beginDate = pd.to_datetime(tmin).strftime("%Y-%m-%d") 

141 data["registrationPeriod"]["beginDate"] = beginDate 

142 if tmax is not None: 

143 endDate = pd.to_datetime(tmax).strftime("%Y-%m-%d") 

144 data["registrationPeriod"]["endDate"] = endDate 

145 if (x is None or y is None) and extent is None: 

146 raise (Exception("Please specify either extent or x, y and radius")) 

147 

148 transformer = Transformer.from_crs(epsg, 4326) 

149 data["area"] = {} 

150 if x is not None and y is not None: 

151 lat, lon = transformer.transform(x, y) 

152 data["area"]["enclosingCircle"] = { 

153 "center": {"lat": lat, "lon": lon}, 

154 "radius": radius / 1000, 

155 } 

156 if extent is not None: 

157 if isinstance(extent, (Polygon, MultiPolygon)): 

158 xmin, ymin, xmax, ymax = extent.bounds 

159 else: 

160 xmin, xmax, ymin, ymax = extent 

161 

162 lat_ll, lon_ll = transformer.transform(xmin, ymin) 

163 lat_ur, lon_ur = transformer.transform(xmax, ymax) 

164 if use_all_corners_of_extent: 

165 lat_ul, lon_ul = transformer.transform(xmin, ymax) 

166 lat_lr, lon_lr = transformer.transform(xmax, ymin) 

167 lat_ll = min(lat_ll, lat_lr) 

168 lon_ll = min(lon_ll, lon_ul) 

169 lat_ur = max(lat_ul, lat_ur) 

170 lon_ur = max(lon_lr, lon_ur) 

171 

172 data["area"]["boundingBox"] = { 

173 "lowerCorner": {"lat": lat_ll, "lon": lon_ll}, 

174 "upperCorner": {"lat": lat_ur, "lon": lon_ur}, 

175 } 

176 req = util.post_with_rate_limit(url, json=data, timeout=timeout) 

177 if req.status_code > 200: 

178 root = ElementTree.fromstring(req.text) 

179 FileOrUrl._check_for_rejection(root) 

180 # if reading of the rejection message failed, raise a more general error 

181 raise (Exception((f"Retieving data from {url} failed"))) 

182 

183 if to_file is not None: 

184 with open(to_file, "w") as f: 

185 f.write(req.text) 

186 

187 # read results 

188 tree = ElementTree.fromstring(req.text) 

189 else: 

190 if zipfile is not None: 

191 with zipfile.open(to_file) as f: 

192 tree = ElementTree.parse(f).getroot() 

193 else: 

194 tree = ElementTree.parse(to_file).getroot() 

195 

196 ns = {"xmlns": cl._xmlns} 

197 data = [] 

198 for gmw in tree.findall(f".//xmlns:{cl._char}", ns): 

199 d = {} 

200 for key in gmw.attrib: 

201 d[key.split("}", 1)[1]] = gmw.attrib[key] 

202 for child in gmw: 

203 key = util._get_tag(child) 

204 if len(child) == 0: 

205 d[key] = child.text 

206 elif key == "standardizedLocation": 

207 d[key] = FileOrUrl._read_geometry(child) 

208 elif key == "deliveredLocation": 

209 d[key] = FileOrUrl._read_geometry(child) 

210 elif ( 

211 key.endswith("Date") 

212 or key.endswith("Overview") 

213 or key in ["startTime", "endTime"] 

214 ): 

215 d[key] = child[0].text 

216 elif key in ["diameterRange", "screenPositionRange"]: 

217 for grandchild in child: 

218 key = util._get_tag(grandchild) 

219 d[key] = grandchild.text 

220 elif key == "licence": 

221 for grandchild in child: 

222 key2 = grandchild.tag.split("}", 1)[1] 

223 for greatgrandchild in grandchild: 

224 key3 = greatgrandchild.tag.split("}", 1)[1] 

225 if key3 == "identificationLicence": 

226 d[key] = greatgrandchild.text 

227 else: 

228 logger.warning(f"Unknown key: {key2}") 

229 elif key == "realisedInstallation": 

230 for grandchild in child: 

231 key2 = grandchild.tag.split("}", 1)[1] 

232 for greatgrandchild in grandchild: 

233 key3 = greatgrandchild.tag.split("}", 1)[1] 

234 if key3 == "installationFunction": 

235 d[key] = greatgrandchild.text 

236 else: 

237 logger.warning(f"Unknown key: {key2}") 

238 

239 else: 

240 logger.warning(f"Unknown key: {key}") 

241 data.append(d) 

242 

243 gdf = objects_to_gdf(data) 

244 if zipfile is not None and extent is not None and isinstance(gdf, gpd.GeoDataFrame): 

245 if isinstance(extent, (Polygon, MultiPolygon)): 

246 gdf = gdf[gdf.intersects(extent)] 

247 else: 

248 gdf = gdf.cx[extent[0] : extent[1], extent[2] : extent[3]] 

249 return gdf 

250 

251 

252def _get_data_in_extent( 

253 bro_cl, 

254 extent=None, 

255 timeout=5, 

256 silent=False, 

257 to_path=None, 

258 to_zip=None, 

259 redownload=False, 

260 geometry=None, 

261 to_gdf=True, 

262 index="broId", 

263 continue_on_error=False, 

264 progress_callback=None, 

265): 

266 """ 

267 Retrieve data within a specified extent for a certain bro-class. 

268 

269 Parameters 

270 ---------- 

271 bro_cl : class 

272 brodata class. 

273 extent : str or object, optional 

274 Spatial extent to query. If a string, interpreted as a zip file path. 

275 timeout : int, default=5 

276 Timeout in seconds for data retrieval requests. 

277 silent : bool, default=False 

278 If True, disables progress bars and reduces logging output. 

279 to_path : str, optional 

280 Directory path to save downloaded files. 

281 to_zip : str, optional 

282 Path to a zip file to read from or save data to. 

283 redownload : bool, default=False 

284 If True, forces redownload of data even if files exist. 

285 geometry : str or object, optional 

286 Geometry specification for the output GeoDataFrame. 

287 to_gdf : bool, default=True 

288 If True, converts the output to a GeoDataFrame. 

289 index : str, default="broId" 

290 Column name to use as index in the output GeoDataFrame. 

291 continue_on_error : bool, default=False 

292 If True, continues processing other items if an error occurs. 

293 progress_callback : function, optional 

294 A callback function that takes two arguments (current, total) to report 

295 progress. If None, no progress reporting is done. Defaults to None. 

296 

297 Returns 

298 ------- 

299 gdf : GeoDataFrame 

300 GeoDataFrame containing the retrieved data objects, indexed by the specified 

301 column. 

302 

303 Raises 

304 ------ 

305 Exception 

306 If invalid arguments are provided or data retrieval fails (unless 

307 continue_on_error is True). 

308 

309 Notes 

310 ----- 

311 - If `extent` is a string, it is treated as a zip file path and `to_zip` must not 

312 be provided. 

313 - Data can be read from or saved to zip archives or directories, depending on the 

314 provided arguments. 

315 - Progress is displayed unless `silent` is True. 

316 """ 

317 if isinstance(extent, str): 

318 if to_zip is not None: 

319 raise (Exception("When extent is a string, do not supply to_zip")) 

320 to_zip = extent 

321 extent = None 

322 redownload = False 

323 zipfile = None 

324 _files = None 

325 if to_zip is not None: 

326 if not redownload and os.path.isfile(to_zip): 

327 logger.info(f"Reading data from {to_zip}") 

328 zipfile = ZipFile(to_zip) 

329 else: 

330 if to_path is None: 

331 to_path = os.path.splitext(to_zip)[0] 

332 remove_path_again = not os.path.isdir(to_path) 

333 _files = [] 

334 

335 # get gwm characteristics 

336 logger.info(f"Getting characteristics in extent: {extent}") 

337 to_file = None 

338 if zipfile is not None or to_path is not None: 

339 to_file = "characteristics.xml" 

340 if zipfile is None: 

341 to_file = os.path.join(to_path, to_file) 

342 if _files is not None: 

343 _files.append(to_file) 

344 if to_path is not None and not os.path.isdir(to_path): 

345 os.makedirs(to_path) 

346 

347 char = _get_characteristics( 

348 bro_cl, extent=extent, to_file=to_file, redownload=redownload, zipfile=zipfile 

349 ) 

350 

351 data = _get_data_for_bro_ids( 

352 bro_cl, 

353 char.index, 

354 timeout=timeout, 

355 silent=silent, 

356 to_path=to_path, 

357 zipfile=zipfile, 

358 redownload=redownload, 

359 continue_on_error=continue_on_error, 

360 progress_callback=progress_callback, 

361 _files=_files, 

362 ) 

363 if zipfile is not None: 

364 zipfile.close() 

365 if zipfile is None and to_zip is not None: 

366 util._save_data_to_zip(to_zip, _files, remove_path_again, to_path) 

367 

368 gdf = objects_to_gdf(data, geometry, to_gdf, index) 

369 

370 return gdf 

371 

372 

373def _get_data_for_bro_ids( 

374 bro_cl, 

375 bro_ids, 

376 timeout=5, 

377 silent=False, 

378 to_path=None, 

379 zipfile=None, 

380 redownload=False, 

381 continue_on_error=False, 

382 desc=None, 

383 progress_callback=None, 

384 _files=None, 

385): 

386 """ 

387 Retrieve data for a list of specified bro_ids for a certain bro-class. 

388 

389 Parameters 

390 ---------- 

391 bro_cl : class 

392 brodata class. 

393 bro_ids : list of strings 

394 A list of bro-ids to donnload data for. 

395 timeout : int, default=5 

396 Timeout in seconds for data retrieval requests. 

397 silent : bool, default=False 

398 If True, disables progress bars and reduces logging output. 

399 to_path : str, optional 

400 Directory path to save downloaded files. 

401 zipfile : zipfile.ZipFile, optional 

402 A zipfile-object. When not None, zipfile is used to read previously downloaded 

403 data from. The default is None. 

404 redownload : bool, default=False 

405 If True, forces redownload of data even if files exist. 

406 continue_on_error : bool, default=False 

407 If True, continues processing other items if an error occurs. 

408 desc : str, optional 

409 Description for the progress bar. The default is None. 

410 progress_callback : function, optional 

411 A callback function that takes two arguments (current, total) to report 

412 progress. If None, no progress reporting is done. Defaults to None. 

413 

414 Returns 

415 ------- 

416 data : dictionary 

417 A dictionary with the bro-ids as keys, and the data as values. 

418 

419 Raises 

420 ------ 

421 Exception 

422 If invalid arguments are provided or data retrieval fails (unless 

423 continue_on_error is True). 

424 

425 Notes 

426 ----- 

427 - Data can be read from or saved to zip archives or directories, depending on the 

428 provided arguments. 

429 - Progress is displayed unless `silent` is True. 

430 """ 

431 data = {} 

432 if isinstance(bro_ids, str): 

433 bro_ids = [bro_ids] 

434 total = len(bro_ids) 

435 for i, bro_id in util.tqdm( 

436 enumerate(bro_ids), total=total, disable=silent, desc=desc 

437 ): 

438 if progress_callback is not None: 

439 progress_callback(i, total) 

440 if zipfile is not None: 

441 fname = f"{bro_id}.xml" 

442 data[bro_id] = bro_cl(fname, zipfile=zipfile) 

443 continue 

444 to_file = None 

445 if to_path is not None: 

446 to_file = os.path.join(to_path, f"{bro_id}.xml") 

447 if _files is not None: 

448 _files.append(to_file) 

449 if not redownload and os.path.isfile(to_file): 

450 data[bro_id] = bro_cl(to_file) 

451 continue 

452 kwargs = {"to_file": to_file, "timeout": timeout} 

453 if continue_on_error: 

454 try: 

455 data[bro_id] = bro_cl.from_bro_id(bro_id, **kwargs) 

456 except Exception as e: 

457 logger.error("Error retrieving %s: %s", bro_id, e) 

458 else: 

459 data[bro_id] = bro_cl.from_bro_id(bro_id, **kwargs) 

460 

461 return data 

462 

463 

464def objects_to_gdf( 

465 data, 

466 geometry=None, 

467 to_gdf=True, 

468 index="broId", 

469 from_crs=None, 

470 to_crs=28992, 

471): 

472 if not to_gdf: 

473 return data 

474 if isinstance(data, list): 

475 df = pd.DataFrame(data) 

476 else: 

477 df = pd.DataFrame([data[key].to_dict() for key in data]) 

478 

479 if index is not None and not df.empty: 

480 if isinstance(index, str): 

481 if index in df.columns: 

482 df = df.set_index(index) 

483 elif np.all([x in df.columns for x in index]): 

484 # we assume index is an iterable (list), to form a MultiIndex 

485 df = df.set_index(index) 

486 if geometry is None: 

487 if "deliveredLocation" in df: 

488 geometry = "deliveredLocation" 

489 if from_crs is None: 

490 from_crs = 28992 

491 elif "standardizedLocation" in df: 

492 geometry = "standardizedLocation" 

493 if from_crs is None: 

494 from_crs = 4258 

495 else: 

496 return df 

497 gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=from_crs) 

498 if to_crs is not None and from_crs is not None and to_crs != from_crs: 

499 gdf = gdf.to_crs(to_crs) 

500 return gdf 

501 

502 

503class FileOrUrl(ABC): 

504 """ 

505 A class for parsing and handling XML data from files, URLs, or zipped files. 

506 

507 Supports fetching XML data from local files or remote URLs. It also handles 

508 rejection checks and extracts data into object attributes. Data is parsed 

509 recursively and can be converted to a dictionary. 

510 

511 Attributes: 

512 Instance variables are dynamically set based on the XML content. 

513 

514 Methods: 

515 __init__(url_or_file, zipfile=None, timeout=5, to_file=None, **kwargs): 

516 Parses XML from a URL, file, or zipped file, and initializes the object. 

517 

518 from_bro_id(bro_id, **kwargs): 

519 Fetches XML data from a REST service based on a given 'bro_id'. 

520 

521 to_dict(): 

522 Converts instance attributes to a dictionary, excluding methods and 

523 private attributes. 

524 """ 

525 

526 def __init__( 

527 self, 

528 url_or_file, 

529 zipfile=None, 

530 timeout=5, 

531 to_file=None, 

532 redownload=True, 

533 max_retries=2, 

534 **kwargs, 

535 ): 

536 # CSV 

537 if url_or_file.endswith(".csv"): 

538 if zipfile is not None: 

539 self._read_csv(StringIO(zipfile.read(url_or_file)), **kwargs) 

540 else: 

541 self._read_csv(url_or_file, **kwargs) 

542 # XML or URL 

543 else: 

544 if zipfile is not None: 

545 root = ElementTree.fromstring(zipfile.read(url_or_file)) 

546 elif url_or_file.startswith("http"): 

547 if redownload or to_file is None or not os.path.isfile(to_file): 

548 params = {} 

549 if "tmin" in kwargs and kwargs["tmin"] is not None: 

550 tmin = kwargs.pop("tmin") 

551 tmin = pd.to_datetime(tmin).strftime("%Y-%m-%d") 

552 params["observationPeriodBeginDate"] = tmin 

553 if "tmax" in kwargs and kwargs["tmax"] is not None: 

554 tmax = kwargs.pop("tmax") 

555 tmax = pd.to_datetime(tmax).strftime("%Y-%m-%d") 

556 params["observationPeriodEndDate"] = tmax 

557 if max_retries > 1: 

558 adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) 

559 session = requests.Session() 

560 session.mount("https://", adapter) 

561 util.wait_for_rate_limit(url_or_file) 

562 req = session.get(url_or_file, params=params, timeout=timeout) 

563 else: 

564 req = util.get_with_rate_limit( 

565 url_or_file, params=params, timeout=timeout 

566 ) 

567 if not req.ok: 

568 if req.reason == "Bad Request": 

569 root = ElementTree.fromstring(req.text) 

570 FileOrUrl._check_for_rejection(root) 

571 raise Exception(f"Retrieving data from {url_or_file} failed") 

572 if to_file is not None: 

573 with open(to_file, "w") as f: 

574 f.write(req.text) 

575 root = ElementTree.fromstring(req.text) 

576 FileOrUrl._check_for_rejection(root) 

577 else: 

578 tree = ElementTree.parse(to_file) 

579 root = tree.getroot() 

580 else: 

581 tree = ElementTree.parse(url_or_file) 

582 root = tree.getroot() 

583 

584 self._read_contents(root, **kwargs) 

585 

586 def __repr__(self): 

587 # retrieve properties if they exist 

588 propdict = {"broId": "broId"} 

589 props = {} 

590 for key in propdict: 

591 if hasattr(self, key): 

592 props[propdict[key]] = getattr(self, key) 

593 if hasattr(self, "deliveredLocation"): 

594 if isinstance(self.deliveredLocation, Point): 

595 props["x"] = self.deliveredLocation.x 

596 props["y"] = self.deliveredLocation.y 

597 name = util._format_repr(self, props) 

598 return name 

599 

600 @abstractmethod 

601 def _read_contents(self, tree, **kwargs): 

602 """Each subclass must overload _read_contents to parse XML result.""" 

603 

604 def _read_csv(self, *args, **kwargs): 

605 raise NotImplementedError( 

606 f"Class {self.__class__.__name__} does not support reading from CSV files." 

607 ) 

608 

609 @classmethod 

610 def from_bro_id(cls, bro_id, **kwargs): 

611 if not hasattr(cls, "_rest_url"): 

612 raise (NotImplementedError(f"No rest-service defined for {cls.__name__}")) 

613 

614 return cls(f"{cls._rest_url}/objects/{bro_id}", **kwargs) 

615 

616 def to_dict(self): 

617 d = {} 

618 for attrib in dir(self): 

619 if attrib.startswith("_"): 

620 continue 

621 value = getattr(self, attrib) 

622 if type(value) is types.MethodType: 

623 continue 

624 d[attrib] = value 

625 return d 

626 

627 @staticmethod 

628 def _check_for_rejection(tree): 

629 ns = {"brocom": "http://www.broservices.nl/xsd/brocommon/3.0"} 

630 response_type = tree.find("brocom:responseType", ns) 

631 if response_type.text == "rejection": 

632 criterionError = tree.find("brocom:criterionError", ns) 

633 if criterionError is None: 

634 msg = tree.find("brocom:rejectionReason", ns).text 

635 else: 

636 msg = criterionError.find("brocom:specification", ns).text 

637 raise (ValueError(msg)) 

638 

639 @staticmethod 

640 def _get_tag(node): 

641 return util._get_tag(node) 

642 

643 def _get_main_object(self, tree, object_name=None, ns=None): 

644 if object_name is None: 

645 object_name = self._object_name 

646 if ns is None: 

647 ns = {"xmlns": self._xmlns} 

648 if isinstance(object_name, list): 

649 for name in object_name: 

650 objects = tree.findall(f".//xmlns:{name}", ns) 

651 if objects: 

652 break 

653 else: 

654 objects = tree.findall(f".//xmlns:{object_name}", ns) 

655 if len(objects) > 1: 

656 raise (Exception(f"Only one {object_name} supported")) 

657 elif len(objects) == 0: 

658 raise (Exception(f"No {object_name} found")) 

659 return objects[0] 

660 

661 def _warn_unknown_tag(self, tag, parent=None): 

662 class_name = self.__class__.__name__ 

663 bro_id = getattr(self, "broId", "") 

664 util._warn_unknown_tag(tag, parent, class_name, bro_id) 

665 

666 def _raise_assumed_single(self, key): 

667 raise ValueError( 

668 f"Assumed there is only one {key} in {self.__class__.__name__} {getattr(self, 'broId', '')}" 

669 ) 

670 

671 def _check_single_child_with_tag(self, node, tag): 

672 return len(node) == 1 and self._get_tag(node[0]) == tag 

673 

674 def _read_children_of_children(self, node, d=None, to_float=None, to_int=None): 

675 if to_float is not None and isinstance(to_float, str): 

676 to_float = [to_float] 

677 if to_int is not None and isinstance(to_int, str): 

678 to_int = [to_int] 

679 if len(node) == 0: 

680 key = node.tag.split("}", 1)[1] 

681 if d is None: 

682 setattr(self, key, FileOrUrl._parse_text(node, key, to_float, to_int)) 

683 else: 

684 d[key] = FileOrUrl._parse_text(node, key, to_float, to_int) 

685 else: 

686 for child in node: 

687 self._read_children_of_children( 

688 child, d=d, to_float=to_float, to_int=to_int 

689 ) 

690 

691 @staticmethod 

692 def _parse_text(node, key, to_float=None, to_int=None): 

693 if to_float is not None and key in to_float: 

694 return FileOrUrl._parse_float(node) 

695 if to_int is not None and key in to_int: 

696 return int(node.text) 

697 return node.text 

698 

699 @staticmethod 

700 def _parse_float(node): 

701 if node.text is None: 

702 return np.nan 

703 return float(node.text) 

704 

705 def _read_delivered_location(self, node): 

706 for child in node: 

707 key = self._get_tag(child) 

708 if key == "location": 

709 setattr(self, "deliveredLocation", self._read_geometry(child)) 

710 elif key == "horizontalPositioningDate": 

711 setattr(self, key, self._read_date(child)) 

712 elif key == "horizontalPositioningMethod": 

713 setattr(self, key, child.text) 

714 elif key == "horizontalPositioningOperator": 

715 setattr(self, key, self._read_operator(child)) 

716 else: 

717 self._warn_unknown_tag(key) 

718 

719 def _read_operator(self, node): 

720 d = {} 

721 for child in node: 

722 key = self._get_tag(child) 

723 if key in [ 

724 "chamberOfCommerceNumber", 

725 "europeanCompanyRegistrationNumber", 

726 ]: 

727 d[key] = child.text 

728 else: 

729 self._warn_unknown_tag(key) 

730 return d 

731 

732 def _read_standardized_location(self, node): 

733 for child in node: 

734 key = self._get_tag(child) 

735 if key == "location": 

736 setattr(self, "standardizedLocation", self._read_geometry(child)) 

737 elif key == "coordinateTransformation": 

738 setattr(self, key, child.text) 

739 else: 

740 self._warn_unknown_tag(key) 

741 

742 def _read_delivered_vertical_position(self, node, d=None): 

743 for child in node: 

744 key = self._get_tag(child) 

745 if key == "verticalPositioningDate": 

746 value = self._read_date(child) 

747 elif key == "offset": 

748 if child.text is None: 

749 value = np.nan 

750 else: 

751 value = float(child.text) 

752 elif key == "verticalPositioningOperator": 

753 value = self._read_operator(child) 

754 else: 

755 value = child.text 

756 

757 if d is None: 

758 setattr(self, key, value) 

759 else: 

760 d[key] = value 

761 

762 def _read_lifespan(self, node, d=None): 

763 for child in node: 

764 key = self._get_tag(child) 

765 if key in ["startDate", "startTime", "endTime"]: 

766 if d is None: 

767 setattr(self, key, self._read_date(child)) 

768 else: 

769 d[key] = self._read_date(child) 

770 else: 

771 self._warn_unknown_tag(key) 

772 

773 def _read_validity_period(self, node, d=None): 

774 for child in node: 

775 key = self._get_tag(child) 

776 if key == "startValidity": 

777 if d is None: 

778 setattr(self, key, self._read_date(child)) 

779 else: 

780 d[key] = self._read_date(child) 

781 elif key == "endValidity": 

782 if d is None: 

783 setattr(self, key, self._read_date(child)) 

784 else: 

785 d[key] = self._read_date(child) 

786 else: 

787 self._warn_unknown_tag(key) 

788 

789 @staticmethod 

790 def _read_geometry(node): 

791 assert len(node) == 1 

792 tag = node[0].tag.split("}")[-1] 

793 if tag == "pos": 

794 x, y = tuple(map(float, node[0].text.strip().split())) 

795 if FileOrUrl._is_epsg_4258(node): 

796 x, y = y, x 

797 return Point(x, y) 

798 geometry = gml.parse_geometry(node[0]) 

799 if FileOrUrl._is_epsg_4258(node[0]): 

800 geometry = shapely.ops.transform(lambda x, y: (y, x), geometry) 

801 return geometry 

802 

803 @staticmethod 

804 def _is_epsg_4258(node): 

805 srsName = "urn:ogc:def:crs:EPSG::4258" 

806 return "srsName" in node.attrib and node.attrib["srsName"] == srsName 

807 

808 @staticmethod 

809 def _read_date(node): 

810 ns = {"brocom": "http://www.broservices.nl/xsd/brocommon/3.0"} 

811 date = node.find("brocom:date", ns) 

812 if date is None: 

813 date = node.find("brocom:yearMonth", ns) 

814 if date is None: 

815 date = node.find("brocom:year", ns) 

816 if date is None: 

817 return pd.NaT 

818 return pd.to_datetime(date.text) 

819 

820 @staticmethod 

821 def _read_time_instant(node): 

822 ns = {"gml": "http://www.opengis.net/gml/3.2"} 

823 time_instant = node.find("gml:TimeInstant", ns) 

824 time_position = time_instant.find("gml:timePosition", ns) 

825 return pd.to_datetime(time_position.text) 

826 

827 def _read_data_array(self, node): 

828 values = None 

829 for child in node: 

830 key = self._get_tag(child) 

831 if key == "encoding": 

832 ns = {"swe": "http://www.opengis.net/swe/2.0"} 

833 text_encoding = child.find("swe:TextEncoding", ns) 

834 encoding = text_encoding.attrib.copy() 

835 elif key == "elementCount": 

836 pass 

837 elif key == "elementType": 

838 pass 

839 elif key == "values": 

840 values = pd.read_csv( 

841 StringIO(child.text), 

842 header=None, 

843 decimal=encoding["decimalSeparator"], 

844 sep=encoding["tokenSeparator"], 

845 lineterminator=encoding["blockSeparator"], 

846 na_values=-999999, 

847 ) 

848 else: 

849 self._warn_unknown_tag(key) 

850 return values 

851 

852 def _read_descriptive_borehole_log(self, node): 

853 d = {} 

854 for child in node: 

855 key = self._get_tag(child) 

856 if key in [ 

857 "descriptionQuality", 

858 "describedSamplesQuality", 

859 "continuouslySampled", 

860 "descriptionLocation", 

861 "describedMaterial", 

862 "sampleMoistness", 

863 "boreholeLogChecked", 

864 ]: 

865 d[key] = child.text 

866 elif key == "layer": 

867 if key not in d: 

868 d[key] = [] 

869 if self._check_single_child_with_tag(child, "Layer"): 

870 child = child[0] 

871 layer = {} 

872 for grandchild in child: 

873 key2 = self._get_tag(grandchild) 

874 if key2 in ["upperBoundary", "lowerBoundary"]: 

875 layer[key2] = self._parse_float(grandchild) 

876 elif key2 in [ 

877 "upperBoundaryDetermination", 

878 "lowerBoundaryDetermination", 

879 "anthropogenic", 

880 "activityType", 

881 "specialMaterial", 

882 "slant", 

883 "internalStructureIntact", 

884 "bedded", 

885 "compositeLayer", 

886 "bedding", 

887 "rooted", 

888 "identification", 

889 ]: 

890 layer[key2] = grandchild.text 

891 elif key2 == "soil": 

892 self._read_soil(grandchild, layer) 

893 elif key2 == "rock": 

894 self._read_rock(grandchild, layer) 

895 elif key2 == "soilType": 

896 for greatgrandchild in grandchild: 

897 key2 = self._get_tag(greatgrandchild) 

898 if key2 in ["soilName", "sandMedianClass"]: 

899 layer[key2] = greatgrandchild.text 

900 elif key2 == "particularConstituent": 

901 for greatgrandchild in grandchild: 

902 key2 = self._get_tag(greatgrandchild) 

903 if key2 in ["ConstituentType"]: 

904 layer[key2] = greatgrandchild.text 

905 else: 

906 self._warn_unknown_tag(key2) 

907 d[key].append(layer) 

908 else: 

909 self._warn_unknown_tag(key) 

910 if "layer" in d: 

911 d["layer"] = pd.DataFrame(d["layer"]) 

912 return d 

913 

914 def _read_soil(self, node, d): 

915 for child in node: 

916 key = self._get_tag(child) 

917 if key in [ 

918 "geotechnicalSoilName", 

919 "soilNameNEN5104", 

920 "gravelContentClassNEN5104", 

921 "organicMatterContentClassNEN5104", 

922 "colour", 

923 "mottled", 

924 "interbedding", 

925 "carbonateContentClass", 

926 "organicMatterContentClass", 

927 "crossBedding", 

928 "gradedBedding", 

929 "mixed", 

930 "mixingType", 

931 "gravelMedianClass", 

932 "fineGravelContentClass", 

933 "mediumCoarseGravelContentClass", 

934 "veryCoarseGravelContentClass", 

935 "sandMedianClass", 

936 "sandSortingNEN5104", 

937 "peatType", 

938 "organicSoilTexture", 

939 "fineSoilConsistency", 

940 "organicSoilConsistency", 

941 "peatTensileStrength", 

942 "geotechnicalDepositionalCharacteristic", 

943 "depositionalAge", 

944 "classificationLoamBased", 

945 "pedologicalSoilName", 

946 "structureType", 

947 "estimatedDensity", 

948 "ripeningClass", 

949 "vertic", 

950 "containsShellMatter", 

951 "containsGravel", 

952 "gravelContentClass", 

953 "chunk", 

954 "moistness", 

955 ]: 

956 d[key] = child.text 

957 elif key in ["estimatedOrganicMatterContent", "estimatedClayContent"]: 

958 d[key] = float(child.text) 

959 elif key in ["tertiaryConstituent", "dispersedInhomogeneity"]: 

960 if key not in d: 

961 d[key] = [] 

962 d[key].append(child.text) 

963 elif key == "grainshape": 

964 for grandchild in child: 

965 key = self._get_tag(grandchild) 

966 if key in ["sizeFraction", "angularity", "sphericity"]: 

967 d[key] = grandchild.text 

968 else: 

969 self._warn_unknown_tag(key) 

970 elif key == "incompleteFractionSpecification": 

971 for grandchild in child: 

972 key = self._get_tag(grandchild) 

973 if key in ["estimatedOrganicMatterContent", "estimatedClayContent"]: 

974 d[key] = float(grandchild.text) 

975 else: 

976 self._warn_unknown_tag(key) 

977 elif key == "stain": 

978 for grandchild in child: 

979 key = self._get_tag(grandchild) 

980 if key in ["stainColour", "mottlingDensity", "evenlyMottled"]: 

981 d[key] = grandchild.text 

982 else: 

983 self._warn_unknown_tag(key) 

984 elif key == "soilAggregate": 

985 for grandchild in child: 

986 key = self._get_tag(grandchild) 

987 if key in [ 

988 "aggregateShape", 

989 "angularity", 

990 "roughness", 

991 "aggregateLengthClass", 

992 "poreAbundanceClass", 

993 "horizontallyAligned", 

994 "disintegrating", 

995 ]: 

996 d[key] = grandchild.text 

997 else: 

998 self._warn_unknown_tag(key) 

999 elif key == "fractionDistribution": 

1000 for grandchild in child: 

1001 key = self._get_tag(grandchild) 

1002 if key in [ 

1003 "estimatedGravelContent", 

1004 "estimatedShellMatterContent", 

1005 "estimatedOrganicMatterContent", 

1006 "estimatedFineFractionContent", 

1007 ]: 

1008 d[key] = float(grandchild.text) 

1009 elif key == "fineFractionDistribution": 

1010 for greatgrandchild in grandchild: 

1011 key = self._get_tag(greatgrandchild) 

1012 if key in [ 

1013 "estimatedClayContent", 

1014 "estimatedSiltContent", 

1015 "estimatedSandContent", 

1016 ]: 

1017 d[key] = float(greatgrandchild.text) 

1018 else: 

1019 self._warn_unknown_tag(key) 

1020 else: 

1021 self._warn_unknown_tag(key) 

1022 elif key == "munsellColour": 

1023 for grandchild in child: 

1024 key = self._get_tag(grandchild) 

1025 if key in ["munsellHue", "munsellValue", "munsellChroma"]: 

1026 d[key] = grandchild.text 

1027 else: 

1028 self._warn_unknown_tag(key) 

1029 elif key == "sandFraction": 

1030 for grandchild in child: 

1031 key = self._get_tag(grandchild) 

1032 if key in ["sandMedianClass", "sandSorting"]: 

1033 d[key] = grandchild.text 

1034 else: 

1035 self._warn_unknown_tag(key) 

1036 else: 

1037 self._warn_unknown_tag(key) 

1038 

1039 def _read_rock(self, node, d): 

1040 for child in node: 

1041 key = self._get_tag(child) 

1042 if key in [ 

1043 "rockType", 

1044 "cementType", 

1045 "colour", 

1046 "carbonateContentClass", 

1047 "crossBedding", 

1048 "gradedBedding", 

1049 "voidsPresent", 

1050 "voidDistribution", 

1051 "stability", 

1052 "strengthClass", 

1053 "weathered", 

1054 ]: 

1055 d[key] = child.text 

1056 elif key in ["tertiaryRockConstituent", "dispersedInhomogeneity"]: 

1057 if key not in d: 

1058 d[key] = [] 

1059 d[key].append(child.text) 

1060 elif key == "weatheringDegree": 

1061 for grandchild in child: 

1062 key = self._get_tag(grandchild) 

1063 if key in ["discolouration", "disintegration", "decomposition"]: 

1064 d[key] = grandchild.text 

1065 else: 

1066 self._warn_unknown_tag(key) 

1067 else: 

1068 self._warn_unknown_tag(key) 

1069 

1070 

1071def get_bronhouders(index="kvk", **kwargs): 

1072 """ 

1073 Get the name, kvk-number and the identifier of bronhouders (data owners). 

1074 

1075 Parameters 

1076 ---------- 

1077 index : string, optional 

1078 The column to set as the index of the resulting DataFrame. The default is "kvk". 

1079 **kwargs : dict 

1080 Kwargs are passed onto pandas.read_json(). 

1081 

1082 Returns 

1083 ------- 

1084 df : pd.DataFrame 

1085 A Pandas DataFrame, with one row per bronhouder. 

1086 

1087 """ 

1088 url = "https://bromonitor.nl/api/rapporten/bronhouders" 

1089 df = pd.read_json(url, **kwargs) 

1090 if index is not None: 

1091 df = df.set_index(index) 

1092 return df 

1093 

1094 

1095def get_brondocumenten_per_bronhouder(index=("kvk", "type"), timeout=5, **kwargs): 

1096 """ 

1097 Get the number of documents per bronhouder (data owner). 

1098 

1099 Parameters 

1100 ---------- 

1101 index : str, tuple or list, optional 

1102 The column(s) to set as the index of the resulting DataFrame. The default is 

1103 "kvk" and "type". 

1104 timeout : int or float, optional 

1105 A number indicating how many seconds to wait for the client to make a connection 

1106 and/or send a response. The default is 5. 

1107 **kwargs : dict 

1108 Kwargs are passed onto pandas.DataFrame(). 

1109 

1110 Returns 

1111 ------- 

1112 df : pd.DataFrame 

1113 A Pandas DataFrame, with one row per combination of bronhouder and data-type. 

1114 

1115 """ 

1116 url = "https://bromonitor.nl/api/rapporten/brondocumenten-per-bronhouder" 

1117 r = util.get_with_rate_limit(url, timeout=timeout) 

1118 if not r.ok: 

1119 raise (Exception("Download of brondocumenten per bronhouder failed")) 

1120 df = pd.DataFrame(r.json()["data"], **kwargs) 

1121 if "key" in df.columns: 

1122 df = pd.concat((pd.DataFrame(list(df["key"])), df.drop(columns="key")), axis=1) 

1123 if index is not None: 

1124 if isinstance(index, tuple): 

1125 index = list(index) 

1126 df = df.set_index(index) 

1127 return df 

1128 

1129 

1130def get_kvk_df(fn_bronhouder_kvk=None): 

1131 """ 

1132 Read manually saved table of KVK and Organisatienaam to DataFrame. 

1133 

1134 from https://basisregistratieondergrond.nl/service-contact/formulieren/aangemeld-bro/ 

1135 :param fn_bronhouder_kvk: str, filename of the file with bronhouder and kvk numbers 

1136 :return: pandas DataFrame with kvk as index and column 'Organisatienaam' and 'Bronhouder' 

1137 """ 

1138 if fn_bronhouder_kvk is None: 

1139 fn_bronhouder_kvk = os.path.join( 

1140 os.path.dirname(__file__), "data", "bronhouder_kvk.txt" 

1141 ) 

1142 

1143 df_bron_kvk = pd.read_csv( 

1144 fn_bronhouder_kvk, 

1145 sep=";", # is a dummy value, data will be split later on the last | sign 

1146 dtype=str, 

1147 header=None, 

1148 names=["all_data"], 

1149 skipinitialspace=True, 

1150 comment="#", 

1151 ) 

1152 

1153 # split column all_data into bronhouder and kvk, using last | sign; both as string type 

1154 # mind that index has string type, as that is format provided in brodata downloads 

1155 df_bron_kvk[["Organisatienaam", "KVK-nummer"]] = ( 

1156 df_bron_kvk["all_data"].str.rsplit("|", n=1, expand=True).astype(str) 

1157 ) 

1158 df_bron_kvk = df_bron_kvk.drop(columns=["all_data"]) 

1159 

1160 # add column Bronhouder, value is True when (​B) in kvk 

1161 df_bron_kvk["Bronhouder"] = False 

1162 

1163 bronhouder_pattern = r"[(​B)|(B)]" 

1164 df_bron_kvk.loc[ 

1165 df_bron_kvk["KVK-nummer"].str.contains(bronhouder_pattern, regex=True), 

1166 "Bronhouder", 

1167 ] = True 

1168 # clean up kvk 

1169 df_bron_kvk["KVK-nummer"] = ( 

1170 df_bron_kvk["KVK-nummer"] 

1171 .str.replace(bronhouder_pattern, "", regex=True) 

1172 .str.strip() 

1173 ) 

1174 

1175 # remove leading and trailing whitespace from all columns 

1176 df_bron_kvk = df_bron_kvk.map(lambda x: x.strip() if isinstance(x, str) else x) 

1177 

1178 # make kvk index 

1179 df_bron_kvk.set_index("KVK-nummer", inplace=True) 

1180 

1181 return df_bron_kvk