Coverage for brodata / bro.py: 79%

595 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-20 14:37 +0000

1import logging 

2import os 

3import types 

4from xml.etree import ElementTree 

5from abc import ABC, abstractmethod 

6from io import StringIO 

7from zipfile import ZipFile 

8 

9from shapely.geometry import MultiPolygon, Point, Polygon 

10import shapely 

11import numpy as np 

12import geopandas as gpd 

13import pandas as pd 

14import requests 

15from pyproj import Transformer 

16 

17from . import util, gml 

18 

19logger = logging.getLogger(__name__) 

20 

21 

22# %% 

23def _get_bro_ids_of_bronhouder(cl, bronhouder): 

24 """ 

25 Retrieve list of BRO (Basisregistratie Ondergrond) IDs for a given bronhouder. 

26 

27 This function sends a GET request to the REST API to fetch the BRO IDs associated 

28 with the specified bronhouder. If the request is unsuccessful, it logs an error 

29 message. 

30 

31 Parameters 

32 ---------- 

33 bronhouder : str 

34 The identifier for the bronhouder to retrieve the associated BRO IDs. 

35 

36 Returns 

37 ------- 

38 list or None 

39 A list of BRO IDs if the request is successful. Returns `None` if the request 

40 fails. 

41 """ 

42 url = f"{cl._rest_url}/bro-ids?" 

43 params = dict(bronhouder=bronhouder) 

44 req = requests.get(url, params=params) 

45 if req.status_code > 200: 

46 logger.error(req.json()["errors"][0]["message"]) 

47 return 

48 bro_ids = req.json()["broIds"] 

49 return bro_ids 

50 

51 

52def _get_characteristics( 

53 cl, 

54 extent=None, 

55 tmin=None, 

56 tmax=None, 

57 x=None, 

58 y=None, 

59 radius=1000.0, 

60 epsg=28992, 

61 to_file=None, 

62 redownload=False, 

63 use_all_corners_of_extent=True, 

64 timeout=5, 

65 zipfile=None, 

66): 

67 """ 

68 Get characteristics of a set of registered objects for a given object class. 

69 

70 The maximum number of objects that can be retrieved is 2000 for a single request. 

71 

72 Parameters 

73 ---------- 

74 extent : list, tuple, shapely.geometry.Polygon or shapely.geometry.MultiPolygon, optional 

75 Download the characteristics within extent ([xmin, xmax, ymin, ymax]) or 

76 within the bounds of a polygon. The default is None. 

77 tmin : str or pd.Timestamp, optional 

78 The minimum registrationPeriod of the requested characteristics. The default is 

79 None. 

80 tmax : str or pd.Timestamp, optional 

81 The maximum registrationPeriod of the requested characteristics. The default is 

82 None. 

83 x : float, optional 

84 The x-coordinate of the center of the circle in which the characteristics are 

85 requested. The default is None. 

86 y : float, optional 

87 The y-coordinate of the center of the circle in which the characteristics are 

88 requested. The default is None. 

89 radius : float, optional 

90 The radius in meters of the center of the circle in which the characteristics 

91 are requested. The default is 1000.0. 

92 epsg : str, optional 

93 The coordinate reference system of the specified extent, x or y, and of the 

94 resulting GeoDataFrame. The default is 28992, which is the Dutch RD-system. 

95 to_file : str, optional 

96 When not None, save the characteristics to a file with a name as specified in 

97 to_file. The defaults None. 

98 redownload : bool, optional 

99 When the downloaded file exists in to_file, read from this file when redownload 

100 is False. If redownload is True, download the data again from the BRO-servers. 

101 The default is False. 

102 use_all_corners_of_extent : bool, optional 

103 Because the extent by default is given in epsg 28992, some locations along the 

104 border of a requested extent will not be returned in the result. To solve this 

105 issue, when use_all_corners_of_extent is True, all four corners of the extent 

106 are used to calculate the minimum and maximum lan and lon values. The default is 

107 True. 

108 timeout : int or float, optional 

109 A number indicating how many seconds to wait for the client to make a connection 

110 and/or send a response. The default is 5. 

111 zipfile : zipfile.ZipFile, optional 

112 A zipfile-object. When not None, zipfile is used to read previously downloaded 

113 data from. The default is None. 

114 

115 Returns 

116 ------- 

117 gpd.GeoDataFrame 

118 A GeoDataFrame contraining the characteristics. 

119 

120 Notes 

121 ----- 

122 Haalt de karakteristieken op van een set van registratie objecten, gegeven een 

123 kenmerkenverzameling (kenset). 

124 

125 De karakteristieken geven een samenvatting van een object zodat een verdere selectie 

126 gemaakt kan worden. Het past in een tweetrapsbenadering, waarbij de eerste stap 

127 bestaat uit het ophalen van de karakteristieken en de 2e stap uit het ophalen van de 

128 gewenste registratie objecten. Het resultaat van deze operatie is gemaximaliseerd op 

129 2000. 

130 """ 

131 if zipfile is None and ( 

132 redownload or to_file is None or not os.path.isfile(to_file) 

133 ): 

134 url = f"{cl._rest_url}/characteristics/searches?" 

135 

136 data = {} 

137 if tmin is not None or tmax is not None: 

138 data["registrationPeriod"] = {} 

139 if tmin is not None: 

140 beginDate = pd.to_datetime(tmin).strftime("%Y-%m-%d") 

141 data["registrationPeriod"]["beginDate"] = beginDate 

142 if tmax is not None: 

143 endDate = pd.to_datetime(tmax).strftime("%Y-%m-%d") 

144 data["registrationPeriod"]["endDate"] = endDate 

145 if (x is None or y is None) and extent is None: 

146 raise (Exception("Please specify either extent or x, y and radius")) 

147 

148 transformer = Transformer.from_crs(epsg, 4326) 

149 data["area"] = {} 

150 if x is not None and y is not None: 

151 lat, lon = transformer.transform(x, y) 

152 data["area"]["enclosingCircle"] = { 

153 "center": {"lat": lat, "lon": lon}, 

154 "radius": radius / 1000, 

155 } 

156 if extent is not None: 

157 if isinstance(extent, (Polygon, MultiPolygon)): 

158 xmin, ymin, xmax, ymax = extent.bounds 

159 else: 

160 xmin, xmax, ymin, ymax = extent 

161 

162 lat_ll, lon_ll = transformer.transform(xmin, ymin) 

163 lat_ur, lon_ur = transformer.transform(xmax, ymax) 

164 if use_all_corners_of_extent: 

165 lat_ul, lon_ul = transformer.transform(xmin, ymax) 

166 lat_lr, lon_lr = transformer.transform(xmax, ymin) 

167 lat_ll = min(lat_ll, lat_lr) 

168 lon_ll = min(lon_ll, lon_ul) 

169 lat_ur = max(lat_ul, lat_ur) 

170 lon_ur = max(lon_lr, lon_ur) 

171 

172 data["area"]["boundingBox"] = { 

173 "lowerCorner": {"lat": lat_ll, "lon": lon_ll}, 

174 "upperCorner": {"lat": lat_ur, "lon": lon_ur}, 

175 } 

176 req = requests.post(url, json=data, timeout=timeout) 

177 if req.status_code > 200: 

178 root = ElementTree.fromstring(req.text) 

179 FileOrUrl._check_for_rejection(root) 

180 # if reading of the rejection message failed, raise a more general error 

181 raise (Exception((f"Retieving data from {url} failed"))) 

182 

183 if to_file is not None: 

184 with open(to_file, "w") as f: 

185 f.write(req.text) 

186 

187 # read results 

188 tree = ElementTree.fromstring(req.text) 

189 else: 

190 if zipfile is not None: 

191 with zipfile.open(to_file) as f: 

192 tree = ElementTree.parse(f).getroot() 

193 else: 

194 tree = ElementTree.parse(to_file).getroot() 

195 

196 ns = {"xmlns": cl._xmlns} 

197 data = [] 

198 for gmw in tree.findall(f".//xmlns:{cl._char}", ns): 

199 d = {} 

200 for key in gmw.attrib: 

201 d[key.split("}", 1)[1]] = gmw.attrib[key] 

202 for child in gmw: 

203 key = util._get_tag(child) 

204 if len(child) == 0: 

205 d[key] = child.text 

206 elif key == "standardizedLocation": 

207 d[key] = FileOrUrl._read_geometry(child) 

208 elif key == "deliveredLocation": 

209 d[key] = FileOrUrl._read_geometry(child) 

210 elif ( 

211 key.endswith("Date") 

212 or key.endswith("Overview") 

213 or key in ["startTime", "endTime"] 

214 ): 

215 d[key] = child[0].text 

216 elif key in ["diameterRange", "screenPositionRange"]: 

217 for grandchild in child: 

218 key = util._get_tag(grandchild) 

219 d[key] = grandchild.text 

220 elif key == "licence": 

221 for grandchild in child: 

222 key2 = grandchild.tag.split("}", 1)[1] 

223 for greatgrandchild in grandchild: 

224 key3 = greatgrandchild.tag.split("}", 1)[1] 

225 if key3 == "identificationLicence": 

226 d[key] = greatgrandchild.text 

227 else: 

228 logger.warning(f"Unknown key: {key2}") 

229 elif key == "realisedInstallation": 

230 for grandchild in child: 

231 key2 = grandchild.tag.split("}", 1)[1] 

232 for greatgrandchild in grandchild: 

233 key3 = greatgrandchild.tag.split("}", 1)[1] 

234 if key3 == "installationFunction": 

235 d[key] = greatgrandchild.text 

236 else: 

237 logger.warning(f"Unknown key: {key2}") 

238 

239 else: 

240 logger.warning(f"Unknown key: {key}") 

241 data.append(d) 

242 

243 gdf = objects_to_gdf(data) 

244 if zipfile is not None and extent is not None and isinstance(gdf, gpd.GeoDataFrame): 

245 if isinstance(extent, (Polygon, MultiPolygon)): 

246 gdf = gdf[gdf.intersects(extent)] 

247 else: 

248 gdf = gdf.cx[extent[0] : extent[1], extent[2] : extent[3]] 

249 return gdf 

250 

251 

252def _get_data_in_extent( 

253 bro_cl, 

254 extent=None, 

255 timeout=5, 

256 silent=False, 

257 to_path=None, 

258 to_zip=None, 

259 redownload=False, 

260 geometry=None, 

261 to_gdf=True, 

262 index="broId", 

263 continue_on_error=False, 

264 progress_callback=None, 

265): 

266 """ 

267 Retrieve data within a specified extent for a certain bro-class. 

268 

269 Parameters 

270 ---------- 

271 bro_cl : class 

272 brodata class. 

273 extent : str or object, optional 

274 Spatial extent to query. If a string, interpreted as a zip file path. 

275 timeout : int, default=5 

276 Timeout in seconds for data retrieval requests. 

277 silent : bool, default=False 

278 If True, disables progress bars and reduces logging output. 

279 to_path : str, optional 

280 Directory path to save downloaded files. 

281 to_zip : str, optional 

282 Path to a zip file to read from or save data to. 

283 redownload : bool, default=False 

284 If True, forces redownload of data even if files exist. 

285 geometry : str or object, optional 

286 Geometry specification for the output GeoDataFrame. 

287 to_gdf : bool, default=True 

288 If True, converts the output to a GeoDataFrame. 

289 index : str, default="broId" 

290 Column name to use as index in the output GeoDataFrame. 

291 continue_on_error : bool, default=False 

292 If True, continues processing other items if an error occurs. 

293 progress_callback : function, optional 

294 A callback function that takes two arguments (current, total) to report 

295 progress. If None, no progress reporting is done. Defaults to None. 

296 

297 Returns 

298 ------- 

299 gdf : GeoDataFrame 

300 GeoDataFrame containing the retrieved data objects, indexed by the specified 

301 column. 

302 

303 Raises 

304 ------ 

305 Exception 

306 If invalid arguments are provided or data retrieval fails (unless 

307 continue_on_error is True). 

308 

309 Notes 

310 ----- 

311 - If `extent` is a string, it is treated as a zip file path and `to_zip` must not 

312 be provided. 

313 - Data can be read from or saved to zip archives or directories, depending on the 

314 provided arguments. 

315 - Progress is displayed unless `silent` is True. 

316 """ 

317 if isinstance(extent, str): 

318 if to_zip is not None: 

319 raise (Exception("When extent is a string, do not supply to_zip")) 

320 to_zip = extent 

321 extent = None 

322 redownload = False 

323 zipfile = None 

324 _files = None 

325 if to_zip is not None: 

326 if not redownload and os.path.isfile(to_zip): 

327 logger.info(f"Reading data from {to_zip}") 

328 zipfile = ZipFile(to_zip) 

329 else: 

330 if to_path is None: 

331 to_path = os.path.splitext(to_zip)[0] 

332 remove_path_again = not os.path.isdir(to_path) 

333 _files = [] 

334 

335 # get gwm characteristics 

336 logger.info(f"Getting characteristics in extent: {extent}") 

337 to_file = None 

338 if zipfile is not None or to_path is not None: 

339 to_file = "characteristics.xml" 

340 if zipfile is None: 

341 to_file = os.path.join(to_path, to_file) 

342 if _files is not None: 

343 _files.append(to_file) 

344 if to_path is not None and not os.path.isdir(to_path): 

345 os.makedirs(to_path) 

346 

347 char = _get_characteristics( 

348 bro_cl, extent=extent, to_file=to_file, redownload=redownload, zipfile=zipfile 

349 ) 

350 

351 data = _get_data_for_bro_ids( 

352 bro_cl, 

353 char.index, 

354 timeout=timeout, 

355 silent=silent, 

356 to_path=to_path, 

357 zipfile=zipfile, 

358 redownload=redownload, 

359 continue_on_error=continue_on_error, 

360 progress_callback=progress_callback, 

361 _files=_files, 

362 ) 

363 if zipfile is not None: 

364 zipfile.close() 

365 if zipfile is None and to_zip is not None: 

366 util._save_data_to_zip(to_zip, _files, remove_path_again, to_path) 

367 

368 gdf = objects_to_gdf(data, geometry, to_gdf, index) 

369 

370 return gdf 

371 

372 

373def _get_data_for_bro_ids( 

374 bro_cl, 

375 bro_ids, 

376 timeout=5, 

377 silent=False, 

378 to_path=None, 

379 zipfile=None, 

380 redownload=False, 

381 continue_on_error=False, 

382 desc=None, 

383 progress_callback=None, 

384 _files=None, 

385): 

386 """ 

387 Retrieve data for a list of specified bro_ids for a certain bro-class. 

388 

389 Parameters 

390 ---------- 

391 bro_cl : class 

392 brodata class. 

393 bro_ids : list of strings 

394 A list of bro-ids to donnload data for. 

395 timeout : int, default=5 

396 Timeout in seconds for data retrieval requests. 

397 silent : bool, default=False 

398 If True, disables progress bars and reduces logging output. 

399 to_path : str, optional 

400 Directory path to save downloaded files. 

401 zipfile : zipfile.ZipFile, optional 

402 A zipfile-object. When not None, zipfile is used to read previously downloaded 

403 data from. The default is None. 

404 redownload : bool, default=False 

405 If True, forces redownload of data even if files exist. 

406 continue_on_error : bool, default=False 

407 If True, continues processing other items if an error occurs. 

408 desc : str, optional 

409 Description for the progress bar. The default is None. 

410 progress_callback : function, optional 

411 A callback function that takes two arguments (current, total) to report 

412 progress. If None, no progress reporting is done. Defaults to None. 

413 

414 Returns 

415 ------- 

416 data : dictionary 

417 A dictionary with the bro-ids as keys, and the data as values. 

418 

419 Raises 

420 ------ 

421 Exception 

422 If invalid arguments are provided or data retrieval fails (unless 

423 continue_on_error is True). 

424 

425 Notes 

426 ----- 

427 - Data can be read from or saved to zip archives or directories, depending on the 

428 provided arguments. 

429 - Progress is displayed unless `silent` is True. 

430 """ 

431 data = {} 

432 if isinstance(bro_ids, str): 

433 bro_ids = [bro_ids] 

434 total = len(bro_ids) 

435 for i, bro_id in util.tqdm(enumerate(bro_ids), total=total, disable=silent, desc=desc): 

436 if progress_callback is not None: 

437 progress_callback(i, total) 

438 if zipfile is not None: 

439 fname = f"{bro_id}.xml" 

440 data[bro_id] = bro_cl(fname, zipfile=zipfile) 

441 continue 

442 to_file = None 

443 if to_path is not None: 

444 to_file = os.path.join(to_path, f"{bro_id}.xml") 

445 if _files is not None: 

446 _files.append(to_file) 

447 if not redownload and os.path.isfile(to_file): 

448 data[bro_id] = bro_cl(to_file) 

449 continue 

450 kwargs = {"to_file": to_file, "timeout": timeout} 

451 if continue_on_error: 

452 try: 

453 data[bro_id] = bro_cl.from_bro_id(bro_id, **kwargs) 

454 except Exception as e: 

455 logger.error("Error retrieving %s: %s", bro_id, e) 

456 else: 

457 data[bro_id] = bro_cl.from_bro_id(bro_id, **kwargs) 

458 

459 return data 

460 

461 

462def objects_to_gdf( 

463 data, 

464 geometry=None, 

465 to_gdf=True, 

466 index="broId", 

467 from_crs=None, 

468 to_crs=28992, 

469): 

470 if not to_gdf: 

471 return data 

472 if isinstance(data, list): 

473 df = pd.DataFrame(data) 

474 else: 

475 df = pd.DataFrame([data[key].to_dict() for key in data]) 

476 

477 if index is not None and not df.empty: 

478 if isinstance(index, str): 

479 if index in df.columns: 

480 df = df.set_index(index) 

481 elif np.all([x in df.columns for x in index]): 

482 # we assume index is an iterable (list), to form a MultiIndex 

483 df = df.set_index(index) 

484 if geometry is None: 

485 if "deliveredLocation" in df: 

486 geometry = "deliveredLocation" 

487 if from_crs is None: 

488 from_crs = 28992 

489 elif "standardizedLocation" in df: 

490 geometry = "standardizedLocation" 

491 if from_crs is None: 

492 from_crs = 4258 

493 else: 

494 return df 

495 gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=from_crs) 

496 if to_crs is not None and from_crs is not None and to_crs != from_crs: 

497 gdf = gdf.to_crs(to_crs) 

498 return gdf 

499 

500 

501class FileOrUrl(ABC): 

502 """ 

503 A class for parsing and handling XML data from files, URLs, or zipped files. 

504 

505 Supports fetching XML data from local files or remote URLs. It also handles 

506 rejection checks and extracts data into object attributes. Data is parsed 

507 recursively and can be converted to a dictionary. 

508 

509 Attributes: 

510 Instance variables are dynamically set based on the XML content. 

511 

512 Methods: 

513 __init__(url_or_file, zipfile=None, timeout=5, to_file=None, **kwargs): 

514 Parses XML from a URL, file, or zipped file, and initializes the object. 

515 

516 from_bro_id(bro_id, **kwargs): 

517 Fetches XML data from a REST service based on a given 'bro_id'. 

518 

519 to_dict(): 

520 Converts instance attributes to a dictionary, excluding methods and 

521 private attributes. 

522 """ 

523 

524 def __init__( 

525 self, 

526 url_or_file, 

527 zipfile=None, 

528 timeout=5, 

529 to_file=None, 

530 redownload=True, 

531 max_retries=2, 

532 **kwargs, 

533 ): 

534 # CSV 

535 if url_or_file.endswith(".csv"): 

536 if zipfile is not None: 

537 self._read_csv(StringIO(zipfile.read(url_or_file)), **kwargs) 

538 else: 

539 self._read_csv(url_or_file, **kwargs) 

540 # XML or URL 

541 else: 

542 if zipfile is not None: 

543 root = ElementTree.fromstring(zipfile.read(url_or_file)) 

544 elif url_or_file.startswith("http"): 

545 if redownload or to_file is None or not os.path.isfile(to_file): 

546 params = {} 

547 if "tmin" in kwargs and kwargs["tmin"] is not None: 

548 tmin = kwargs.pop("tmin") 

549 tmin = pd.to_datetime(tmin).strftime("%Y-%m-%d") 

550 params["observationPeriodBeginDate"] = tmin 

551 if "tmax" in kwargs and kwargs["tmax"] is not None: 

552 tmax = kwargs.pop("tmax") 

553 tmax = pd.to_datetime(tmax).strftime("%Y-%m-%d") 

554 params["observationPeriodEndDate"] = tmax 

555 if max_retries > 1: 

556 adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) 

557 session = requests.Session() 

558 session.mount("https://", adapter) 

559 req = session.get(url_or_file, params=params, timeout=timeout) 

560 else: 

561 req = requests.get(url_or_file, params=params, timeout=timeout) 

562 if not req.ok: 

563 if req.reason == "Bad Request": 

564 root = ElementTree.fromstring(req.text) 

565 FileOrUrl._check_for_rejection(root) 

566 raise Exception(f"Retrieving data from {url_or_file} failed") 

567 if to_file is not None: 

568 with open(to_file, "w") as f: 

569 f.write(req.text) 

570 root = ElementTree.fromstring(req.text) 

571 FileOrUrl._check_for_rejection(root) 

572 else: 

573 tree = ElementTree.parse(to_file) 

574 root = tree.getroot() 

575 else: 

576 tree = ElementTree.parse(url_or_file) 

577 root = tree.getroot() 

578 

579 self._read_contents(root, **kwargs) 

580 

581 def __repr__(self): 

582 # retrieve properties if they exist 

583 propdict = {"broId": "broId"} 

584 props = {} 

585 for key in propdict: 

586 if hasattr(self, key): 

587 props[propdict[key]] = getattr(self, key) 

588 if hasattr(self, "deliveredLocation"): 

589 if isinstance(self.deliveredLocation, Point): 

590 props["x"] = self.deliveredLocation.x 

591 props["y"] = self.deliveredLocation.y 

592 name = util._format_repr(self, props) 

593 return name 

594 

595 @abstractmethod 

596 def _read_contents(self, tree, **kwargs): 

597 """Each subclass must overload _read_contents to parse XML result.""" 

598 

599 def _read_csv(self, *args, **kwargs): 

600 raise NotImplementedError( 

601 f"Class {self.__class__.__name__} does not support reading from CSV files." 

602 ) 

603 

604 @classmethod 

605 def from_bro_id(cls, bro_id, **kwargs): 

606 if not hasattr(cls, "_rest_url"): 

607 raise (NotImplementedError(f"No rest-service defined for {cls.__name__}")) 

608 

609 return cls(f"{cls._rest_url}/objects/{bro_id}", **kwargs) 

610 

611 def to_dict(self): 

612 d = {} 

613 for attrib in dir(self): 

614 if attrib.startswith("_"): 

615 continue 

616 value = getattr(self, attrib) 

617 if type(value) is types.MethodType: 

618 continue 

619 d[attrib] = value 

620 return d 

621 

622 @staticmethod 

623 def _check_for_rejection(tree): 

624 ns = {"brocom": "http://www.broservices.nl/xsd/brocommon/3.0"} 

625 response_type = tree.find("brocom:responseType", ns) 

626 if response_type.text == "rejection": 

627 criterionError = tree.find("brocom:criterionError", ns) 

628 if criterionError is None: 

629 msg = tree.find("brocom:rejectionReason", ns).text 

630 else: 

631 msg = criterionError.find("brocom:specification", ns).text 

632 raise (ValueError(msg)) 

633 

634 @staticmethod 

635 def _get_tag(node): 

636 return util._get_tag(node) 

637 

638 def _get_main_object(self, tree, object_name=None, ns=None): 

639 if object_name is None: 

640 object_name = self._object_name 

641 if ns is None: 

642 ns = {"xmlns": self._xmlns} 

643 if isinstance(object_name, list): 

644 for name in object_name: 

645 objects = tree.findall(f".//xmlns:{name}", ns) 

646 if objects: 

647 break 

648 else: 

649 objects = tree.findall(f".//xmlns:{object_name}", ns) 

650 if len(objects) > 1: 

651 raise (Exception(f"Only one {object_name} supported")) 

652 elif len(objects) == 0: 

653 raise (Exception(f"No {object_name} found")) 

654 return objects[0] 

655 

656 def _warn_unknown_tag(self, tag, parent=None): 

657 class_name = self.__class__.__name__ 

658 bro_id = getattr(self, "broId", "") 

659 util._warn_unknown_tag(tag, parent, class_name, bro_id) 

660 

661 def _raise_assumed_single(self, key): 

662 raise ValueError( 

663 f"Assumed there is only one {key} in {self.__class__.__name__} {getattr(self, 'broId', '')}" 

664 ) 

665 

666 def _check_single_child_with_tag(self, node, tag): 

667 return len(node) == 1 and self._get_tag(node[0]) == tag 

668 

669 def _read_children_of_children(self, node, d=None, to_float=None, to_int=None): 

670 if to_float is not None and isinstance(to_float, str): 

671 to_float = [to_float] 

672 if to_int is not None and isinstance(to_int, str): 

673 to_int = [to_int] 

674 if len(node) == 0: 

675 key = node.tag.split("}", 1)[1] 

676 if d is None: 

677 setattr(self, key, FileOrUrl._parse_text(node, key, to_float, to_int)) 

678 else: 

679 d[key] = FileOrUrl._parse_text(node, key, to_float, to_int) 

680 else: 

681 for child in node: 

682 self._read_children_of_children( 

683 child, d=d, to_float=to_float, to_int=to_int 

684 ) 

685 

686 @staticmethod 

687 def _parse_text(node, key, to_float=None, to_int=None): 

688 if to_float is not None and key in to_float: 

689 return FileOrUrl._parse_float(node) 

690 if to_int is not None and key in to_int: 

691 return int(node.text) 

692 return node.text 

693 

694 @staticmethod 

695 def _parse_float(node): 

696 if node.text is None: 

697 return np.nan 

698 return float(node.text) 

699 

700 def _read_delivered_location(self, node): 

701 for child in node: 

702 key = self._get_tag(child) 

703 if key == "location": 

704 setattr(self, "deliveredLocation", self._read_geometry(child)) 

705 elif key == "horizontalPositioningDate": 

706 setattr(self, key, self._read_date(child)) 

707 elif key == "horizontalPositioningMethod": 

708 setattr(self, key, child.text) 

709 elif key == "horizontalPositioningOperator": 

710 setattr(self, key, self._read_operator(child)) 

711 else: 

712 self._warn_unknown_tag(key) 

713 

714 def _read_operator(self, node): 

715 d = {} 

716 for child in node: 

717 key = self._get_tag(child) 

718 if key in [ 

719 "chamberOfCommerceNumber", 

720 "europeanCompanyRegistrationNumber", 

721 ]: 

722 d[key] = child.text 

723 else: 

724 self._warn_unknown_tag(key) 

725 return d 

726 

727 def _read_standardized_location(self, node): 

728 for child in node: 

729 key = self._get_tag(child) 

730 if key == "location": 

731 setattr(self, "standardizedLocation", self._read_geometry(child)) 

732 elif key == "coordinateTransformation": 

733 setattr(self, key, child.text) 

734 else: 

735 self._warn_unknown_tag(key) 

736 

737 def _read_delivered_vertical_position(self, node, d=None): 

738 for child in node: 

739 key = self._get_tag(child) 

740 if key == "verticalPositioningDate": 

741 value = self._read_date(child) 

742 elif key == "offset": 

743 if child.text is None: 

744 value = np.nan 

745 else: 

746 value = float(child.text) 

747 elif key == "verticalPositioningOperator": 

748 value = self._read_operator(child) 

749 else: 

750 value = child.text 

751 

752 if d is None: 

753 setattr(self, key, value) 

754 else: 

755 d[key] = value 

756 

757 def _read_lifespan(self, node, d=None): 

758 for child in node: 

759 key = self._get_tag(child) 

760 if key in ["startDate", "startTime", "endTime"]: 

761 if d is None: 

762 setattr(self, key, self._read_date(child)) 

763 else: 

764 d[key] = self._read_date(child) 

765 else: 

766 self._warn_unknown_tag(key) 

767 

768 def _read_validity_period(self, node, d=None): 

769 for child in node: 

770 key = self._get_tag(child) 

771 if key == "startValidity": 

772 if d is None: 

773 setattr(self, key, self._read_date(child)) 

774 else: 

775 d[key] = self._read_date(child) 

776 elif key == "endValidity": 

777 if d is None: 

778 setattr(self, key, self._read_date(child)) 

779 else: 

780 d[key] = self._read_date(child) 

781 else: 

782 self._warn_unknown_tag(key) 

783 

784 @staticmethod 

785 def _read_geometry(node): 

786 assert len(node) == 1 

787 tag = node[0].tag.split("}")[-1] 

788 if tag == "pos": 

789 x, y = tuple(map(float, node[0].text.strip().split())) 

790 if FileOrUrl._is_epsg_4258(node): 

791 x, y = y, x 

792 return Point(x, y) 

793 geometry = gml.parse_geometry(node[0]) 

794 if FileOrUrl._is_epsg_4258(node[0]): 

795 geometry = shapely.ops.transform(lambda x, y: (y, x), geometry) 

796 return geometry 

797 

798 @staticmethod 

799 def _is_epsg_4258(node): 

800 srsName = "urn:ogc:def:crs:EPSG::4258" 

801 return "srsName" in node.attrib and node.attrib["srsName"] == srsName 

802 

803 @staticmethod 

804 def _read_date(node): 

805 ns = {"brocom": "http://www.broservices.nl/xsd/brocommon/3.0"} 

806 date = node.find("brocom:date", ns) 

807 if date is None: 

808 date = node.find("brocom:yearMonth", ns) 

809 if date is None: 

810 date = node.find("brocom:year", ns) 

811 if date is None: 

812 return pd.NaT 

813 return pd.to_datetime(date.text) 

814 

815 @staticmethod 

816 def _read_time_instant(node): 

817 ns = {"gml": "http://www.opengis.net/gml/3.2"} 

818 time_instant = node.find("gml:TimeInstant", ns) 

819 time_position = time_instant.find("gml:timePosition", ns) 

820 return pd.to_datetime(time_position.text) 

821 

822 def _read_data_array(self, node): 

823 values = None 

824 for child in node: 

825 key = self._get_tag(child) 

826 if key == "encoding": 

827 ns = {"swe": "http://www.opengis.net/swe/2.0"} 

828 text_encoding = child.find("swe:TextEncoding", ns) 

829 encoding = text_encoding.attrib.copy() 

830 elif key == "elementCount": 

831 pass 

832 elif key == "elementType": 

833 pass 

834 elif key == "values": 

835 values = pd.read_csv( 

836 StringIO(child.text), 

837 header=None, 

838 decimal=encoding["decimalSeparator"], 

839 sep=encoding["tokenSeparator"], 

840 lineterminator=encoding["blockSeparator"], 

841 na_values=-999999, 

842 ) 

843 else: 

844 self._warn_unknown_tag(key) 

845 return values 

846 

847 def _read_descriptive_borehole_log(self, node): 

848 d = {} 

849 for child in node: 

850 key = self._get_tag(child) 

851 if key in [ 

852 "descriptionQuality", 

853 "describedSamplesQuality", 

854 "continuouslySampled", 

855 "descriptionLocation", 

856 "describedMaterial", 

857 "sampleMoistness", 

858 "boreholeLogChecked", 

859 ]: 

860 d[key] = child.text 

861 elif key == "layer": 

862 if key not in d: 

863 d[key] = [] 

864 if self._check_single_child_with_tag(child, "Layer"): 

865 child = child[0] 

866 layer = {} 

867 for grandchild in child: 

868 key2 = self._get_tag(grandchild) 

869 if key2 in ["upperBoundary", "lowerBoundary"]: 

870 layer[key2] = self._parse_float(grandchild) 

871 elif key2 in [ 

872 "upperBoundaryDetermination", 

873 "lowerBoundaryDetermination", 

874 "anthropogenic", 

875 "activityType", 

876 "specialMaterial", 

877 "slant", 

878 "internalStructureIntact", 

879 "bedded", 

880 "compositeLayer", 

881 "bedding", 

882 "rooted", 

883 "identification", 

884 ]: 

885 layer[key2] = grandchild.text 

886 elif key2 == "soil": 

887 self._read_soil(grandchild, layer) 

888 elif key2 == "rock": 

889 self._read_rock(grandchild, layer) 

890 elif key2 == "soilType": 

891 for greatgrandchild in grandchild: 

892 key2 = self._get_tag(greatgrandchild) 

893 if key2 in ["soilName", "sandMedianClass"]: 

894 layer[key2] = greatgrandchild.text 

895 elif key2 == "particularConstituent": 

896 for greatgrandchild in grandchild: 

897 key2 = self._get_tag(greatgrandchild) 

898 if key2 in ["ConstituentType"]: 

899 layer[key2] = greatgrandchild.text 

900 else: 

901 self._warn_unknown_tag(key2) 

902 d[key].append(layer) 

903 else: 

904 self._warn_unknown_tag(key) 

905 if "layer" in d: 

906 d["layer"] = pd.DataFrame(d["layer"]) 

907 return d 

908 

909 def _read_soil(self, node, d): 

910 for child in node: 

911 key = self._get_tag(child) 

912 if key in [ 

913 "geotechnicalSoilName", 

914 "soilNameNEN5104", 

915 "gravelContentClassNEN5104", 

916 "organicMatterContentClassNEN5104", 

917 "colour", 

918 "mottled", 

919 "interbedding", 

920 "carbonateContentClass", 

921 "organicMatterContentClass", 

922 "crossBedding", 

923 "gradedBedding", 

924 "mixed", 

925 "mixingType", 

926 "gravelMedianClass", 

927 "fineGravelContentClass", 

928 "mediumCoarseGravelContentClass", 

929 "veryCoarseGravelContentClass", 

930 "sandMedianClass", 

931 "sandSortingNEN5104", 

932 "peatType", 

933 "organicSoilTexture", 

934 "fineSoilConsistency", 

935 "organicSoilConsistency", 

936 "peatTensileStrength", 

937 "geotechnicalDepositionalCharacteristic", 

938 "depositionalAge", 

939 "classificationLoamBased", 

940 "pedologicalSoilName", 

941 "structureType", 

942 "estimatedDensity", 

943 "ripeningClass", 

944 "vertic", 

945 "containsShellMatter", 

946 "containsGravel", 

947 "gravelContentClass", 

948 "chunk", 

949 "moistness", 

950 ]: 

951 d[key] = child.text 

952 elif key in ["estimatedOrganicMatterContent", "estimatedClayContent"]: 

953 d[key] = float(child.text) 

954 elif key in ["tertiaryConstituent", "dispersedInhomogeneity"]: 

955 if key not in d: 

956 d[key] = [] 

957 d[key].append(child.text) 

958 elif key == "grainshape": 

959 for grandchild in child: 

960 key = self._get_tag(grandchild) 

961 if key in ["sizeFraction", "angularity", "sphericity"]: 

962 d[key] = grandchild.text 

963 else: 

964 self._warn_unknown_tag(key) 

965 elif key == "incompleteFractionSpecification": 

966 for grandchild in child: 

967 key = self._get_tag(grandchild) 

968 if key in ["estimatedOrganicMatterContent", "estimatedClayContent"]: 

969 d[key] = float(grandchild.text) 

970 else: 

971 self._warn_unknown_tag(key) 

972 elif key == "stain": 

973 for grandchild in child: 

974 key = self._get_tag(grandchild) 

975 if key in ["stainColour", "mottlingDensity", "evenlyMottled"]: 

976 d[key] = grandchild.text 

977 else: 

978 self._warn_unknown_tag(key) 

979 elif key == "soilAggregate": 

980 for grandchild in child: 

981 key = self._get_tag(grandchild) 

982 if key in [ 

983 "aggregateShape", 

984 "angularity", 

985 "roughness", 

986 "aggregateLengthClass", 

987 "poreAbundanceClass", 

988 "horizontallyAligned", 

989 "disintegrating", 

990 ]: 

991 d[key] = grandchild.text 

992 else: 

993 self._warn_unknown_tag(key) 

994 elif key == "fractionDistribution": 

995 for grandchild in child: 

996 key = self._get_tag(grandchild) 

997 if key in [ 

998 "estimatedGravelContent", 

999 "estimatedShellMatterContent", 

1000 "estimatedOrganicMatterContent", 

1001 "estimatedFineFractionContent", 

1002 ]: 

1003 d[key] = float(grandchild.text) 

1004 elif key == "fineFractionDistribution": 

1005 for greatgrandchild in grandchild: 

1006 key = self._get_tag(greatgrandchild) 

1007 if key in [ 

1008 "estimatedClayContent", 

1009 "estimatedSiltContent", 

1010 "estimatedSandContent", 

1011 ]: 

1012 d[key] = float(greatgrandchild.text) 

1013 else: 

1014 self._warn_unknown_tag(key) 

1015 else: 

1016 self._warn_unknown_tag(key) 

1017 elif key == "munsellColour": 

1018 for grandchild in child: 

1019 key = self._get_tag(grandchild) 

1020 if key in ["munsellHue", "munsellValue", "munsellChroma"]: 

1021 d[key] = grandchild.text 

1022 else: 

1023 self._warn_unknown_tag(key) 

1024 elif key == "sandFraction": 

1025 for grandchild in child: 

1026 key = self._get_tag(grandchild) 

1027 if key in ["sandMedianClass", "sandSorting"]: 

1028 d[key] = grandchild.text 

1029 else: 

1030 self._warn_unknown_tag(key) 

1031 else: 

1032 self._warn_unknown_tag(key) 

1033 

1034 def _read_rock(self, node, d): 

1035 for child in node: 

1036 key = self._get_tag(child) 

1037 if key in [ 

1038 "rockType", 

1039 "cementType", 

1040 "colour", 

1041 "carbonateContentClass", 

1042 "crossBedding", 

1043 "gradedBedding", 

1044 "voidsPresent", 

1045 "voidDistribution", 

1046 "stability", 

1047 "strengthClass", 

1048 "weathered", 

1049 ]: 

1050 d[key] = child.text 

1051 elif key in ["tertiaryRockConstituent", "dispersedInhomogeneity"]: 

1052 if key not in d: 

1053 d[key] = [] 

1054 d[key].append(child.text) 

1055 elif key == "weatheringDegree": 

1056 for grandchild in child: 

1057 key = self._get_tag(grandchild) 

1058 if key in ["discolouration", "disintegration", "decomposition"]: 

1059 d[key] = grandchild.text 

1060 else: 

1061 self._warn_unknown_tag(key) 

1062 else: 

1063 self._warn_unknown_tag(key) 

1064 

1065 

1066def get_bronhouders(index="kvk", **kwargs): 

1067 """ 

1068 Get the name, kvk-number and the identifier of bronhouders (data owners). 

1069 

1070 Parameters 

1071 ---------- 

1072 index : string, optional 

1073 The column to set as the index of the resulting DataFrame. The default is "kvk". 

1074 **kwargs : dict 

1075 Kwargs are passed onto pandas.read_json(). 

1076 

1077 Returns 

1078 ------- 

1079 df : pd.DataFrame 

1080 A Pandas DataFrame, with one row per bronhouder. 

1081 

1082 """ 

1083 url = "https://bromonitor.nl/api/rapporten/bronhouders" 

1084 df = pd.read_json(url, **kwargs) 

1085 if index is not None: 

1086 df = df.set_index(index) 

1087 return df 

1088 

1089 

1090def get_brondocumenten_per_bronhouder(index=("kvk", "type"), timeout=5, **kwargs): 

1091 """ 

1092 Get the number of documents per bronhouder (data owner). 

1093 

1094 Parameters 

1095 ---------- 

1096 index : str, tuple or list, optional 

1097 The column(s) to set as the index of the resulting DataFrame. The default is 

1098 "kvk" and "type". 

1099 timeout : int or float, optional 

1100 A number indicating how many seconds to wait for the client to make a connection 

1101 and/or send a response. The default is 5. 

1102 **kwargs : dict 

1103 Kwargs are passed onto pandas.DataFrame(). 

1104 

1105 Returns 

1106 ------- 

1107 df : pd.DataFrame 

1108 A Pandas DataFrame, with one row per combination of bronhouder and data-type. 

1109 

1110 """ 

1111 url = "https://bromonitor.nl/api/rapporten/brondocumenten-per-bronhouder" 

1112 r = requests.get(url, timeout=timeout) 

1113 if not r.ok: 

1114 raise (Exception("Download of brondocumenten per bronhouder failed")) 

1115 df = pd.DataFrame(r.json()["data"], **kwargs) 

1116 if "key" in df.columns: 

1117 df = pd.concat((pd.DataFrame(list(df["key"])), df.drop(columns="key")), axis=1) 

1118 if index is not None: 

1119 if isinstance(index, tuple): 

1120 index = list(index) 

1121 df = df.set_index(index) 

1122 return df 

1123 

1124 

1125def get_kvk_df(fn_bronhouder_kvk=None): 

1126 """ 

1127 Read manually saved table of KVK and Organisatienaam to DataFrame. 

1128 

1129 from https://basisregistratieondergrond.nl/service-contact/formulieren/aangemeld-bro/ 

1130 :param fn_bronhouder_kvk: str, filename of the file with bronhouder and kvk numbers 

1131 :return: pandas DataFrame with kvk as index and column 'Organisatienaam' and 'Bronhouder' 

1132 """ 

1133 if fn_bronhouder_kvk is None: 

1134 fn_bronhouder_kvk = os.path.join( 

1135 os.path.dirname(__file__), "data", "bronhouder_kvk.txt" 

1136 ) 

1137 

1138 df_bron_kvk = pd.read_csv( 

1139 fn_bronhouder_kvk, 

1140 sep=";", # is a dummy value, data will be split later on the last | sign 

1141 dtype=str, 

1142 header=None, 

1143 names=["all_data"], 

1144 skipinitialspace=True, 

1145 comment="#", 

1146 ) 

1147 

1148 # split column all_data into bronhouder and kvk, using last | sign; both as string type 

1149 # mind that index has string type, as that is format provided in brodata downloads 

1150 df_bron_kvk[["Organisatienaam", "KVK-nummer"]] = ( 

1151 df_bron_kvk["all_data"].str.rsplit("|", n=1, expand=True).astype(str) 

1152 ) 

1153 df_bron_kvk = df_bron_kvk.drop(columns=["all_data"]) 

1154 

1155 # add column Bronhouder, value is True when (​B) in kvk 

1156 df_bron_kvk["Bronhouder"] = False 

1157 

1158 bronhouder_pattern = r"[(​B)|(B)]" 

1159 df_bron_kvk.loc[ 

1160 df_bron_kvk["KVK-nummer"].str.contains(bronhouder_pattern, regex=True), 

1161 "Bronhouder", 

1162 ] = True 

1163 # clean up kvk 

1164 df_bron_kvk["KVK-nummer"] = ( 

1165 df_bron_kvk["KVK-nummer"] 

1166 .str.replace(bronhouder_pattern, "", regex=True) 

1167 .str.strip() 

1168 ) 

1169 

1170 # remove leading and trailing whitespace from all columns 

1171 df_bron_kvk = df_bron_kvk.map(lambda x: x.strip() if isinstance(x, str) else x) 

1172 

1173 # make kvk index 

1174 df_bron_kvk.set_index("KVK-nummer", inplace=True) 

1175 

1176 return df_bron_kvk