Coverage for brodata / gld.py: 72%

265 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-20 14:37 +0000

1import csv 

2import logging 

3import time 

4from functools import partial 

5from io import StringIO 

6 

7import numpy as np 

8import pandas as pd 

9import requests 

10 

11from . import bro 

12 

13logger = logging.getLogger(__name__) 

14 

15 

16def get_objects_as_csv( 

17 bro_id, 

18 rapportagetype="volledig", 

19 observatietype=None, 

20 to_file=None, 

21 return_contents=True, 

22 **kwargs, 

23): 

24 """ 

25 Fetch a complete Groundwater Level Dossier (GLD) as a CSV (RFC 4180) file 

26 based on the provided BRO-ID. The data can be filtered by report type and 

27 observation type. 

28 

29 Parameters 

30 ---------- 

31 bro_id : str 

32 The BRO-ID of the Groundwater Level Dossier to fetch. It can also be a full url, 

33 which is used by the gm-services. When using a full url, the parameter 

34 `rapportagetype` needs to reflect the choice in the url, and the parameter 

35 `observatietype` is ignored. 

36 rapportagetype : str, optional 

37 Type of report. The valid values are: 

38 - "volledig" : Full report 

39 - "compact" : Compact report with readable timestamps 

40 - "compact_met_timestamps" : Compact report with Unix epoch timestamps 

41 Default is "volledig". 

42 observatietype : str, optional 

43 Type of observations. The valid values are: 

44 - "regulier_beoordeeld" : Regular measurement with full evaluation 

45 (observatietype = reguliere meting en mate beoordeling = volledig beoordeeld) 

46 - "regulier_voorlopig" : Regular measurement with preliminary evaluation 

47 (observatietype = reguliere meting en mate beoordeling = voorlopig) 

48 - "controle" : Control measurement 

49 (observatietype = controle meting) 

50 - "onbekend" : Unknown evaluation 

51 (observatietype = reguliere meting en mate beoordeling = onbekend) 

52 If None, all observation types will be returned. Default is None. 

53 to_file : str, optional 

54 If provided, the CSV data will be written to the specified file. 

55 If None, the function returns the CSV data as a DataFrame. Default is None. 

56 return_contents : bool, optional 

57 If True, the function returns the parsed CSV data as a DataFrame. If False, 

58 the function returns None after saving the CSV to the specified file (if 

59 `to_file` is provided). Default is True. 

60 **kwargs : additional keyword arguments 

61 Additional arguments passed to `read_gld_csv`. 

62 

63 Returns 

64 ------- 

65 pd.DataFrame or None 

66 If successful, returns a DataFrame containing the parsed CSV data. 

67 If `to_file` is provided, returns None after saving the CSV to the specified file. 

68 If the request fails or returns empty data, returns None. 

69 

70 Notes 

71 ----- 

72 The function sends a GET request to the Groundwater Level Dossier API 

73 and fetches the data in CSV format. The `rapportagetype` and `observatietype` 

74 parameters can be used to filter the data. 

75 """ 

76 if bro_id.startswith("http"): 

77 req = requests.get(bro_id) 

78 else: 

79 url = f"{GroundwaterLevelDossier._rest_url}/objectsAsCsv/{bro_id}" 

80 params = { 

81 "rapportagetype": rapportagetype, 

82 } 

83 if observatietype is not None: 

84 params["observatietype"] = observatietype 

85 req = requests.get(url, params=params) 

86 req = _check_request_status(req) 

87 if to_file is not None: 

88 with open(to_file, "w") as f: 

89 f.write(req.text) 

90 if not return_contents: 

91 return 

92 if req.text == "": 

93 return None 

94 else: 

95 df = read_gld_csv( 

96 StringIO(req.text), 

97 bro_id, 

98 rapportagetype=rapportagetype, 

99 observatietype=observatietype, 

100 **kwargs, 

101 ) 

102 return df 

103 

104 

105def _check_request_status(req): 

106 if req.status_code == 429: 

107 msg = "Too many requests. The BRO API has rate limits in place." 

108 logger.warning(msg) 

109 # try 3 times with increasing wait time 

110 wait_times = [1, 2, 4] 

111 for wait_time in wait_times: 

112 logger.warning(f"Waiting for {wait_time} seconds before retrying...") 

113 time.sleep(wait_time) 

114 req = requests.get(req.url) 

115 if req.status_code <= 200: 

116 break 

117 if req.status_code == 429: 

118 raise Exception(msg + " Please try again later.") 

119 if req.status_code > 200: 

120 json_data = req.json() 

121 if "errors" in json_data: 

122 msg = json_data["errors"][0]["message"] 

123 else: 

124 msg = "{}: {}".format(json_data["title"], json_data["description"]) 

125 raise Exception(msg) 

126 return req 

127 

128 

129def get_series_as_csv( 

130 bro_id, filter_on_status_quality_control=None, asISO8601=False, to_file=None 

131): 

132 """ 

133 Get groundwater level series as a CSV, with timestamps and corresponding measurements. 

134 

135 This function retrieves a table with measurements for different observation types 

136 (regulier_voorlopig, regulier_beoordeeld, controle en onbekend) as columns. It is 

137 intended for applications such as the graphical visualization of groundwater levels. 

138 

139 Parameters 

140 ---------- 

141 bro_id : str 

142 The BRO-ID of the Groundwater Level Dossier. 

143 filter_on_status_quality_control : str or list of str, optional 

144 One or more quality control statuses to filter the measurements by. 

145 Possible values are 'onbeslist', 'goedgekeurd', and 'afgekeurd'. 

146 The default is None. 

147 asISO8601 : bool, optional 

148 If True, timestamps are returned in ISO8601 format; otherwise, in Unix 

149 epoch format. The default is False. 

150 to_file : str, optional 

151 If provided, the CSV data will be written to this file path. The default 

152 is None. 

153 

154 Returns 

155 ------- 

156 pd.DataFrame or None 

157 A DataFrame containing the time series of measurements, with timestamps 

158 as the index. Returns None if no data is available. 

159 """ 

160 url = f"{GroundwaterLevelDossier._rest_url}/seriesAsCsv/{bro_id}" 

161 params = {} 

162 if filter_on_status_quality_control is not None: 

163 if not isinstance(filter_on_status_quality_control, str): 

164 filter_on_status_quality_control = ",".join( 

165 filter_on_status_quality_control 

166 ) 

167 params["filterOnStatusQualityControl"] = filter_on_status_quality_control 

168 if asISO8601: 

169 params["asISO8601"] = "" 

170 req = requests.get(url, params=params) 

171 req = _check_request_status(req) 

172 if to_file is not None: 

173 with open(to_file, "w") as f: 

174 f.write(req.text) 

175 if req.text == "": 

176 return None 

177 else: 

178 df = pd.read_csv(StringIO(req.text)) 

179 if "Tijdstip" in df.columns: 

180 if asISO8601: 

181 df["Tijdstip"] = pd.to_datetime(df["Tijdstip"]) 

182 else: 

183 df["Tijdstip"] = pd.to_datetime(df["Tijdstip"], unit="ms") 

184 df = df.set_index("Tijdstip") 

185 return df 

186 

187 

188def read_gld_csv(fname, bro_id, rapportagetype, observatietype, **kwargs): 

189 """ 

190 Read and process a Groundwater Level Dossier (GLD) CSV file. 

191 

192 This function reads a CSV file containing groundwater level observations, 

193 processes the data according to the specified report type (`rapportagetype`), 

194 and returns a DataFrame of the observations. The file is assumed to contain 

195 at least three columns: time, value, and qualifier. The 'time' column is parsed 

196 as datetime, and additional processing is applied to the data. 

197 

198 Parameters 

199 ---------- 

200 fname : str 

201 The path to the CSV file containing the groundwater level observations. 

202 bro_id : str 

203 The BRO-ID of the Groundwater Level Dossier being processed. 

204 rapportagetype : str 

205 The report type. Can be one of: 

206 - 'volledig': as complete as possible (not supported yet) 

207 - 'compact': simple format with time and value. 

208 - 'compact_met_timestamps': format with timestamps for each observation. 

209 **kwargs : additional keyword arguments 

210 Additional arguments passed to the `process_observations` function. 

211 

212 Returns 

213 ------- 

214 pd.DataFrame 

215 A DataFrame containing the processed observations with the following columns: 

216 - time: The observation time. 

217 - value: The observed groundwater level. 

218 - qualifier: The quality code of the observation. 

219 - censored_reason: Reason for censoring, if applicable. 

220 - censoring_limitvalue: Limit value for censoring, if applicable. 

221 - interpolation_type: The interpolation method used, if applicable. 

222 

223 Notes 

224 ----- 

225 The time column is parsed as a datetime index. If the report type is 

226 'compact_met_timestamps', the time values are converted from Unix epoch time 

227 (milliseconds) to a datetime format. 

228 """ 

229 names = [ 

230 "time", 

231 "value", 

232 "qualifier", 

233 "censored_reason", 

234 "censoring_limitvalue", 

235 "interpolation_type", 

236 ] 

237 if rapportagetype == "compact": 

238 parse_dates = ["time"] 

239 else: 

240 parse_dates = None 

241 if observatietype is None or rapportagetype == "volledig": 

242 # the csv contains multiple observation types, seperated by a header with 

243 # observation-type and status. 

244 if isinstance(fname, StringIO): 

245 lines = fname.readlines() 

246 else: 

247 with open(fname, "r") as f: 

248 lines = f.readlines() 

249 

250 # look for header lines 

251 headers = [] 

252 if rapportagetype == "volledig": 

253 # the line with metdata is proceeded by a line starting with "observatie ID" 

254 for i, line in enumerate(lines): 

255 if line.startswith('"observatie ID",'): 

256 headers.append(i + 1) 

257 header_length = 3 

258 else: 

259 # the line with metdata is proceeded by an empty line 

260 # but directly after the header, there can also be empty lines, that we skip 

261 data_lines = False 

262 for i, line in enumerate(lines): 

263 only_commas = all(c == "," for c in line.rstrip("\r\n")) 

264 last_line_was_header = len(headers) > 0 and headers[-1] == i - 1 

265 

266 if only_commas: 

267 if last_line_was_header: 

268 data_lines = True 

269 else: 

270 data_lines = False 

271 else: 

272 if not data_lines: 

273 headers.append(i) 

274 header_length = 2 

275 

276 dfs = [] 

277 for i, header in enumerate(headers): 

278 line = lines[header] 

279 # split string by comma, but ignore commas between quotes 

280 reader = csv.reader(StringIO(line)) 

281 parts = next(reader) 

282 observation_type = parts[3] 

283 status = parts[4] 

284 

285 if i < len(headers) - 1: 

286 current_lines = lines[header + header_length : headers[i + 1] - 1] 

287 else: 

288 current_lines = lines[header + header_length :] 

289 df = pd.read_csv( 

290 StringIO("".join(current_lines)), 

291 names=names, 

292 index_col="time", 

293 parse_dates=parse_dates, 

294 usecols=[0, 1, 2], 

295 ) 

296 # remove empty indices 

297 mask = df.index.isna() & df.isna().all(axis=1) 

298 if mask.any(): 

299 df = df[~mask] 

300 df["status"] = status 

301 df["observation_type"] = observation_type 

302 dfs.append(df) 

303 if len(dfs) > 0: 

304 df = pd.concat(dfs) 

305 else: 

306 df = _get_empty_observation_df() 

307 else: 

308 df = pd.read_csv( 

309 fname, 

310 names=names, 

311 index_col="time", 

312 parse_dates=parse_dates, 

313 usecols=[0, 1, 2], 

314 ) 

315 if observatietype == "regulier_beoordeeld": 

316 df["status"] = "volledigBeoordeeld" 

317 df["observation_type"] = "reguliereMeting" 

318 elif observatietype == "regulier_voorlopig": 

319 df["status"] = "voorlopig" 

320 df["observation_type"] = "reguliereMeting" 

321 elif observatietype == "controle": 

322 df["status"] = np.nan 

323 df["observation_type"] = "controleMeting" 

324 elif observatietype == "onbekend": 

325 df["status"] = "onbekend" 

326 df["observation_type"] = "reguliereMeting" 

327 if rapportagetype == "compact_met_timestamps": 

328 df.index = pd.to_datetime(df.index, unit="ms") 

329 # remove empty indices 

330 mask = df.index.isna() & df.isna().all(axis=1) 

331 if mask.any(): 

332 df = df[~mask] 

333 df = process_observations(df, bro_id, **kwargs) 

334 return df 

335 

336 

337def get_observations_summary(bro_id): 

338 """ 

339 Fetch a summary of a Groundwater Level Dossier (GLD) in JSON format based on 

340 the provided BRO-ID. The summary includes details about the groundwater level 

341 observations, such as observation ID, start and end dates. 

342 

343 Parameters 

344 ---------- 

345 bro_id : str 

346 The BRO-ID of the Groundwater Level Dossier to fetch the summary for. 

347 

348 Raises 

349 ------ 

350 Exception 

351 If the request to the API fails or the status code is greater than 200, 

352 an exception will be raised with the error message returned by the API. 

353 

354 Returns 

355 ------- 

356 pd.DataFrame 

357 A DataFrame containing the summary of the groundwater level observations. 

358 The DataFrame will be indexed by the `observationId` and include 

359 `startDate` and `endDate` columns, converted to `datetime` format. 

360 

361 Notes 

362 ----- 

363 The function sends a GET request to the REST API and processes the returned 

364 JSON data into a DataFrame. If the response contains valid `startDate` or 

365 `endDate` fields, they will be converted to `datetime` format using the 

366 `pd.to_datetime` function. 

367 """ 

368 url = GroundwaterLevelDossier._rest_url 

369 url = "{}/objects/{}/observationsSummary".format(url, bro_id) 

370 req = requests.get(url) 

371 req = _check_request_status(req) 

372 df = pd.DataFrame(req.json()) 

373 if "observationId" in df.columns: 

374 df = df.set_index("observationId") 

375 if "startDate" in df.columns: 

376 df["startDate"] = pd.to_datetime(df["startDate"], dayfirst=True) 

377 if "endDate" in df.columns: 

378 df["endDate"] = pd.to_datetime(df["endDate"], dayfirst=True) 

379 return df 

380 

381 

382class GroundwaterLevelDossier(bro.FileOrUrl): 

383 """ 

384 Class to represent a Groundwater Level Dossier (GLD) from the BRO. 

385 

386 Attributes 

387 ---------- 

388 observation : pd.DataFrame 

389 DataFrame containing groundwater level observations with time and value 

390 columns. The data is processed and filtered based on the provided arguments. 

391 

392 tubeNumber : int 

393 The tube number associated with the observation. 

394 

395 groundwaterMonitoringWell : str 

396 The BRO-ID of the groundwater monitoring well. 

397 """ 

398 

399 _rest_url = "https://publiek.broservices.nl/gm/gld/v1" 

400 

401 def _read_contents(self, tree, status=None, observation_type=None, **kwargs): 

402 """ 

403 Parse data to populate the Groundwater Level Dossier attributes. 

404 

405 This method reads and processes the XML contents, extracting relevant 

406 groundwater monitoring information such as the groundwater monitoring well, 

407 tube number, and observations. It also processes the observations into a 

408 DataFrame, which is filtered and transformed based on the provided arguments. 

409 

410 Parameters 

411 ---------- 

412 tree : xml.etree.ElementTree 

413 The XML tree to parse and extract data from. 

414 

415 **kwargs : keyword arguments 

416 Additional parameters passed to the `process_observations` function to 

417 filter and transform the observations. 

418 

419 Raises 

420 ------ 

421 Exception 

422 If more than one or no GLD element is found in the XML tree. 

423 

424 Notes 

425 ----- 

426 The method expects the XML structure to adhere to the specified namespaces 

427 and element tags. It processes observation values, timestamps, and qualifiers 

428 into a pandas DataFrame. 

429 

430 The observation data is stored in the `observation` attribute and can be 

431 accessed as a DataFrame. 

432 """ 

433 ns = { 

434 "xmlns": "http://www.broservices.nl/xsd/dsgld/1.0", 

435 "gldcommon": "http://www.broservices.nl/xsd/gldcommon/1.0", 

436 "waterml": "http://www.opengis.net/waterml/2.0", 

437 "swe": "http://www.opengis.net/swe/2.0", 

438 "om": "http://www.opengis.net/om/2.0", 

439 "xlink": "http://www.w3.org/1999/xlink", 

440 } 

441 gld = self._get_main_object(tree, "GLD_O", ns) 

442 for key in gld.attrib: 

443 setattr(self, key.split("}", 1)[1], gld.attrib[key]) 

444 for child in gld: 

445 key = self._get_tag(child) 

446 if len(child) == 0: 

447 setattr(self, key, child.text) 

448 elif key == "monitoringPoint": 

449 well = child.find("gldcommon:GroundwaterMonitoringTube", ns) 

450 gmw_id = well.find("gldcommon:broId", ns).text 

451 setattr(self, "groundwaterMonitoringWell", gmw_id) 

452 tube_nr = int(well.find("gldcommon:tubeNumber", ns).text) 

453 setattr(self, "tubeNumber", tube_nr) 

454 elif key in ["registrationHistory"]: 

455 self._read_children_of_children(child) 

456 elif key == "groundwaterMonitoringNet": 

457 for grandchild in child: 

458 key2 = grandchild.tag.split("}", 1)[1] 

459 if key2 == "GroundwaterMonitoringNet": 

460 setattr(self, key, grandchild[0].text) 

461 else: 

462 logger.warning(f"Unknown key: {key2}") 

463 elif key == "observation": 

464 # get observation_metadata 

465 om_observation = child.find("om:OM_Observation", ns) 

466 if om_observation is None: 

467 continue 

468 metadata = om_observation.find("om:metadata", ns) 

469 observation_metadata = metadata.find("waterml:ObservationMetadata", ns) 

470 

471 # get status 

472 water_ml_status = observation_metadata.find("waterml:status", ns) 

473 if water_ml_status is None: 

474 status_value = None 

475 else: 

476 status_value = water_ml_status.attrib[ 

477 f"{{{ns['xlink']}}}href" 

478 ].rsplit(":", 1)[-1] 

479 if status is not None and status != status_value: 

480 continue 

481 

482 # get observation_type 

483 parameter = observation_metadata.find("waterml:parameter", ns) 

484 named_value = parameter.find("om:NamedValue", ns) 

485 name = named_value.find("om:name", ns) 

486 assert ( 

487 name.attrib[f"{{{ns['xlink']}}}href"] 

488 == "urn:bro:gld:ObservationMetadata:observationType" 

489 ) 

490 value = named_value.find("om:value", ns) 

491 observation_type_value = value.text 

492 if ( 

493 observation_type is not None 

494 and observation_type != observation_type_value 

495 ): 

496 continue 

497 

498 times = [] 

499 values = [] 

500 qualifiers = [] 

501 for measurement in child.findall(".//waterml:MeasurementTVP", ns): 

502 times.append(measurement.find("waterml:time", ns).text) 

503 value = measurement.find("waterml:value", ns).text 

504 if value is None: 

505 values.append(np.nan) 

506 else: 

507 values.append(float(value)) 

508 metadata = measurement.find("waterml:metadata", ns) 

509 TVPMM = metadata.find("waterml:TVPMeasurementMetadata", ns) 

510 qualifier = TVPMM.find("waterml:qualifier", ns) 

511 value = qualifier.find("swe:Category", ns).find("swe:value", ns) 

512 qualifiers.append(value.text) 

513 observation = pd.DataFrame( 

514 { 

515 "time": times, 

516 "value": values, 

517 "qualifier": qualifiers, 

518 "status": status_value, 

519 "observation_type": observation_type_value, 

520 } 

521 ).set_index("time") 

522 

523 if not hasattr(self, key): 

524 self.observation = [] 

525 self.observation.append(observation) 

526 else: 

527 self._warn_unknown_tag(key) 

528 if hasattr(self, "observation"): 

529 self.observation = pd.concat(self.observation) 

530 self.observation = process_observations( 

531 self.observation, self.broId, **kwargs 

532 ) 

533 else: 

534 self.observation = _get_empty_observation_df() 

535 

536 

537def process_observations( 

538 df, 

539 bro_id="gld", 

540 to_wintertime=True, 

541 qualifier=None, 

542 tmin=None, 

543 tmax=None, 

544 sort=True, 

545 drop_duplicates=True, 

546): 

547 """ 

548 Process groundwater level observations. 

549 

550 This function processes a DataFrame containing groundwater level observations, 

551 applying the following operations based on the provided parameters: 

552 - Conversion to Dutch winter time (optional). 

553 - Filtering observations based on the qualifier. 

554 - Dropping duplicate observations (optional). 

555 - Sorting the observations by time (optional). 

556 

557 Parameters 

558 ---------- 

559 df : pd.DataFrame 

560 DataFrame containing the groundwater level observations, with a time 

561 index and columns such as "value", "qualifier", etc. 

562 bro_id : str 

563 The BRO-ID of the Groundwater Level Dossier being processed. Only used for 

564 logging-purposes. The default is "gld". 

565 to_wintertime : bool, optional 

566 If True, the observation times are converted to Dutch winter time by 

567 removing any time zone information and adding one hour. If to_wintertime is 

568 False, observation times are kept in CET/CEST. Default is True. 

569 qualifier : str or list of str, optional 

570 If provided, the observations are filtered based on their "qualifier" 

571 column. Only rows with the specified qualifier(s) will be kept. 

572 tmin : str or datetime, optional 

573 The minimum time for filtering observations. Defaults to None. 

574 tmax : str or datetime, optional 

575 The maximum time for filtering observations. Defaults to None. 

576 sort : bool, optional 

577 If True, the DataFrame will be sorted, see `sort_observations`. Default is 

578 True. 

579 drop_duplicates : bool, optional 

580 If True, any duplicate observation times will be dropped, keeping only 

581 the first occurrence. Default is True. 

582 

583 Returns 

584 ------- 

585 pd.DataFrame 

586 A DataFrame containing the processed observations, with duplicate rows 

587 (if any) removed, the time index sorted, and filtered by qualifier if 

588 applicable. 

589 

590 """ 

591 df.index = pd.to_datetime(df.index, utc=True) 

592 if to_wintertime: 

593 # remove time zone information by transforming to dutch winter time 

594 df.index = df.index.tz_localize(None) + pd.Timedelta(1, unit="h") 

595 else: 

596 df.index = df.index.tz_convert("CET") 

597 

598 if qualifier is not None: 

599 if isinstance(qualifier, str): 

600 df = df[df["qualifier"] == qualifier] 

601 else: 

602 df = df[df["qualifier"].isin(qualifier)] 

603 

604 if tmin is not None: 

605 df = df.loc[pd.Timestamp(tmin) :] 

606 

607 if tmax is not None: 

608 df = df.loc[: pd.Timestamp(tmax)] 

609 

610 if sort: 

611 df = sort_observations(df) 

612 

613 if drop_duplicates: 

614 df = drop_duplicate_observations(df, bro_id=bro_id, sort=sort) 

615 

616 return df 

617 

618 

619def sort_observations(df): 

620 """ 

621 Sort observations in a DataFrame by multiple criteria. Applies a multi-level sort 

622 to the input DataFrame, prioritizing the following criteria in order: 

623 1. By the DataFrame's DatetimeIndex in ascending order 

624 2. By status (if present): volledigBeoordeeld before voorlopig before onbekend 

625 3. By observation_type (if present): reguliereMeting before controleMeting 

626 

627 Parameters 

628 ---------- 

629 df : pandas.DataFrame 

630 DataFrame with optional 'observation_type' and 'status' columns, 

631 and a DatetimeIndex. 

632 

633 Returns 

634 ------- 

635 pandas.DataFrame 

636 Sorted DataFrame with the same structure as input. 

637 """ 

638 if "observation_type" in df.columns: 

639 # make sure measurements with observation_type set to reguliereMeting are first 

640 sort_dict = {"reguliereMeting": 0, "controleMeting": 1} 

641 df = df.sort_values("observation_type", key=lambda x: x.map(sort_dict)) 

642 

643 if "status" in df.columns: 

644 # make sure measurements with status set to volledigBeoordeeld are first 

645 sort_dict = {"volledigBeoordeeld": 0, "voorlopig": 1, "onbekend": 2} 

646 df = df.sort_values("status", key=lambda x: x.map(sort_dict)) 

647 

648 # sort based on DatetimeIndex 

649 df = df.sort_index() 

650 

651 return df 

652 

653 

654def drop_duplicate_observations(df, bro_id="gld", keep="first", sort=True): 

655 """ 

656 Remove duplicate observations from a DataFrame based on its index. 

657 

658 Parameters 

659 ---------- 

660 df : pd.DataFrame 

661 The DataFrame to process. 

662 bro_id : str, optional 

663 Identifier for the dataset, used in warning messages. Default is "gld". 

664 keep : {'first', 'last', False}, optional 

665 Which duplicates to mark: 

666 - 'first' : Mark duplicates as True except for the first occurrence. 

667 - 'last' : Mark duplicates as True except for the last occurrence. 

668 - False : Mark all duplicates as True. 

669 Default is 'first'. 

670 

671 Returns 

672 ------- 

673 pd.DataFrame 

674 DataFrame with duplicate index values removed, keeping only the rows 

675 specified by the `keep` parameter. 

676 

677 Warnings 

678 -------- 

679 Logs a warning message if duplicates are found, indicating the number and 

680 total count of duplicates before removal. 

681 """ 

682 if df.index.has_duplicates: 

683 duplicates = df.index.duplicated(keep=keep) 

684 message = "{} contains {} duplicates (of {}). Keeping only first values" 

685 message = message.format(bro_id, duplicates.sum(), len(df.index)) 

686 if sort: 

687 message = f"{message} (sorted for importance)" 

688 message = f"{message}." 

689 logger.warning(message) 

690 df = df[~duplicates] 

691 return df 

692 

693 

694def _get_empty_observation_df(): 

695 columns = ["time", "value", "qualifier", "status", "observation_type"] 

696 return pd.DataFrame(columns=columns).set_index("time") 

697 

698 

699cl = GroundwaterLevelDossier 

700 

701get_bro_ids_of_bronhouder = partial(bro._get_bro_ids_of_bronhouder, cl) 

702get_bro_ids_of_bronhouder.__doc__ = bro._get_bro_ids_of_bronhouder.__doc__ 

703 

704get_data_for_bro_ids = partial(bro._get_data_for_bro_ids, cl) 

705get_data_for_bro_ids.__doc__ = bro._get_data_for_bro_ids.__doc__