smashbox.read_inputdata.read_data
Created on Tue Jun 25 11:11:03 2024
@author: maxime
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3""" 4Created on Tue Jun 25 11:11:03 2024 5 6@author: maxime 7""" 8import glob 9import os 10import re 11import warnings 12import numpy as np 13import pandas as pd 14import rasterio 15from smash.fcore._mwd_mesh import MeshDT 16import shutil 17import tqdm 18import smash 19import pytz 20from pathlib import Path 21import time 22from smashbox.tools import geo_toolbox 23from smashbox.tools import tools 24 25RATIO_PET_HOURLY = np.array( 26 [ 27 0, 28 0, 29 0, 30 0, 31 0, 32 0, 33 0, 34 0.035, 35 0.062, 36 0.079, 37 0.097, 38 0.11, 39 0.117, 40 0.117, 41 0.11, 42 0.097, 43 0.079, 44 0.062, 45 0.035, 46 0, 47 0, 48 0, 49 0, 50 0, 51 ], 52 dtype=np.float32, 53) 54 55# Compute the time laps between 2 timezones 56 57 58def tz_diff(date, tz1, tz2): 59 """ 60 Returns the difference in hours between timezone1 and timezone2 61 for a given date. 62 """ 63 date = pd.to_datetime(date) 64 return (tz1.localize(date) - tz2.localize(date).astimezone(tz1)).seconds / 3600 65 66 67# Must be done char by char to take account of various format. Must be documented. 68# Can't support all date formatter. 69# Avantage: can work even file are not sorted with the date. 70 71 72def _build_date_regex_pattern(date_pattern) -> str: 73 # Supported date formatters 74 datefmt = {"%Y": 4, "%m": 2, "%d": 2, "%H": 2, "%M": 2, "%S": 2} 75 76 regex = "" 77 i = 0 78 while i < len(date_pattern): 79 if date_pattern[i : i + 2] in datefmt: 80 regex += r"\d{" + str(datefmt[date_pattern[i : i + 2]]) + "}" 81 i = i + 2 82 else: 83 regex += r"\\" + date_pattern[i] 84 i = i + 1 85 86 return regex 87 88 89# fonction which split the date_pattern in 2 part: 90# 1)the date pattern 91# 2)the position of the occurence of the date pattern when searching it in a text 92# Exemple: date_pattern="%Y%m%d%H%1" return "%Y%m%d%H" and 1 93 94 95def _split_date_occurence(date_pattern) -> str: 96 re_match = re.search(r"\%[0-9]+$", date_pattern) 97 if re_match is not None: 98 pattern = date_pattern[0 : int(re_match.span()[0])] 99 occurence = int(re_match.group()[1]) 100 return pattern, occurence 101 else: 102 return date_pattern, 0 103 104 105# We suppose that the atmos file are sorted in ascendent order with the date 106# (this a drawback of the method) ! Must be documented ! 107# Only used for daily internanuel pet 108 109 110def _check_files_containing_date( 111 files: str, date: pd.Timestamp, date_pattern: str 112) -> int: 113 date_string = date.strftime(date_pattern) 114 ind = -1 115 for i, f in enumerate(files): 116 # print(i,date_string,os.path.basename(f)) 117 if date_string in os.path.basename(f): 118 ind = i 119 120 return ind 121 122 123# Test if files are sorted according the date 124# Test if the model time step fit with the data 125# Create a n samples of niter contigous date 126# return True or false 127 128 129def _sample_delta_time_in_file(files, date_pattern, dt): 130 d_pattern, occurence = _split_date_occurence(date_pattern) 131 regex_date = _build_date_regex_pattern(d_pattern) 132 133 if len(files) <= 2: 134 return False 135 136 nsample = 5 137 niter = 100 138 list_delta_time = [] 139 list_date_block = [] 140 141 for i in range(0, len(files), max(int(len(files) / nsample), niter)): 142 143 for j in range(min(niter, len(files) - i - 1)): 144 145 re_match_0 = re.findall(regex_date, os.path.basename(files[i + j])) 146 re_match_1 = re.findall(regex_date, os.path.basename(files[i + j + 1])) 147 148 if len(re_match_0) > 0 and len(re_match_1) > 0: 149 date_match_0 = pd.to_datetime(re_match_0[occurence], format=d_pattern) 150 date_match_1 = pd.to_datetime(re_match_1[occurence], format=d_pattern) 151 delta_time = date_match_1 - date_match_0 152 list_delta_time.append(delta_time.total_seconds()) 153 154 if j == 0: 155 list_date_block.append(date_match_0) 156 157 if delta_time.total_seconds() < 0: 158 return False 159 160 # Test if all sample date block are in ascendent order 161 if len(list_date_block) > 1: 162 for i in range(len(list_date_block) - 1): 163 delta_time = list_date_block[i + 1] - list_date_block[i] 164 165 if delta_time.total_seconds() < 0.0: 166 return False 167 168 if len(list_delta_time) > 0: 169 if np.min(np.array(list_delta_time)) != dt: 170 # sorted but not good time-step 171 raise ValueError( 172 "Precipitation files are sorted with the date pattern but not at the good time-step:" 173 f" prcp time-step={list_delta_time[0]}, model time-step={dt}" 174 ) 175 176 # Finally test if all date are in ascendent order 177 if np.all(np.array(list_delta_time) > 0): 178 return True 179 else: 180 return False 181 182 else: 183 raise ValueError( 184 "Date Pattern was not found in the list of the precipitation files" 185 f" while searching pattern={d_pattern} with corresponding regex={regex_date}" 186 f" In filename {files[0]}," 187 ) 188 189 190# Search algorithm 191# Search for a pattern in a sorted list of files 192# return the first position after or equal at date in files 193 194 195def _fast_index_search_for_date(files, date, date_pattern): 196 # print("search for",date) 197 d_pattern, occurence = _split_date_occurence(date_pattern) 198 regex_date = _build_date_regex_pattern(d_pattern) 199 200 re_match = re.findall(regex_date, os.path.basename(files[-1])) 201 if len(re_match) > 0: 202 date_match = pd.to_datetime(re_match[occurence], format=d_pattern) 203 204 # cas1 205 if date > date_match: 206 return len(files) 207 208 else: 209 raise ValueError( 210 f"Date formatter {d_pattern} corresponding to regex {regex_date}" 211 " not found in filename {os.path.basename(files[i])}" 212 ) 213 214 re_match = re.findall(regex_date, os.path.basename(files[0])) 215 if len(re_match) > 0: 216 date_match = pd.to_datetime(re_match[occurence], format=d_pattern) 217 218 # cas2 219 if date < date_match: 220 return 0 221 222 else: 223 raise ValueError( 224 f"Date formatter {d_pattern} corresponding to regex {regex_date}" 225 " not found in filename {os.path.basename(files[i])}" 226 ) 227 228 # cas3 229 pos = 0 230 final_pos = 0 231 move = 1 232 previous_move = 0 233 step = len(files) - 1 234 nb_iter = 0 235 while nb_iter < 100: 236 re_match = re.findall(regex_date, os.path.basename(files[pos])) 237 date_match = pd.to_datetime(re_match[occurence], format=d_pattern) 238 239 # print( 240 # "At iteration " 241 # + str(nb_iter) 242 # +f" searching {date}," 243 # + "and match " 244 # + date_match.strftime("%Y-%m-%d %H:%M:%S") 245 # + f" at pos {pos}" 246 # ) 247 248 if date_match < date: 249 pos = min(pos + step, len(files) - 1) 250 move = 1 251 elif date_match > date: 252 pos = max(pos - step, 0) 253 move = -1 254 elif date_match == date: 255 final_pos = pos 256 break 257 258 if step == 0: 259 if date_match < date: 260 final_pos = pos + 1 261 break 262 elif date_match > date: 263 final_pos = pos 264 break 265 266 # print(f"-> Move to pos {pos} with step {step}") 267 268 # reduce step only if we change search direction 269 if previous_move != move: 270 step = int(step / 2) 271 272 nb_iter = nb_iter + 1 273 previous_move = move 274 275 # print( 276 # "At last iteration " 277 # + str(nb_iter) 278 # +f" searching {date}," 279 # + "and match " 280 # + date_match.strftime("%Y-%m-%d %H:%M:%S") 281 # + f" at pos {pos}" 282 # ) 283 284 # iter>maxlimit: 285 if date_match < date: 286 final_pos = pos 287 elif date_match > date: 288 # back to previous pos 289 final_pos = pos - 2 * step 290 291 return final_pos 292 293 294# Getting only files for the specified daterange 295# work even files are not sorted according the date pattern 296def _get_files_list_for_date_range(files, date_pattern, date_range): 297 d_pattern, occurence = _split_date_occurence(date_pattern) 298 regex_date = _build_date_regex_pattern(d_pattern) 299 300 vec_date = [] 301 if len(files) > 0: 302 for i, f in enumerate(files): 303 re_match = re.findall(regex_date, os.path.basename(f)) 304 305 if len(re_match) > 0: 306 vec_date.append(re_match[occurence]) 307 308 vec_date = pd.to_datetime(vec_date, format=d_pattern) 309 vec_date = vec_date.strftime("%Y%m%d%H%M%S") 310 311 # convert to numpy array 312 np_lst_date = np.array(vec_date) 313 # sort but keep indexes 314 sorted_indices = np_lst_date.argsort() 315 # sort according previous indexes 316 np_lst_date_sorted = np_lst_date[sorted_indices] 317 318 np_date_range = np.array(date_range.strftime("%Y%m%d%H%M%S").to_list()) 319 320 # build the list of index only for the daterange 321 index_list_for_daterange = [] 322 for i in range(len(np_date_range)): 323 pos = np.where(np_lst_date_sorted == np_date_range[i]) 324 if len(pos[0]) > 0: 325 index_list_for_daterange.append(pos[0][0]) 326 else: 327 index_list_for_daterange.append(-1) 328 329 # find the final list of files for daterange (sorted) 330 final_list_files = [] 331 for index in index_list_for_daterange: 332 if index >= 0: 333 final_list_files.append(files[sorted_indices[index]]) 334 else: 335 final_list_files.append("miss") 336 337 return final_list_files 338 339 340# Getting only files for the specified daterange 341# work even files are not sorted according the date pattern 342# Duplicate of _get_files_list_for_date_range but with hack for leap year... 343def _get_files_list_for_date_range_interannuel(files, date_pattern, date_range): 344 d_pattern, occurence = _split_date_occurence(date_pattern) 345 regex_date = _build_date_regex_pattern(d_pattern) 346 347 vec_date = [] 348 if len(files) > 0: 349 for i, f in enumerate(files): 350 re_match = re.findall(regex_date, os.path.basename(f)) 351 if len(re_match) > 0: 352 vec_date.append("1904" + re_match[occurence]) # annee bixetile 353 vec_date = pd.to_datetime(vec_date, format="%Y%m%d") # match 19040402 354 vec_date = vec_date.strftime("%Y%m%d%H%M%S") 355 356 # convert to numpy array 357 np_lst_date = np.array(vec_date) 358 # sort but keep indexes 359 sorted_indices = np_lst_date.argsort() 360 # sort according previous indexes 361 np_lst_date_sorted = np_lst_date[sorted_indices] 362 363 np_date_range = np.array(date_range.strftime("%Y%m%d%H%M%S").to_list()) 364 365 # build the list of index only for the daterange 366 index_list_for_daterange = [] 367 for i in range(len(np_date_range)): 368 pos = np.where(np_lst_date_sorted == np_date_range[i]) 369 370 if len(pos[0]) > 0: 371 index_list_for_daterange.append(pos[0][0]) 372 else: 373 index_list_for_daterange.append(-1) 374 375 # find the final list of files for daterange (sorted) 376 final_list_files = [] 377 for index in index_list_for_daterange: 378 if index >= 0: 379 final_list_files.append(files[sorted_indices[index]]) 380 else: 381 final_list_files.append("miss") 382 383 return final_list_files 384 385 386# Get the list of files to read for a specific date_range 387 388 389def _get_atmos_files( 390 dir: str, 391 fmt: str, 392 access: str, 393 dt: float, 394 date_range: pd.DatetimeIndex, 395 date_pattern: str, 396 daily_interannual: bool = False, 397) -> list[str]: 398 # Set to drop duplicates after strftime 399 400 date_range_strftime_access = set(date_range.strftime(access)) if access else {""} 401 files = [] 402 403 acces_try = 0 404 res = False 405 while res == False: 406 407 res = os.access(f"{dir}", os.R_OK) 408 if res: 409 for date_strftime_access in date_range_strftime_access: 410 prcp_path = Path(f"{dir}/{date_strftime_access}/") 411 files.extend(sorted(prcp_path.glob(f"**/*{fmt}"))) 412 else: 413 time.sleep(0.1) 414 acces_try = acces_try + 1 415 print( 416 f"</> WARNING: {dir} is unvailable ... waiting 2s max and trying again {acces_try}/20..." 417 ) 418 if acces_try > 20: 419 print( 420 f"</> WARNING !! atmos files directory {dir} is unaccesible after 20 try (2 seconds) => atmos files are missing !" 421 ) 422 break 423 424 files = sorted(files) 425 426 # Return a list of files in date_range with potential missing files (366 files max) 427 if daily_interannual: 428 429 # list of daily date 430 date_range_interannual = pd.date_range( 431 start=date_range[0].strftime("%Y%m%d"), 432 end=date_range[-1].strftime("%Y%m%d"), 433 freq="1d", 434 ) 435 436 # conversion au format inte-annuel + use year 1904 to manage leap year 437 date_range_interannual = pd.to_datetime( 438 "1904" + date_range_interannual.strftime("%m%d"), format="%Y%m%d" 439 ) 440 441 # we build the file list according the date_range (sorted): we have 1 date <=> 1file 442 final_list_files = _get_files_list_for_date_range_interannuel( 443 files, "%m%d", date_range_interannual 444 ) 445 446 return final_list_files 447 448 # Return a list of files in date_range with potential missing files 449 else: 450 451 # Only here with model time-step 452 final_list_files = [] 453 if len(files) == 0: 454 for i in range(len(date_range)): 455 final_list_files.append("miss") 456 return final_list_files 457 458 # Check if file are sorted with the date pattern (for long list of files) 459 if len(files) >= 100: 460 # ensure files are alphabetically sorted (glob does not sort list depend on the file system) 461 # this operation can be very long ? 462 # files.sort() 463 test = _sample_delta_time_in_file(files, date_pattern, dt) 464 465 # if sorted we quickly search index of date_start and date_end 466 # if not sorted we parse all files et keep the dates 467 if test: 468 # file seems sorted 469 pos_0 = _fast_index_search_for_date(files, date_range[0], date_pattern) 470 pos_1 = _fast_index_search_for_date(files, date_range[-1], date_pattern) 471 472 files = files[pos_0 : pos_1 + 1] 473 else: 474 print( 475 "Warnings, atmospheric filename are not sorted with date." 476 " Reading atmospheric data may take more time." 477 ) 478 479 # we build the file list according the date_range (sorted): we have 1 date <=> 1file 480 final_list_files = _get_files_list_for_date_range(files, date_pattern, date_range) 481 482 return final_list_files 483 484 485# This function seems to work better but require more tests 486def _read_windowed_raster_new( 487 path, 488 mesh: dict | MeshDT | None = None, 489 transform: dict = None, 490 order: int = 0, 491 cval=-99.0, 492 grid_mode=True, 493) -> tuple[np.ndarray, dict[str, int]]: 494 """ 495 496 Description 497 ----------- 498 499 Read a part of a raster and resample it if required. 500 501 Paramerters 502 ----------- 503 504 path: str() 505 Input geotif path 506 507 mesh: dict() or MeshDT 508 A smash mesh 509 510 transform: dict() 511 transform dictionnary from rasterio.get_transform() method. (xmin,ymax,xres,yres). May be to replace the mesh 512 513 514 resampling_method: str() 515 scipy-zoom or numpy use scipy zoom function or numpy array functions 516 517 order: int() 518 order of the resampling cubic interpolation 519 520 cval: float() | np.nan 521 fill value for the extended boundaries 522 523 grid_mode: bool() 524 True | False. if True coordinate start from the edge of the cell. If False coordinate starts from the center of the cell. 525 526 Return 527 ------ 528 529 numpy.array() 530 Cropped and resampled array according bbox_out and res_out 531 532 """ 533 534 if transform is not None: 535 mesh_dict = transform 536 else: 537 if not isinstance(mesh, dict): 538 mesh_dict = tools.read_object_as_dict(mesh) 539 else: 540 mesh_dict = mesh 541 542 ds_in = rasterio.open(path) 543 matrix = ds_in.read() 544 545 bbox_in = ds_in.bounds._asdict() 546 res_in = {"dx": ds_in.res[0], "dy": ds_in.res[1]} 547 548 bbox_out = geo_toolbox.get_bbox_from_smash_mesh(mesh_dict) 549 res_out = {"dx": mesh_dict["xres"], "dy": mesh_dict["yres"]} 550 551 cropped_array = geo_toolbox.crop_array( 552 matrix[0, :, :], bbox_in, res_in, bbox_out, res_out, order=order 553 ) 554 555 warning = {"res": 0, "overlap": 0, "outofbound": 0} 556 557 if res_in["dx"] != res_out["dx"] or res_in["dy"] != res_out["dy"]: 558 warning["res"] = 1 559 560 return cropped_array, warning 561 562 563def _rasterio_read_windowed_raster(path="", mesh: dict = None): 564 """ 565 Description 566 ----------- 567 568 Read a part of a raster with rasterio and resample it if required. 569 570 Paramerters 571 ----------- 572 573 path: str() 574 Input geotif path 575 576 mesh: dict() or MeshDT 577 A smash mesh 578 579 Return 580 ------ 581 582 numpy.array() 583 Cropped and resampled array according bbox_out and res_out 584 585 """ 586 587 ds_in = rasterio.open(path) 588 589 col_off = max(mesh["xmin"] - ds_in.bounds.left, 0) / ds_in.res[0] 590 row_off = max(ds_in.bounds.top - mesh["ymax"], 0) / ds_in.res[1] 591 ncols = (mesh["ncol"] * mesh["xres"]) / ds_in.res[0] 592 nrows = (mesh["nrow"] * mesh["yres"]) / ds_in.res[1] 593 594 window = rasterio.windows.Window( 595 round(col_off), round(row_off), int(ncols), int(nrows) 596 ) 597 598 matrix = ds_in.read( 599 window=window, 600 out_shape=( 601 ds_in.count, 602 mesh["nrow"], 603 mesh["ncol"], 604 ), 605 resampling=rasterio.enums.Resampling.nearest, 606 ) 607 608 return matrix 609 610 611# this function has a bug. It shift the grid and fill with nodata value the borders if shifted. 612def _read_windowed_raster( 613 path, mesh: MeshDT | None = None, transform: dict = None 614) -> tuple[np.ndarray, dict[str, int]]: 615 warning = {"res": 0, "overlap": 0, "outofbound": 0} 616 617 if transform is not None: 618 mesh = transform 619 620 # % Get raster information 621 with rasterio.open(path) as ds: 622 transform = ds.get_transform() 623 xmin = transform[0] 624 ymax = transform[3] 625 xres = transform[1] 626 yres = -transform[5] 627 628 # % Manage absolute tolerance wrt to xres or yres value 629 atol = 1e-2 630 xatol = atol * 10 ** min(0, np.floor(np.log10(np.abs(xres)))) 631 yatol = atol * 10 ** min(0, np.floor(np.log10(np.abs(yres)))) 632 633 # % Resolution missmatch 634 if not np.isclose(mesh.xres, xres, atol=xatol) or not np.isclose( 635 mesh.yres, yres, atol=yatol 636 ): 637 warning["res"] = 1 638 639 # % Overlap missmatch 640 dxmin = mesh.xmin - xmin 641 dymax = ymax - mesh.ymax 642 xol_match = np.abs(dxmin / xres - np.round(dxmin / xres)) 643 yol_match = np.abs(dymax / yres - np.round(dymax / yres)) 644 if not np.isclose(xol_match, 0, atol=xatol) or not np.isclose( 645 yol_match, 0, atol=yatol 646 ): 647 warning["overlap"] = 1 648 649 # # % Allocate buffer 650 arr = np.zeros(shape=(mesh.nrow, mesh.ncol), dtype=np.float32) 651 arr.fill(np.float32(-99)) 652 653 # % Pad offset to the nearest integer 654 xoff = np.rint((mesh.xmin - xmin) / xres) 655 yoff = np.rint((ymax - mesh.ymax) / yres) 656 657 # % Resolution ratio 658 xres_ratio = mesh.xres / xres 659 yres_ratio = mesh.yres / yres 660 661 # Reading window 662 win_xsize = np.rint(mesh.ncol * xres_ratio) 663 win_ysize = np.rint(mesh.nrow * yres_ratio) 664 665 # % Totally out of bound 666 # % Return the buffer with no data 667 if ( 668 xoff >= ds.width 669 or yoff >= ds.height 670 or xoff + win_xsize <= 0 671 or yoff + win_ysize <= 0 672 ): 673 warning["outofbound"] = 2 674 return arr, warning 675 676 # % Partially out of bound 677 if ( 678 xoff < 0 679 or yoff < 0 680 or xoff + win_xsize > ds.width 681 or yoff + win_ysize > ds.height 682 ): 683 warning["outofbound"] = 1 684 685 # % Readjust offset 686 pxoff = max(0, xoff) 687 pyoff = max(0, yoff) 688 689 # % Readjust reading window 690 pwin_xsize = min(win_xsize + min(0, xoff), ds.width - pxoff) 691 pwin_ysize = min(win_ysize + min(0, yoff), ds.height - pyoff) 692 693 # % Buffer slices 694 xs = pxoff - xoff 695 ys = pyoff - yoff 696 xe = xs + pwin_xsize 697 ye = ys + pwin_ysize 698 arr_xslice = slice(int(xs * 1 / xres_ratio), int(xe * 1 / xres_ratio)) 699 arr_yslice = slice(int(ys * 1 / yres_ratio), int(ye * 1 / yres_ratio)) 700 701 # % Reading and writting into buffer 702 ds.read( 703 1, 704 window=rasterio.windows.Window(pxoff, pyoff, pwin_xsize, pwin_ysize), 705 out=arr[arr_yslice, arr_xslice], 706 ) 707 708 # % Manage NoData 709 nodata = ds.nodata 710 if nodata is not None: 711 arr = np.where(arr == nodata, -99.0, arr) 712 713 return arr, warning 714 715 716def _get_reading_warning_message(reading_warning: dict[str, int | list[str]]) -> str: 717 msg = [] 718 719 if reading_warning["got"]: 720 if len(reading_warning["got"]) > 1: 721 msg.append( 722 f"Reading sucess: read {len(reading_warning['got'])} file(s): " 723 f"{reading_warning['got'][0]}, ..., {reading_warning['got'][0-1]}" 724 ) 725 else: 726 msg.append( 727 f"Reading sucess: read {len(reading_warning['got'])} file(s): " 728 f"{reading_warning['got'][0]}" 729 ) 730 731 if reading_warning["miss"]: 732 msg.append( 733 f"Missing warning: missing {len(reading_warning['miss'])} file(s): " 734 f"{reading_warning['miss']}" 735 ) 736 737 if reading_warning["res"]: 738 msg.append( 739 "Resolution warning: resolution missmatch between mesh and at least one raster file. Nearest " 740 "neighbour resampling algorithm has been used to match mesh resolution" 741 ) 742 743 if reading_warning["overlap"]: 744 msg.append( 745 "Overlap warning: overlap missmatch between mesh and at least one raster file. Cropping domain " 746 "has been updated to the nearest overlapping cell" 747 ) 748 749 if reading_warning["outofbound"]: 750 kind = "partially" if reading_warning["outofbound"] == 1 else "totally" 751 msg.append( 752 f"Out of bound warning: mesh is {kind} out of bound for at least one raster file. Out of bound " 753 f"domain has been filled in with no data" 754 ) 755 756 return "\n".join(msg) 757 758 759# TODO: Unique fun to read prcp, snow and temp 760def _read_atmos_data(model, atmos_data, directories): 761 762 date_range = pd.date_range( 763 start=model.setup.start_time, 764 end=model.setup.end_time, 765 freq=f"{int(model.setup.dt)}s", 766 )[1:] 767 768 reading_warning = {"got": [], "miss": [], "res": 0, "overlap": 0, "outofbound": 0} 769 770 # atmos_data_directory = getattr(model.setup, f"{atmos_data}_directory") 771 atmos_data_format = getattr(model.setup, f"{atmos_data}_format") 772 atmos_date_pattern = getattr(model.setup, f"{atmos_data}_date_pattern") 773 prcp_access = getattr(model.setup, "prcp_access") 774 prcp_conversion_factor = getattr(model.setup, "prcp_conversion_factor") 775 sparse_storage = getattr(model.setup, "sparse_storage") 776 dt = getattr(model.setup, "dt") 777 778 sparse_matrix = None 779 std_storage = None 780 if sparse_storage: 781 sparse_matrix = getattr(model._input_data.atmos_data, f"sparse_{atmos_data}") 782 else: 783 std_storage = getattr(model._input_data.atmos_data, f"{atmos_data}") 784 785 if atmos_data_format == "tif": 786 787 file_list = [] 788 keys = sorted(directories.keys()) 789 for key in keys: 790 791 # atmos_data_directory = directories[i] 792 atmos_data_directory = directories[key] 793 files = _get_atmos_files( 794 atmos_data_directory, 795 atmos_data_format, 796 prcp_access, 797 dt, 798 date_range, 799 atmos_date_pattern, 800 ) 801 802 if len(file_list) == 0: 803 file_list = files 804 805 lacuna = [] 806 for k, f in enumerate(files): 807 if f == "miss": 808 lacuna.append(k) 809 else: 810 811 for k in lacuna: 812 file_list[k] = files[k] 813 814 lacuna = [] 815 for k, f in enumerate(files): 816 if f == "miss": 817 lacuna.append(k) 818 819 if len(lacuna) == 0: 820 break 821 822 for i, date in enumerate(tqdm.tqdm(date_range, desc=f"</> Reading {atmos_data}")): 823 atmos_file = file_list[i] 824 if atmos_file == "miss": 825 reading_warning["miss"].append( 826 date.strftime("%Y-%m-%d %H:%M") 827 + f", (matching pattern {date.strftime(atmos_date_pattern)})" 828 ) 829 if sparse_storage: 830 matrix = np.zeros( 831 shape=(model.mesh.nrow, model.mesh.ncol), 832 dtype=np.float32, 833 order="F", 834 ) 835 matrix.fill(np.float32(-99)) 836 smash.fcore._mwd_sparse_matrix_manipulation.matrix_to_sparse_matrix( 837 model.mesh, 838 matrix, 839 np.float32(-99), 840 sparse_matrix[i], 841 ) 842 843 else: 844 std_storage[..., i] = np.float32(-99) 845 846 else: 847 reading_warning["got"].append( 848 date.strftime("%Y-%m-%d %H:%M") + f" ({os.path.basename(atmos_file)})" 849 ) 850 851 matrix, warning = _read_windowed_raster_new(atmos_file, model.mesh) 852 matrix *= prcp_conversion_factor 853 reading_warning.update( 854 {k: v for k, v in warning.items() if not reading_warning[k]} 855 ) 856 857 if sparse_storage: 858 smash.fcore._mwd_sparse_matrix_manipulation.matrix_to_sparse_matrix( 859 model.mesh, 860 matrix, 861 np.float32(0), 862 sparse_matrix[i], 863 ) 864 865 else: 866 std_storage[..., i] = matrix 867 868 # files = files[1:] 869 870 if sparse_matrix: 871 setattr( 872 model._input_data.atmos_data, f"sparse_{atmos_data}", sparse_matrix 873 ) 874 else: 875 setattr(model._input_data.atmos_data, f"{atmos_data}", std_storage) 876 877 # % WIP 878 elif atmos_data_format == "nc": 879 raise NotImplementedError("NetCDF format not implemented yet") 880 881 msg = _get_reading_warning_message(reading_warning) 882 883 if msg: 884 warnings.warn(f"Warning(s) linked to {atmos_data} reading.\n{msg}", stacklevel=2) 885 886 887def _read_interannual_pet(model, directories, tz="UTC"): 888 date_range = pd.date_range( 889 start=model.setup.start_time, 890 end=model.setup.end_time, 891 freq=f"{int(model.setup.dt)}s", 892 )[1:] 893 894 reading_warning = {"got": [], "miss": [], "res": 0, "overlap": 0, "outofbound": 0} 895 896 if model.setup.pet_format == "tif": 897 898 file_list = [] 899 keys = sorted(directories.keys()) 900 for key in keys: 901 902 atmos_data_directory = directories[key] 903 904 files = _get_atmos_files( 905 atmos_data_directory, 906 model.setup.pet_format, 907 model.setup.pet_access, 908 model.setup.dt, 909 date_range, 910 model.setup.pet_date_pattern, 911 model.setup.daily_interannual_pet, 912 ) 913 914 if len(file_list) == 0: 915 file_list = files 916 917 lacuna = [] 918 for k, f in enumerate(files): 919 if f == "miss": 920 lacuna.append(k) 921 else: 922 923 for k in lacuna: 924 file_list[k] = files[k] 925 926 lacuna = [] 927 for k, f in enumerate(files): 928 if f == "miss": 929 lacuna.append(k) 930 931 if len(lacuna) == 0: 932 break 933 934 if model.setup.daily_interannual_pet: 935 leap_year_days = pd.date_range( 936 start="202001010000", end="202012310000", freq="1D" 937 ) 938 step_offset = int( 939 (date_range[0] - date_range[0].floor("D")).total_seconds() 940 / model.setup.dt 941 ) 942 nstep_per_day = int(86_400 / model.setup.dt) 943 hourly_ratio = 3_600 / model.setup.dt 944 945 ratio_shift = tz_diff( 946 "2020/01/01 12:00", pytz.timezone("UTC"), pytz.timezone(tz) 947 ) 948 RATIO_PET_HOURLY_SHIFTED = np.roll(RATIO_PET_HOURLY, int(-1 * ratio_shift)) 949 950 if hourly_ratio >= 1: 951 ratio = np.repeat(RATIO_PET_HOURLY_SHIFTED, hourly_ratio) / hourly_ratio 952 953 else: 954 ratio = np.sum( 955 RATIO_PET_HOURLY_SHIFTED.reshape(-1, int(1 / hourly_ratio)), axis=1 956 ) 957 958 matrix_dip = np.zeros( 959 shape=(model.mesh.nrow, model.mesh.ncol, len(leap_year_days)), 960 dtype=np.float32, 961 ) 962 missing_day = np.empty(shape=0, dtype=np.int32) 963 964 list_date_in_daterange = date_range.strftime("%m%d") 965 list_date_in_leap_year_days = leap_year_days.strftime("%m%d") 966 967 for i, date in enumerate( 968 tqdm.tqdm(leap_year_days, desc="</> Reading daily interannual pet") 969 ): 970 if list_date_in_leap_year_days[i] in list_date_in_daterange: 971 ind = _check_files_containing_date(file_list, date, "%m%d") 972 973 ind_date = list(list_date_in_daterange).index( 974 list_date_in_leap_year_days[i] 975 ) 976 true_date = date_range[ind_date] 977 978 if ind == -1: 979 reading_warning["miss"].append( 980 date_range[ind_date].strftime("%Y-%m-%d") 981 + f", (matching pattern {true_date.strftime('%m%d')})" 982 ) 983 missing_day = np.append(date, true_date) 984 else: 985 reading_warning["got"].append( 986 true_date.strftime("%Y-%m-%d") 987 + f" ({os.path.basename(file_list[ind])})" 988 ) 989 matrix_dip[..., i], warning = _read_windowed_raster_new( 990 file_list[ind], model.mesh 991 ) 992 matrix_dip[..., i] *= model.setup.pet_conversion_factor 993 reading_warning.update( 994 {k: v for k, v in warning.items() if not reading_warning[k]} 995 ) 996 997 for i, date in enumerate( 998 tqdm.tqdm(date_range, desc="</> Disaggregating daily interannual pet") 999 ): 1000 ratio_ind = (i + step_offset) % nstep_per_day 1001 1002 if date in missing_day: 1003 if model.setup.sparse_storage: 1004 matrix = np.zeros( 1005 shape=(model.mesh.nrow, model.mesh.ncol), 1006 dtype=np.float32, 1007 order="F", 1008 ) 1009 matrix.fill(np.float32(-99)) 1010 smash.fcore._mwd_sparse_matrix_manipulation.matrix_to_sparse_matrix( 1011 model.mesh, 1012 matrix, 1013 np.float32(-99), 1014 model._input_data.atmos_data.sparse_pet[i], 1015 ) 1016 else: 1017 model._input_data.atmos_data.pet[..., i] = np.float32(-99) 1018 1019 else: 1020 index = list_date_in_leap_year_days.get_loc(list_date_in_daterange[i]) 1021 matrix = matrix_dip[..., index] * ratio[ratio_ind] 1022 if model.setup.sparse_storage: 1023 smash.fcore._mwd_sparse_matrix_manipulation.matrix_to_sparse_matrix( 1024 model.mesh, 1025 matrix, 1026 np.float32(0), 1027 model._input_data.atmos_data.sparse_pet[i], 1028 ) 1029 else: 1030 model._input_data.atmos_data.pet[..., i] = matrix 1031 # % WIP 1032 elif model.setup.pet_format == "nc": 1033 raise NotImplementedError("NetCDF format not implemented yet") 1034 1035 msg = _get_reading_warning_message(reading_warning) 1036 1037 if msg: 1038 warnings.warn( 1039 f"Warning(s) linked to potential evapotranspiration reading.\n{msg}", 1040 stacklevel=2, 1041 ) 1042 1043 1044def _read_qobs(model): 1045 start_time = pd.Timestamp(model.setup.start_time) 1046 end_time = pd.Timestamp(model.setup.end_time) 1047 miss = [] 1048 1049 for i, c in enumerate(model.mesh.code): 1050 f = glob.glob(f"{model.setup.qobs_directory}/**/*{c}*.csv", recursive=True) 1051 1052 if f: 1053 dat = pd.read_csv(f[0]) 1054 try: 1055 file_start_time = pd.Timestamp(dat.columns[0]) 1056 except Exception: 1057 raise ValueError( 1058 f"Column header '{dat.columns[0]}' in the observed discharge file for catchment '{c}' " 1059 f"is not a valid date" 1060 ) from None 1061 1062 file_end_time = file_start_time + pd.Timedelta( 1063 seconds=model.setup.dt * (len(dat) - 1) 1064 ) 1065 start_diff = ( 1066 int((start_time - file_start_time).total_seconds() / model.setup.dt) + 1 1067 ) 1068 end_diff = ( 1069 int((end_time - file_start_time).total_seconds() / model.setup.dt) + 1 1070 ) 1071 1072 # % Check if observed discharge file contains data for corresponding simulation period 1073 if start_diff > dat.index.max() or end_diff < 0: 1074 warnings.warn( 1075 f"The provided observed discharge file for catchment '{c}' does not contain data for the " 1076 f"selected simulation period ['{start_time}', '{end_time}']. The file covers the period " 1077 f"['{file_start_time}', '{file_end_time}']", 1078 stacklevel=2, 1079 ) 1080 else: 1081 ind_start_dat = max(0, start_diff) 1082 ind_end_dat = min(dat.index.max(), end_diff) 1083 ind_start_arr = max(0, -start_diff) 1084 ind_end_arr = ind_start_arr + ind_end_dat - ind_start_dat 1085 1086 model._input_data.response_data.q[i, ind_start_arr:ind_end_arr] = ( 1087 dat.iloc[ind_start_dat:ind_end_dat, 0] 1088 ) 1089 else: 1090 miss.append(c) 1091 1092 if miss: 1093 warnings.warn( 1094 f"Missing {len(miss)} observed discharge file(s): {miss}", stacklevel=2 1095 ) 1096 1097 1098def serialize_setup_directories(directories): 1099 1100 if isinstance(directories, list): 1101 1102 if isinstance(directories[0], dict): 1103 dir_data = directories[0] 1104 else: 1105 raise Exception( 1106 f"Bad syntaxe for {directories} in setup. It should be a dictionnary" 1107 ) 1108 1109 elif isinstance(directories, dict): 1110 dir_data = directories 1111 1112 return dir_data 1113 1114 1115def read_atmos_data(model, setup_options={}): 1116 1117 # update attributes 1118 for key, value in setup_options.items(): 1119 if hasattr(model.setup, key): 1120 setattr(model.setup, key, value) 1121 1122 if model.setup.read_qobs: 1123 1124 _read_qobs(model) 1125 1126 # with rasterio.Env(): 1127 if model.setup.read_prcp: 1128 1129 if len(model.setup.prcp_directories) > 0: 1130 dir_data = serialize_setup_directories(model.setup.prcp_directories) 1131 else: 1132 dir_data = {1: model.setup.prcp_directory} 1133 1134 _read_atmos_data(model, atmos_data="prcp", directories=dir_data) 1135 1136 if model.setup.read_pet: 1137 1138 if len(model.setup.pet_directories) > 0: 1139 dir_data = serialize_setup_directories(model.setup.pet_directories) 1140 else: 1141 dir_data = {1: model.setup.pet_directory} 1142 1143 if model.setup.daily_interannual_pet: 1144 _read_interannual_pet(model, directories=dir_data, tz=model.setup.timezone) 1145 else: 1146 _read_atmos_data(model, atmos_data="pet", directories=dir_data) 1147 1148 if model.setup.read_snow: 1149 1150 if len(model.setup.snow_directories) > 0: 1151 dir_data = serialize_setup_directories(model.setup.snow_directories) 1152 else: 1153 dir_data = {1: model.setup.snow_directory} 1154 1155 _read_atmos_data(model, atmos_data="snow", directories=dir_data) 1156 1157 if model.setup.read_temp: 1158 1159 if len(model.setup.temp_directories) > 0: 1160 dir_data = serialize_setup_directories(model.setup.temp_directories) 1161 else: 1162 dir_data = {1: model.setup.temp_directory} 1163 1164 _read_atmos_data(model, atmos_data="temp", directories=dir_data) 1165 1166 if model.setup.prcp_partitioning: 1167 1168 smash.fcore._mw_atmos_statistic.compute_prcp_partitioning( 1169 model.setup, model.mesh, model._input_data 1170 ) 1171 1172 if model.setup.compute_mean_atmos: 1173 1174 print("</> Computing mean atmospheric data") 1175 smash.fcore._mw_atmos_statistic.compute_mean_atmos( 1176 model.setup, model.mesh, model._input_data 1177 ) 1178 1179 1180def write_atmos_data_aliases(path_to_alias, setup, input_data): 1181 1182 for atmos_data, directories in input_data.items(): 1183 1184 atmos_data_format = getattr(setup, f"{atmos_data}_format") 1185 atmos_date_pattern = getattr(setup, f"{atmos_data}_date_pattern") 1186 prcp_access = getattr(setup, f"{atmos_data}_access") 1187 dt = getattr(setup, "dt") 1188 1189 date_range = pd.date_range( 1190 start=setup.start_time, 1191 end=setup.end_time, 1192 freq=f"{int(setup.dt)}s", 1193 )[1:] 1194 1195 array_file = None 1196 for atmos_data_directory in directories: 1197 1198 files = _get_atmos_files( 1199 atmos_data_directory, 1200 atmos_data_format, 1201 prcp_access, 1202 dt, 1203 date_range, 1204 atmos_date_pattern, 1205 ) 1206 1207 if array_file is None: 1208 array_file = np.array(files, dtype=str) 1209 lacuna = np.where(array_file == "-1") 1210 else: 1211 array_file[lacuna] = np.array(files)[lacuna] 1212 lacuna = np.where(array_file == "-1") 1213 1214 if len(lacuna) == 0: 1215 break 1216 1217 if os.path.exists(os.path.join(path_to_alias, f"tmp_{atmos_data}")): 1218 shutil.rmtree(os.path.join(path_to_alias, f"tmp_{atmos_data}")) 1219 1220 if not os.path.exists(os.path.join(path_to_alias, f"tmp_{atmos_data}")): 1221 os.mkdir(os.path.join(path_to_alias, f"tmp_{atmos_data}")) 1222 1223 for i in range(len(array_file)): 1224 if array_file[i] != "-1": 1225 mytarget = array_file[i] 1226 mylink = os.path.join( 1227 path_to_alias, f"tmp_{atmos_data}", os.path.basename(array_file[i]) 1228 ) 1229 os.symlink(mytarget, mylink)
RATIO_PET_HOURLY =
array([0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.035, 0.062,
0.079, 0.097, 0.11 , 0.117, 0.117, 0.11 , 0.097, 0.079, 0.062,
0.035, 0. , 0. , 0. , 0. , 0. ], dtype=float32)
def
tz_diff(date, tz1, tz2):
59def tz_diff(date, tz1, tz2): 60 """ 61 Returns the difference in hours between timezone1 and timezone2 62 for a given date. 63 """ 64 date = pd.to_datetime(date) 65 return (tz1.localize(date) - tz2.localize(date).astimezone(tz1)).seconds / 3600
Returns the difference in hours between timezone1 and timezone2 for a given date.
def
serialize_setup_directories(directories):
1099def serialize_setup_directories(directories): 1100 1101 if isinstance(directories, list): 1102 1103 if isinstance(directories[0], dict): 1104 dir_data = directories[0] 1105 else: 1106 raise Exception( 1107 f"Bad syntaxe for {directories} in setup. It should be a dictionnary" 1108 ) 1109 1110 elif isinstance(directories, dict): 1111 dir_data = directories 1112 1113 return dir_data
def
read_atmos_data(model, setup_options={}):
1116def read_atmos_data(model, setup_options={}): 1117 1118 # update attributes 1119 for key, value in setup_options.items(): 1120 if hasattr(model.setup, key): 1121 setattr(model.setup, key, value) 1122 1123 if model.setup.read_qobs: 1124 1125 _read_qobs(model) 1126 1127 # with rasterio.Env(): 1128 if model.setup.read_prcp: 1129 1130 if len(model.setup.prcp_directories) > 0: 1131 dir_data = serialize_setup_directories(model.setup.prcp_directories) 1132 else: 1133 dir_data = {1: model.setup.prcp_directory} 1134 1135 _read_atmos_data(model, atmos_data="prcp", directories=dir_data) 1136 1137 if model.setup.read_pet: 1138 1139 if len(model.setup.pet_directories) > 0: 1140 dir_data = serialize_setup_directories(model.setup.pet_directories) 1141 else: 1142 dir_data = {1: model.setup.pet_directory} 1143 1144 if model.setup.daily_interannual_pet: 1145 _read_interannual_pet(model, directories=dir_data, tz=model.setup.timezone) 1146 else: 1147 _read_atmos_data(model, atmos_data="pet", directories=dir_data) 1148 1149 if model.setup.read_snow: 1150 1151 if len(model.setup.snow_directories) > 0: 1152 dir_data = serialize_setup_directories(model.setup.snow_directories) 1153 else: 1154 dir_data = {1: model.setup.snow_directory} 1155 1156 _read_atmos_data(model, atmos_data="snow", directories=dir_data) 1157 1158 if model.setup.read_temp: 1159 1160 if len(model.setup.temp_directories) > 0: 1161 dir_data = serialize_setup_directories(model.setup.temp_directories) 1162 else: 1163 dir_data = {1: model.setup.temp_directory} 1164 1165 _read_atmos_data(model, atmos_data="temp", directories=dir_data) 1166 1167 if model.setup.prcp_partitioning: 1168 1169 smash.fcore._mw_atmos_statistic.compute_prcp_partitioning( 1170 model.setup, model.mesh, model._input_data 1171 ) 1172 1173 if model.setup.compute_mean_atmos: 1174 1175 print("</> Computing mean atmospheric data") 1176 smash.fcore._mw_atmos_statistic.compute_mean_atmos( 1177 model.setup, model.mesh, model._input_data 1178 )
def
write_atmos_data_aliases(path_to_alias, setup, input_data):
1181def write_atmos_data_aliases(path_to_alias, setup, input_data): 1182 1183 for atmos_data, directories in input_data.items(): 1184 1185 atmos_data_format = getattr(setup, f"{atmos_data}_format") 1186 atmos_date_pattern = getattr(setup, f"{atmos_data}_date_pattern") 1187 prcp_access = getattr(setup, f"{atmos_data}_access") 1188 dt = getattr(setup, "dt") 1189 1190 date_range = pd.date_range( 1191 start=setup.start_time, 1192 end=setup.end_time, 1193 freq=f"{int(setup.dt)}s", 1194 )[1:] 1195 1196 array_file = None 1197 for atmos_data_directory in directories: 1198 1199 files = _get_atmos_files( 1200 atmos_data_directory, 1201 atmos_data_format, 1202 prcp_access, 1203 dt, 1204 date_range, 1205 atmos_date_pattern, 1206 ) 1207 1208 if array_file is None: 1209 array_file = np.array(files, dtype=str) 1210 lacuna = np.where(array_file == "-1") 1211 else: 1212 array_file[lacuna] = np.array(files)[lacuna] 1213 lacuna = np.where(array_file == "-1") 1214 1215 if len(lacuna) == 0: 1216 break 1217 1218 if os.path.exists(os.path.join(path_to_alias, f"tmp_{atmos_data}")): 1219 shutil.rmtree(os.path.join(path_to_alias, f"tmp_{atmos_data}")) 1220 1221 if not os.path.exists(os.path.join(path_to_alias, f"tmp_{atmos_data}")): 1222 os.mkdir(os.path.join(path_to_alias, f"tmp_{atmos_data}")) 1223 1224 for i in range(len(array_file)): 1225 if array_file[i] != "-1": 1226 mytarget = array_file[i] 1227 mylink = os.path.join( 1228 path_to_alias, f"tmp_{atmos_data}", os.path.basename(array_file[i]) 1229 ) 1230 os.symlink(mytarget, mylink)