smashbox.read_inputdata.read_data

Created on Tue Jun 25 11:11:03 2024

@author: maxime

View Source

   1#!/usr/bin/env python3
   2# -*- coding: utf-8 -*-
   3"""
   4Created on Tue Jun 25 11:11:03 2024
   5
   6@author: maxime
   7"""
   8import glob
   9import os
  10import re
  11import warnings
  12import numpy as np
  13import pandas as pd
  14import rasterio
  15from smash.fcore._mwd_mesh import MeshDT
  16import shutil
  17import tqdm
  18import smash
  19import pytz
  20from pathlib import Path
  21import time
  22from smashbox.tools import geo_toolbox
  23from smashbox.tools import tools
  24
  25RATIO_PET_HOURLY = np.array(
  26    [
  27        0,
  28        0,
  29        0,
  30        0,
  31        0,
  32        0,
  33        0,
  34        0.035,
  35        0.062,
  36        0.079,
  37        0.097,
  38        0.11,
  39        0.117,
  40        0.117,
  41        0.11,
  42        0.097,
  43        0.079,
  44        0.062,
  45        0.035,
  46        0,
  47        0,
  48        0,
  49        0,
  50        0,
  51    ],
  52    dtype=np.float32,
  53)
  54
  55# Compute the time laps between 2 timezones
  56
  57
  58def tz_diff(date, tz1, tz2):
  59    """
  60    Returns the difference in hours between timezone1 and timezone2
  61    for a given date.
  62    """
  63    date = pd.to_datetime(date)
  64    return (tz1.localize(date) - tz2.localize(date).astimezone(tz1)).seconds / 3600
  65
  66
  67# Must be done char by char to take account of various format. Must be documented.
  68# Can't support all date formatter.
  69# Avantage: can work even file are not sorted with the date.
  70
  71
  72def _build_date_regex_pattern(date_pattern) -> str:
  73    # Supported date formatters
  74    datefmt = {"%Y": 4, "%m": 2, "%d": 2, "%H": 2, "%M": 2, "%S": 2}
  75
  76    regex = ""
  77    i = 0
  78    while i < len(date_pattern):
  79        if date_pattern[i : i + 2] in datefmt:
  80            regex += r"\d{" + str(datefmt[date_pattern[i : i + 2]]) + "}"
  81            i = i + 2
  82        else:
  83            regex += r"\\" + date_pattern[i]
  84            i = i + 1
  85
  86    return regex
  87
  88
  89# fonction which split the date_pattern in 2 part:
  90# 1)the date pattern
  91# 2)the position of the occurence of the date pattern when searching it in a text
  92# Exemple: date_pattern="%Y%m%d%H%1"  return "%Y%m%d%H" and 1
  93
  94
  95def _split_date_occurence(date_pattern) -> str:
  96    re_match = re.search(r"\%[0-9]+$", date_pattern)
  97    if re_match is not None:
  98        pattern = date_pattern[0 : int(re_match.span()[0])]
  99        occurence = int(re_match.group()[1])
 100        return pattern, occurence
 101    else:
 102        return date_pattern, 0
 103
 104
 105# We suppose that the atmos file are sorted in ascendent order with the date
 106# (this a drawback of the method) ! Must be documented !
 107# Only used for daily internanuel pet
 108
 109
 110def _check_files_containing_date(
 111    files: str, date: pd.Timestamp, date_pattern: str
 112) -> int:
 113    date_string = date.strftime(date_pattern)
 114    ind = -1
 115    for i, f in enumerate(files):
 116        # print(i,date_string,os.path.basename(f))
 117        if date_string in os.path.basename(f):
 118            ind = i
 119
 120    return ind
 121
 122
 123# Test if files are sorted according the date
 124# Test if the model time step fit with the data
 125# Create a n samples of niter contigous date
 126# return True or false
 127
 128
 129def _sample_delta_time_in_file(files, date_pattern, dt):
 130    d_pattern, occurence = _split_date_occurence(date_pattern)
 131    regex_date = _build_date_regex_pattern(d_pattern)
 132
 133    if len(files) <= 2:
 134        return False
 135
 136    nsample = 5
 137    niter = 100
 138    list_delta_time = []
 139    list_date_block = []
 140
 141    for i in range(0, len(files), max(int(len(files) / nsample), niter)):
 142
 143        for j in range(min(niter, len(files) - i - 1)):
 144
 145            re_match_0 = re.findall(regex_date, os.path.basename(files[i + j]))
 146            re_match_1 = re.findall(regex_date, os.path.basename(files[i + j + 1]))
 147
 148            if len(re_match_0) > 0 and len(re_match_1) > 0:
 149                date_match_0 = pd.to_datetime(re_match_0[occurence], format=d_pattern)
 150                date_match_1 = pd.to_datetime(re_match_1[occurence], format=d_pattern)
 151                delta_time = date_match_1 - date_match_0
 152                list_delta_time.append(delta_time.total_seconds())
 153
 154                if j == 0:
 155                    list_date_block.append(date_match_0)
 156
 157                if delta_time.total_seconds() < 0:
 158                    return False
 159
 160    # Test if all sample date block are in ascendent order
 161    if len(list_date_block) > 1:
 162        for i in range(len(list_date_block) - 1):
 163            delta_time = list_date_block[i + 1] - list_date_block[i]
 164
 165            if delta_time.total_seconds() < 0.0:
 166                return False
 167
 168    if len(list_delta_time) > 0:
 169        if np.min(np.array(list_delta_time)) != dt:
 170            # sorted but not good time-step
 171            raise ValueError(
 172                "Precipitation files are sorted with the date pattern but not at the good time-step:"
 173                f" prcp time-step={list_delta_time[0]}, model time-step={dt}"
 174            )
 175
 176        # Finally test if all date are in ascendent order
 177        if np.all(np.array(list_delta_time) > 0):
 178            return True
 179        else:
 180            return False
 181
 182    else:
 183        raise ValueError(
 184            "Date Pattern was not found in the list of the precipitation files"
 185            f" while searching pattern={d_pattern} with corresponding regex={regex_date}"
 186            f" In filename {files[0]},"
 187        )
 188
 189
 190# Search algorithm
 191# Search for a pattern in a sorted list of files
 192# return the first position after or equal at date in files
 193
 194
 195def _fast_index_search_for_date(files, date, date_pattern):
 196    # print("search for",date)
 197    d_pattern, occurence = _split_date_occurence(date_pattern)
 198    regex_date = _build_date_regex_pattern(d_pattern)
 199
 200    re_match = re.findall(regex_date, os.path.basename(files[-1]))
 201    if len(re_match) > 0:
 202        date_match = pd.to_datetime(re_match[occurence], format=d_pattern)
 203
 204        # cas1
 205        if date > date_match:
 206            return len(files)
 207
 208    else:
 209        raise ValueError(
 210            f"Date formatter {d_pattern} corresponding to regex {regex_date}"
 211            " not found in filename {os.path.basename(files[i])}"
 212        )
 213
 214    re_match = re.findall(regex_date, os.path.basename(files[0]))
 215    if len(re_match) > 0:
 216        date_match = pd.to_datetime(re_match[occurence], format=d_pattern)
 217
 218        # cas2
 219        if date < date_match:
 220            return 0
 221
 222    else:
 223        raise ValueError(
 224            f"Date formatter {d_pattern} corresponding to regex {regex_date}"
 225            " not found in filename {os.path.basename(files[i])}"
 226        )
 227
 228    # cas3
 229    pos = 0
 230    final_pos = 0
 231    move = 1
 232    previous_move = 0
 233    step = len(files) - 1
 234    nb_iter = 0
 235    while nb_iter < 100:
 236        re_match = re.findall(regex_date, os.path.basename(files[pos]))
 237        date_match = pd.to_datetime(re_match[occurence], format=d_pattern)
 238
 239        # print(
 240        #             "At iteration "
 241        #             + str(nb_iter)
 242        #             +f" searching {date},"
 243        #             + "and match "
 244        #             + date_match.strftime("%Y-%m-%d %H:%M:%S")
 245        #             + f" at pos {pos}"
 246        #         )
 247
 248        if date_match < date:
 249            pos = min(pos + step, len(files) - 1)
 250            move = 1
 251        elif date_match > date:
 252            pos = max(pos - step, 0)
 253            move = -1
 254        elif date_match == date:
 255            final_pos = pos
 256            break
 257
 258        if step == 0:
 259            if date_match < date:
 260                final_pos = pos + 1
 261                break
 262            elif date_match > date:
 263                final_pos = pos
 264                break
 265
 266        # print(f"-> Move to pos {pos} with step {step}")
 267
 268        # reduce step only if we change search direction
 269        if previous_move != move:
 270            step = int(step / 2)
 271
 272        nb_iter = nb_iter + 1
 273        previous_move = move
 274
 275    # print(
 276    #             "At last iteration "
 277    #             + str(nb_iter)
 278    #             +f" searching {date},"
 279    #             + "and match "
 280    #             + date_match.strftime("%Y-%m-%d %H:%M:%S")
 281    #             + f" at pos {pos}"
 282    #         )
 283
 284    # iter>maxlimit:
 285    if date_match < date:
 286        final_pos = pos
 287    elif date_match > date:
 288        # back to previous pos
 289        final_pos = pos - 2 * step
 290
 291    return final_pos
 292
 293
 294# Getting  only files for the specified daterange
 295# work even files are not sorted according the date pattern
 296def _get_files_list_for_date_range(files, date_pattern, date_range):
 297    d_pattern, occurence = _split_date_occurence(date_pattern)
 298    regex_date = _build_date_regex_pattern(d_pattern)
 299
 300    vec_date = []
 301    if len(files) > 0:
 302        for i, f in enumerate(files):
 303            re_match = re.findall(regex_date, os.path.basename(f))
 304
 305            if len(re_match) > 0:
 306                vec_date.append(re_match[occurence])
 307
 308        vec_date = pd.to_datetime(vec_date, format=d_pattern)
 309        vec_date = vec_date.strftime("%Y%m%d%H%M%S")
 310
 311    # convert to numpy array
 312    np_lst_date = np.array(vec_date)
 313    # sort but keep indexes
 314    sorted_indices = np_lst_date.argsort()
 315    # sort according previous indexes
 316    np_lst_date_sorted = np_lst_date[sorted_indices]
 317
 318    np_date_range = np.array(date_range.strftime("%Y%m%d%H%M%S").to_list())
 319
 320    # build the list of index only for the daterange
 321    index_list_for_daterange = []
 322    for i in range(len(np_date_range)):
 323        pos = np.where(np_lst_date_sorted == np_date_range[i])
 324        if len(pos[0]) > 0:
 325            index_list_for_daterange.append(pos[0][0])
 326        else:
 327            index_list_for_daterange.append(-1)
 328
 329    # find the final list of files for daterange (sorted)
 330    final_list_files = []
 331    for index in index_list_for_daterange:
 332        if index >= 0:
 333            final_list_files.append(files[sorted_indices[index]])
 334        else:
 335            final_list_files.append("miss")
 336
 337    return final_list_files
 338
 339
 340# Getting  only files for the specified daterange
 341# work even files are not sorted according the date pattern
 342# Duplicate of _get_files_list_for_date_range but with hack for leap year...
 343def _get_files_list_for_date_range_interannuel(files, date_pattern, date_range):
 344    d_pattern, occurence = _split_date_occurence(date_pattern)
 345    regex_date = _build_date_regex_pattern(d_pattern)
 346
 347    vec_date = []
 348    if len(files) > 0:
 349        for i, f in enumerate(files):
 350            re_match = re.findall(regex_date, os.path.basename(f))
 351            if len(re_match) > 0:
 352                vec_date.append("1904" + re_match[occurence])  # annee bixetile
 353        vec_date = pd.to_datetime(vec_date, format="%Y%m%d")  # match 19040402
 354        vec_date = vec_date.strftime("%Y%m%d%H%M%S")
 355
 356    # convert to numpy array
 357    np_lst_date = np.array(vec_date)
 358    # sort but keep indexes
 359    sorted_indices = np_lst_date.argsort()
 360    # sort according previous indexes
 361    np_lst_date_sorted = np_lst_date[sorted_indices]
 362
 363    np_date_range = np.array(date_range.strftime("%Y%m%d%H%M%S").to_list())
 364
 365    # build the list of index only for the daterange
 366    index_list_for_daterange = []
 367    for i in range(len(np_date_range)):
 368        pos = np.where(np_lst_date_sorted == np_date_range[i])
 369
 370        if len(pos[0]) > 0:
 371            index_list_for_daterange.append(pos[0][0])
 372        else:
 373            index_list_for_daterange.append(-1)
 374
 375    # find the final list of files for daterange (sorted)
 376    final_list_files = []
 377    for index in index_list_for_daterange:
 378        if index >= 0:
 379            final_list_files.append(files[sorted_indices[index]])
 380        else:
 381            final_list_files.append("miss")
 382
 383    return final_list_files
 384
 385
 386# Get the list of files to read for a specific date_range
 387
 388
 389def _get_atmos_files(
 390    dir: str,
 391    fmt: str,
 392    access: str,
 393    dt: float,
 394    date_range: pd.DatetimeIndex,
 395    date_pattern: str,
 396    daily_interannual: bool = False,
 397) -> list[str]:
 398    # Set to drop duplicates after strftime
 399
 400    date_range_strftime_access = set(date_range.strftime(access)) if access else {""}
 401    files = []
 402
 403    acces_try = 0
 404    res = False
 405    while res == False:
 406
 407        res = os.access(f"{dir}", os.R_OK)
 408        if res:
 409            for date_strftime_access in date_range_strftime_access:
 410                prcp_path = Path(f"{dir}/{date_strftime_access}/")
 411                files.extend(sorted(prcp_path.glob(f"**/*{fmt}")))
 412        else:
 413            time.sleep(0.1)
 414            acces_try = acces_try + 1
 415            print(
 416                f"</> WARNING: {dir} is unvailable ... waiting 2s max and trying again {acces_try}/20..."
 417            )
 418            if acces_try > 20:
 419                print(
 420                    f"</> WARNING !! atmos files directory {dir} is unaccesible after 20 try (2 seconds) => atmos files are missing !"
 421                )
 422                break
 423
 424    files = sorted(files)
 425
 426    # Return a list of files in date_range with potential missing files (366 files max)
 427    if daily_interannual:
 428
 429        # list of daily date
 430        date_range_interannual = pd.date_range(
 431            start=date_range[0].strftime("%Y%m%d"),
 432            end=date_range[-1].strftime("%Y%m%d"),
 433            freq="1d",
 434        )
 435
 436        # conversion au format inte-annuel + use year 1904 to manage leap year
 437        date_range_interannual = pd.to_datetime(
 438            "1904" + date_range_interannual.strftime("%m%d"), format="%Y%m%d"
 439        )
 440
 441        # we build the file list according the date_range (sorted): we have 1 date <=>  1file
 442        final_list_files = _get_files_list_for_date_range_interannuel(
 443            files, "%m%d", date_range_interannual
 444        )
 445
 446        return final_list_files
 447
 448    # Return a list of files in date_range with potential missing files
 449    else:
 450
 451        # Only here with model time-step
 452        final_list_files = []
 453        if len(files) == 0:
 454            for i in range(len(date_range)):
 455                final_list_files.append("miss")
 456            return final_list_files
 457
 458        # Check if file are sorted with the date pattern (for long list of files)
 459        if len(files) >= 100:
 460            # ensure files are alphabetically sorted (glob does not sort list depend on the file system)
 461            # this operation can be very long ?
 462            # files.sort()
 463            test = _sample_delta_time_in_file(files, date_pattern, dt)
 464
 465            # if sorted we quickly search index of date_start and date_end
 466            # if not sorted we parse all files et keep the dates
 467            if test:
 468                # file seems sorted
 469                pos_0 = _fast_index_search_for_date(files, date_range[0], date_pattern)
 470                pos_1 = _fast_index_search_for_date(files, date_range[-1], date_pattern)
 471
 472                files = files[pos_0 : pos_1 + 1]
 473            else:
 474                print(
 475                    "Warnings, atmospheric filename are not sorted with date."
 476                    " Reading atmospheric data may take more time."
 477                )
 478
 479        # we build the file list according the date_range (sorted): we have 1 date <=>  1file
 480        final_list_files = _get_files_list_for_date_range(files, date_pattern, date_range)
 481
 482        return final_list_files
 483
 484
 485# This function seems to work better but require more tests
 486def _read_windowed_raster_new(
 487    path,
 488    mesh: dict | MeshDT | None = None,
 489    transform: dict = None,
 490    order: int = 0,
 491    cval=-99.0,
 492    grid_mode=True,
 493) -> tuple[np.ndarray, dict[str, int]]:
 494    """
 495
 496    Description
 497    -----------
 498
 499    Read a part of a raster and resample it if required.
 500
 501    Paramerters
 502    -----------
 503
 504    path: str()
 505        Input geotif path
 506
 507    mesh: dict() or MeshDT
 508        A smash mesh
 509
 510    transform: dict()
 511        transform dictionnary from rasterio.get_transform() method. (xmin,ymax,xres,yres). May be to replace the mesh
 512
 513
 514    resampling_method: str()
 515        scipy-zoom or numpy use scipy zoom function or numpy array functions
 516
 517    order: int()
 518        order of the resampling cubic interpolation
 519
 520    cval: float() | np.nan
 521        fill value for the extended boundaries
 522
 523    grid_mode: bool()
 524        True | False. if True coordinate start from the edge of the cell. If False coordinate starts from the center of the cell.
 525
 526    Return
 527    ------
 528
 529    numpy.array()
 530    Cropped and resampled array according bbox_out and res_out
 531
 532    """
 533
 534    if transform is not None:
 535        mesh_dict = transform
 536    else:
 537        if not isinstance(mesh, dict):
 538            mesh_dict = tools.read_object_as_dict(mesh)
 539        else:
 540            mesh_dict = mesh
 541
 542    ds_in = rasterio.open(path)
 543    matrix = ds_in.read()
 544
 545    bbox_in = ds_in.bounds._asdict()
 546    res_in = {"dx": ds_in.res[0], "dy": ds_in.res[1]}
 547
 548    bbox_out = geo_toolbox.get_bbox_from_smash_mesh(mesh_dict)
 549    res_out = {"dx": mesh_dict["xres"], "dy": mesh_dict["yres"]}
 550
 551    cropped_array = geo_toolbox.crop_array(
 552        matrix[0, :, :], bbox_in, res_in, bbox_out, res_out, order=order
 553    )
 554
 555    warning = {"res": 0, "overlap": 0, "outofbound": 0}
 556
 557    if res_in["dx"] != res_out["dx"] or res_in["dy"] != res_out["dy"]:
 558        warning["res"] = 1
 559
 560    return cropped_array, warning
 561
 562
 563def _rasterio_read_windowed_raster(path="", mesh: dict = None):
 564    """
 565    Description
 566    -----------
 567
 568    Read a part of a raster with rasterio and resample it if required.
 569
 570    Paramerters
 571    -----------
 572
 573    path: str()
 574        Input geotif path
 575
 576    mesh: dict() or MeshDT
 577        A smash mesh
 578
 579    Return
 580    ------
 581
 582    numpy.array()
 583    Cropped and resampled array according bbox_out and res_out
 584
 585    """
 586
 587    ds_in = rasterio.open(path)
 588
 589    col_off = max(mesh["xmin"] - ds_in.bounds.left, 0) / ds_in.res[0]
 590    row_off = max(ds_in.bounds.top - mesh["ymax"], 0) / ds_in.res[1]
 591    ncols = (mesh["ncol"] * mesh["xres"]) / ds_in.res[0]
 592    nrows = (mesh["nrow"] * mesh["yres"]) / ds_in.res[1]
 593
 594    window = rasterio.windows.Window(
 595        round(col_off), round(row_off), int(ncols), int(nrows)
 596    )
 597
 598    matrix = ds_in.read(
 599        window=window,
 600        out_shape=(
 601            ds_in.count,
 602            mesh["nrow"],
 603            mesh["ncol"],
 604        ),
 605        resampling=rasterio.enums.Resampling.nearest,
 606    )
 607
 608    return matrix
 609
 610
 611# this function has a bug. It shift the grid and fill with nodata value the borders if shifted.
 612def _read_windowed_raster(
 613    path, mesh: MeshDT | None = None, transform: dict = None
 614) -> tuple[np.ndarray, dict[str, int]]:
 615    warning = {"res": 0, "overlap": 0, "outofbound": 0}
 616
 617    if transform is not None:
 618        mesh = transform
 619
 620    # % Get raster information
 621    with rasterio.open(path) as ds:
 622        transform = ds.get_transform()
 623        xmin = transform[0]
 624        ymax = transform[3]
 625        xres = transform[1]
 626        yres = -transform[5]
 627
 628        # % Manage absolute tolerance wrt to xres or yres value
 629        atol = 1e-2
 630        xatol = atol * 10 ** min(0, np.floor(np.log10(np.abs(xres))))
 631        yatol = atol * 10 ** min(0, np.floor(np.log10(np.abs(yres))))
 632
 633        # % Resolution missmatch
 634        if not np.isclose(mesh.xres, xres, atol=xatol) or not np.isclose(
 635            mesh.yres, yres, atol=yatol
 636        ):
 637            warning["res"] = 1
 638
 639        # % Overlap missmatch
 640        dxmin = mesh.xmin - xmin
 641        dymax = ymax - mesh.ymax
 642        xol_match = np.abs(dxmin / xres - np.round(dxmin / xres))
 643        yol_match = np.abs(dymax / yres - np.round(dymax / yres))
 644        if not np.isclose(xol_match, 0, atol=xatol) or not np.isclose(
 645            yol_match, 0, atol=yatol
 646        ):
 647            warning["overlap"] = 1
 648
 649        # # % Allocate buffer
 650        arr = np.zeros(shape=(mesh.nrow, mesh.ncol), dtype=np.float32)
 651        arr.fill(np.float32(-99))
 652
 653        # % Pad offset to the nearest integer
 654        xoff = np.rint((mesh.xmin - xmin) / xres)
 655        yoff = np.rint((ymax - mesh.ymax) / yres)
 656
 657        # % Resolution ratio
 658        xres_ratio = mesh.xres / xres
 659        yres_ratio = mesh.yres / yres
 660
 661        # Reading window
 662        win_xsize = np.rint(mesh.ncol * xres_ratio)
 663        win_ysize = np.rint(mesh.nrow * yres_ratio)
 664
 665        # % Totally out of bound
 666        # % Return the buffer with no data
 667        if (
 668            xoff >= ds.width
 669            or yoff >= ds.height
 670            or xoff + win_xsize <= 0
 671            or yoff + win_ysize <= 0
 672        ):
 673            warning["outofbound"] = 2
 674            return arr, warning
 675
 676        # % Partially out of bound
 677        if (
 678            xoff < 0
 679            or yoff < 0
 680            or xoff + win_xsize > ds.width
 681            or yoff + win_ysize > ds.height
 682        ):
 683            warning["outofbound"] = 1
 684
 685        # % Readjust offset
 686        pxoff = max(0, xoff)
 687        pyoff = max(0, yoff)
 688
 689        # % Readjust reading window
 690        pwin_xsize = min(win_xsize + min(0, xoff), ds.width - pxoff)
 691        pwin_ysize = min(win_ysize + min(0, yoff), ds.height - pyoff)
 692
 693        # % Buffer slices
 694        xs = pxoff - xoff
 695        ys = pyoff - yoff
 696        xe = xs + pwin_xsize
 697        ye = ys + pwin_ysize
 698        arr_xslice = slice(int(xs * 1 / xres_ratio), int(xe * 1 / xres_ratio))
 699        arr_yslice = slice(int(ys * 1 / yres_ratio), int(ye * 1 / yres_ratio))
 700
 701        # % Reading and writting into buffer
 702        ds.read(
 703            1,
 704            window=rasterio.windows.Window(pxoff, pyoff, pwin_xsize, pwin_ysize),
 705            out=arr[arr_yslice, arr_xslice],
 706        )
 707
 708        # % Manage NoData
 709        nodata = ds.nodata
 710        if nodata is not None:
 711            arr = np.where(arr == nodata, -99.0, arr)
 712
 713        return arr, warning
 714
 715
 716def _get_reading_warning_message(reading_warning: dict[str, int | list[str]]) -> str:
 717    msg = []
 718
 719    if reading_warning["got"]:
 720        if len(reading_warning["got"]) > 1:
 721            msg.append(
 722                f"Reading sucess: read {len(reading_warning['got'])} file(s): "
 723                f"{reading_warning['got'][0]}, ..., {reading_warning['got'][0-1]}"
 724            )
 725        else:
 726            msg.append(
 727                f"Reading sucess: read {len(reading_warning['got'])} file(s): "
 728                f"{reading_warning['got'][0]}"
 729            )
 730
 731    if reading_warning["miss"]:
 732        msg.append(
 733            f"Missing warning: missing {len(reading_warning['miss'])} file(s): "
 734            f"{reading_warning['miss']}"
 735        )
 736
 737    if reading_warning["res"]:
 738        msg.append(
 739            "Resolution warning: resolution missmatch between mesh and at least one raster file. Nearest "
 740            "neighbour resampling algorithm has been used to match mesh resolution"
 741        )
 742
 743    if reading_warning["overlap"]:
 744        msg.append(
 745            "Overlap warning: overlap missmatch between mesh and at least one raster file. Cropping domain "
 746            "has been updated to the nearest overlapping cell"
 747        )
 748
 749    if reading_warning["outofbound"]:
 750        kind = "partially" if reading_warning["outofbound"] == 1 else "totally"
 751        msg.append(
 752            f"Out of bound warning: mesh is {kind} out of bound for at least one raster file. Out of bound "
 753            f"domain has been filled in with no data"
 754        )
 755
 756    return "\n".join(msg)
 757
 758
 759# TODO: Unique fun to read prcp, snow and temp
 760def _read_atmos_data(model, atmos_data, directories):
 761
 762    date_range = pd.date_range(
 763        start=model.setup.start_time,
 764        end=model.setup.end_time,
 765        freq=f"{int(model.setup.dt)}s",
 766    )[1:]
 767
 768    reading_warning = {"got": [], "miss": [], "res": 0, "overlap": 0, "outofbound": 0}
 769
 770    # atmos_data_directory = getattr(model.setup, f"{atmos_data}_directory")
 771    atmos_data_format = getattr(model.setup, f"{atmos_data}_format")
 772    atmos_date_pattern = getattr(model.setup, f"{atmos_data}_date_pattern")
 773    prcp_access = getattr(model.setup, "prcp_access")
 774    prcp_conversion_factor = getattr(model.setup, "prcp_conversion_factor")
 775    sparse_storage = getattr(model.setup, "sparse_storage")
 776    dt = getattr(model.setup, "dt")
 777
 778    sparse_matrix = None
 779    std_storage = None
 780    if sparse_storage:
 781        sparse_matrix = getattr(model._input_data.atmos_data, f"sparse_{atmos_data}")
 782    else:
 783        std_storage = getattr(model._input_data.atmos_data, f"{atmos_data}")
 784
 785    if atmos_data_format == "tif":
 786
 787        file_list = []
 788        keys = sorted(directories.keys())
 789        for key in keys:
 790
 791            # atmos_data_directory = directories[i]
 792            atmos_data_directory = directories[key]
 793            files = _get_atmos_files(
 794                atmos_data_directory,
 795                atmos_data_format,
 796                prcp_access,
 797                dt,
 798                date_range,
 799                atmos_date_pattern,
 800            )
 801
 802            if len(file_list) == 0:
 803                file_list = files
 804
 805                lacuna = []
 806                for k, f in enumerate(files):
 807                    if f == "miss":
 808                        lacuna.append(k)
 809            else:
 810
 811                for k in lacuna:
 812                    file_list[k] = files[k]
 813
 814                lacuna = []
 815                for k, f in enumerate(files):
 816                    if f == "miss":
 817                        lacuna.append(k)
 818
 819            if len(lacuna) == 0:
 820                break
 821
 822        for i, date in enumerate(tqdm.tqdm(date_range, desc=f"</> Reading {atmos_data}")):
 823            atmos_file = file_list[i]
 824            if atmos_file == "miss":
 825                reading_warning["miss"].append(
 826                    date.strftime("%Y-%m-%d %H:%M")
 827                    + f", (matching pattern {date.strftime(atmos_date_pattern)})"
 828                )
 829                if sparse_storage:
 830                    matrix = np.zeros(
 831                        shape=(model.mesh.nrow, model.mesh.ncol),
 832                        dtype=np.float32,
 833                        order="F",
 834                    )
 835                    matrix.fill(np.float32(-99))
 836                    smash.fcore._mwd_sparse_matrix_manipulation.matrix_to_sparse_matrix(
 837                        model.mesh,
 838                        matrix,
 839                        np.float32(-99),
 840                        sparse_matrix[i],
 841                    )
 842
 843                else:
 844                    std_storage[..., i] = np.float32(-99)
 845
 846            else:
 847                reading_warning["got"].append(
 848                    date.strftime("%Y-%m-%d %H:%M") + f" ({os.path.basename(atmos_file)})"
 849                )
 850
 851                matrix, warning = _read_windowed_raster_new(atmos_file, model.mesh)
 852                matrix *= prcp_conversion_factor
 853                reading_warning.update(
 854                    {k: v for k, v in warning.items() if not reading_warning[k]}
 855                )
 856
 857                if sparse_storage:
 858                    smash.fcore._mwd_sparse_matrix_manipulation.matrix_to_sparse_matrix(
 859                        model.mesh,
 860                        matrix,
 861                        np.float32(0),
 862                        sparse_matrix[i],
 863                    )
 864
 865                else:
 866                    std_storage[..., i] = matrix
 867
 868                # files = files[1:]
 869
 870            if sparse_matrix:
 871                setattr(
 872                    model._input_data.atmos_data, f"sparse_{atmos_data}", sparse_matrix
 873                )
 874            else:
 875                setattr(model._input_data.atmos_data, f"{atmos_data}", std_storage)
 876
 877    # % WIP
 878    elif atmos_data_format == "nc":
 879        raise NotImplementedError("NetCDF format not implemented yet")
 880
 881    msg = _get_reading_warning_message(reading_warning)
 882
 883    if msg:
 884        warnings.warn(f"Warning(s) linked to {atmos_data} reading.\n{msg}", stacklevel=2)
 885
 886
 887def _read_interannual_pet(model, directories, tz="UTC"):
 888    date_range = pd.date_range(
 889        start=model.setup.start_time,
 890        end=model.setup.end_time,
 891        freq=f"{int(model.setup.dt)}s",
 892    )[1:]
 893
 894    reading_warning = {"got": [], "miss": [], "res": 0, "overlap": 0, "outofbound": 0}
 895
 896    if model.setup.pet_format == "tif":
 897
 898        file_list = []
 899        keys = sorted(directories.keys())
 900        for key in keys:
 901
 902            atmos_data_directory = directories[key]
 903
 904            files = _get_atmos_files(
 905                atmos_data_directory,
 906                model.setup.pet_format,
 907                model.setup.pet_access,
 908                model.setup.dt,
 909                date_range,
 910                model.setup.pet_date_pattern,
 911                model.setup.daily_interannual_pet,
 912            )
 913
 914            if len(file_list) == 0:
 915                file_list = files
 916
 917                lacuna = []
 918                for k, f in enumerate(files):
 919                    if f == "miss":
 920                        lacuna.append(k)
 921            else:
 922
 923                for k in lacuna:
 924                    file_list[k] = files[k]
 925
 926                lacuna = []
 927                for k, f in enumerate(files):
 928                    if f == "miss":
 929                        lacuna.append(k)
 930
 931            if len(lacuna) == 0:
 932                break
 933
 934        if model.setup.daily_interannual_pet:
 935            leap_year_days = pd.date_range(
 936                start="202001010000", end="202012310000", freq="1D"
 937            )
 938            step_offset = int(
 939                (date_range[0] - date_range[0].floor("D")).total_seconds()
 940                / model.setup.dt
 941            )
 942            nstep_per_day = int(86_400 / model.setup.dt)
 943            hourly_ratio = 3_600 / model.setup.dt
 944
 945            ratio_shift = tz_diff(
 946                "2020/01/01 12:00", pytz.timezone("UTC"), pytz.timezone(tz)
 947            )
 948            RATIO_PET_HOURLY_SHIFTED = np.roll(RATIO_PET_HOURLY, int(-1 * ratio_shift))
 949
 950            if hourly_ratio >= 1:
 951                ratio = np.repeat(RATIO_PET_HOURLY_SHIFTED, hourly_ratio) / hourly_ratio
 952
 953            else:
 954                ratio = np.sum(
 955                    RATIO_PET_HOURLY_SHIFTED.reshape(-1, int(1 / hourly_ratio)), axis=1
 956                )
 957
 958            matrix_dip = np.zeros(
 959                shape=(model.mesh.nrow, model.mesh.ncol, len(leap_year_days)),
 960                dtype=np.float32,
 961            )
 962            missing_day = np.empty(shape=0, dtype=np.int32)
 963
 964            list_date_in_daterange = date_range.strftime("%m%d")
 965            list_date_in_leap_year_days = leap_year_days.strftime("%m%d")
 966
 967            for i, date in enumerate(
 968                tqdm.tqdm(leap_year_days, desc="</> Reading daily interannual pet")
 969            ):
 970                if list_date_in_leap_year_days[i] in list_date_in_daterange:
 971                    ind = _check_files_containing_date(file_list, date, "%m%d")
 972
 973                    ind_date = list(list_date_in_daterange).index(
 974                        list_date_in_leap_year_days[i]
 975                    )
 976                    true_date = date_range[ind_date]
 977
 978                    if ind == -1:
 979                        reading_warning["miss"].append(
 980                            date_range[ind_date].strftime("%Y-%m-%d")
 981                            + f", (matching pattern {true_date.strftime('%m%d')})"
 982                        )
 983                        missing_day = np.append(date, true_date)
 984                    else:
 985                        reading_warning["got"].append(
 986                            true_date.strftime("%Y-%m-%d")
 987                            + f" ({os.path.basename(file_list[ind])})"
 988                        )
 989                        matrix_dip[..., i], warning = _read_windowed_raster_new(
 990                            file_list[ind], model.mesh
 991                        )
 992                        matrix_dip[..., i] *= model.setup.pet_conversion_factor
 993                        reading_warning.update(
 994                            {k: v for k, v in warning.items() if not reading_warning[k]}
 995                        )
 996
 997            for i, date in enumerate(
 998                tqdm.tqdm(date_range, desc="</> Disaggregating daily interannual pet")
 999            ):
1000                ratio_ind = (i + step_offset) % nstep_per_day
1001
1002                if date in missing_day:
1003                    if model.setup.sparse_storage:
1004                        matrix = np.zeros(
1005                            shape=(model.mesh.nrow, model.mesh.ncol),
1006                            dtype=np.float32,
1007                            order="F",
1008                        )
1009                        matrix.fill(np.float32(-99))
1010                        smash.fcore._mwd_sparse_matrix_manipulation.matrix_to_sparse_matrix(
1011                            model.mesh,
1012                            matrix,
1013                            np.float32(-99),
1014                            model._input_data.atmos_data.sparse_pet[i],
1015                        )
1016                    else:
1017                        model._input_data.atmos_data.pet[..., i] = np.float32(-99)
1018
1019                else:
1020                    index = list_date_in_leap_year_days.get_loc(list_date_in_daterange[i])
1021                    matrix = matrix_dip[..., index] * ratio[ratio_ind]
1022                    if model.setup.sparse_storage:
1023                        smash.fcore._mwd_sparse_matrix_manipulation.matrix_to_sparse_matrix(
1024                            model.mesh,
1025                            matrix,
1026                            np.float32(0),
1027                            model._input_data.atmos_data.sparse_pet[i],
1028                        )
1029                    else:
1030                        model._input_data.atmos_data.pet[..., i] = matrix
1031    # % WIP
1032    elif model.setup.pet_format == "nc":
1033        raise NotImplementedError("NetCDF format not implemented yet")
1034
1035    msg = _get_reading_warning_message(reading_warning)
1036
1037    if msg:
1038        warnings.warn(
1039            f"Warning(s) linked to potential evapotranspiration reading.\n{msg}",
1040            stacklevel=2,
1041        )
1042
1043
1044def _read_qobs(model):
1045    start_time = pd.Timestamp(model.setup.start_time)
1046    end_time = pd.Timestamp(model.setup.end_time)
1047    miss = []
1048
1049    for i, c in enumerate(model.mesh.code):
1050        f = glob.glob(f"{model.setup.qobs_directory}/**/*{c}*.csv", recursive=True)
1051
1052        if f:
1053            dat = pd.read_csv(f[0])
1054            try:
1055                file_start_time = pd.Timestamp(dat.columns[0])
1056            except Exception:
1057                raise ValueError(
1058                    f"Column header '{dat.columns[0]}' in the observed discharge file for catchment '{c}' "
1059                    f"is not a valid date"
1060                ) from None
1061
1062            file_end_time = file_start_time + pd.Timedelta(
1063                seconds=model.setup.dt * (len(dat) - 1)
1064            )
1065            start_diff = (
1066                int((start_time - file_start_time).total_seconds() / model.setup.dt) + 1
1067            )
1068            end_diff = (
1069                int((end_time - file_start_time).total_seconds() / model.setup.dt) + 1
1070            )
1071
1072            # % Check if observed discharge file contains data for corresponding simulation period
1073            if start_diff > dat.index.max() or end_diff < 0:
1074                warnings.warn(
1075                    f"The provided observed discharge file for catchment '{c}' does not contain data for the "
1076                    f"selected simulation period ['{start_time}', '{end_time}']. The file covers the period "
1077                    f"['{file_start_time}', '{file_end_time}']",
1078                    stacklevel=2,
1079                )
1080            else:
1081                ind_start_dat = max(0, start_diff)
1082                ind_end_dat = min(dat.index.max(), end_diff)
1083                ind_start_arr = max(0, -start_diff)
1084                ind_end_arr = ind_start_arr + ind_end_dat - ind_start_dat
1085
1086                model._input_data.response_data.q[i, ind_start_arr:ind_end_arr] = (
1087                    dat.iloc[ind_start_dat:ind_end_dat, 0]
1088                )
1089        else:
1090            miss.append(c)
1091
1092    if miss:
1093        warnings.warn(
1094            f"Missing {len(miss)} observed discharge file(s): {miss}", stacklevel=2
1095        )
1096
1097
1098def serialize_setup_directories(directories):
1099
1100    if isinstance(directories, list):
1101
1102        if isinstance(directories[0], dict):
1103            dir_data = directories[0]
1104        else:
1105            raise Exception(
1106                f"Bad syntaxe for {directories} in setup. It should be a dictionnary"
1107            )
1108
1109    elif isinstance(directories, dict):
1110        dir_data = directories
1111
1112    return dir_data
1113
1114
1115def read_atmos_data(model, setup_options={}):
1116
1117    # update attributes
1118    for key, value in setup_options.items():
1119        if hasattr(model.setup, key):
1120            setattr(model.setup, key, value)
1121
1122    if model.setup.read_qobs:
1123
1124        _read_qobs(model)
1125
1126    # with rasterio.Env():
1127    if model.setup.read_prcp:
1128
1129        if len(model.setup.prcp_directories) > 0:
1130            dir_data = serialize_setup_directories(model.setup.prcp_directories)
1131        else:
1132            dir_data = {1: model.setup.prcp_directory}
1133
1134        _read_atmos_data(model, atmos_data="prcp", directories=dir_data)
1135
1136    if model.setup.read_pet:
1137
1138        if len(model.setup.pet_directories) > 0:
1139            dir_data = serialize_setup_directories(model.setup.pet_directories)
1140        else:
1141            dir_data = {1: model.setup.pet_directory}
1142
1143        if model.setup.daily_interannual_pet:
1144            _read_interannual_pet(model, directories=dir_data, tz=model.setup.timezone)
1145        else:
1146            _read_atmos_data(model, atmos_data="pet", directories=dir_data)
1147
1148    if model.setup.read_snow:
1149
1150        if len(model.setup.snow_directories) > 0:
1151            dir_data = serialize_setup_directories(model.setup.snow_directories)
1152        else:
1153            dir_data = {1: model.setup.snow_directory}
1154
1155            _read_atmos_data(model, atmos_data="snow", directories=dir_data)
1156
1157    if model.setup.read_temp:
1158
1159        if len(model.setup.temp_directories) > 0:
1160            dir_data = serialize_setup_directories(model.setup.temp_directories)
1161        else:
1162            dir_data = {1: model.setup.temp_directory}
1163
1164            _read_atmos_data(model, atmos_data="temp", directories=dir_data)
1165
1166    if model.setup.prcp_partitioning:
1167
1168        smash.fcore._mw_atmos_statistic.compute_prcp_partitioning(
1169            model.setup, model.mesh, model._input_data
1170        )
1171
1172    if model.setup.compute_mean_atmos:
1173
1174        print("</> Computing mean atmospheric data")
1175        smash.fcore._mw_atmos_statistic.compute_mean_atmos(
1176            model.setup, model.mesh, model._input_data
1177        )
1178
1179
1180def write_atmos_data_aliases(path_to_alias, setup, input_data):
1181
1182    for atmos_data, directories in input_data.items():
1183
1184        atmos_data_format = getattr(setup, f"{atmos_data}_format")
1185        atmos_date_pattern = getattr(setup, f"{atmos_data}_date_pattern")
1186        prcp_access = getattr(setup, f"{atmos_data}_access")
1187        dt = getattr(setup, "dt")
1188
1189        date_range = pd.date_range(
1190            start=setup.start_time,
1191            end=setup.end_time,
1192            freq=f"{int(setup.dt)}s",
1193        )[1:]
1194
1195        array_file = None
1196        for atmos_data_directory in directories:
1197
1198            files = _get_atmos_files(
1199                atmos_data_directory,
1200                atmos_data_format,
1201                prcp_access,
1202                dt,
1203                date_range,
1204                atmos_date_pattern,
1205            )
1206
1207            if array_file is None:
1208                array_file = np.array(files, dtype=str)
1209                lacuna = np.where(array_file == "-1")
1210            else:
1211                array_file[lacuna] = np.array(files)[lacuna]
1212                lacuna = np.where(array_file == "-1")
1213
1214            if len(lacuna) == 0:
1215                break
1216
1217        if os.path.exists(os.path.join(path_to_alias, f"tmp_{atmos_data}")):
1218            shutil.rmtree(os.path.join(path_to_alias, f"tmp_{atmos_data}"))
1219
1220        if not os.path.exists(os.path.join(path_to_alias, f"tmp_{atmos_data}")):
1221            os.mkdir(os.path.join(path_to_alias, f"tmp_{atmos_data}"))
1222
1223        for i in range(len(array_file)):
1224            if array_file[i] != "-1":
1225                mytarget = array_file[i]
1226                mylink = os.path.join(
1227                    path_to_alias, f"tmp_{atmos_data}", os.path.basename(array_file[i])
1228                )
1229                os.symlink(mytarget, mylink)

RATIO_PET_HOURLY = array([0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.035, 0.062, 0.079, 0.097, 0.11 , 0.117, 0.117, 0.11 , 0.097, 0.079, 0.062, 0.035, 0. , 0. , 0. , 0. , 0. ], dtype=float32)

def tz_diff(date, tz1, tz2): View Source

59def tz_diff(date, tz1, tz2):
60    """
61    Returns the difference in hours between timezone1 and timezone2
62    for a given date.
63    """
64    date = pd.to_datetime(date)
65    return (tz1.localize(date) - tz2.localize(date).astimezone(tz1)).seconds / 3600

Returns the difference in hours between timezone1 and timezone2 for a given date.

def serialize_setup_directories(directories): View Source

1099def serialize_setup_directories(directories):
1100
1101    if isinstance(directories, list):
1102
1103        if isinstance(directories[0], dict):
1104            dir_data = directories[0]
1105        else:
1106            raise Exception(
1107                f"Bad syntaxe for {directories} in setup. It should be a dictionnary"
1108            )
1109
1110    elif isinstance(directories, dict):
1111        dir_data = directories
1112
1113    return dir_data

def read_atmos_data(model, setup_options={}): View Source

1116def read_atmos_data(model, setup_options={}):
1117
1118    # update attributes
1119    for key, value in setup_options.items():
1120        if hasattr(model.setup, key):
1121            setattr(model.setup, key, value)
1122
1123    if model.setup.read_qobs:
1124
1125        _read_qobs(model)
1126
1127    # with rasterio.Env():
1128    if model.setup.read_prcp:
1129
1130        if len(model.setup.prcp_directories) > 0:
1131            dir_data = serialize_setup_directories(model.setup.prcp_directories)
1132        else:
1133            dir_data = {1: model.setup.prcp_directory}
1134
1135        _read_atmos_data(model, atmos_data="prcp", directories=dir_data)
1136
1137    if model.setup.read_pet:
1138
1139        if len(model.setup.pet_directories) > 0:
1140            dir_data = serialize_setup_directories(model.setup.pet_directories)
1141        else:
1142            dir_data = {1: model.setup.pet_directory}
1143
1144        if model.setup.daily_interannual_pet:
1145            _read_interannual_pet(model, directories=dir_data, tz=model.setup.timezone)
1146        else:
1147            _read_atmos_data(model, atmos_data="pet", directories=dir_data)
1148
1149    if model.setup.read_snow:
1150
1151        if len(model.setup.snow_directories) > 0:
1152            dir_data = serialize_setup_directories(model.setup.snow_directories)
1153        else:
1154            dir_data = {1: model.setup.snow_directory}
1155
1156            _read_atmos_data(model, atmos_data="snow", directories=dir_data)
1157
1158    if model.setup.read_temp:
1159
1160        if len(model.setup.temp_directories) > 0:
1161            dir_data = serialize_setup_directories(model.setup.temp_directories)
1162        else:
1163            dir_data = {1: model.setup.temp_directory}
1164
1165            _read_atmos_data(model, atmos_data="temp", directories=dir_data)
1166
1167    if model.setup.prcp_partitioning:
1168
1169        smash.fcore._mw_atmos_statistic.compute_prcp_partitioning(
1170            model.setup, model.mesh, model._input_data
1171        )
1172
1173    if model.setup.compute_mean_atmos:
1174
1175        print("</> Computing mean atmospheric data")
1176        smash.fcore._mw_atmos_statistic.compute_mean_atmos(
1177            model.setup, model.mesh, model._input_data
1178        )

def write_atmos_data_aliases(path_to_alias, setup, input_data): View Source

1181def write_atmos_data_aliases(path_to_alias, setup, input_data):
1182
1183    for atmos_data, directories in input_data.items():
1184
1185        atmos_data_format = getattr(setup, f"{atmos_data}_format")
1186        atmos_date_pattern = getattr(setup, f"{atmos_data}_date_pattern")
1187        prcp_access = getattr(setup, f"{atmos_data}_access")
1188        dt = getattr(setup, "dt")
1189
1190        date_range = pd.date_range(
1191            start=setup.start_time,
1192            end=setup.end_time,
1193            freq=f"{int(setup.dt)}s",
1194        )[1:]
1195
1196        array_file = None
1197        for atmos_data_directory in directories:
1198
1199            files = _get_atmos_files(
1200                atmos_data_directory,
1201                atmos_data_format,
1202                prcp_access,
1203                dt,
1204                date_range,
1205                atmos_date_pattern,
1206            )
1207
1208            if array_file is None:
1209                array_file = np.array(files, dtype=str)
1210                lacuna = np.where(array_file == "-1")
1211            else:
1212                array_file[lacuna] = np.array(files)[lacuna]
1213                lacuna = np.where(array_file == "-1")
1214
1215            if len(lacuna) == 0:
1216                break
1217
1218        if os.path.exists(os.path.join(path_to_alias, f"tmp_{atmos_data}")):
1219            shutil.rmtree(os.path.join(path_to_alias, f"tmp_{atmos_data}"))
1220
1221        if not os.path.exists(os.path.join(path_to_alias, f"tmp_{atmos_data}")):
1222            os.mkdir(os.path.join(path_to_alias, f"tmp_{atmos_data}"))
1223
1224        for i in range(len(array_file)):
1225            if array_file[i] != "-1":
1226                mytarget = array_file[i]
1227                mylink = os.path.join(
1228                    path_to_alias, f"tmp_{atmos_data}", os.path.basename(array_file[i])
1229                )
1230                os.symlink(mytarget, mylink)