Source code for julian.iso_parsers

##########################################################################################
# julian/iso_parsers.py
##########################################################################################
"""
===========
ISO Parsers
===========
"""

import numpy as np
from julian.calendar       import day_from_ymd, day_from_yd
from julian.leap_seconds   import seconds_on_day
from julian.time_of_day    import sec_from_hms
from julian.utc_tai_tdb_tt import tai_from_day_sec, tdb_from_tai, time_from_time
from julian._exceptions    import JulianParseException, JulianValidateFailure


def _count_white(string):
    """(number of leading blanks, number of trailing blanks)"""

    lstring = len(string)
    if not lstring:
        return (0, 0)   # pragma: no cover

    # Count the leading blanks
    for l0 in range(lstring):   # pragma: no branch
        if string[l0] != ' ':
            break

    # Count the trailing blanks
    for l1 in range(lstring):   # pragma: no branch
        if string[~l1] != ' ':          # "~l1" means counting from the end!
            break

    return (l0, l1)

# key = (stripped_length, dash_count); value = (y1, m0, d0, dlen, dash_locs)
_ISO_DATE_FORMAT_INFO = {
    (10,2): (4, 5, 8, 2, (4,7)),        # yyyy-mm-dd
    ( 8,1): (4, 0, 5, 3, (4,) ),        # yyyy-ddd
    ( 8,2): (2, 3, 6, 2, (2,5)),        # yy-mm-dd
    ( 6,1): (2, 0, 3, 3, (2,) ),        # yy-ddd
    ( 8,0): (4, 4, 6, 2, ()   ),        # yyyymmdd
    ( 7,0): (4, 0, 4, 3, ()   ),        # yyyyddd
    ( 6,0): (2, 2, 4, 2, ()   ),        # yymmdd
    ( 5,0): (2, 0, 2, 3, ()   ),        # yyddd
}



[docs]
def day_from_iso(strings, *, validate=True, syntax=False, strip=False, proleptic=False):
    """Day number based on a parsing of a date string in the ISO 8601:1988 format.

    Recognized calendar date formats are "yyyy-mm-dd", "yyyymmdd", "yy-mm-dd", and
    "yymmdd". Supported ordinal date formats are "yyyy-ddd", "yyyyddd", "yy-ddd", and
    "yyddd". A fractional day following a decimal point is also permitted.

    This parser is much faster than the more general date parsing routines. It can also
    process lists or arrays of date strings of arbitrary shape, provided that every
    element uses the exact same format.

    Because it can handle arrays of bytestrings, it is very efficient at processing raw
    data extracted from a column of an ASCII table.

    Parameters:
        strings (str, bytes, or array-like):
            String(s) to interpret.
        validate (bool, optional):
            True to validate the year/month/day values.
        syntax (bool, optional):
            True to check the string values more closely for conformance to the ISO
            standard; raise JulianParseException (a ValueError subclass) on error.
        strip (bool, optional):
            True to skip over leading and trailing blanks.
        proleptic (bool, optional):
            True to interpret all dates according to the modern Gregorian calendar, even
            those that occurred prior to the transition from the Julian calendar. False to
            use the Julian calendar for earlier dates.

    Returns:
        int or array: Day number(s) relative to January 1, 2000.

    Raises:
        JulianValidateFailure:
            If `validate` is True and a year, month, or day value is out of range.
    """

    # Convert to bytestring if necessary, replace Unicode
    strings = np.array(strings).astype('S')

    first_index = len(strings.shape) * (0,)
    first = strings[first_index].decode('latin8')
    lfirst = len(first)

    # Count characters to strip
    w2 = strings.itemsize - lfirst              # w2 = 0 or 1 trailing null

    # w0 = number of blanks before
    # w1 = number of blanks after
    if strip:
        (w0, w1) = _count_white(first)
    else:
        (w0, w1) = (0, 0)
        test = first.replace('-', '').replace('.', '')
        if not test.isdecimal():
            raise JulianParseException(f'unrecognized ISO date format: "{first}"')

    # Check for a dot
    lstripped = lfirst - w0 - w1                # length without padding
    kdot = max(0, first.find('.'))              # 0 means no dot
    kend = w0 + lstripped                       # index of the first char after the date
    kints = kdot if kdot else kend              # index of first char after all integers

    # Identify the format
    ndashes = len(str(first).split('-')) - 1
    try:
        (y1, m0, d0, dlen, dashes) = _ISO_DATE_FORMAT_INFO[(kints - w0, ndashes)]
    except KeyError:
        raise JulianParseException(f'unrecognized ISO date format: "{first}"')

    # Construct the dtype dictionary
    dtype_dict = {}
    dtype_dict['y'] = (f'|S{y1}', w0)
    dtype_dict['d'] = (f'|S{dlen}', w0 + d0)    # d is just the integer part
    if m0:
        dtype_dict['m'] = ('|S2', w0 + m0)

    if w0:
        dtype_dict['white0'] = (f'|S{w0}', 0)
    for i, dash in enumerate(dashes):
        dtype_dict[f'dash{i}'] = ('|S1', w0 + dash)
    if kdot:
        dtype_dict['dot'] = ('|S1', kdot)
        flen = kend - kdot - 1
        if flen:
            dtype_dict['f'] = (f'|S{flen}', kdot + 1)
    if w1:
        dtype_dict['white1'] = (f'|S{w1}', kend)
    if w2:
        dtype_dict['nulls'] = ('|S1', lfirst)

    if syntax:
        dtype_dict['data'] = (f'|S{kend-w0}', w0)

    # Extract year, month, day, and fraction; JulianParseException on failure
    strings = strings.view(np.dtype(dtype_dict))
    try:
        y = strings['y'].astype('int')
        d = strings['d'].astype('int')
        m = strings['m'].astype('int') if 'm' in dtype_dict else 0
        f = strings['f'].astype('int') if 'f' in dtype_dict else 0
    except ValueError as e:
        raise JulianParseException(str(e))

    # Validate syntax if necessary
    if syntax:
        if 'dash0' in dtype_dict and np.any(strings['dash0'] != b'-'):
            raise JulianParseException('inconsistent dashes in ISO date')
        if 'dash1' in dtype_dict and np.any(strings['dash1'] != b'-'):
            raise JulianParseException('inconsistent dashes in ISO date')
        if 'white0' in dtype_dict and np.any(strings['white0'] != w0 * b' '):
            raise JulianParseException('inconsistent white space in ISO date')
        if 'white1' in dtype_dict and np.any(strings['white1'] != w1 * b' '):
            raise JulianParseException('inconsistent white space in ISO date')
        if 'nulls' in dtype_dict and np.any(strings['nulls'] != b'\0'):
            raise JulianParseException('inconsistent null termination in ISO date')

        data = bytearray(strings['data'])
        if b' ' in data:
            raise JulianParseException('invalid blank character in ISO date')

        for key in ('y', 'd', 'm', 'f'):
            if key in dtype_dict:
                if b'-' in bytearray(strings[key]):
                    raise JulianParseException('invalid negative value in ISO date')

    # Convert to day
    if m0:
        day = day_from_ymd(y, m, d, validate=validate, proleptic=proleptic)
    else:
        day = day_from_yd(y, d, validate=validate, proleptic=proleptic)

    # Add fraction if needed
    if kdot:
        if np.shape(day):
            day = day + f/10.**(flen)
        else:
            day = day + float(f)/10.**(flen)

    return day


########################################


[docs]
def sec_from_iso(strings, *, validate=True, leapsecs=True, strip=False, syntax=False):
    """Accumulated number of seconds into a day, based on a parsing of a time string in
    ISO 8601:1988 "extended" format (but using a decimal point for fractional seconds
    rather than a comma).

    The format required is "hh:mm:ss[.s...][Z]". This parser is much faster than the more
    general time parsing routines. It can also process lists or arrays of date strings of
    arbitrary shape, provided that every element uses the exact same format.

    Because it can handle arrays of bytestrings, it is very efficient at processing raw
    data extracted from a column of an ASCII table.

    Parameters:
        strings (str, bytes, or array-like[str or bytes]):
            Strings to interpret. If an array is provided, all values must use the same
            format.
        validate (bool, optional):
            True to check the year/month/day values more carefully; raise
            JulianValidateFailure (a ValueError subclass) on error.
        syntax (bool, optional):
            True to check the string values more closely for conformance to the ISO
            standard; raise JulianParseException (a ValueError subclass) on error.
        strip (bool, optional):
            True to skip over leading and trailing blanks.
        leapsecs (bool, optional):
            True to tolerate leap second values during validation.

    Returns:
        int, float, or array:
            Elapsed seconds since beginning of day. Values are integral the seconds value
            is integral.

    Raises:
        JulianValidateFailure: If `validate` is True and an hour, minute, or second value
            is out of range.
    """

    # Convert to bytestring if necessary, replace Unicode
    strings = np.array(strings).astype('S')

    first_index = len(strings.shape) * (0,)
    first = strings[first_index].decode('latin8')
    lfirst = len(first)

    # Count characters to strip
    w2 = strings.itemsize - lfirst              # w2 = 0 or 1 trailing null

    # w0 = number of blanks before
    # w1 = number of blanks after
    if strip:
        (w0, w1) = _count_white(first)
    else:
        (w0, w1) = (0, 0)
        test = first.replace(':', '').replace('.', '').rstrip('Z')
        if not test.isdecimal():
            raise JulianParseException(f'unrecognized ISO time format: "{first}"')

    # Check for "Z"
    lstripped = lfirst - w0 - w1
    wz = int(first[w0 + lstripped - 1] == 'Z')  # wz = 0 or 1
    lstripped -= wz                             # width of time string without extras
    kend = w0 + lstripped                       # index of the first char after the time

    # Locate colons and dots
    first_array = np.array(list(first))
    kcolons = np.where(first_array == ':')[0]
    if kcolons.size > 2:
        raise JulianParseException('unrecognized ISO time format; too many colons: '
                                   f'"{first}"')

    kdots = np.where(first_array == '.')[0]
    if kdots.size > 1:
        raise JulianParseException('unrecognized ISO time format; too many decimals: '
                                   f'"{first}"')
        kdot = 0
    elif kdots.size == 1:
        kdot = kdots[0]
    else:
        kdot = 0

    kints = kdot if kdot else kend              # index of first char after all integers

    # Identify the h, m, s, and fraction field locations and widths
    if kcolons.size:
        kcolons = [w0-1] + list(kcolons)        # colon locations plus fake one in front
        khms = np.array(kcolons) + 1            # start locations of fields
        khms1 = list(kcolons[1:]) + [kints]     # end locations of all integer fields
        widths = khms1 - khms                   # widths of fields
        if np.any(widths != 2):
            raise JulianParseException(f'invalid field width in ISO time: "{first}"')
    else:
        width = kints - w0
        fields = width // 2
        if fields > 3 or width != fields * 2:
            raise JulianParseException('invalid text width in ISO time format: '
                                       f'"{first}"')
        khms = w0 + 2 * np.arange(fields)       # start locations of fields
        widths = fields * [2]

    # Construct the dtype dictionary
    dtype_dict = {}
    for i, w in enumerate(widths):
        key = 'hms'[i]
        dtype_dict[key] = (f'|S{w}', khms[i])

    if w0:
        dtype_dict['white0'] = (f'|S{w0}', 0)
    for i, kcolon in enumerate(kcolons[1:]):    # skip fake colon in front
        dtype_dict[f'colon{i}'] = ('|S1', kcolon)
    if kdot:
        dtype_dict['dot'] = ('|S1', kdot)
        flen = kend - kdot - 1
        if flen:
            dtype_dict['f'] = (f'|S{flen}', kdot + 1)
    if wz:
        dtype_dict['z'] = ('|S1', kend)
    if w1:
        dtype_dict['white1'] = (f'|S{w1}', kend + wz)
    if w2:
        dtype_dict['nulls'] = ('|S1', lfirst)

    if syntax:
        dtype_dict['data'] = (f'|S{kend-w0}', w0)

    # Extract hours, minutes, seconds; JulianParseException on failure
    strings = strings.view(np.dtype(dtype_dict))
    try:
        h = strings['h'].astype('int')
        m = strings['m'].astype('int') if 'm' in dtype_dict else 0
        s = strings['s'].astype('int') if 's' in dtype_dict else 0
    except ValueError as e:
        raise JulianParseException(str(e))

    if kdot:
        if 'f' in dtype_dict:
            f = strings['f'].astype('int') / 10.**flen
        else:
            f = 0.

        if 's' in dtype_dict:
            s = s + f
        elif 'm' in dtype_dict:
            m = m + f
        else:
            h = h + f

    # Validate if necessary
    if syntax:
        if 'white0' in dtype_dict and np.any(strings['white0'] != w0 * b' '):
            raise JulianParseException('inconsistent white space in ISO time')
        if 'colon0' in dtype_dict and np.any(strings['colon0'] != b':'):
            raise JulianParseException('inconsistent colons in ISO time')
        if 'colon1' in dtype_dict and np.any(strings['colon1'] != b':'):
            raise JulianParseException('inconsistent colons in ISO time')
        if 'dot' in dtype_dict and np.any(strings['dot'] != b'.'):
            raise JulianParseException('inconsistent decimal points in ISO time')
        if 'z' in dtype_dict and np.any(strings['z'] != b'Z'):
            raise JulianParseException('inconsistent "Z" usage in ISO time')
        if 'white1' in dtype_dict and np.any(strings['white1'] != w1 * b' '):
            raise JulianParseException('inconsistent white space in ISO time')
        if 'nulls' in dtype_dict and np.any(strings['nulls'] != b'\0'):
            raise JulianParseException('inconsistent null termination in ISO time')

        data = bytearray(strings['data'])
        if b' ' in data or b'-' in data:
            raise JulianParseException('invalid blank character in ISO time')

    return sec_from_hms(h, m, s, validate=validate, leapsecs=leapsecs)


########################################


[docs]
def day_sec_from_iso(strings, *, validate=True, syntax=False, strip=False,
                     proleptic=False):
    """Day and second based on a parsing of the string in ISO date-time format.

    This function parses date-time strings in the fixed ISO format, using "yyyy-mm-dd"
    or "yyyy-ddd" for the date, a single space or "T", and a time as "hh:mm:ss[.s...][Z]".
    It is much faster than the more general date parsing routines. It can also process
    lists or arrays of date strings of arbitrary shape, provided that every element uses
    the exact same format.

    Because it can handle arrays of bytestrings, it is very efficient at processing raw
    data extracted from a column of an ASCII table.

    Parameters:
        strings (str, bytes, or array-like:
            Strings to interpret. If an array is provided, all values must use the same
            format.
        validate (bool, optional):
            True to validate the ranges of the year, month, and day values.
        syntax (bool, optional):
            True to check the string values more closely for conformance to the ISO
            standard; raise JulianParseException (a ValueError subclass) on error.
        strip (bool, optional):
            True to skip over leading and trailing blanks.
        leapsecs (bool, optional):
            True to tolerate leap second values during validation.
        proleptic (bool, optional):
            True to interpret all dates according to the modern Gregorian calendar, even
            those that occurred prior to the transition from the Julian calendar. False to
            use the Julian calendar for earlier dates.

    Returns:
        tuple (day, sec):

        - **day** (*int or array*): Day number(s) relative to January 1, 2000.
        - **sec** (*int, float, or array*): Elapsed seconds since beginning of day. Values
          are integral the seconds value is integral.

    Raises:
        JulianValidateFailure:
            If `validate` is True and any numeric value is out of range.
    """

    # Convert to an array of strings, replace Unicode
    strings = np.array(strings).astype('S')

    first_index = len(strings.shape) * (0,)
    first = strings[first_index].decode('latin8')
    lfirst = len(first)

    # Check for a T or blank separator
    csep = 'T'
    isep = first.find(csep)
    if isep == -1:
        w0, w1 = _count_white(first)
        csep = ' '
        isep = first.find(csep, w0)
        if isep == lfirst - w1:
            isep = -1

    # If no separator is found, it is just a date
    if isep == -1:
        return (day_from_iso(strings, validate=validate, strip=strip), 0)

    # Otherwise, parse the date and time separately
    dtype_dict = {'date': ('|S' + str(isep), 0),
                  'time': ('|S' + str(lfirst - isep - 1), isep + 1),
                  'sep' : ('|S1', isep)}

    strings = strings.view(np.dtype(dtype_dict))
    day = day_from_iso(strings['date'], validate=validate, syntax=syntax, strip=strip,
                       proleptic=proleptic)
    sec = sec_from_iso(strings['time'], validate=validate, syntax=syntax, strip=strip,
                       leapsecs=True)

    if syntax:
        if np.any(strings['sep'] != csep.encode('latin8')):
            raise JulianParseException('invalid ISO date-time punctuation')

    if validate:
        if np.any(sec >= seconds_on_day(day)):
            raise JulianValidateFailure('seconds value is outside allowed range')

    return (day, sec)


########################################


[docs]
def tai_from_iso(strings, *, validate=True, strip=False, proleptic=False):
    """TAI time given an ISO date or date-time string.

    This is a shortcut for `time_from_iso()` with timesys='TAI'.

    Parameters:
        strings (str, bytes, or array-like):
            Strings to interpret. If an array is provided, all values must use the same
            format.
        validate (bool, optional):
            True to validate the date and time values.
        strip (bool, optional):
            True to skip over leading and trailing blanks.
        proleptic (bool, optional):
            True to interpret all dates according to the modern Gregorian calendar, even
            those that occurred prior to the transition from the Julian calendar. False to
            use the Julian calendar for earlier dates.

    Returns:
        int, float, or array: Time in seconds TAI.

    Raises:
        JulianValidateFailure:
            If a value embedded in the date or time is out of range.
    """

    (day, sec) = day_sec_from_iso(strings, validate=validate, strip=strip,
                                  proleptic=proleptic)
    return tai_from_day_sec(day, sec)




[docs]
def tdb_from_iso(strings, *, validate=True, strip=False, proleptic=False):
    """TDB time given an ISO date or date-time string.

    This is a shortcut for `time_from_iso()` with timesys='TDB'.

    Parameters:
        strings (str, bytes, or array-like):
            Strings to interpret. If an array is provided, all values must use the same
            format.
        validate (bool, optional):
            True to validate the date and time values.
        strip (bool, optional):
            True to skip over leading and trailing blanks.
        proleptic (bool, optional):
            True to interpret all dates according to the modern Gregorian calendar, even
            those that occurred prior to the transition from the Julian calendar. False to
            use the Julian calendar for earlier dates.

    Returns:
        int, float, or array: Time in seconds TDB.

    Raises:
        JulianValidateFailure:
            If a value embedded in the date or time is out of range.
    """

    (day, sec) = day_sec_from_iso(strings, validate=validate, strip=strip,
                                  proleptic=proleptic)
    return tdb_from_tai(tai_from_day_sec(day, sec))




[docs]
def time_from_iso(strings, timesys='TAI', *, validate=True, strip=False, proleptic=False):
    """Time in a specified time system given an ISO date or date-time string.

    Parameters:
        strings (str, bytes, or array-like[str or bytes]):
            Strings to interpret. If an array is provided, all values must use the same
            format.
        timesys (str):
            Name of the time system, "UTC", "TAI", "TDB", or "TT".
        validate (bool, optional):
            True to validate the date and time values.
        strip (bool, optional):
            True to skip over leading and trailing blanks.
        proleptic (bool, optional):
            True to interpret all dates according to the modern Gregorian calendar, even
            those that occurred prior to the transition from the Julian calendar. False to
            use the Julian calendar for earlier dates.

    Returns:
        int, float, or array: Time in seconds in the specified time system.

    Raises:
        JulianValidateFailure:
            If a value embedded in the date or time is out of range.
    """

    tai = tai_from_iso(strings, validate=validate, strip=strip, proleptic=proleptic)
    return time_from_time(tai, 'TAI', newsys=timesys)


##########################################################################################