Source code for julian.date_parsers

##########################################################################################
# julian/date_parsers.py
##########################################################################################
"""
============
Date Parsers
============
"""

import pyparsing
import re

from julian.calendar      import day_from_yd, day_from_ymd
from julian.date_pyparser import date_pyparser
from julian.formatters    import format_day
from julian.mjd_jd        import day_from_mjd
from julian._exceptions   import JulianParseException, JulianValidateFailure

_PRE_FILTER = True      # set False for some performance tests

##########################################################################################
# General date parser
##########################################################################################

[docs] def day_from_string(string, order='YMD', *, doy=True, mjd=False, weekdays=False, extended=False, proleptic=False): """Day number based on the parsing of a free-form string. Parameters: string (str): String to interpret. order (str): One of "YMD", "MDY", or "DMY", defining the default order for day month, and year in situations where it might be ambiguous. doy (bool, optional): True to recognize dates specified as year and day-of-year. mjd (bool, optional): True to recognize Modified Julian Dates. weekdays (bool, optional): True to allow dates including weekdays. extended (bool, optional): True to support extended year values: signed (with at least four digits) and those involving "CE", "BCE", "AD", "BC". proleptic (bool, optional): True to interpret all dates according to the modern Gregorian calendar, even those that occurred prior to the transition from the Julian calendar. False to use the Julian calendar for earlier dates. Returns: int: Day number relative to January 1, 2000. Raises: JulianParseException: If `string` was not recognized as a valid date expression. JulianValidateFailure: If the date string contains invalid or contradictory information. """ parser = date_pyparser(order=order, strict=False, doy=doy, mjd=mjd, weekdays=weekdays, extended=extended, padding=True, embedded=False) try: parse_list = parser.parse_string(string).as_list() except pyparsing.ParseException: raise JulianParseException(f'unrecognized date format: "{string}"') parse_dict = {key:value for key, value in parse_list} return _day_from_dict(parse_dict, proleptic=proleptic, validate=True)
########################################################################################## # Date scraper ##########################################################################################
[docs] def days_in_strings(strings, order='YMD', *, doy=False, mjd=False, weekdays=False, extended=False, proleptic=False, substrings=False, first=False): """List of day numbers obtained by searching one or more strings for patterns that look like formatted dates. Parameters: strings (str, list, tuple, or array): Strings to interpret. order (str): One of "YMD", "MDY", or "DMY", defining the default order for day month, and year in situations where it might be ambiguous. doy (bool, optional): True to recognize dates specified as year and day-of-year. mjd (bool, optional): True to recognize Modified Julian Dates. weekdays (bool, optional): True to allow dates including weekdays. extended (bool, optional): True to support extended year values: signed (with at least four digits) and those involving "CE", "BCE", "AD", "BC". proleptic (bool, optional): True to interpret all dates according to the modern Gregorian calendar, even those that occurred prior to the transition from the Julian calendar. False to use the Julian calendar for earlier dates. substrings (bool, optional): True to also return the substring containing each identified date. first (bool, optional): True to return the first date found rather than a list of dates. In this case, None is returned if no date is found rather than an empty list. Returns: int, tuple (day, substring), list[int or tuple], or None: If `first=False`, a list of dates is returned; otherwise, a single date is returned or None if no date was found. If `substrings` is False, each date is represented by a day number relative to January 1, 2000. If `substrings` is True, each date is represented by a tuple, where the first element is the day number and the second is the substring found that defines this date. Raises: JulianValidateFailure: If a matched date string contains invalid or contradictory information. """ if isinstance(strings, str): strings = [strings] parser = date_pyparser(order=order, doy=doy, mjd=mjd, weekdays=weekdays, extended=extended, strict=True, floating=False, iso_only=False, padding=True, embedded=True) day_list = [] for string in strings: # Use fast check to skip over strings that are clearly date-less if not _date_pattern_filter(string, doy=doy, mjd=mjd): continue while True: parse_dict, substring, string = _search_in_string(string, parser) if not parse_dict: break day = _day_from_dict(parse_dict, proleptic=proleptic, validate=True) if substrings: day_list.append((day, substring)) else: day_list.append(day) if first: return day_list[0] if first: return None return day_list
########################################################################################## # Utilities ########################################################################################## _WORDS = re.compile('([A-Za-z0-9.]+)') def _search_in_string(string, parser): """Parse dictionary derived from the first matching pattern in the string. Parameters: string (str): String to interpret. parser (pyparsing.ParserElement): Parser to use. Returns: tuple (dict, match, remainder): - **dict** (*dict*): A dictionary of information about the first matching string. If no match was found, this dictionary is empty. - **match** (*str*): The text that matched; empty string on failure. - **remainder** (*str*): The remainder of the string following the match; empty string on failure. """ # To speed things up, only check starting at the beginning of each word words = _WORDS.split(string) result = None for k in range(1, len(words), 2): # words are at odd locations in this list substring = ''.join(words[k:]) try: result = parser.parse_string(substring) break except pyparsing.ParseException: pass if result is None: return ({}, '', '') parse_dict = {key:value for key, value in result.as_list()} loc = parse_dict['~'] match_text = substring[:loc].strip() remainder_text = substring[loc:] return (parse_dict, match_text, remainder_text) _WEEKDAYS = {'SAT':0, 'SUN':1, 'MON':2, 'TUE':3, 'WED':4, 'THU':5, 'FRI':6} _WEEKDAY_NAMES = {'SAT':'Saturday', 'SUN':'Sunday', 'MON':'Monday', 'TUE':'Tuesday', 'WED':'Wednesday', 'THU':'Thursday', 'FRI':'Friday'} def _day_from_dict(parse_dict, proleptic=True, validate=True): """Day number based on the contents of a dictionary.""" y = parse_dict['YEAR'] d = int(parse_dict['DAY'] // 1.) # First check for MJD date if y == 'MJD': return day_from_mjd(d) # Interpret year, month and day if 'MONTH' in parse_dict: m = parse_dict['MONTH'] day = day_from_ymd(y, m, d, validate=validate, proleptic=proleptic) else: day = day_from_yd(y, d, validate=validate, proleptic=proleptic) # Check weekday if necessary if validate and 'WEEKDAY' in parse_dict: if day % 7 != _WEEKDAYS[parse_dict['WEEKDAY']]: date = format_day(day, order=('YMD' if 'MONTH' in parse_dict else 'YD')) name = _WEEKDAY_NAMES[parse_dict['WEEKDAY']] raise JulianValidateFailure(f'Date {date} is not a {name}') return day _DATE_WORDS_REGEX = re.compile(r'(?<![A-Z])(' 'JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|' 'DEC|MON|TUE|WED|THU|FRI|SAT|SUN)', re.I) _YEAR12_REGEX = re.compile(r'(?<!\d)[12]\d\d\d(?!\d)') _FOUR_DIGITS = re.compile(r'(?<!\d)\d\d\d\d(?!\d)') _ONE_OR_TWO_DIGITS = re.compile(r'(?<!\d)\d\d?(?!\d)') _YEAR_DOY_REGEX = re.compile(r'(?<!\d)(\d\d)?\d\d[^\d]{1,4}[0-3]?\d?\d(?!\d)') _MJD_REGEX = re.compile(r'(?<![A-Z])M?J[ET]?D(?![A-Z])', re.I) def _date_pattern_filter(string, doy=False, mjd=False): """True if this string _might_ contain a date. This is a quick set of tests using regular expressions; it speeds up the search by not spending time attempting to parse strings that clearly do not contain a date. """ if not _PRE_FILTER: return True # pragma: no cover # If a month name or weekday name appears, we have a candidate. if _DATE_WORDS_REGEX.search(string): return True # If a 4-digit year starting with 1 or 2 appears, we have a candidate. if _YEAR12_REGEX.search(string): return True # Count the one- or two-digit numbers ints = len(_ONE_OR_TWO_DIGITS.findall(string)) # If three or more appear, we have a candidate. if ints >= 3: return True # A four-digit integer plus two or more integers with one or two digits is also a # candidate. if _FOUR_DIGITS.search(string) and ints >= 2: return True # For the day-of-year case, the last remaining option is a two digit number followed # closely by a three-digit number < 400 if doy and _YEAR_DOY_REGEX.search(string): return True # Check for MJD if mjd and _MJD_REGEX.search(string): return True return False ##########################################################################################